//TODO: Based on current understanding (I/O-bound performance that can be mitigated by in-memory databases),
//TODO: best approach towards parallelization is to create dedicated workers for a specific database. These
//TODO: workers process all gene signatures using a in-memory version of the database.
p = Pool(num_workers if num_workers else cpu_count())
module2regulome4pair = partial(module2regulome, motif_annotations=motif_annotations,
rank_threshold=rank_threshold, auc_threshold=auc_threshold,
nes_threshold=nes_threshold, avgrcc_sample_frac=avgrcc_sample_frac,
weighted_recovery=weighted_recovery)
After Change
// of the algorithm.
assert len(rnkdbs) <= num_workers if num_workers else cpu_count(), "The number of databases is larger than the number of cores."
print("Using {} workers.".format(len(rnkdbs)))
receivers = []
for db in rnkdbs:
sender, receiver = Pipe()
receivers.append(receiver)
Worker(db, modules, motif_annotations_fname, sender).start()
return reduce(concat, (recv.recv() for recv in receivers))
else:
// Create dask graph.
from cytoolz.curried import filter as filtercur
dask_graph = delayed(compose(list, filtercur(is_not_none)))(