本文整理汇总了Python中pbcore.io.SubreadSet.toFofn方法的典型用法代码示例。如果您正苦于以下问题:Python SubreadSet.toFofn方法的具体用法?Python SubreadSet.toFofn怎么用?Python SubreadSet.toFofn使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类pbcore.io.SubreadSet
的用法示例。
在下文中一共展示了SubreadSet.toFofn方法的2个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: split_dataset
# 需要导入模块: from pbcore.io import SubreadSet [as 别名]
# 或者: from pbcore.io.SubreadSet import toFofn [as 别名]
def split_dataset(subreadset, out_prefix):
"""
Takes an input dataset, and for each entry generates one separate dataset
file, while maintaining all the filters.
Returns a FOFN of the generated datasets.
To create an example filtered dataset for testing:
dataset create --type SubreadSet test.subreadset.xml subreads1.bam subreads2.bam
dataset filter test.subreadset.xml test.filtered.subreadset.xml 'length>1000'
"""
out_prefix_abs = os.path.abspath(out_prefix)
dset = SubreadSet(subreadset, strict=True)
fns = dset.toFofn()
log.info('resources in {!r}:\n{}'.format(subreadset, '\n'.join(fns)))
fofn = []
for i, bam_fn in enumerate(fns):
out_fn = '{}.{:05}.subreadset.xml'.format(out_prefix_abs, i)
new_dataset = SubreadSet(bam_fn)
new_dataset.newUuid()
new_dataset._filters = copy.deepcopy(dset._filters)
new_dataset.write(out_fn)
fofn.append(out_fn)
return fofn
示例2: run
# 需要导入模块: from pbcore.io import SubreadSet [as 别名]
# 或者: from pbcore.io.SubreadSet import toFofn [as 别名]
def run(subreadset, fofn):
dir_name = os.getcwd()
maxChunks = 0
dset = SubreadSet(subreadset, strict=True)
fns = dset.toFofn()
import pprint
log.info('resources in {!r}:\n{}'.format(subreadset, pprint.pformat(fns)))
nrecs = len(dset)
# HG with 70x coverage => 200G bases total
ts = 50000 # @ 20k/read => 1G bases, ~300MB .gz => ~200 chunks for Human
ts = 500000 # @ 20k/read => 10G bases, ~3GB .gz => ~20 chunks for Human
# and we expect about 7-10min per chunk.
chunks = nrecs // ts
log.info('num_chunks={:g} ({:g} / {:g})'.format(chunks, nrecs, ts))
log.info('Splitting with dset.split(zmws=False, chunks={}, ignoreSubDatasets=True, maxChunks={},)'.format(
chunks, maxChunks))
dset_chunks = dset.split(zmws=False, chunks=chunks, ignoreSubDatasets=True, maxChunks=maxChunks,
updateCounts=False,
#targetSize=1, breakContigs=True
)
chunk_fns = []
for i, dset in enumerate(dset_chunks):
chunk_name = 'chunk_{:03d}.subreadset.xml'.format(i) # TODO: 02
chunk_fn = os.path.join(dir_name, chunk_name)
dset.updateCounts()
dset.write(chunk_fn, validate=False) # , relPaths=True
chunk_fns.append(chunk_fn)
with open(fofn, 'w') as ofs:
for fn in chunk_fns:
ofs.write('{}\n'.format(fn))
log.info('Wrote {} chunks into "{}"'.format(len(dset_chunks), fofn))