本文整理汇总了Python中CGAT.Bed.getNumColumns方法的典型用法代码示例。如果您正苦于以下问题:Python Bed.getNumColumns方法的具体用法?Python Bed.getNumColumns怎么用?Python Bed.getNumColumns使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类CGAT.Bed
的用法示例。
在下文中一共展示了Bed.getNumColumns方法的3个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: aggregateWindowsReadCounts
# 需要导入模块: from CGAT import Bed [as 别名]
# 或者: from CGAT.Bed import getNumColumns [as 别名]
def aggregateWindowsReadCounts(infiles,
outfile,
regex="(.*)\..*"):
'''aggregate several results from coverageBed
into a single file.
*regex* is used to extract the track name from the filename.
The default removes any suffix.
coverageBed outputs the following columns:
1 Contig
2 Start
3 Stop
4 Name
5 The number of features in A that overlapped (by at least one
base pair) the B interval.
6 The number of bases in B that had non-zero coverage from features in A.
7 The length of the entry in B.
8 The fraction of bases in B that had non-zero coverage from
features in A.
For bed: use column 5
For bed6: use column 7
For bed12: use column 13
Windows without any counts will not be output.
'''
# get bed format
bed_columns = Bed.getNumColumns(infiles[0])
# +1 as awk is 1-based
column = bed_columns - 4 + 1
src = " ".join(['''<( zcat %s |
awk '{printf("%%s:%%i-%%i\\t%%i\\n", $1,$2,$3,$%s );}' ) ''' %
(x, column) for x in infiles])
tmpfile = P.getTempFilename(".")
statement = '''paste %(src)s > %(tmpfile)s'''
P.run()
# build track names
tracks = [re.search(regex, os.path.basename(x)).groups()[0]
for x in infiles]
outf = IOTools.openFile(outfile, "w")
outf.write("interval_id\t%s\n" % "\t".join(tracks))
for line in open(tmpfile, "r"):
data = line[:-1].split("\t")
genes = list(set([data[x] for x in range(0, len(data), 2)]))
values = [int(data[x]) for x in range(1, len(data), 2)]
if sum(values) == 0:
continue
assert len(genes) == 1, \
"paste command failed, wrong number of genes per line: '%s'" % line
outf.write("%s\t%s\n" % (genes[0], "\t".join(map(str, values))))
outf.close()
os.unlink(tmpfile)
示例2: aggregateWindowsReadCounts
# 需要导入模块: from CGAT import Bed [as 别名]
# 或者: from CGAT.Bed import getNumColumns [as 别名]
def aggregateWindowsReadCounts( infiles, outfile ):
'''aggregate tag counts for each window.
coverageBed outputs the following columns:
1) Contig
2) Start
3) Stop
4) Name
5) The number of features in A that overlapped (by at least one base pair) the B interval.
6) The number of bases in B that had non-zero coverage from features in A.
7) The length of the entry in B.
8) The fraction of bases in B that had non-zero coverage from features in A.
For bed: use column 5
For bed6: use column 7
For bed12: use column 13
Tiles with no counts will not be output.
'''
to_cluster = True
# get bed format
bed_columns = Bed.getNumColumns( infiles[0] )
# +1 as awk is 1-based
column = bed_columns - 4 + 1
src = " ".join( [ '''<( zcat %s | awk '{printf("%%s:%%i-%%i\\t%%i\\n", $1,$2,$3,$%s );}' ) ''' % (x,column) for x in infiles] )
tmpfile = P.getTempFilename( "." )
statement = '''paste %(src)s > %(tmpfile)s'''
P.run()
tracks = [ re.sub( "\..*", '', os.path.basename(x) ) for x in infiles ]
outf = IOTools.openFile( outfile, "w")
outf.write( "interval_id\t%s\n" % "\t".join( tracks ) )
for line in open( tmpfile, "r" ):
data = line[:-1].split("\t")
genes = list(set([ data[x] for x in range(0,len(data), 2 ) ]))
values = [ int(data[x]) for x in range(1,len(data), 2 ) ]
if sum(values) == 0: continue
assert len(genes) == 1, "paste command failed, wrong number of genes per line: '%s'" % line
outf.write( "%s\t%s\n" % (genes[0], "\t".join(map(str, values) ) ) )
outf.close()
os.unlink(tmpfile)
示例3: aggregateWindowsTagCounts
# 需要导入模块: from CGAT import Bed [as 别名]
# 或者: from CGAT.Bed import getNumColumns [as 别名]
def aggregateWindowsTagCounts(infiles,
outfile,
regex="(.*)\..*"):
'''aggregate output from several ``bedtools coverage`` results.
``bedtools coverage`` outputs the following columns for a bed4
file::
1 Contig
2 Start
3 Stop
4 Name
5 The number of features in A that overlapped (by at least one
base pair) the B interval.
6 The number of bases in B that had non-zero coverage from features in A.
7 The length of the entry in B.
8 The fraction of bases in B that had non-zero coverage from
features in A.
This method autodetects the number of columns in the :term:`infiles`
and selects:
* bed4: use column 5
* bed6: use column 7
* bed12: use column 13
Arguments
---------
infiles : list
Input filenames with the output from ``bedtools coverage``
outfile : string
Output filename in :term:`tsv` format.
regex : string
Regular expression used to extract the track name from the
filename. The default removes any suffix.
'''
# get bed format
bed_columns = Bed.getNumColumns(infiles[0])
# +1 as awk is 1-based
column = bed_columns - 4 + 1
src = " ".join(["""<( zcat %s |
awk '{printf("%%s:%%i-%%i\\t%%i\\n", $1,$2,$3,$%s );}')""" %
(x, column) for x in infiles])
tmpfile = P.getTempFilename(".")
statement = '''paste %(src)s > %(tmpfile)s'''
P.run()
# build track names
tracks = [re.search(regex, os.path.basename(x)).groups()[0]
for x in infiles]
outf = IOTools.openFile(outfile, "w")
outf.write("interval_id\t%s\n" % "\t".join(tracks))
# filter for uniqueness - keys with the same value as the
# previous line will be ignored.
last_gene = None
c = E.Counter()
for line in open(tmpfile, "r"):
c.input += 1
data = line[:-1].split("\t")
genes = list(set([data[x] for x in range(0, len(data), 2)]))
values = [int(data[x]) for x in range(1, len(data), 2)]
assert len(genes) == 1, \
"paste command failed, wrong number of genes per line: '%s'" % line
if genes[0] == last_gene:
c.duplicates += 1
continue
c.output += 1
outf.write("%s\t%s\n" % (genes[0], "\t".join(map(str, values))))
last_gene = genes[0]
outf.close()
os.unlink(tmpfile)
E.info("aggregateWindowsTagCounts: %s" % c)