本文整理匯總了Python中HTSeq.pair_SAM_alignments方法的典型用法代碼示例。如果您正苦於以下問題:Python HTSeq.pair_SAM_alignments方法的具體用法?Python HTSeq.pair_SAM_alignments怎麽用?Python HTSeq.pair_SAM_alignments使用的例子?那麽, 這裏精選的方法代碼示例或許可以為您提供幫助。您也可以進一步了解該方法所在類HTSeq
的用法示例。
在下文中一共展示了HTSeq.pair_SAM_alignments方法的15個代碼示例,這些例子默認根據受歡迎程度排序。您可以為喜歡或者感覺有用的代碼點讚,您的評價將有助於係統推薦出更棒的Python代碼示例。
示例1: HTseq_count
# 需要導入模塊: import HTSeq [as 別名]
# 或者: from HTSeq import pair_SAM_alignments [as 別名]
def HTseq_count(bam_file, gtf_file, out_dir, identifier, parallel = True ):
gtf_file = HTSeq.GFF_Reader(gtf_file)
features = HTSeq.GenomicArrayOfSets( "auto", stranded=True )
print "extracting features from gtf file"
for feature in gtf_file:
# if feature.type == "exon":
features[feature.iv] += feature.attr[identifier]
counts = collections.Counter( )
almnt_file = HTSeq.SAM_Reader(bam_file)
counts = collections.Counter( )
for bundle in HTSeq.pair_SAM_alignments( almnt_file, bundle=True ):
if len(bundle) != 1:
continue # Skip multiple alignments
first_almnt, second_almnt = bundle[0] # extract pair
if not first_almnt.aligned and second_almnt.aligned:
count[ "_unmapped" ] += 1
continue
gene_ids = set()
for iv, val in features[ left_almnt.iv ].steps():
gene_ids |= val
for iv, val in features[ right_almnt.iv ].steps():
gene_ids |= val
if len(gene_ids) == 1:
gene_id = list(gene_ids)[0]
counts[ gene_id ] += 1
elif len(gene_ids) == 0:
counts[ "_no_feature" ] += 1
else:
counts[ "_ambiguous" ] += 1
for gene_id in counts:
print gene_id, counts[ gene_id ]
示例2: count_reads_paired
# 需要導入模塊: import HTSeq [as 別名]
# 或者: from HTSeq import pair_SAM_alignments [as 別名]
def count_reads_paired(read_seq, counter, order, stranded,
quiet, minaqual, write_to_samout ):
if order == "name":
read_seq = HTSeq.pair_SAM_alignments( read_seq )
elif order == "pos":
read_seq = HTSeq.pair_SAM_alignments_with_buffer( read_seq )
else:
raise ValueError, "Illegal order specified."
i = 0
for r in read_seq:
if i > 0 and i % 100000 == 0 and not quiet:
sys.stderr.write( "%d SAM alignment record%s processed.\n" % ( i, "s" if not pe_mode else " pairs" ) )
i += 1
if r[0] is not None and r[0].aligned:
if stranded != "reverse":
iv_seq = ( co.ref_iv for co in r[0].cigar if co.type == "M" and co.size > 0 )
else:
iv_seq = ( invert_strand( co.ref_iv ) for co in r[0].cigar if co.type == "M" and co.size > 0 )
else:
iv_seq = tuple()
if r[1] is not None and r[1].aligned:
if stranded != "reverse":
iv_seq = itertools.chain(iv_seq,
( invert_strand( co.ref_iv ) for co in r[1].cigar if co.type == "M" and co.size > 0 ) )
else:
iv_seq = itertools.chain( iv_seq,
( co.ref_iv for co in r[1].cigar if co.type == "M" and co.size > 0 ) )
else:
if ( r[0] is None ) or not ( r[0].aligned ):
write_to_samout( r, "__not_aligned" )
counter.notaligned += 1
continue
try:
if ( r[0] is not None and r[0].optional_field( "NH" ) > 1 ) or \
( r[1] is not None and r[1].optional_field( "NH" ) > 1 ):
counter.nonunique += 1
write_to_samout( r, "__alignment_not_unique" )
continue
except KeyError:
pass
if ( r[0] and r[0].aQual < minaqual ) or ( r[1] and r[1].aQual < minaqual ):
lowqual += 1
write_to_samout( r, "__too_low_aQual" )
continue
counter.count(iv_seq, r)
if not quiet:
sys.stderr.write( "%d SAM %s processed.\n" % ( i, "alignments " if not pe_mode else "alignment pairs" ) )
示例3: count_reads_paired
# 需要導入模塊: import HTSeq [as 別名]
# 或者: from HTSeq import pair_SAM_alignments [as 別名]
def count_reads_paired(read_seq, counter, order, quiet, minaqual):
if order == "name":
read_seq = HTSeq.pair_SAM_alignments(read_seq)
elif order == "pos":
read_seq = HTSeq.pair_SAM_alignments_with_buffer(read_seq)
else:
raise ValueError("Illegal order specified.")
i = 0
for r in read_seq:
if i > 0 and i % 100000 == 0 and not quiet:
msg = "%d SAM alignment record pairs processed.\n" % (i)
sys.stderr.write(msg)
i += 1
if r[0] is not None and r[0].aligned:
forward_iv_seq = (co.ref_iv for co in r[0].cigar
if co.type == "M" and co.size > 0)
reverse_iv_seq = (invert_strand(co.ref_iv) for co in r[0].cigar
if co.type == "M" and co.size > 0)
else:
forward_iv_seq = tuple()
reverse_iv_seq = tuple()
if r[1] is not None and r[1].aligned:
rest = (invert_strand(co.ref_iv) for co in r[1].cigar
if co.type == "M" and co.size > 0)
forward_iv_seq = itertools.chain(forward_iv_seq, rest)
rest = (co.ref_iv for co in r[1].cigar
if co.type == "M" and co.size > 0)
reverse_iv_seq = itertools.chain(reverse_iv_seq, rest)
else:
if (r[0] is None) or not (r[0].aligned):
counter.not_aligned(r)
continue
try:
if (r[0] is not None and r[0].optional_field("NH") > 1) or \
(r[1] is not None and r[1].optional_field("NH") > 1):
counter.non_unique(r)
continue
except KeyError:
pass
if (r[0] and r[0].aQual < minaqual) or \
(r[1] and r[1].aQual < minaqual):
counter.too_low_quality(r)
continue
counter.forward_count(forward_iv_seq, r)
counter.reverse_count(reverse_iv_seq, r)
if not quiet:
sys.stderr.write("%d SAM alignment pairs processed.\n" % (i))
示例4: ungapped_pe_counter
# 需要導入模塊: import HTSeq [as 別名]
# 或者: from HTSeq import pair_SAM_alignments [as 別名]
def ungapped_pe_counter(sam_reader, feature_array):
counts = collections.Counter( )
pair_iterator = hts.pair_SAM_alignments( sam_reader, bundle=True )
# bundle puts all multiply-mapped pairs together.
t0 = datetime.datetime.now()
for ic, bundle in enumerate(pair_iterator):
# report progress (to prove that it is still alive):
if ic % 1000000 == 0:
t1 = datetime.datetime.now()
print "\r%d read bundles counted in %s\r" % (ic, t1-t0)
sys.stdout.flush()
if bundle == []: # first bundle for some reason is always an empty list
continue
bcounts = assess_bundle(bundle, feature_array)
"""
To evaluate the multiply mapped bundles, each pair in a bundle must still ALWAYS
and ONLY map to a single feature. Thus, every aligned pair has come from the same
feature (gene), and this bundle counts as evidence of one read for this gene.
If any of the read pairs maps to a different gene, or no gene, or multiple genes,
then the bundle is considered ambiguous.
If all pairs in a bundle map as _no_feature, _unmapped or _ambiguous, then the
bundle counts as one count towards this feature type. (ie, it is passed on to
the final counter to increment by 1).
"""
if len(bcounts) > 1: # ie, is a multiply mapped feature with multiple gene mappings
counts[ "_ambiguous" ] += 1
continue
elif len(bcounts) == 0: # uh oh! There is an error somewhere.
print "#" * 40
print "Error! bundle was not assigned any status"
print "Contents of bundle:"
print bundle
continue
else:
counts[ bcounts.keys()[0] ] += 1
return counts
示例5: bam_parser_2
# 需要導入模塊: import HTSeq [as 別名]
# 或者: from HTSeq import pair_SAM_alignments [as 別名]
def bam_parser_2(bam_file, min_len, max_clip, min_id, mode):
bam_dict = {}
query_counter = 0
output_list = list()
if mode == 'paired':
#import itertools
#for aln in itertools.islice( HTSeq.pair_SAM_alignments(bam_file), 1000 ): # printing first N reads
for aln in HTSeq.pair_SAM_alignments(bam_file):
query_counter += 1
query_1, query_2 = aln
q1_aln = parser_aln_list(query_1, aln_number = query_counter, pair_pos = 1, min_len=min_len, max_clip=max_clip, min_id=min_id)
q2_aln = parser_aln_list(query_2, aln_number = query_counter, pair_pos = 2, min_len=min_len, max_clip=max_clip, min_id=min_id)
alns = [q1_aln, q2_aln]
if alns == [None, None]:
continue
else:
if None in alns:
alns.remove(None)
output_list.append(alns)
elif mode == 'single':
for aln in bam_file:
query_counter += 1
query_1 = aln
q1_aln = parser_aln_list(query_1, aln_number = query_counter, pair_pos = 1, min_len=min_len, max_clip=max_clip, min_id=min_id)
alns = [q1_aln]
if q1_aln != None:
output_list.append(alns)
df_columns = ['ALN','QUERY','REF','SEQ','LEN','ID','SCORE','CLIP_PCT']
output_list = [item for sublist in output_list for item in sublist]
return pd.DataFrame(output_list, columns=df_columns)
示例6: htseq_count
# 需要導入模塊: import HTSeq [as 別名]
# 或者: from HTSeq import pair_SAM_alignments [as 別名]
def htseq_count(data):
""" adapted from Simon Anders htseq-count.py script
http://www-huber.embl.de/users/anders/HTSeq/doc/count.html
"""
sam_filename, gff_filename, out_file, stats_file = _get_files(data)
stranded = _get_stranded_flag(data["config"])
overlap_mode = "union"
feature_type = "exon"
id_attribute = "gene_id"
minaqual = 0
if file_exists(out_file):
return out_file
logger.info("Counting reads mapping to exons in %s using %s as the "
"annotation and strandedness as %s." % (os.path.basename(sam_filename),
os.path.basename(gff_filename), _get_strandedness(data["config"])))
features = HTSeq.GenomicArrayOfSets("auto", stranded != "no")
counts = {}
# Try to open samfile to fail early in case it is not there
open(sam_filename).close()
gff = HTSeq.GFF_Reader(gff_filename)
i = 0
try:
for f in gff:
if f.type == feature_type:
try:
feature_id = f.attr[id_attribute]
except KeyError:
sys.exit("Feature %s does not contain a '%s' attribute" %
(f.name, id_attribute))
if stranded != "no" and f.iv.strand == ".":
sys.exit("Feature %s at %s does not have strand "
"information but you are running htseq-count "
"in stranded mode. Use '--stranded=no'." %
(f.name, f.iv))
features[f.iv] += feature_id
counts[f.attr[id_attribute]] = 0
i += 1
if i % 100000 == 0:
sys.stderr.write("%d GFF lines processed.\n" % i)
except:
sys.stderr.write("Error occured in %s.\n"
% gff.get_line_number_string())
raise
sys.stderr.write("%d GFF lines processed.\n" % i)
if len(counts) == 0:
sys.stderr.write("Warning: No features of type '%s' found.\n"
% feature_type)
try:
align_reader = htseq_reader(sam_filename)
first_read = iter(align_reader).next()
pe_mode = first_read.paired_end
except:
sys.stderr.write("Error occured when reading first line of sam "
"file.\n")
raise
try:
if pe_mode:
read_seq_pe_file = align_reader
read_seq = HTSeq.pair_SAM_alignments(align_reader)
empty = 0
ambiguous = 0
notaligned = 0
lowqual = 0
nonunique = 0
i = 0
for r in read_seq:
i += 1
if not pe_mode:
if not r.aligned:
notaligned += 1
continue
try:
if r.optional_field("NH") > 1:
nonunique += 1
continue
except KeyError:
pass
if r.aQual < minaqual:
lowqual += 1
continue
if stranded != "reverse":
iv_seq = (co.ref_iv for co in r.cigar if co.type == "M"
and co.size > 0)
else:
iv_seq = (invert_strand(co.ref_iv) for co in r.cigar if
co.type == "M" and co.size > 0)
else:
if r[0] is not None and r[0].aligned:
if stranded != "reverse":
#.........這裏部分代碼省略.........
示例7: count_reads_onto_prebuilt_features
# 需要導入模塊: import HTSeq [as 別名]
# 或者: from HTSeq import pair_SAM_alignments [as 別名]
def count_reads_onto_prebuilt_features(
sam_filename, features, feature_ids, stranded, overlap_mode, quiet, minaqual, samout, umis=False
):
def write_to_samout(r, assignment):
if samoutfile is None:
return
if not pe_mode:
r = (r,)
for read in r:
if read is not None:
samoutfile.write(read.original_sam_line.rstrip() + "\tXF:Z:" + assignment + "\n")
if quiet:
warnings.filterwarnings(action="ignore", module="HTSeq")
if samout != "":
samoutfile = open(samout, "w")
else:
samoutfile = None
if umis:
umi_re = re.compile(":UMI:(\w+):")
umi_counts = {}
def count_umis(fs, read_name):
umi_seq = umi_re.search(read_name).group(1)
umi_counts[fs][umi_seq] += 1
for feature_id in feature_ids:
umi_counts[feature_id] = Counter()
else:
def count_umis(x, y):
return None
# Try to open samfile to fail early in case it is not there
if sam_filename != "-":
open(sam_filename).close()
counts = {}
for feature_id in feature_ids:
counts[feature_id] = 0
try:
if sam_filename != "-":
read_seq_file = HTSeq.SAM_Reader(sam_filename)
read_seq = read_seq_file
first_read = iter(read_seq).next()
else:
read_seq_file = HTSeq.SAM_Reader(sys.stdin)
read_seq_iter = iter(read_seq_file)
first_read = read_seq_iter.next()
read_seq = itertools.chain([first_read], read_seq_iter)
pe_mode = first_read.paired_end
except StopIteration:
raise EmptySamError(sam_filename)
try:
if pe_mode:
read_seq = HTSeq.pair_SAM_alignments(read_seq)
empty = 0
ambiguous = 0
notaligned = 0
lowqual = 0
nonunique = 0
i = 0
for r in read_seq:
i += 1
if not pe_mode:
if not r.aligned:
notaligned += 1
write_to_samout(r, "not_aligned")
continue
try:
if r.optional_field("NH") > 1:
write_to_samout(r, "alignment_not_unique")
nonunique += 1
continue
except KeyError:
pass
if r.aQual < minaqual:
lowqual += 1
write_to_samout(r, "too_low_aQual")
continue
if stranded != "reverse":
iv_seq = (co.ref_iv for co in r.cigar if co.type == "M" and co.size > 0)
else:
iv_seq = (invert_strand(co.ref_iv) for co in r.cigar if co.type == "M" and co.size > 0)
else:
if r[0] is not None and r[0].aligned:
if stranded != "reverse":
iv_seq = (co.ref_iv for co in r[0].cigar if co.type == "M" and co.size > 0)
else:
iv_seq = (invert_strand(co.ref_iv) for co in r[0].cigar if co.type == "M" and co.size > 0)
else:
iv_seq = tuple()
if r[1] is not None and r[1].aligned:
if stranded != "reverse":
iv_seq = itertools.chain(
iv_seq, (invert_strand(co.ref_iv) for co in r[1].cigar if co.type == "M" and co.size > 0)
#.........這裏部分代碼省略.........
示例8: main
# 需要導入模塊: import HTSeq [as 別名]
# 或者: from HTSeq import pair_SAM_alignments [as 別名]
def main():
exe_parser = argparse.ArgumentParser()
exe_parser.add_argument('infile', type=str, help='<input file> [(full path), -b/-s required]')
exe_parser.add_argument("-u", "--not_aligned",
help="output reads that were not aligned, including those that were aligned multiple times(flat file).",
type=str)
exe_parser.add_argument("-s", "--samout", help="output not aligned reads to [file path].", type=str)
exe_parser.add_argument("-b", "--ambiguous_out", help="output a fasta file of ambiguous hits [file path].",
type=str)
exe_parser.add_argument("-v", "--verbose", help="verbose. (default = TRUE).", action="store_true")
exe_parser.add_argument("gff", help="<gff file> [(full path)]", type=str)
exe_parser.add_argument("-f", "--fasta", help="output fasta file of hits (full path).", type=str)
exe_parser.add_argument("-m", "--min_read_length", help="minimal read length to consider. (default = 60b).",
type=int)
exe_parser.add_argument("-i", "--min_id", help="minimal percent id of hit to consider. (default = 80).", type=int)
exe_parser.add_argument("-z", "--min_score", help="minimal aligner score to consider. (default = 0).", type=int)
exe_parser.add_argument("-c", "--max_clip",
help="proportion of bases clipped from read for alignment. (default = 0.3).", type=float)
exe_parser.add_argument("--stranded", help="whether the data is stranded (y, n, reverse). (default = n).", type=str,
choices=["y", "n", "reverse"], default="n")
exe_parser.add_argument("--idattr", help="GFF attribute to be used as feature ID. (default = GeneID).", type=str)
exe_parser.add_argument("--type", help="feature type (3rd column in GFF file) to be used. (default = CDS).",
type=str)
exe_parser.add_argument("-a", "--minaqual", help="min. alignment quality (default = 0).", type=str)
exe_parser.add_argument("-p", "--paired_end_mode",
help="input is paired end sorted by name (n) or position (p) . (default = p).", type=str,
choices=["p", "n"], default="p")
exe_parser.add_argument("-o", "--out", help="name of counts output file.", type=str)
args = exe_parser.parse_args()
if args.paired_end_mode == 'p':
paired_end = True
pe_order = 'p'
elif args.paired_end_mode == 'n':
paired_end = True
pe_order = 'n'
if args.infile:
try:
if args.infile == '-': # get sam on a stream
seqfile = HTSeq.SAM_Reader(sys.stdin)
if args.paired_end_mode:
# read_seq_iter = iter(seqfile)
# first_read = read_seq_iter.next()
# read_seq = itertools.chain([first_read], read_seq_iter)
# reader = HTSeq.pair_SAM_alignments(read_seq)
if pe_order == 'p':
reader = HTSeq.pair_SAM_alignments_with_buffer(seqfile)
elif pe_order == 'n':
reader = HTSeq.pair_SAM_alignments(seqfile) # (read_seq)
else:
reader = seqfile
elif args.infile != '-':
seqfile = HTSeq.SAM_Reader(args.infile)
if args.paired_end_mode:
read_seq_iter = iter(seqfile)
first_read = read_seq_iter.next()
read_seq = itertools.chain([first_read], read_seq_iter)
reader = HTSeq.pair_SAM_alignments(read_seq)
if pe_order == 'p':
reader = HTSeq.pair_SAM_alignments_with_buffer(reader)
elif pe_order == 'n':
reader = HTSeq.pair_SAM_alignments(reader)
else:
reader = seqfile
# fread_seq_iter = iter(reader)
# first_read = iter(read_seq).next()
elif args.infile == '':
print "no input file type given. exiting..."
sys.exit(1)
except:
print "failed processing SAM/BAM file"
raise
elif not args.infile:
print "no input file given. exiting..."
sys.exit(1)
if args.gff:
gff_file = args.gff
else:
print "no gff file given. exiting..."
sys.exit(1)
if args.verbose:
verbose = True
else:
verbose = False
if args.min_read_length:
min_read_len = args.min_read_length
else:
min_read_len = 60 # default read length
if args.max_clip:
max_clip_ = float(args.max_clip)
else:
max_clip_ = float(0.3) # default read length
if args.min_id:
min_id = float(args.min_id)
#.........這裏部分代碼省略.........
示例9: count_reads_in_features
# 需要導入模塊: import HTSeq [as 別名]
# 或者: from HTSeq import pair_SAM_alignments [as 別名]
#.........這裏部分代碼省略.........
dict_gene_nonunique_counts = initialise_counts_per_feature(dict_gene_nonunique_counts, feature_name)
dict_gene_unique_counts_ambiguous = initialise_counts_per_feature(dict_gene_unique_counts_ambiguous, feature_name)
i += 1
if i % 100000 == 0 and not quiet:
sys.stderr.write( "%d GFF lines processed.\n" % i )
except:
sys.stderr.write( "Error occured in %s.\n" % gff.get_line_number_string() )
raise
if not quiet:
sys.stderr.write( "%d GFF lines processed.\n" % i )
if len( counts ) == 0 and not quiet:
sys.stderr.write( "Warning: No features of type '%s' found.\n" % feature_type )
try:
if sam_filename != "-":
read_seq = HTSeq.SAM_Reader( sam_filename )
first_read = iter(read_seq).next()
else:
read_seq = iter( HTSeq.SAM_Reader( sys.stdin ) )
first_read = read_seq.next()
read_seq = itertools.chain( [ first_read ], read_seq )
pe_mode = first_read.paired_end
#pe_mode = 1 ## Added by us
except:
sys.stderr.write( "Error occured when reading first line of sam file.\n" )
raise
###################################################################################################
try:
if pe_mode:
read_seq_pe_file = read_seq
read_seq = HTSeq.pair_SAM_alignments( read_seq )
empty = 0
ambiguous = 0
ambiguous_tag=0
notaligned = 0
lowqual = 0
nonunique = 0
nonunique_nonamb_to_be_rescued = 0
temp_read_name="NA"
previous_read_name="NA"
temp_interval_r0="NA"
temp_interval_r1="NA"
counter_fragment = 0
flag_result = 0
i = 0
pe_mode_for_SE = 0
## -- Added pe_mode on for SE files so that multireads reads will be accounted for
if not pe_mode: # real SE
pe_mode_for_SE = 1 #
read_seq_pe_file = read_seq
pe_mode=1
## -- End
index_fragment = 0
for r in read_seq:
prev_index_fragment = index_fragment
tag_nonunique_NH = 0
tag_overlapping_genes = 0
flag_aln_not_unique = 0 #
flag_ambiguous = 0 #
#-- LOOP OVER ALL READS IN INPUT BAM FILE
if pe_mode_for_SE:
r = (r, None)
counter_fragment += 1
示例10: count_reads_in_features
# 需要導入模塊: import HTSeq [as 別名]
# 或者: from HTSeq import pair_SAM_alignments [as 別名]
def count_reads_in_features( sam_filename, gff_filename, stranded,
overlap_mode, feature_type, id_attribute, quiet, minaqual, samout, custom_stat ):
def write_to_samout( r, assignment ):
if samoutfile is None:
return
if not pe_mode:
r = (r,)
for read in r:
if read is not None:
samoutfile.write( read.original_sam_line.rstrip() +
"\tXF:Z:" + assignment + "\n" )
if quiet:
warnings.filterwarnings( action="ignore", module="HTSeq" )
if samout != "":
samoutfile = open( samout, "w" )
else:
samoutfile = None
# MB
if custom_stat != "":
custom_stat_file=open(custom_stat,"a")
else:
custom_stat_file = None
# endMB
features = HTSeq.GenomicArrayOfSets( "auto", stranded != "no" )
counts = {}
# Try to open samfile to fail early in case it is not there
if sam_filename != "-":
open( sam_filename ).close()
gff = HTSeq.GFF_Reader( gff_filename )
i = 0
try:
for f in gff:
if f.type == feature_type:
try:
feature_id = f.attr[ id_attribute ]
except KeyError:
sys.exit( "Feature %s does not contain a '%s' attribute" %
( f.name, id_attribute ) )
if stranded != "no" and f.iv.strand == ".":
sys.exit( "Feature %s at %s does not have strand information but you are "
"running htseq-count in stranded mode. Use '--stranded=no'." %
( f.name, f.iv ) )
features[ f.iv ] += feature_id
counts[ f.attr[ id_attribute ] ] = 0
i += 1
if i % 100000 == 0 and not quiet:
sys.stderr.write( "%d GFF lines processed.\n" % i )
except:
sys.stderr.write( "Error occured in %s.\n" % gff.get_line_number_string() )
raise
if not quiet:
sys.stderr.write( "%d GFF lines processed.\n" % i )
if len( counts ) == 0 and not quiet:
sys.stderr.write( "Warning: No features of type '%s' found.\n" % feature_type )
try:
if sam_filename != "-":
read_seq = HTSeq.SAM_Reader( sam_filename )
first_read = iter(read_seq).next()
else:
read_seq = iter( HTSeq.SAM_Reader( sys.stdin ) )
first_read = read_seq.next()
read_seq = itertools.chain( [ first_read ], read_seq )
pe_mode = first_read.paired_end
except:
sys.stderr.write( "Error occured when reading first line of sam file.\n" )
raise
try:
if pe_mode:
read_seq_pe_file = read_seq
read_seq = HTSeq.pair_SAM_alignments( read_seq )
empty = 0
ambiguous = 0
notaligned = 0
lowqual = 0
nonunique = 0
# MB: Creating detailed stats
if custom_stat_file:
sam_lines = 0
skipped = 0
assigned_reads = 0
assigned_reads_s = 0
assigned_reads_p = 0
assigned_genes = 0
assigned_genes_s = 0
assigned_genes_p = 0
empty_s = 0
empty_p = 0
ambiguous_s = 0
ambiguous_p = 0
#.........這裏部分代碼省略.........
示例11: iter
# 需要導入模塊: import HTSeq [as 別名]
# 或者: from HTSeq import pair_SAM_alignments [as 別名]
#!/usr/bin/python
import HTSeq as h
from collections import defaultdict
#reader_masked = h.SAM_Reader("/Users/dashazhernakova/Documents/UMCG/data/geuvadis/mappedData/ERR188022/aligned_masked.sam")
#reader = h.SAM_Reader("/Users/dashazhernakova/Documents/UMCG/data/geuvadis/mappedData/ERR188022/aligned.sam")
reader = h.SAM_Reader("/Users/dashazhernakova/Documents/UMCG/data/geuvadis/mappedData/ERR188022_masked/Aligned.out.filtered.new.1017680.sam")
reader_masked = h.SAM_Reader("/Users/dashazhernakova/Documents/UMCG/data/geuvadis/mappedData/ERR188022/Aligned.out.filtered.new1mb.sam")
it_p = iter(h.pair_SAM_alignments(reader))
it_p_m = iter(h.pair_SAM_alignments(reader_masked))
same_aligned = 0
one_same_pos = 0
both_same_pos = 0
masked_more_pos = 0
simple_more_pos = 0
#cur_read = {}
#cur_m_read = {}
not_in_simple = 0
not_in_masked = 0
n_m = defaultdict(list)
i = 0
for r1, r2 in h.pair_SAM_alignments(reader):
n_m[r1.read.name].append((r1,r2))
i += 1
if i%10000 == 0:
print i, " lines"
#for k,v in n_m.items():
示例12: count_reads_in_features
# 需要導入模塊: import HTSeq [as 別名]
# 或者: from HTSeq import pair_SAM_alignments [as 別名]
def count_reads_in_features( sam_filename, gff_filename, samtype, order, stranded,
overlap_mode, feature_type, id_attribute, quiet, minaqual, samout ):
def write_to_samout( r, assignment ):
if samoutfile is None:
return
if not pe_mode:
r = (r,)
for read in r:
if read is not None:
samoutfile.write( read.original_sam_line.rstrip() +
"\tXF:Z:" + assignment + "\n" )
if samout != "":
samoutfile = open( samout, "w" )
else:
samoutfile = None
features = HTSeq.GenomicArrayOfSets( "auto", stranded != "no" )
counts = {}
# Try to open samfile to fail early in case it is not there
if sam_filename != "-":
open( sam_filename ).close()
gff = HTSeq.GFF_Reader( gff_filename )
i = 0
try:
for f in gff:
if f.type == feature_type:
try:
feature_id = f.attr[ id_attribute ]
except KeyError:
raise ValueError, ( "Feature %s does not contain a '%s' attribute" %
( f.name, id_attribute ) )
if stranded != "no" and f.iv.strand == ".":
raise ValueError, ( "Feature %s at %s does not have strand information but you are "
"running htseq-count in stranded mode. Use '--stranded=no'." %
( f.name, f.iv ) )
features[ f.iv ] += feature_id
counts[ f.attr[ id_attribute ] ] = 0
i += 1
if i % 100000 == 0 and not quiet:
sys.stderr.write( "%d GFF lines processed.\n" % i )
except:
sys.stderr.write( "Error occured when processing GFF file (%s):\n" % gff.get_line_number_string() )
raise
if not quiet:
sys.stderr.write( "%d GFF lines processed.\n" % i )
if len( counts ) == 0:
sys.stderr.write( "Warning: No features of type '%s' found.\n" % feature_type )
if samtype == "sam":
SAM_or_BAM_Reader = HTSeq.SAM_Reader
elif samtype == "bam":
SAM_or_BAM_Reader = HTSeq.BAM_Reader
else:
raise ValueError, "Unknown input format %s specified." % samtype
try:
if sam_filename != "-":
read_seq_file = SAM_or_BAM_Reader( sam_filename )
read_seq = read_seq_file
first_read = iter(read_seq).next()
else:
read_seq_file = SAM_or_BAM_Reader( sys.stdin )
read_seq_iter = iter( read_seq_file )
first_read = read_seq_iter.next()
read_seq = itertools.chain( [ first_read ], read_seq_iter )
pe_mode = first_read.paired_end
except:
sys.stderr.write( "Error occured when reading beginning of SAM/BAM file.\n" )
raise
try:
if pe_mode:
if order == "name":
read_seq = HTSeq.pair_SAM_alignments( read_seq )
elif order == "pos":
read_seq = HTSeq.pair_SAM_alignments_with_buffer( read_seq )
else:
raise ValueError, "Illegal order specified."
empty = 0
ambiguous = 0
notaligned = 0
lowqual = 0
nonunique = 0
i = 0
for r in read_seq:
if i > 0 and i % 100000 == 0 and not quiet:
sys.stderr.write( "%d SAM alignment record%s processed.\n" % ( i, "s" if not pe_mode else " pairs" ) )
i += 1
if not pe_mode:
if not r.aligned:
notaligned += 1
write_to_samout( r, "__not_aligned" )
#.........這裏部分代碼省略.........
示例13: count_reads_in_features
# 需要導入模塊: import HTSeq [as 別名]
# 或者: from HTSeq import pair_SAM_alignments [as 別名]
def count_reads_in_features( sam_filename, gff_filename, stranded,
overlap_mode, feature_type, id_attribute, quiet, minaqual, samout, allow_ambiguous, allow_nonunique ):
def write_to_samout( r, assignment ):
if samoutfile is None:
return
if not pe_mode:
r = (r,)
for read in r:
if read is not None:
samoutfile.write( read.original_sam_line.rstrip() +
"\tXF:Z:" + assignment + "\n" )
if quiet:
warnings.filterwarnings( action="ignore", module="HTSeq" )
if samout != "":
samoutfile = open( samout, "w" )
else:
samoutfile = None
features = HTSeq.GenomicArrayOfSets( "auto", stranded != "no" )
features_dict = defaultdict(list)
counts = {}
# Try to open samfile to fail early in case it is not there
if sam_filename != "-":
open( sam_filename ).close()
gff = HTSeq.GFF_Reader( gff_filename )
i = 0
try:
for f in gff:
if f.type == feature_type:
try:
feature_id = f.attr[ id_attribute ]
except KeyError:
sys.exit( "Feature %s does not contain a '%s' attribute" %
( f.name, id_attribute ) )
if stranded != "no" and f.iv.strand == ".":
sys.exit( "Feature %s at %s does not have strand information but you are "
"running htseq-count in stranded mode. Use '--stranded=no'." %
( f.name, f.iv ) )
features[ f.iv ] += feature_id
counts[ f.attr[ id_attribute ] ] = 0
features_dict[ f.attr[ id_attribute ] ].append(f)
i += 1
if i % 100000 == 0 and not quiet:
sys.stderr.write( "%d GFF lines processed.\n" % i )
except:
sys.stderr.write( "Error occured in %s.\n" % gff.get_line_number_string() )
raise
if not quiet:
sys.stderr.write( "%d GFF lines processed.\n" % i )
sys.stderr.write( "Sorting exons from GFF file.\n" )
for key, value in features_dict.items():
if features_dict[key][0].iv.strand == "-":
features_dict[key] = sorted(features_dict[key], key=lambda feat: feat.iv.start, reverse=True)
else:
features_dict[key] = sorted(features_dict[key], key=lambda feat: feat.iv.start, reverse=False)
if len( counts ) == 0 and not quiet:
sys.stderr.write( "Warning: No features of type '%s' found.\n" % feature_type )
try:
if sam_filename != "-":
read_seq = HTSeq.SAM_Reader( sam_filename )
first_read = iter(read_seq).next()
else:
read_seq = iter( HTSeq.SAM_Reader( sys.stdin ) )
first_read = read_seq.next()
read_seq = itertools.chain( [ first_read ], read_seq )
pe_mode = first_read.paired_end
except:
sys.stderr.write( "Error occured when reading first line of sam file.\n" )
raise
try:
if pe_mode:
read_seq_pe_file = read_seq
read_seq = HTSeq.pair_SAM_alignments( read_seq )
empty = 0
ambiguous = 0
notaligned = 0
lowqual = 0
nonunique = 0
i = 0
for r in read_seq:
i += 1
if not pe_mode:
if not r.aligned:
notaligned += 1
write_to_samout( r, "not_aligned" )
continue
try:
if ((allow_nonunique == "no") and (r.optional_field( "NH" ) > 1)):
write_to_samout( r, "alignment_not_unique" )
nonunique += 1
#.........這裏部分代碼省略.........
示例14: count_reads_in_features
# 需要導入模塊: import HTSeq [as 別名]
# 或者: from HTSeq import pair_SAM_alignments [as 別名]
def count_reads_in_features( sam_filename, gff_filename, stranded,
overlap_mode, feature_type, id_attribute, quiet, minaqual, samout ):
def write_to_samout( r, assignment ):
if samoutfile is None:
return
if not pe_mode:
r = (r,)
for read in r:
if read is not None:
samoutfile.write( read.original_sam_line.rstrip() +
"\tXF:Z:" + assignment + "\n" )
if quiet:
warnings.filterwarnings( action="ignore", module="HTSeq" )
if samout != "":
samoutfile = open( samout, "w" )
else:
samoutfile = None
features = HTSeq.GenomicArrayOfSets( "auto", stranded != "no" )
counts = {}
## added by CR
dict_nonunique = {}
# Try to open samfile to fail early in case it is not there
if sam_filename != "-":
open( sam_filename ).close()
gff = HTSeq.GFF_Reader( gff_filename )
i = 0
try:
for f in gff:
if f.type == feature_type:
try:
feature_id = f.attr[ id_attribute ]
except KeyError:
sys.exit( "Feature %s does not contain a '%s' attribute" %
( f.name, id_attribute ) )
if stranded != "no" and f.iv.strand == ".":
sys.exit( "Feature %s at %s does not have strand information but you are "
"running htseq-count in stranded mode. Use '--stranded=no'." %
( f.name, f.iv ) )
features[ f.iv ] += feature_id
counts[ f.attr[ id_attribute ] ] = 0
##added by CR
dict_nonunique[ f.attr[ id_attribute ] ] = 0
i += 1
if i % 100000 == 0 and not quiet:
sys.stderr.write( "%d GFF lines processed.\n" % i )
except:
sys.stderr.write( "Error occured in %s.\n" % gff.get_line_number_string() )
raise
if not quiet:
sys.stderr.write( "%d GFF lines processed.\n" % i )
if len( counts ) == 0 and not quiet:
sys.stderr.write( "Warning: No features of type '%s' found.\n" % feature_type )
try:
if sam_filename != "-":
read_seq = HTSeq.SAM_Reader( sam_filename )
first_read = iter(read_seq).next()
else:
read_seq = iter( HTSeq.SAM_Reader( sys.stdin ) )
first_read = read_seq.next()
read_seq = itertools.chain( [ first_read ], read_seq )
pe_mode = first_read.paired_end
except:
sys.stderr.write( "Error occured when reading first line of sam file.\n" )
raise
try:
if pe_mode:
read_seq_pe_file = read_seq
read_seq = HTSeq.pair_SAM_alignments( read_seq )
empty = 0
ambiguous = 0
notaligned = 0
lowqual = 0
nonunique = 0
#added by SB
temp_read_name="NA"
temp_interval_r0="NA"
temp_interval_r1="NA"
## added by CR
nonunique2 = 0
#added by SB
i = 0
for r in read_seq:
i += 1
if not pe_mode:
if not r.aligned:
notaligned += 1
write_to_samout( r, "not_aligned" )
continue
try:
if r.optional_field( "NH" ) > 1:
#.........這裏部分代碼省略.........
示例15: count_reads_in_features
# 需要導入模塊: import HTSeq [as 別名]
# 或者: from HTSeq import pair_SAM_alignments [as 別名]
#.........這裏部分代碼省略.........
samoutfile = None
try:
if sam_filename == "-":
read_seq_file = SAM_or_BAM_Reader(sys.stdin)
else:
read_seq_file = SAM_or_BAM_Reader(sam_filename)
read_seq_iter = iter(read_seq_file)
# Catch empty BAM files
try:
first_read = next(read_seq_iter)
pe_mode = first_read.paired_end
except:
first_read = None
pe_mode = False
if first_read is not None:
read_seq = itertools.chain([first_read], read_seq_iter)
else:
read_seq = []
except:
sys.stderr.write(
"Error occured when reading beginning of {:} file.\n".format(
samname))
raise
try:
if pe_mode:
if ((supplementary_alignment_mode == 'ignore') and
(secondary_alignment_mode == 'ignore')):
primary_only = True
else:
primary_only = False
if order == "name":
read_seq = HTSeq.pair_SAM_alignments(
read_seq,
primary_only=primary_only)
elif order == "pos":
read_seq = HTSeq.pair_SAM_alignments_with_buffer(
read_seq,
max_buffer_size=max_buffer_size,
primary_only=primary_only)
else:
raise ValueError("Illegal order specified.")
empty = 0
ambiguous = 0
notaligned = 0
lowqual = 0
nonunique = 0
i = 0
for r in read_seq:
if i > 0 and i % 100000 == 0 and not quiet:
sys.stderr.write(
"%d %s alignment record%s processed.\n" %
(i, samname, "s" if not pe_mode else " pairs"))
sys.stderr.flush()
i += 1
if not pe_mode:
if not r.aligned:
notaligned += 1
write_to_samout(r, "__not_aligned", samoutfile)
continue
if ((secondary_alignment_mode == 'ignore') and
r.not_primary_alignment):
continue
if ((supplementary_alignment_mode == 'ignore') and