本文整理汇总了Python中pysam.Samfile.write方法的典型用法代码示例。如果您正苦于以下问题:Python Samfile.write方法的具体用法?Python Samfile.write怎么用?Python Samfile.write使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类pysam.Samfile
的用法示例。
在下文中一共展示了Samfile.write方法的8个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: split_samfile
# 需要导入模块: from pysam import Samfile [as 别名]
# 或者: from pysam.Samfile import write [as 别名]
def split_samfile(sam_file, splits, prefix='', path=''):
"""Take a sam file and split it splits number of times.
:path: Where to put the split files.
:prefix: A prefix for the outfile names.
:returns: A tuple of job files.
"""
# Determine how many reads will be in each split sam file.
num_lines = count_reads(sam_file)
num_reads = int(int(num_lines)/splits) + 1
# Get rid of starting path
sam_name = os.path.basename(sam_file)
# Subset the SAM file into X number of jobs
cnt = 0
currjob = 1
suffix = '.split_sam_' + str(currjob).zfill(4)
run_file = os.path.join(path, prefix + sam_name + suffix)
rmode = 'rb' if sam_name.split('.')[0] == 'bam' else 'r'
wmode = 'wb'
# Actually split the file
outfiles = [run_file]
with Samfile(sam_file, rmode) as in_sam:
sam_split = Samfile(run_file, wmode, template=in_sam)
for line in in_sam:
cnt += 1
if cnt < num_reads:
sam_split.write(line)
elif cnt == num_reads:
# Check if next line is mate-pair. If so, don't split.
line2 = next(in_sam)
currjob += 1
suffix = '.split_sam_' + str(currjob).zfill(4)
run_file = os.path.join(path, prefix + sam_name + suffix)
new_sam = Samfile(run_file, wmode, template=in_sam)
outfiles.append(run_file)
if line.qname == line2.qname:
sam_split.write(line)
sam_split.write(line2)
sam_split.close()
cnt = 0
else:
sam_split.write(line)
sam_split.close()
new_sam.write(line2)
cnt = 0
sam_split = new_sam
sam_split.close()
return tuple(outfiles)
示例2: parse_barcode
# 需要导入模块: from pysam import Samfile [as 别名]
# 或者: from pysam.Samfile import write [as 别名]
def parse_barcode(bamfile):
"""parses a sorted and index bam file, removes all cases where rna hits more than one spot in genome
and writes to a file, create file for mutant and wildtype based on barcodes"""
samfile = Samfile(bamfile, "rb")
multi_hit_file = Samfile("MultiHit.bam","wb",template=samfile)
mutant_one = Samfile("MutantOne.bam","wb",template=samfile)
wildtype_one = Samfile("WildtypeOne.bam","wb",template=samfile)
mutant_two = Samfile("MutantTwo.bam","wb",template=samfile)
wildtype_two = Samfile("WildtypeTwo.bam","wb",template=samfile)
for line in samfile.fetch():
#if line.is_secondary:
## does this hit to more than one spot in genome
# multi_hit_file.write(line)
if "#GAGT"in line.qname:
## write to mutant file
mutant_one.write(line)
elif "#TTAG" in line.qname:
mutant_two.write(line)
elif "#ACCC" in line.qname:
### write to wildtype file
wildtype_one.write(line)
elif "#CGTA" in line.qname:
### write to wildtype file
wildtype_two.write(line)
multi_hit_file.close()
mutant_one.close()
wildtype_one.close()
mutant_two.close()
wildtype_two.close()
samfile.close()
示例3: subsample
# 需要导入模块: from pysam import Samfile [as 别名]
# 或者: from pysam.Samfile import write [as 别名]
def subsample(fn, ns=None):
if ns is None:
fn, ns = fn
sample = []
count = 0
outdir_base = path.join(path.dirname(fn), "subset")
sf = Samfile(fn)
try:
i_weight = float(sf.mapped) / max(ns)
print "Read out ", i_weight
except ValueError:
i_weight = 0.0
for read in sf:
i_weight += 1
print "Counted ", i_weight
i_weight /= float(max(ns))
sf = Samfile(fn)
print fn, count, i_weight
for i, read in enumerate(sf):
key = random() ** i_weight
if len(sample) < max(ns):
heappush(sample, (key, read, i + count))
else:
heappushpop(sample, (key, read, i + count))
count += i
for n in ns:
if n == min(ns):
outdir = outdir_base + "_min"
else:
outdir = outdir_base + "{:04.1f}M".format(n / 1e6)
try:
makedirs(outdir)
except OSError:
pass
sampN = sorted(sample, reverse=True)[: int(n)]
print "Kept {: >12,} of {: >12,} reads".format(len(sampN), count)
print fn, "->", outdir
stdout.flush()
of = Samfile(path.join(outdir, "accepted_hits.bam"), mode="wb", template=sf)
sample.sort(key=lambda (key, read, pos): (read.tid, read.pos))
for key, read, pos in sampN:
of.write(read)
of.close()
sf.close()
return [count for key, read, count in sample]
示例4: split_reads
# 需要导入模块: from pysam import Samfile [as 别名]
# 或者: from pysam.Samfile import write [as 别名]
def split_reads(reads, ref_reads, alt_reads):
read_results = Counter()
ref_bam = Samfile(ref_reads, 'wb', template=reads)
alt_bam = Samfile(alt_reads, 'wb', template=reads)
prev_phase = None
prev_read = None
prev_qname = None
test = 0
for read in reads.fetch(until_eof=True):
test += 1
chrom = read.reference_name
snps_on_chrom = snp_dict[chrom]
phase = get_phase(read, snps_on_chrom)
read_qname = read.qname
if read_qname == prev_qname:
read_results["tot_read"]+=1
phase_set = set([phase, prev_phase])
phase_set.discard(None)
if len(phase_set) == 1:
read_phase = phase_set.pop()
if read_phase == -1:
ref_bam.write(read)
ref_bam.write(prev_read)
read_results["ref_read"]+=1
elif read_phase == 1:
alt_bam.write(read)
alt_bam.write(prev_read)
read_results["alt_read"]+=1
elif read_phase == 0:
read_results['misphased_read']+=1
elif len(phase_set)==0:
read_results["no_snps_read"]+=1
else:
read_results['misphased_read']+=1
prev_read = read
prev_phase = phase
prev_qname = read_qname
return(read_results)
示例5: main
# 需要导入模块: from pysam import Samfile [as 别名]
# 或者: from pysam.Samfile import write [as 别名]
def main():
parser = OptionParser(usage=usage)
#parser.add_option("-s", action="store_true", dest="sam_input", default=False,
#help="Input is in SAM format instead of BAM format")
(options, args) = parser.parse_args()
if len(args) != 4:
parser.print_help()
sys.exit(1)
psl_filename = args[0]
ref_filename = args[1]
contigs_filename = args[2]
bam_filename = args[3]
liftover_dir = args[1]
references, ref_chromosomes = read_fasta(ref_filename)
refname_to_id = dict([(name,i) for i,name in enumerate(ref_chromosomes)])
print('Read', len(ref_chromosomes), 'reference chromosomes:', ','.join(ref_chromosomes), file=sys.stderr)
contigs, contig_names = read_fasta(contigs_filename)
print('Read', len(contig_names), 'contigs.', file=sys.stderr)
bam_header = {'HD': {'VN': '1.0'}, 'SQ': [dict([('LN', len(references[chromosome])), ('SN', chromosome)]) for chromosome in ref_chromosomes] }
outfile = Samfile(bam_filename, 'wb', header=bam_header)
line_nr = 0
header_read = False
for line in (s.strip() for s in open(psl_filename)):
line_nr += 1
if line.startswith('------'):
header_read = True
continue
if not header_read: continue
fields = line.split()
assert len(fields) == 21, 'Error reading PSL file, offending line: %d'%line_nr
sizes = [int(x) for x in fields[18].strip(',').split(',')]
contig_starts = [int(x) for x in fields[19].strip(',').split(',')]
ref_starts = [int(x) for x in fields[20].strip(',').split(',')]
assert 0 < len(sizes) == len(contig_starts) == len(ref_starts)
strand = fields[8]
contig_name = fields[9]
ref_name = fields[13]
assert strand in ['-','+']
assert contig_name in contigs
assert ref_name in references
a = AlignedRead()
a.qname = contig_name
if strand == '+':
a.seq = str(contigs[contig_name])
else:
a.seq = str(contigs[contig_name].reverse_complement())
a.flag = (16 if strand == '+' else 0)
a.rname = refname_to_id[ref_name]
a.pos = ref_starts[0]
a.mapq = 255
qpos = contig_starts[0]
refpos = ref_starts[0]
cigar = []
# soft-clipping at the start?
if contig_starts[0] > 0:
cigar.append((4,contig_starts[0]))
longest_insertion = 0
longest_deletion = 0
total_matches = 0
total_insertion = 0
total_deletion = 0
for length, contig_start, ref_start in zip(sizes, contig_starts, ref_starts):
assert contig_start >= qpos
assert ref_start >= refpos
# insertion?
if contig_start > qpos:
insertion_length = contig_start - qpos
longest_insertion = max(longest_insertion, insertion_length)
total_insertion += insertion_length
append_to_cigar(cigar, 1, insertion_length)
qpos = contig_start
# deletion?
if ref_start > refpos:
deletion_length = ref_start - refpos
longest_deletion = max(longest_deletion, deletion_length)
total_deletion += deletion_length
append_to_cigar(cigar, 2, deletion_length)
refpos = ref_start
# strech of matches/mismatches
append_to_cigar(cigar, 0, length)
refpos += length
qpos += length
total_matches += length
# soft-clipping at the end?
if len(a.seq) > qpos:
cigar.append((4,len(a.seq) - qpos))
a.cigar = tuple(cigar)
# only use contigs where longest deletion is <= 10000 bp
if longest_deletion > 10000: continue
# require at least 200 matching positions
if total_matches < 200: continue
# require the matching positions to make up at least 75 percent of the contig
# (without counting parts of the contig that are insertions).
if total_matches / (len(a.seq) - total_insertion) < 0.75: continue
outfile.write(a)
outfile.close()
示例6: print_reads
# 需要导入模块: from pysam import Samfile [as 别名]
# 或者: from pysam.Samfile import write [as 别名]
def print_reads(reads_to_print, ref_name, header):
output_name = "{0}_{1}.bam".format(args.output_base, ref_name)
output_samfile = Samfile(output_name, "wb", header=header)
for aln in reads_to_print:
output_samfile.write(aln)
output_samfile.close()
示例7: subsample
# 需要导入模块: from pysam import Samfile [as 别名]
# 或者: from pysam.Samfile import write [as 别名]
def subsample(fn, ns=None, paired=False):
if ns is None:
fn, ns = fn
sample = []
count = 0
outdir_base = path.join(path.dirname(fn), 'subset')
sf = Samfile(fn)
try:
i_weight = float(sf.mapped)/max(ns)
print("Read out ", i_weight)
except ValueError:
i_weight = 0.0
for read in sf:
i_weight += 1
print("Counted ", i_weight)
i_weight /= float(max(ns))
sf = Samfile(fn)
if paired:
read_2s = {}
print(fn, count, i_weight)
for i, read in enumerate(sf):
key = random()**i_weight
if not paired or read.is_read1:
if len(sample) < max(ns):
heappush(sample, (key, i+count, read))
else:
dropped = heappushpop(sample, (key, i+count, read))
if paired:
read_2s.pop(dropped[-1].qname, None)
elif paired:
read_2s[read.qname] = read
else:
assert ValueError("I don't know how we got here")
count += i
for n in ns:
outdir = outdir_base + '{:04.1f}M'.format(n/1e6)
try:
makedirs(outdir)
except OSError:
pass
sampN = sorted(sample, reverse=True)[:int(n)]
print("Kept {: >12,} of {: >12,} reads".format(len(sampN), count))
print(fn, '->', outdir)
stdout.flush()
of = Samfile(path.join(outdir, 'accepted_hits.bam'),
mode='wb', template=sf)
sample.sort(key=lambda heap_item: (heap_item[-1].tid, heap_item[-1].pos))
missing_mates = 0
for key, pos, read in sampN:
of.write(read)
if paired and read.is_proper_pair:
if read.qname not in read_2s:
missing_mates += 1
continue
of.write(read_2s[read.qname])
of.close()
sf.close()
print(missing_mates)
return [count for key, read, count in sample]
示例8: Samfile
# 需要导入模块: from pysam import Samfile [as 别名]
# 或者: from pysam.Samfile import write [as 别名]
"""
Turns abbreviated MD into a full array
"""
ret = []
for i in re.findall("\d+|\^?[ATCGN]+", md):
if i.startswith('^'):
ret.extend(list(i[1:]))
elif i[0] in ["A","T","C","G","N"]:
ret.extend(list(i))
else:
ret.extend(['-']*int(i))
return ret
if __name__ == '__main__':
f = Samfile(sys.argv[1])
out = Samfile(sys.argv[1][:-4]+"_realign.bam",'wb', template=f)
count = 0.0
n = 0.05
for read in f:
q,t = expandAlign(read)
query, target = realign(read)
replace(read, query, target)
out.write(read)
count += 1
if (count / f.mapped) > n:
n += 0.05
print "[%s] -- parsed %d of %d reads (%.2f)" % (time.asctime(), int(count), f.mapped, count/f.mapped )
out.close()