本文整理汇总了Python中pysam.Samfile.close方法的典型用法代码示例。如果您正苦于以下问题:Python Samfile.close方法的具体用法?Python Samfile.close怎么用?Python Samfile.close使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类pysam.Samfile
的用法示例。
在下文中一共展示了Samfile.close方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: test_process_bam_mismatches
# 需要导入模块: from pysam import Samfile [as 别名]
# 或者: from pysam.Samfile import close [as 别名]
def test_process_bam_mismatches():
tbam = os.path.join(DATA, "tmp.bam")
bam = os.path.join(DATA, "ordered_umi.bam")
if os.path.exists(tbam):
os.remove(tbam)
with captured_output() as (out, err):
process_bam(bam, tbam, mismatches=1)
assert os.path.exists(tbam)
it = iter(out.getvalue().split("\n"))
assert it.next().strip() == "1\t9\t10\t4\t2"
assert it.next().strip() == "1\t11\t12\t2\t1"
assert it.next().strip() == "1\t29\t30\t2\t1"
bam_reader = Samfile(tbam)
it = iter(bam_reader)
r = it.next()
assert r.pos == 4
assert r.qname == "read8:UMI_ATTCAGGG"
r = it.next()
assert r.pos == 9
assert r.qname == "read1:UMI_AAAAAGGG"
r = it.next()
assert r.pos == 9
assert r.qname == "read4:UMI_AAAGGGGG"
r = it.next()
assert r.pos == 11
assert r.qname == "read5:UMI_ATTTAGGG"
bam_reader.close()
os.remove(tbam)
示例2: callbase
# 需要导入模块: from pysam import Samfile [as 别名]
# 或者: from pysam.Samfile import close [as 别名]
def callbase(bamfile, snpsites, out):
BF = Samfile(bamfile, 'rb') #open your bam file
SF = open(snpsites, 'r') #the file contain snp sites info
RF = open(out, 'w') #resulte file
RF.write('ref_name\tpos\tRbase\tAbase\tA\tT\tC\tG\tN\tothers\n')
for i in SF:
if i.startswith('#'):
continue
else:
line = ParseSNPsitesLine(i)
vcf_pos = line.pos-1 #change 1-base to 0-based
vcf_refname = line.chrom
print 'processing: %s %s...'%(vcf_refname, str(vcf_pos))
At, Tt, Ct, Gt, Nt, othert = 0, 0, 0, 0, 0, 0
for i in BF.pileup(vcf_refname, vcf_pos, vcf_pos+1):
if i.pos == vcf_pos:
vcf_Rbase = line.Rbase
vcf_Abase = line.Abase
for j in i.pileups:
yourbase = j.alignment.seq[j.qpos]
if yourbase == 'A': At += 1
elif yourbase == 'T': Tt += 1
elif yourbase == 'C': Ct += 1
elif yourbase == 'G': Gt += 1
elif yourbase == 'N': Nt += 1
else: othert += 1
RF.write('%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n'%(vcf_refname, \
str(vcf_pos+1), vcf_Rbase, vcf_Abase, str(At), str(Tt), str(Ct), str(Gt), \
str(Nt), str(othert)))
BF.close()
示例3: main
# 需要导入模块: from pysam import Samfile [as 别名]
# 或者: from pysam.Samfile import close [as 别名]
def main(args=None):
if args is None:
args = sys.argv[1:]
f = Samfile(args[0])
header = f.header
f.close()
reflen = header['SQ'][0]['LN']
BamIO.write(clip(BamIO.parse(args[0]), reflen), args[1], header=header)
return 0
示例4: subsample
# 需要导入模块: from pysam import Samfile [as 别名]
# 或者: from pysam.Samfile import close [as 别名]
def subsample(fn, ns=None):
if ns is None:
fn, ns = fn
sample = []
count = 0
outdir_base = path.join(path.dirname(fn), "subset")
sf = Samfile(fn)
try:
i_weight = float(sf.mapped) / max(ns)
print "Read out ", i_weight
except ValueError:
i_weight = 0.0
for read in sf:
i_weight += 1
print "Counted ", i_weight
i_weight /= float(max(ns))
sf = Samfile(fn)
print fn, count, i_weight
for i, read in enumerate(sf):
key = random() ** i_weight
if len(sample) < max(ns):
heappush(sample, (key, read, i + count))
else:
heappushpop(sample, (key, read, i + count))
count += i
for n in ns:
if n == min(ns):
outdir = outdir_base + "_min"
else:
outdir = outdir_base + "{:04.1f}M".format(n / 1e6)
try:
makedirs(outdir)
except OSError:
pass
sampN = sorted(sample, reverse=True)[: int(n)]
print "Kept {: >12,} of {: >12,} reads".format(len(sampN), count)
print fn, "->", outdir
stdout.flush()
of = Samfile(path.join(outdir, "accepted_hits.bam"), mode="wb", template=sf)
sample.sort(key=lambda (key, read, pos): (read.tid, read.pos))
for key, read, pos in sampN:
of.write(read)
of.close()
sf.close()
return [count for key, read, count in sample]
示例5: removeEdgeMismatches
# 需要导入模块: from pysam import Samfile [as 别名]
# 或者: from pysam.Samfile import close [as 别名]
def removeEdgeMismatches(self,bamFile,minDistance, minBaseQual):
startTime=Helper.getTime()
minDistance=int(minDistance)
counter=0;j=0
num_lines = len(self.variantDict)
Helper.info(" [%s] remove Missmatches from the first %s bp from read edges" % (startTime.strftime("%c"),str(minDistance)),self.logFile,self.textField)
bamFile = Samfile(bamFile, "rb")
for varKey in self.variantDict.keys():
variant = self.variantDict[varKey]
counter+=1
if counter%10000==0:
Helper.status('%s mm parsed ' % counter ,self.logFile, self.textField,"grey")
keepSNP=False
varPos=variant.position-1
iter = bamFile.pileup(variant.chromosome, variant.position-1, variant.position)
#walks up the region wich overlap this position
for x in iter:
if x.pos == varPos:
for pileupread in x.pileups: #walk through the single reads
if not pileupread.is_del and not pileupread.is_refskip:
distance=abs(pileupread.alignment.alen-pileupread.query_position) if pileupread.alignment.is_reverse else pileupread.query_position
if distance >= minDistance:
#check readBase and Base Quality
if pileupread.alignment.query_sequence[pileupread.query_position] == variant.alt and pileupread.alignment.query_qualities[pileupread.query_position]>=minBaseQual:
#if pileupread.alignment.query_sequence[pileupread.query_position] == variant.alt:
keepSNP=True
if keepSNP==False:
j+=1
del self.variantDict[varKey]
Helper.status('%s of %svariants were deleted' % (j,num_lines), self.logFile, self.textField,"black")
Helper.printTimeDiff(startTime, self.logFile, self.textField)
bamFile.close()
示例6: split_samfile
# 需要导入模块: from pysam import Samfile [as 别名]
# 或者: from pysam.Samfile import close [as 别名]
def split_samfile(sam_file, splits, prefix='', path=''):
"""Take a sam file and split it splits number of times.
:path: Where to put the split files.
:prefix: A prefix for the outfile names.
:returns: A tuple of job files.
"""
# Determine how many reads will be in each split sam file.
num_lines = count_reads(sam_file)
num_reads = int(int(num_lines)/splits) + 1
# Get rid of starting path
sam_name = os.path.basename(sam_file)
# Subset the SAM file into X number of jobs
cnt = 0
currjob = 1
suffix = '.split_sam_' + str(currjob).zfill(4)
run_file = os.path.join(path, prefix + sam_name + suffix)
rmode = 'rb' if sam_name.split('.')[0] == 'bam' else 'r'
wmode = 'wb'
# Actually split the file
outfiles = [run_file]
with Samfile(sam_file, rmode) as in_sam:
sam_split = Samfile(run_file, wmode, template=in_sam)
for line in in_sam:
cnt += 1
if cnt < num_reads:
sam_split.write(line)
elif cnt == num_reads:
# Check if next line is mate-pair. If so, don't split.
line2 = next(in_sam)
currjob += 1
suffix = '.split_sam_' + str(currjob).zfill(4)
run_file = os.path.join(path, prefix + sam_name + suffix)
new_sam = Samfile(run_file, wmode, template=in_sam)
outfiles.append(run_file)
if line.qname == line2.qname:
sam_split.write(line)
sam_split.write(line2)
sam_split.close()
cnt = 0
else:
sam_split.write(line)
sam_split.close()
new_sam.write(line2)
cnt = 0
sam_split = new_sam
sam_split.close()
return tuple(outfiles)
示例7: parse_barcode
# 需要导入模块: from pysam import Samfile [as 别名]
# 或者: from pysam.Samfile import close [as 别名]
def parse_barcode(bamfile):
"""parses a sorted and index bam file, removes all cases where rna hits more than one spot in genome
and writes to a file, create file for mutant and wildtype based on barcodes"""
samfile = Samfile(bamfile, "rb")
multi_hit_file = Samfile("MultiHit.bam","wb",template=samfile)
mutant = Samfile("Mutant.bam","wb",template=samfile)
wildtype = Samfile("Wildtype.bam","wb",template=samfile)
for line in samfile.fetch():
#if line.is_secondary:
## does this hit to more than one spot in genome
# multi_hit_file.write(line)
if "#GAGT"in line.qname or "#TTAG" in line.qname:
## write to mutant file
mutant.write(line)
elif "#ACCC" in line.qname or "#CGTA" in line.qname:
### write to wildtype file
wildtype.write(line)
multi_hit_file.close()
mutant.close()
wildtype.close()
samfile.close()
示例8: Samfile
# 需要导入模块: from pysam import Samfile [as 别名]
# 或者: from pysam.Samfile import close [as 别名]
samfile = Samfile(args.path)
for segment in samfile.fetch(until_eof=True):
num = segment.query_name.split("|")[0]
for etype, eset in errors.iteritems():
if(num in eset):
errors2segments[etype][num].append(segment);
break;
additional = defaultdict(list);
for fname in args.additional:
tsamfile = Samfile(fname);
for segment in tsamfile.fetch(until_eof=True):
num = segment.query_name.split("|")[0]
additional[num].append(ArWrapper(segment, tsamfile.getrname(segment.tid)))
tsamfile.close();
for etype, d in errors2segments.iteritems():
with open(os.path.join(args.outdir, "%s_%s_error.txt" % etype), 'w') as f:
for num, segments in d.iteritems():
if(segments[0].is_reverse):
seq = reverse_complement(segments[0].seq);
else:
seq = segments[0].seq
f.write("%s\nnumber of read:\t%s\n\nSequence:\t%s\n\nSegments:\n\n" % ("_"*140, num, seq))
for segment in segments:
f.write("%s\t%s\t%d\t%s\t%d\t%s\n\n" % (segment.query_name.split("|")[2], samfile.getrname(segment.tid), segment.reference_start, segment.cigarstring, segment.get_tag("AS"), segment.query_name));
示例9: snp_workflow
# 需要导入模块: from pysam import Samfile [as 别名]
# 或者: from pysam.Samfile import close [as 别名]
def snp_workflow(ex, job, assembly, minsnp=40., mincov=5, path_to_ref=None, via='local',
logfile=sys.stdout, debugfile=sys.stderr):
"""Main function of the workflow"""
ref_genome = assembly.fasta_by_chrom
sample_names = [job.groups[gid]['name'] for gid in sorted(job.files.keys())]
logfile.write("\n* Generate vcfs for each chrom/group\n"); logfile.flush()
vcfs = dict((chrom,{}) for chrom in ref_genome.keys()) # {chr: {}}
bams = {}
# Launch the jobs
for gid in sorted(job.files.keys()):
# Merge all bams belonging to the same group
runs = [r['bam'] for r in job.files[gid].itervalues()]
bam = Samfile(runs[0])
header = bam.header
headerfile = unique_filename_in()
for h in header["SQ"]:
if h["SN"] in assembly.chrmeta:
h["SN"] = assembly.chrmeta[h["SN"]]["ac"]
head = Samfile( headerfile, "wh", header=header )
head.close()
if len(runs) > 1:
_b = merge_bam(ex,runs)
index_bam(ex,_b)
bams[gid] = _b
else:
bams[gid] = runs[0]
# Samtools mpileup + bcftools + vcfutils.pl
for chrom,ref in ref_genome.iteritems():
vcf = unique_filename_in()
vcfs[chrom][gid] = (vcf,
pileup.nonblocking(ex, bams[gid], ref, header=headerfile,
via=via, stdout=vcf))
logfile.write(" ...Group %s running.\n" %job.groups[gid]['name']); logfile.flush()
# Wait for vcfs to finish and store them in *vcfs[chrom][gid]*
for gid in sorted(job.files.keys()):
for chrom,ref in ref_genome.iteritems():
vcfs[chrom][gid][1].wait()
vcfs[chrom][gid] = vcfs[chrom][gid][0]
logfile.write(" ...Group %s done.\n" %job.groups[gid]['name']); logfile.flush()
# Targz the pileup files (vcf)
tarname = unique_filename_in()
tarfh = tarfile.open(tarname, "w:gz")
for chrom,v in vcfs.iteritems():
for gid,vcf in v.iteritems():
tarfh.add(vcf, arcname="%s_%s.vcf" % (job.groups[gid]['name'],chrom))
tarfh.close()
ex.add( tarname, description=set_file_descr("vcfs_files.tar.gz",step="pileup",type="tar",view='admin') )
logfile.write("\n* Merge info from vcf files\n"); logfile.flush()
outall = unique_filename_in()
outexons = unique_filename_in()
with open(outall,"w") as fout:
fout.write('#'+'\t'.join(['chromosome','position','reference']+sample_names+ \
['gene','location_type','distance'])+'\n')
with open(outexons,"w") as fout:
fout.write('#'+'\t'.join(['chromosome','position','reference']+sample_names+['exon','strand','ref_aa'] \
+ ['new_aa_'+s for s in sample_names])+'\n')
msa_table = dict((s,'') for s in [assembly.name]+sample_names)
for chrom,v in vcfs.iteritems():
logfile.write(" > Chromosome '%s'\n" % chrom); logfile.flush()
# Put together info from all vcf files
logfile.write(" - All SNPs\n"); logfile.flush()
allsnps = all_snps(ex,chrom,vcfs[chrom],bams,outall,assembly,
sample_names,mincov,float(minsnp),logfile,debugfile)
# Annotate SNPs and check synonymy
logfile.write(" - Exonic SNPs\n"); logfile.flush()
exon_snps(chrom,outexons,allsnps,assembly,sample_names,ref_genome,logfile,debugfile)
for snprow in allsnps:
for n,k in enumerate([assembly.name]+sample_names):
msa_table[k] += snprow[3+n][0]
description = set_file_descr("allSNP.txt",step="SNPs",type="txt")
ex.add(outall,description=description)
description = set_file_descr("exonsSNP.txt",step="SNPs",type="txt")
ex.add(outexons,description=description)
msafile = unique_filename_in()
with open(msafile,"w") as msa:
msa.write(" %i %i\n"%(len(msa_table),len(msa_table.values()[0])))
for name,seq in msa_table.iteritems():
msa.write("%s\t%s\n" %(name,seq))
msa_table = {}
description = set_file_descr("SNPalignment.txt",step="SNPs",type="txt")
ex.add(msafile,description=description)
# Create UCSC bed tracks
logfile.write("\n* Create tracks\n"); logfile.flush()
create_tracks(ex,outall,sample_names,assembly)
# Create quantitative tracks
logfile.write("\n* Create heteroz. and quality tracks\n"); logfile.flush()
def _process_pileup(pileups, seq, startpos, endpos):
atoi = {'A': 0, 'C': 1, 'G': 2, 'T': 3}
vectors = ([],[],[])
for pileupcolumn in pileups:
position = pileupcolumn.pos
if position < startpos: continue
if position >= endpos: break
coverage = pileupcolumn.n
ref_symbol = seq[position-startpos]
ref = atoi.get(ref_symbol, 4)
symbols = [0,0,0,0,0]
#.........这里部分代码省略.........
示例10: estimate_table
# 需要导入模块: from pysam import Samfile [as 别名]
# 或者: from pysam.Samfile import close [as 别名]
def estimate_table(self, regions, dnase_file_name, genome_file_name, k_nb, shift):
"""
Estimates bias based on HS regions, DNase-seq signal and genomic sequences.
Keyword arguments:
regions -- DNase-seq HS regions.
dnase_file_name -- DNase-seq file name.
genome_file_name -- Genome to fetch genomic sequences from.
Return:
bias_table_F, bias_table_R -- Bias tables.
"""
# Parameters
maxDuplicates = 100
pseudocount = 1.0
# Initializing bam and fasta
if(dnase_file_name.split(".")[-1].upper() != "BAM"): return None # TODO ERROR
bamFile = Samfile(dnase_file_name, "rb")
fastaFile = Fastafile(genome_file_name)
# Initializing dictionaries
obsDictF = dict(); obsDictR = dict()
expDictF = dict(); expDictR = dict()
ct_reads_r=0
ct_reads_f=0
ct_kmers=0
# Iterating on HS regions
for region in regions:
# Initialization
prevPos = -1
trueCounter = 0
# Evaluating observed frequencies ####################################
# Fetching reads
for r in bamFile.fetch(region.chrom, region.initial, region.final):
# Calculating positions
if(not r.is_reverse): p1 = r.pos - (k_nb/2) - 1 + shift
else: p1 = r.aend - (k_nb/2) + 1 - shift
p2 = p1 + k_nb
# Verifying PCR artifacts
if(p1 == prevPos): trueCounter += 1
else:
prevPos = p1
trueCounter = 0
if(trueCounter > maxDuplicates): continue
# Fetching k-mer
try: currStr = str(fastaFile.fetch(region.chrom, p1, p2)).upper()
except Exception: continue
if(r.is_reverse): currStr = AuxiliaryFunctions.revcomp(currStr)
# Counting k-mer in dictionary
if(not r.is_reverse):
ct_reads_r+=1
try: obsDictF[currStr] += 1
except Exception: obsDictF[currStr] = 1
else:
ct_reads_f+=1
try: obsDictR[currStr] += 1
except Exception: obsDictR[currStr] = 1
# Evaluating expected frequencies ####################################
# Fetching whole sequence
try: currStr = str(fastaFile.fetch(region.chrom, region.initial, region.final)).upper()
except Exception: continue
currRevComp = AuxiliaryFunctions.revcomp(currStr)
# Iterating on each sequence position
for i in range(0,len(currStr)-k_nb):
ct_kmers+=1
# Counting k-mer in dictionary
s = currStr[i:i+k_nb]
try: expDictF[s] += 1
except Exception: expDictF[s] = 1
# Counting k-mer in dictionary for reverse complement
s = currRevComp[i:i+k_nb]
try: expDictR[s] += 1
except Exception: expDictR[s] = 1
# Closing files
bamFile.close()
fastaFile.close()
# Creating bias dictionary
alphabet = ["A","C","G","T"]
kmerComb = ["".join(e) for e in product(alphabet, repeat=k_nb)]
bias_table_F = dict([(e,0.0) for e in kmerComb])
bias_table_R = dict([(e,0.0) for e in kmerComb])
for kmer in kmerComb:
#.........这里部分代码省略.........
示例11: subsample
# 需要导入模块: from pysam import Samfile [as 别名]
# 或者: from pysam.Samfile import close [as 别名]
def subsample(fn, ns=None, paired=False):
if ns is None:
fn, ns = fn
sample = []
count = 0
outdir_base = path.join(path.dirname(fn), 'subset')
sf = Samfile(fn)
try:
i_weight = float(sf.mapped)/max(ns)
print("Read out ", i_weight)
except ValueError:
i_weight = 0.0
for read in sf:
i_weight += 1
print("Counted ", i_weight)
i_weight /= float(max(ns))
sf = Samfile(fn)
if paired:
read_2s = {}
print(fn, count, i_weight)
for i, read in enumerate(sf):
key = random()**i_weight
if not paired or read.is_read1:
if len(sample) < max(ns):
heappush(sample, (key, i+count, read))
else:
dropped = heappushpop(sample, (key, i+count, read))
if paired:
read_2s.pop(dropped[-1].qname, None)
elif paired:
read_2s[read.qname] = read
else:
assert ValueError("I don't know how we got here")
count += i
for n in ns:
outdir = outdir_base + '{:04.1f}M'.format(n/1e6)
try:
makedirs(outdir)
except OSError:
pass
sampN = sorted(sample, reverse=True)[:int(n)]
print("Kept {: >12,} of {: >12,} reads".format(len(sampN), count))
print(fn, '->', outdir)
stdout.flush()
of = Samfile(path.join(outdir, 'accepted_hits.bam'),
mode='wb', template=sf)
sample.sort(key=lambda heap_item: (heap_item[-1].tid, heap_item[-1].pos))
missing_mates = 0
for key, pos, read in sampN:
of.write(read)
if paired and read.is_proper_pair:
if read.qname not in read_2s:
missing_mates += 1
continue
of.write(read_2s[read.qname])
of.close()
sf.close()
print(missing_mates)
return [count for key, read, count in sample]
示例12: estimate_bias_kmer
# 需要导入模块: from pysam import Samfile [as 别名]
# 或者: from pysam.Samfile import close [as 别名]
def estimate_bias_kmer(args):
# Parameters
maxDuplicates = 100
pseudocount = 1.0
# Initializing bam and fasta
bamFile = Samfile(args.reads_file, "rb")
genome_data = GenomeData(args.organism)
fastaFile = Fastafile(genome_data.get_genome())
regions = GenomicRegionSet("regions")
regions.read(args.regions_file)
# Initializing dictionaries
obsDictF = dict()
obsDictR = dict()
expDictF = dict()
expDictR = dict()
ct_reads_r = 0
ct_reads_f = 0
ct_kmers = 0
# Iterating on HS regions
for region in regions:
# Initialization
prevPos = -1
trueCounter = 0
# Evaluating observed frequencies ####################################
# Fetching reads
for r in bamFile.fetch(region.chrom, region.initial, region.final):
# Calculating positions
if not r.is_reverse:
cut_site = r.pos + args.forward_shift - 1
p1 = cut_site - int(floor(args.k_nb / 2))
else:
cut_site = r.aend + args.reverse_shift + 1
p1 = cut_site - int(floor(args.k_nb / 2))
p2 = p1 + args.k_nb
# Verifying PCR artifacts
if p1 == prevPos:
trueCounter += 1
else:
prevPos = p1
trueCounter = 0
if trueCounter > maxDuplicates: continue
# Fetching k-mer
try:
currStr = str(fastaFile.fetch(region.chrom, p1, p2)).upper()
except Exception:
continue
if r.is_reverse: currStr = AuxiliaryFunctions.revcomp(currStr)
# Counting k-mer in dictionary
if not r.is_reverse:
ct_reads_f += 1
try:
obsDictF[currStr] += 1
except Exception:
obsDictF[currStr] = 1
else:
ct_reads_r += 1
try:
obsDictR[currStr] += 1
except Exception:
obsDictR[currStr] = 1
# Evaluating expected frequencies ####################################
# Fetching whole sequence
try:
currStr = str(fastaFile.fetch(region.chrom, region.initial, region.final)).upper()
except Exception:
continue
currRevComp = AuxiliaryFunctions.revcomp(currStr)
# Iterating on each sequence position
for i in range(0, len(currStr) - args.k_nb):
ct_kmers += 1
# Counting k-mer in dictionary
s = currStr[i:i + args.k_nb]
try:
expDictF[s] += 1
except Exception:
expDictF[s] = 1
# Counting k-mer in dictionary for reverse complement
s = currRevComp[i:i + args.k_nb]
try:
expDictR[s] += 1
except Exception:
expDictR[s] = 1
# Closing files
bamFile.close()
fastaFile.close()
#.........这里部分代码省略.........
示例13: estimate_bias_pwm
# 需要导入模块: from pysam import Samfile [as 别名]
# 或者: from pysam.Samfile import close [as 别名]
def estimate_bias_pwm(args):
# Parameters
max_duplicates = 100
# Initializing bam and fasta
bamFile = Samfile(args.reads_file, "rb")
genome_data = GenomeData(args.organism)
fastaFile = Fastafile(genome_data.get_genome())
regions = GenomicRegionSet("regions")
regions.read(args.regions_file)
obs_f_pwm_dict = dict([("A", [0.0] * args.k_nb), ("C", [0.0] * args.k_nb),
("G", [0.0] * args.k_nb), ("T", [0.0] * args.k_nb), ("N", [0.0] * args.k_nb)])
exp_f_pwm_dict = dict([("A", [0.0] * args.k_nb), ("C", [0.0] * args.k_nb),
("G", [0.0] * args.k_nb), ("T", [0.0] * args.k_nb), ("N", [0.0] * args.k_nb)])
obs_r_pwm_dict = dict([("A", [0.0] * args.k_nb), ("C", [0.0] * args.k_nb),
("G", [0.0] * args.k_nb), ("T", [0.0] * args.k_nb), ("N", [0.0] * args.k_nb)])
exp_r_pwm_dict = dict([("A", [0.0] * args.k_nb), ("C", [0.0] * args.k_nb),
("G", [0.0] * args.k_nb), ("T", [0.0] * args.k_nb), ("N", [0.0] * args.k_nb)])
# Iterating on HS regions
for region in regions:
# Initialization
prev_pos = -1
true_counter = 0
# Evaluating observed frequencies
# Fetching reads
for r in bamFile.fetch(region.chrom, region.initial, region.final):
# Calculating positions
if not r.is_reverse:
cut_site = r.pos + args.forward_shift - 1
p1 = cut_site - int(floor(args.k_nb / 2))
else:
cut_site = r.aend + args.reverse_shift + 1
p1 = cut_site - int(floor(args.k_nb / 2))
p2 = p1 + args.k_nb
# Verifying PCR artifacts
if p1 == prev_pos:
true_counter += 1
else:
prev_pos = p1
true_counter = 0
if true_counter > max_duplicates: continue
# Fetching k-mer
try:
currStr = str(fastaFile.fetch(region.chrom, p1, p2)).upper()
except Exception:
continue
if r.is_reverse: currStr = AuxiliaryFunctions.revcomp(currStr)
# Counting k-mer in dictionary
if not r.is_reverse:
for i in range(0, len(currStr)):
obs_f_pwm_dict[currStr[i]][i] += 1
else:
for i in range(0, len(currStr)):
obs_r_pwm_dict[currStr[i]][i] += 1
# Evaluating expected frequencies
# Fetching whole sequence
try:
currStr = str(fastaFile.fetch(region.chrom, region.initial, region.final)).upper()
except Exception:
continue
# Iterating on each sequence position
s = None
for i in range(0, len(currStr) - args.k_nb):
# Counting k-mer in dictionary
s = currStr[i:i + args.k_nb]
for i in range(0, len(s)):
exp_f_pwm_dict[s[i]][i] += 1
# Counting k-mer in dictionary for reverse complement
s = AuxiliaryFunctions.revcomp(s)
for i in range(0, len(s)):
exp_r_pwm_dict[s[i]][i] += 1
# Closing files
bamFile.close()
fastaFile.close()
# Output pwms
os.system("mkdir -p " + os.path.join(args.output_location, "pfm"))
pwm_dict_list = [obs_f_pwm_dict, obs_r_pwm_dict, exp_f_pwm_dict, exp_r_pwm_dict]
pwm_file_list = []
pwm_obs_f = os.path.join(args.output_location, "pfm", "obs_{}_f.pfm".format(str(args.k_nb)))
pwm_obs_r = os.path.join(args.output_location, "pfm", "obs_{}_r.pfm".format(str(args.k_nb)))
pwm_exp_f = os.path.join(args.output_location, "pfm", "exp_{}_f.pfm".format(str(args.k_nb)))
pwm_exp_r = os.path.join(args.output_location, "pfm", "exp_{}_r.pfm".format(str(args.k_nb)))
pwm_file_list.append(pwm_obs_f)
pwm_file_list.append(pwm_obs_r)
pwm_file_list.append(pwm_exp_f)
pwm_file_list.append(pwm_exp_r)
for i in range(len(pwm_dict_list)):
#.........这里部分代码省略.........
示例14: main
# 需要导入模块: from pysam import Samfile [as 别名]
# 或者: from pysam.Samfile import close [as 别名]
def main():
parser = OptionParser(usage=usage)
#parser.add_option("-s", action="store_true", dest="sam_input", default=False,
#help="Input is in SAM format instead of BAM format")
(options, args) = parser.parse_args()
if len(args) != 4:
parser.print_help()
sys.exit(1)
psl_filename = args[0]
ref_filename = args[1]
contigs_filename = args[2]
bam_filename = args[3]
liftover_dir = args[1]
references, ref_chromosomes = read_fasta(ref_filename)
refname_to_id = dict([(name,i) for i,name in enumerate(ref_chromosomes)])
print('Read', len(ref_chromosomes), 'reference chromosomes:', ','.join(ref_chromosomes), file=sys.stderr)
contigs, contig_names = read_fasta(contigs_filename)
print('Read', len(contig_names), 'contigs.', file=sys.stderr)
bam_header = {'HD': {'VN': '1.0'}, 'SQ': [dict([('LN', len(references[chromosome])), ('SN', chromosome)]) for chromosome in ref_chromosomes] }
outfile = Samfile(bam_filename, 'wb', header=bam_header)
line_nr = 0
header_read = False
for line in (s.strip() for s in open(psl_filename)):
line_nr += 1
if line.startswith('------'):
header_read = True
continue
if not header_read: continue
fields = line.split()
assert len(fields) == 21, 'Error reading PSL file, offending line: %d'%line_nr
sizes = [int(x) for x in fields[18].strip(',').split(',')]
contig_starts = [int(x) for x in fields[19].strip(',').split(',')]
ref_starts = [int(x) for x in fields[20].strip(',').split(',')]
assert 0 < len(sizes) == len(contig_starts) == len(ref_starts)
strand = fields[8]
contig_name = fields[9]
ref_name = fields[13]
assert strand in ['-','+']
assert contig_name in contigs
assert ref_name in references
a = AlignedRead()
a.qname = contig_name
if strand == '+':
a.seq = str(contigs[contig_name])
else:
a.seq = str(contigs[contig_name].reverse_complement())
a.flag = (16 if strand == '+' else 0)
a.rname = refname_to_id[ref_name]
a.pos = ref_starts[0]
a.mapq = 255
qpos = contig_starts[0]
refpos = ref_starts[0]
cigar = []
# soft-clipping at the start?
if contig_starts[0] > 0:
cigar.append((4,contig_starts[0]))
longest_insertion = 0
longest_deletion = 0
total_matches = 0
total_insertion = 0
total_deletion = 0
for length, contig_start, ref_start in zip(sizes, contig_starts, ref_starts):
assert contig_start >= qpos
assert ref_start >= refpos
# insertion?
if contig_start > qpos:
insertion_length = contig_start - qpos
longest_insertion = max(longest_insertion, insertion_length)
total_insertion += insertion_length
append_to_cigar(cigar, 1, insertion_length)
qpos = contig_start
# deletion?
if ref_start > refpos:
deletion_length = ref_start - refpos
longest_deletion = max(longest_deletion, deletion_length)
total_deletion += deletion_length
append_to_cigar(cigar, 2, deletion_length)
refpos = ref_start
# strech of matches/mismatches
append_to_cigar(cigar, 0, length)
refpos += length
qpos += length
total_matches += length
# soft-clipping at the end?
if len(a.seq) > qpos:
cigar.append((4,len(a.seq) - qpos))
a.cigar = tuple(cigar)
# only use contigs where longest deletion is <= 10000 bp
if longest_deletion > 10000: continue
# require at least 200 matching positions
if total_matches < 200: continue
# require the matching positions to make up at least 75 percent of the contig
# (without counting parts of the contig that are insertions).
if total_matches / (len(a.seq) - total_insertion) < 0.75: continue
outfile.write(a)
outfile.close()
示例15: print_reads
# 需要导入模块: from pysam import Samfile [as 别名]
# 或者: from pysam.Samfile import close [as 别名]
def print_reads(reads_to_print, ref_name, header):
output_name = "{0}_{1}.bam".format(args.output_base, ref_name)
output_samfile = Samfile(output_name, "wb", header=header)
for aln in reads_to_print:
output_samfile.write(aln)
output_samfile.close()