当前位置: 首页>>代码示例>>Python>>正文


Python Samfile.close方法代码示例

本文整理汇总了Python中pysam.Samfile.close方法的典型用法代码示例。如果您正苦于以下问题:Python Samfile.close方法的具体用法?Python Samfile.close怎么用?Python Samfile.close使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在pysam.Samfile的用法示例。


在下文中一共展示了Samfile.close方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: test_process_bam_mismatches

# 需要导入模块: from pysam import Samfile [as 别名]
# 或者: from pysam.Samfile import close [as 别名]
def test_process_bam_mismatches():
    tbam = os.path.join(DATA, "tmp.bam")
    bam = os.path.join(DATA, "ordered_umi.bam")
    if os.path.exists(tbam):
        os.remove(tbam)
    with captured_output() as (out, err):
        process_bam(bam, tbam, mismatches=1)
    assert os.path.exists(tbam)
    it = iter(out.getvalue().split("\n"))
    assert it.next().strip() == "1\t9\t10\t4\t2"
    assert it.next().strip() == "1\t11\t12\t2\t1"
    assert it.next().strip() == "1\t29\t30\t2\t1"

    bam_reader = Samfile(tbam)
    it = iter(bam_reader)
    r = it.next()
    assert r.pos == 4
    assert r.qname == "read8:UMI_ATTCAGGG"
    r = it.next()
    assert r.pos == 9
    assert r.qname == "read1:UMI_AAAAAGGG"
    r = it.next()
    assert r.pos == 9
    assert r.qname == "read4:UMI_AAAGGGGG"
    r = it.next()
    assert r.pos == 11
    assert r.qname == "read5:UMI_ATTTAGGG"
    bam_reader.close()
    os.remove(tbam)
开发者ID:brwnj,项目名称:umitools,代码行数:31,代码来源:test_umitools.py

示例2: callbase

# 需要导入模块: from pysam import Samfile [as 别名]
# 或者: from pysam.Samfile import close [as 别名]
def callbase(bamfile, snpsites, out):
    BF = Samfile(bamfile, 'rb') #open your bam file
    SF = open(snpsites, 'r')    #the file contain snp sites info
    RF = open(out, 'w')         #resulte file
    RF.write('ref_name\tpos\tRbase\tAbase\tA\tT\tC\tG\tN\tothers\n')
    for i in SF:
        if i.startswith('#'):
            continue
        else:
            line = ParseSNPsitesLine(i)
            vcf_pos = line.pos-1 #change 1-base to 0-based
            vcf_refname = line.chrom
            print 'processing: %s %s...'%(vcf_refname, str(vcf_pos))
            At, Tt, Ct, Gt, Nt, othert = 0, 0, 0, 0, 0, 0
            for i in BF.pileup(vcf_refname, vcf_pos, vcf_pos+1):
                if i.pos == vcf_pos:
                    vcf_Rbase = line.Rbase
                    vcf_Abase = line.Abase
                    for j in i.pileups:
                        yourbase = j.alignment.seq[j.qpos]
                        if yourbase == 'A': At += 1
                        elif yourbase == 'T': Tt += 1
                        elif yourbase == 'C': Ct += 1
                        elif yourbase == 'G': Gt += 1
                        elif yourbase == 'N': Nt += 1
                        else: othert += 1
        RF.write('%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n'%(vcf_refname, \
str(vcf_pos+1), vcf_Rbase, vcf_Abase, str(At), str(Tt), str(Ct), str(Gt), \
str(Nt), str(othert)))
    BF.close()
开发者ID:freemao,项目名称:call-base-each-snp-site,代码行数:32,代码来源:callbase-pysam.py

示例3: main

# 需要导入模块: from pysam import Samfile [as 别名]
# 或者: from pysam.Samfile import close [as 别名]
def main(args=None):
    if args is None:
        args = sys.argv[1:]

    f = Samfile(args[0])
    header = f.header
    f.close()

    reflen = header['SQ'][0]['LN']

    BamIO.write(clip(BamIO.parse(args[0]), reflen), args[1], header=header)

    return 0
开发者ID:trident01,项目名称:BioExt-1,代码行数:15,代码来源:bamclip.py

示例4: subsample

# 需要导入模块: from pysam import Samfile [as 别名]
# 或者: from pysam.Samfile import close [as 别名]
def subsample(fn, ns=None):
    if ns is None:
        fn, ns = fn
    sample = []
    count = 0
    outdir_base = path.join(path.dirname(fn), "subset")
    sf = Samfile(fn)
    try:
        i_weight = float(sf.mapped) / max(ns)
        print "Read out ", i_weight
    except ValueError:
        i_weight = 0.0
        for read in sf:
            i_weight += 1
        print "Counted ", i_weight
        i_weight /= float(max(ns))
        sf = Samfile(fn)

    print fn, count, i_weight
    for i, read in enumerate(sf):
        key = random() ** i_weight
        if len(sample) < max(ns):
            heappush(sample, (key, read, i + count))
        else:
            heappushpop(sample, (key, read, i + count))

    count += i

    for n in ns:
        if n == min(ns):
            outdir = outdir_base + "_min"
        else:
            outdir = outdir_base + "{:04.1f}M".format(n / 1e6)
        try:
            makedirs(outdir)
        except OSError:
            pass
        sampN = sorted(sample, reverse=True)[: int(n)]
        print "Kept {: >12,} of {: >12,} reads".format(len(sampN), count)
        print fn, "->", outdir
        stdout.flush()
        of = Samfile(path.join(outdir, "accepted_hits.bam"), mode="wb", template=sf)
        sample.sort(key=lambda (key, read, pos): (read.tid, read.pos))
        for key, read, pos in sampN:
            of.write(read)
        of.close()
    sf.close()
    return [count for key, read, count in sample]
开发者ID:petercombs,项目名称:EisenLab-Code,代码行数:50,代码来源:SubSample.py

示例5: removeEdgeMismatches

# 需要导入模块: from pysam import Samfile [as 别名]
# 或者: from pysam.Samfile import close [as 别名]
 def removeEdgeMismatches(self,bamFile,minDistance, minBaseQual):
     startTime=Helper.getTime()
     minDistance=int(minDistance)
     counter=0;j=0  
     num_lines = len(self.variantDict)
     Helper.info(" [%s] remove Missmatches from the first %s bp from read edges" % (startTime.strftime("%c"),str(minDistance)),self.logFile,self.textField)
     
     bamFile = Samfile(bamFile, "rb")
     
     for varKey in self.variantDict.keys():
         variant = self.variantDict[varKey]
         
         counter+=1
         if counter%10000==0:
             Helper.status('%s mm parsed ' % counter ,self.logFile, self.textField,"grey")
         
         keepSNP=False
         varPos=variant.position-1
         iter = bamFile.pileup(variant.chromosome, variant.position-1, variant.position)
         #walks up the region wich overlap this position
         for x in iter:
             if x.pos == varPos:
                 for pileupread in x.pileups: #walk through the single reads
                     if not pileupread.is_del and not pileupread.is_refskip:
                         distance=abs(pileupread.alignment.alen-pileupread.query_position) if pileupread.alignment.is_reverse else pileupread.query_position
                         if distance >= minDistance:
                             #check readBase and Base Quality
                             if pileupread.alignment.query_sequence[pileupread.query_position] == variant.alt and pileupread.alignment.query_qualities[pileupread.query_position]>=minBaseQual:
                             #if pileupread.alignment.query_sequence[pileupread.query_position] == variant.alt:
                                 keepSNP=True
                                 
         if keepSNP==False:
             j+=1
             del self.variantDict[varKey]
     
     Helper.status('%s of %svariants were deleted' % (j,num_lines), self.logFile, self.textField,"black") 
     Helper.printTimeDiff(startTime, self.logFile, self.textField)
     bamFile.close()
开发者ID:djhn75,项目名称:RNAEditor,代码行数:40,代码来源:VariantSet.py

示例6: split_samfile

# 需要导入模块: from pysam import Samfile [as 别名]
# 或者: from pysam.Samfile import close [as 别名]
def split_samfile(sam_file, splits, prefix='', path=''):
    """Take a sam file and split it splits number of times.

    :path:    Where to put the split files.
    :prefix:  A prefix for the outfile names.
    :returns: A tuple of job files.
    """
    # Determine how many reads will be in each split sam file.
    num_lines = count_reads(sam_file)
    num_reads = int(int(num_lines)/splits) + 1

    # Get rid of starting path
    sam_name = os.path.basename(sam_file)

    # Subset the SAM file into X number of jobs
    cnt      = 0
    currjob  = 1
    suffix   = '.split_sam_' + str(currjob).zfill(4)
    run_file = os.path.join(path, prefix + sam_name + suffix)
    rmode    = 'rb' if sam_name.split('.')[0] == 'bam' else 'r'
    wmode    = 'wb'

    # Actually split the file
    outfiles = [run_file]
    with Samfile(sam_file, rmode) as in_sam:
        sam_split = Samfile(run_file, wmode, template=in_sam)
        for line in in_sam:
            cnt += 1
            if cnt < num_reads:
                sam_split.write(line)
            elif cnt == num_reads:
                # Check if next line is mate-pair. If so, don't split.
                line2     = next(in_sam)
                currjob  += 1
                suffix    = '.split_sam_' + str(currjob).zfill(4)
                run_file  = os.path.join(path, prefix + sam_name + suffix)
                new_sam   = Samfile(run_file, wmode, template=in_sam)
                outfiles.append(run_file)

                if line.qname == line2.qname:
                    sam_split.write(line)
                    sam_split.write(line2)
                    sam_split.close()
                    cnt = 0
                else:
                    sam_split.write(line)
                    sam_split.close()
                    new_sam.write(line2)
                    cnt = 0
                sam_split = new_sam
        sam_split.close()
    return tuple(outfiles)
开发者ID:rmagoglia,项目名称:ASEr,代码行数:54,代码来源:CountSNPASE.py

示例7: parse_barcode

# 需要导入模块: from pysam import Samfile [as 别名]
# 或者: from pysam.Samfile import close [as 别名]
def parse_barcode(bamfile):
	"""parses a sorted and index bam file, removes all cases where rna hits more than one spot in genome
	and writes to a file, create file for mutant and wildtype based on barcodes"""
	samfile = Samfile(bamfile, "rb")
	multi_hit_file = Samfile("MultiHit.bam","wb",template=samfile)
	mutant = Samfile("Mutant.bam","wb",template=samfile)
	wildtype = Samfile("Wildtype.bam","wb",template=samfile)
	for line in samfile.fetch():
		#if line.is_secondary:
		## does this hit to more than one spot in genome
		#	multi_hit_file.write(line)
		if "#GAGT"in line.qname or "#TTAG" in line.qname: 
		## write to mutant file
			mutant.write(line)
		elif "#ACCC" in line.qname or "#CGTA" in line.qname:
		### write to wildtype file
			wildtype.write(line)

	multi_hit_file.close()
	mutant.close()
	wildtype.close()
	samfile.close()
开发者ID:gturco,项目名称:seqtools,代码行数:24,代码来源:barcode_parser.py

示例8: Samfile

# 需要导入模块: from pysam import Samfile [as 别名]
# 或者: from pysam.Samfile import close [as 别名]
samfile = Samfile(args.path)
for segment in samfile.fetch(until_eof=True):
	num = segment.query_name.split("|")[0]
	for etype, eset in errors.iteritems():
		if(num in eset):
			errors2segments[etype][num].append(segment);
			break;
		
		
additional = defaultdict(list);
for fname in args.additional:
	tsamfile = Samfile(fname);
	for segment in tsamfile.fetch(until_eof=True):
		num = segment.query_name.split("|")[0]
		additional[num].append(ArWrapper(segment, tsamfile.getrname(segment.tid)))
	tsamfile.close();
		
		
		
		
for etype, d in errors2segments.iteritems():
	with open(os.path.join(args.outdir, "%s_%s_error.txt" % etype), 'w') as f:
		for num, segments in d.iteritems():
			if(segments[0].is_reverse):
				seq = reverse_complement(segments[0].seq);
			else:	
				seq = segments[0].seq
			
			f.write("%s\nnumber of read:\t%s\n\nSequence:\t%s\n\nSegments:\n\n" % ("_"*140, num, seq))
			for segment in segments:
				f.write("%s\t%s\t%d\t%s\t%d\t%s\n\n" % (segment.query_name.split("|")[2], samfile.getrname(segment.tid), segment.reference_start, segment.cigarstring, segment.get_tag("AS"), segment.query_name));
开发者ID:afilipch,项目名称:nrlbio,代码行数:33,代码来源:chiflex_explore_errors.py

示例9: snp_workflow

# 需要导入模块: from pysam import Samfile [as 别名]
# 或者: from pysam.Samfile import close [as 别名]
def snp_workflow(ex, job, assembly, minsnp=40., mincov=5, path_to_ref=None, via='local',
                 logfile=sys.stdout, debugfile=sys.stderr):
    """Main function of the workflow"""
    ref_genome = assembly.fasta_by_chrom
    sample_names = [job.groups[gid]['name'] for gid in sorted(job.files.keys())]

    logfile.write("\n* Generate vcfs for each chrom/group\n"); logfile.flush()
    vcfs = dict((chrom,{}) for chrom in ref_genome.keys()) # {chr: {}}
    bams = {}
    # Launch the jobs
    for gid in sorted(job.files.keys()):
        # Merge all bams belonging to the same group
        runs = [r['bam'] for r in job.files[gid].itervalues()]
        bam = Samfile(runs[0])
        header = bam.header
        headerfile = unique_filename_in()
        for h in header["SQ"]:
            if h["SN"] in assembly.chrmeta:
                h["SN"] = assembly.chrmeta[h["SN"]]["ac"]
        head = Samfile( headerfile, "wh", header=header )
        head.close()
        if len(runs) > 1:
            _b = merge_bam(ex,runs)
            index_bam(ex,_b)
            bams[gid] = _b
        else:
            bams[gid] = runs[0]
        # Samtools mpileup + bcftools + vcfutils.pl
        for chrom,ref in ref_genome.iteritems():
            vcf = unique_filename_in()
            vcfs[chrom][gid] = (vcf,
                                pileup.nonblocking(ex, bams[gid], ref, header=headerfile,
                                                   via=via, stdout=vcf))
        logfile.write("  ...Group %s running.\n" %job.groups[gid]['name']); logfile.flush()
    # Wait for vcfs to finish and store them in *vcfs[chrom][gid]*
    for gid in sorted(job.files.keys()):
        for chrom,ref in ref_genome.iteritems():
            vcfs[chrom][gid][1].wait()
            vcfs[chrom][gid] = vcfs[chrom][gid][0]
        logfile.write("  ...Group %s done.\n" %job.groups[gid]['name']); logfile.flush()
    # Targz the pileup files (vcf)
    tarname = unique_filename_in()
    tarfh = tarfile.open(tarname, "w:gz")
    for chrom,v in vcfs.iteritems():
        for gid,vcf in v.iteritems():
            tarfh.add(vcf, arcname="%s_%s.vcf" % (job.groups[gid]['name'],chrom))
    tarfh.close()
    ex.add( tarname, description=set_file_descr("vcfs_files.tar.gz",step="pileup",type="tar",view='admin') )

    logfile.write("\n* Merge info from vcf files\n"); logfile.flush()
    outall = unique_filename_in()
    outexons = unique_filename_in()
    with open(outall,"w") as fout:
        fout.write('#'+'\t'.join(['chromosome','position','reference']+sample_names+ \
                                 ['gene','location_type','distance'])+'\n')
    with open(outexons,"w") as fout:
        fout.write('#'+'\t'.join(['chromosome','position','reference']+sample_names+['exon','strand','ref_aa'] \
                                  + ['new_aa_'+s for s in sample_names])+'\n')
    msa_table = dict((s,'') for s in [assembly.name]+sample_names)
    for chrom,v in vcfs.iteritems():
        logfile.write("  > Chromosome '%s'\n" % chrom); logfile.flush()
    # Put together info from all vcf files
        logfile.write("  - All SNPs\n"); logfile.flush()
        allsnps = all_snps(ex,chrom,vcfs[chrom],bams,outall,assembly,
                           sample_names,mincov,float(minsnp),logfile,debugfile)
    # Annotate SNPs and check synonymy
        logfile.write("  - Exonic SNPs\n"); logfile.flush()
        exon_snps(chrom,outexons,allsnps,assembly,sample_names,ref_genome,logfile,debugfile)
        for snprow in allsnps:
            for n,k in enumerate([assembly.name]+sample_names):
                msa_table[k] += snprow[3+n][0]
    description = set_file_descr("allSNP.txt",step="SNPs",type="txt")
    ex.add(outall,description=description)
    description = set_file_descr("exonsSNP.txt",step="SNPs",type="txt")
    ex.add(outexons,description=description)
    msafile = unique_filename_in()
    with open(msafile,"w") as msa:
        msa.write(" %i %i\n"%(len(msa_table),len(msa_table.values()[0])))
        for name,seq in msa_table.iteritems():
            msa.write("%s\t%s\n" %(name,seq))
    msa_table = {}
    description = set_file_descr("SNPalignment.txt",step="SNPs",type="txt")
    ex.add(msafile,description=description)
    # Create UCSC bed tracks
    logfile.write("\n* Create tracks\n"); logfile.flush()
    create_tracks(ex,outall,sample_names,assembly)
    # Create quantitative tracks
    logfile.write("\n* Create heteroz. and quality tracks\n"); logfile.flush()

    def _process_pileup(pileups, seq, startpos, endpos):
        atoi = {'A': 0, 'C': 1, 'G': 2, 'T': 3}
        vectors = ([],[],[])
        for pileupcolumn in pileups:
            position = pileupcolumn.pos
            if position < startpos: continue
            if position >= endpos: break
            coverage = pileupcolumn.n
            ref_symbol = seq[position-startpos]
            ref = atoi.get(ref_symbol, 4)
            symbols = [0,0,0,0,0]
#.........这里部分代码省略.........
开发者ID:JoseEspinosa,项目名称:bbcflib,代码行数:103,代码来源:snp.py

示例10: estimate_table

# 需要导入模块: from pysam import Samfile [as 别名]
# 或者: from pysam.Samfile import close [as 别名]
    def estimate_table(self, regions, dnase_file_name, genome_file_name, k_nb, shift):
        """ 
        Estimates bias based on HS regions, DNase-seq signal and genomic sequences.

        Keyword arguments:
        regions -- DNase-seq HS regions.
        dnase_file_name -- DNase-seq file name.
        genome_file_name -- Genome to fetch genomic sequences from.
        
        Return:
        bias_table_F, bias_table_R -- Bias tables.
        """

        # Parameters
        maxDuplicates = 100
        pseudocount = 1.0

        # Initializing bam and fasta
        if(dnase_file_name.split(".")[-1].upper() != "BAM"): return None # TODO ERROR
        bamFile = Samfile(dnase_file_name, "rb")
        fastaFile = Fastafile(genome_file_name)

        # Initializing dictionaries
        obsDictF = dict(); obsDictR = dict()
        expDictF = dict(); expDictR = dict()

        ct_reads_r=0
        ct_reads_f=0
        ct_kmers=0

        # Iterating on HS regions
        for region in regions:

            # Initialization
            prevPos = -1
            trueCounter = 0

            # Evaluating observed frequencies ####################################

            # Fetching reads
            for r in bamFile.fetch(region.chrom, region.initial, region.final):

                # Calculating positions
                if(not r.is_reverse): p1 = r.pos - (k_nb/2) - 1 + shift
                else: p1 = r.aend - (k_nb/2) + 1 - shift
                p2 = p1 + k_nb

                # Verifying PCR artifacts
                if(p1 == prevPos): trueCounter += 1
                else:
                    prevPos = p1
                    trueCounter = 0
                if(trueCounter > maxDuplicates): continue

                # Fetching k-mer
                try: currStr = str(fastaFile.fetch(region.chrom, p1, p2)).upper()
                except Exception: continue
                if(r.is_reverse): currStr = AuxiliaryFunctions.revcomp(currStr)

                # Counting k-mer in dictionary
                if(not r.is_reverse):
                    ct_reads_r+=1
                    try: obsDictF[currStr] += 1
                    except Exception: obsDictF[currStr] = 1
                else:
                    ct_reads_f+=1
                    try: obsDictR[currStr] += 1
                    except Exception: obsDictR[currStr] = 1 


            # Evaluating expected frequencies ####################################

            # Fetching whole sequence
            try: currStr = str(fastaFile.fetch(region.chrom, region.initial, region.final)).upper()
            except Exception: continue
            currRevComp = AuxiliaryFunctions.revcomp(currStr)

            # Iterating on each sequence position
            for i in range(0,len(currStr)-k_nb):
                ct_kmers+=1
                # Counting k-mer in dictionary
                s = currStr[i:i+k_nb]
                try: expDictF[s] += 1
                except Exception: expDictF[s] = 1

                # Counting k-mer in dictionary for reverse complement
                s = currRevComp[i:i+k_nb]
                try: expDictR[s] += 1
                except Exception: expDictR[s] = 1

        # Closing files
        bamFile.close()
        fastaFile.close()

        # Creating bias dictionary
        alphabet = ["A","C","G","T"]
        kmerComb = ["".join(e) for e in product(alphabet, repeat=k_nb)]
        bias_table_F = dict([(e,0.0) for e in kmerComb]) 
        bias_table_R = dict([(e,0.0) for e in kmerComb]) 
        for kmer in kmerComb:
#.........这里部分代码省略.........
开发者ID:Marvin84,项目名称:reg-gen,代码行数:103,代码来源:biasTable.py

示例11: subsample

# 需要导入模块: from pysam import Samfile [as 别名]
# 或者: from pysam.Samfile import close [as 别名]
def subsample(fn, ns=None, paired=False):
    if ns is None:
        fn, ns = fn
    sample = []
    count = 0
    outdir_base = path.join(path.dirname(fn), 'subset')
    sf = Samfile(fn)
    try:
        i_weight = float(sf.mapped)/max(ns)
        print("Read out ", i_weight)
    except ValueError:
        i_weight = 0.0
        for read in sf:
            i_weight += 1
        print("Counted ", i_weight)
        i_weight /= float(max(ns))
        sf = Samfile(fn)

    if paired:
        read_2s = {}
    print(fn, count, i_weight)
    for i, read in enumerate(sf):
        key = random()**i_weight
        if not paired or read.is_read1:
            if len(sample) < max(ns):
                heappush(sample, (key, i+count, read))
            else:
                dropped = heappushpop(sample, (key, i+count, read))
                if paired:
                    read_2s.pop(dropped[-1].qname, None)
        elif paired:
            read_2s[read.qname] = read
        else:
            assert ValueError("I don't know how we got here")


    count += i

    for n in ns:
        outdir = outdir_base + '{:04.1f}M'.format(n/1e6)
        try:
            makedirs(outdir)
        except OSError:
            pass
        sampN = sorted(sample, reverse=True)[:int(n)]
        print("Kept {: >12,} of {: >12,} reads".format(len(sampN), count))
        print(fn, '->', outdir)
        stdout.flush()
        of = Samfile(path.join(outdir, 'accepted_hits.bam'),
                     mode='wb', template=sf)
        sample.sort(key=lambda heap_item: (heap_item[-1].tid, heap_item[-1].pos))
        missing_mates = 0
        for key, pos, read in sampN:
            of.write(read)
            if paired and read.is_proper_pair:
                if read.qname not in read_2s:
                    missing_mates += 1
                    continue
                of.write(read_2s[read.qname])
        of.close()
    sf.close()
    print(missing_mates)
    return [count for key, read, count in sample]
开发者ID:petercombs,项目名称:HybridSliceSeq,代码行数:65,代码来源:SubSample.py

示例12: estimate_bias_kmer

# 需要导入模块: from pysam import Samfile [as 别名]
# 或者: from pysam.Samfile import close [as 别名]
def estimate_bias_kmer(args):
    # Parameters
    maxDuplicates = 100
    pseudocount = 1.0

    # Initializing bam and fasta
    bamFile = Samfile(args.reads_file, "rb")
    genome_data = GenomeData(args.organism)
    fastaFile = Fastafile(genome_data.get_genome())
    regions = GenomicRegionSet("regions")
    regions.read(args.regions_file)

    # Initializing dictionaries
    obsDictF = dict()
    obsDictR = dict()
    expDictF = dict()
    expDictR = dict()

    ct_reads_r = 0
    ct_reads_f = 0
    ct_kmers = 0

    # Iterating on HS regions
    for region in regions:

        # Initialization
        prevPos = -1
        trueCounter = 0

        # Evaluating observed frequencies ####################################
        # Fetching reads
        for r in bamFile.fetch(region.chrom, region.initial, region.final):

            # Calculating positions
            if not r.is_reverse:
                cut_site = r.pos + args.forward_shift - 1
                p1 = cut_site - int(floor(args.k_nb / 2))
            else:
                cut_site = r.aend + args.reverse_shift + 1
                p1 = cut_site - int(floor(args.k_nb / 2))
            p2 = p1 + args.k_nb

            # Verifying PCR artifacts
            if p1 == prevPos:
                trueCounter += 1
            else:
                prevPos = p1
                trueCounter = 0
            if trueCounter > maxDuplicates: continue

            # Fetching k-mer
            try:
                currStr = str(fastaFile.fetch(region.chrom, p1, p2)).upper()
            except Exception:
                continue
            if r.is_reverse: currStr = AuxiliaryFunctions.revcomp(currStr)

            # Counting k-mer in dictionary
            if not r.is_reverse:
                ct_reads_f += 1
                try:
                    obsDictF[currStr] += 1
                except Exception:
                    obsDictF[currStr] = 1
            else:
                ct_reads_r += 1
                try:
                    obsDictR[currStr] += 1
                except Exception:
                    obsDictR[currStr] = 1

        # Evaluating expected frequencies ####################################
        # Fetching whole sequence
        try:
            currStr = str(fastaFile.fetch(region.chrom, region.initial, region.final)).upper()
        except Exception:
            continue
        currRevComp = AuxiliaryFunctions.revcomp(currStr)

        # Iterating on each sequence position
        for i in range(0, len(currStr) - args.k_nb):
            ct_kmers += 1
            # Counting k-mer in dictionary
            s = currStr[i:i + args.k_nb]
            try:
                expDictF[s] += 1
            except Exception:
                expDictF[s] = 1

            # Counting k-mer in dictionary for reverse complement
            s = currRevComp[i:i + args.k_nb]
            try:
                expDictR[s] += 1
            except Exception:
                expDictR[s] = 1

    # Closing files
    bamFile.close()
    fastaFile.close()

#.........这里部分代码省略.........
开发者ID:CostaLab,项目名称:reg-gen,代码行数:103,代码来源:Estimation.py

示例13: estimate_bias_pwm

# 需要导入模块: from pysam import Samfile [as 别名]
# 或者: from pysam.Samfile import close [as 别名]
def estimate_bias_pwm(args):
    # Parameters
    max_duplicates = 100

    # Initializing bam and fasta
    bamFile = Samfile(args.reads_file, "rb")
    genome_data = GenomeData(args.organism)
    fastaFile = Fastafile(genome_data.get_genome())
    regions = GenomicRegionSet("regions")
    regions.read(args.regions_file)

    obs_f_pwm_dict = dict([("A", [0.0] * args.k_nb), ("C", [0.0] * args.k_nb),
                           ("G", [0.0] * args.k_nb), ("T", [0.0] * args.k_nb), ("N", [0.0] * args.k_nb)])
    exp_f_pwm_dict = dict([("A", [0.0] * args.k_nb), ("C", [0.0] * args.k_nb),
                           ("G", [0.0] * args.k_nb), ("T", [0.0] * args.k_nb), ("N", [0.0] * args.k_nb)])
    obs_r_pwm_dict = dict([("A", [0.0] * args.k_nb), ("C", [0.0] * args.k_nb),
                           ("G", [0.0] * args.k_nb), ("T", [0.0] * args.k_nb), ("N", [0.0] * args.k_nb)])
    exp_r_pwm_dict = dict([("A", [0.0] * args.k_nb), ("C", [0.0] * args.k_nb),
                           ("G", [0.0] * args.k_nb), ("T", [0.0] * args.k_nb), ("N", [0.0] * args.k_nb)])

    # Iterating on HS regions
    for region in regions:
        # Initialization
        prev_pos = -1
        true_counter = 0

        # Evaluating observed frequencies
        # Fetching reads
        for r in bamFile.fetch(region.chrom, region.initial, region.final):
            # Calculating positions
            if not r.is_reverse:
                cut_site = r.pos + args.forward_shift - 1
                p1 = cut_site - int(floor(args.k_nb / 2))
            else:
                cut_site = r.aend + args.reverse_shift + 1
                p1 = cut_site - int(floor(args.k_nb / 2))
            p2 = p1 + args.k_nb

            # Verifying PCR artifacts
            if p1 == prev_pos:
                true_counter += 1
            else:
                prev_pos = p1
                true_counter = 0
            if true_counter > max_duplicates: continue

            # Fetching k-mer
            try:
                currStr = str(fastaFile.fetch(region.chrom, p1, p2)).upper()
            except Exception:
                continue
            if r.is_reverse: currStr = AuxiliaryFunctions.revcomp(currStr)

            # Counting k-mer in dictionary
            if not r.is_reverse:
                for i in range(0, len(currStr)):
                    obs_f_pwm_dict[currStr[i]][i] += 1
            else:
                for i in range(0, len(currStr)):
                    obs_r_pwm_dict[currStr[i]][i] += 1

        # Evaluating expected frequencies
        # Fetching whole sequence
        try:
            currStr = str(fastaFile.fetch(region.chrom, region.initial, region.final)).upper()
        except Exception:
            continue

        # Iterating on each sequence position
        s = None
        for i in range(0, len(currStr) - args.k_nb):
            # Counting k-mer in dictionary
            s = currStr[i:i + args.k_nb]
            for i in range(0, len(s)):
                exp_f_pwm_dict[s[i]][i] += 1

            # Counting k-mer in dictionary for reverse complement
            s = AuxiliaryFunctions.revcomp(s)
            for i in range(0, len(s)):
                exp_r_pwm_dict[s[i]][i] += 1

    # Closing files
    bamFile.close()
    fastaFile.close()

    # Output pwms
    os.system("mkdir -p " + os.path.join(args.output_location, "pfm"))
    pwm_dict_list = [obs_f_pwm_dict, obs_r_pwm_dict, exp_f_pwm_dict, exp_r_pwm_dict]
    pwm_file_list = []
    pwm_obs_f = os.path.join(args.output_location, "pfm", "obs_{}_f.pfm".format(str(args.k_nb)))
    pwm_obs_r = os.path.join(args.output_location, "pfm", "obs_{}_r.pfm".format(str(args.k_nb)))
    pwm_exp_f = os.path.join(args.output_location, "pfm", "exp_{}_f.pfm".format(str(args.k_nb)))
    pwm_exp_r = os.path.join(args.output_location, "pfm", "exp_{}_r.pfm".format(str(args.k_nb)))

    pwm_file_list.append(pwm_obs_f)
    pwm_file_list.append(pwm_obs_r)
    pwm_file_list.append(pwm_exp_f)
    pwm_file_list.append(pwm_exp_r)

    for i in range(len(pwm_dict_list)):
#.........这里部分代码省略.........
开发者ID:CostaLab,项目名称:reg-gen,代码行数:103,代码来源:Estimation.py

示例14: main

# 需要导入模块: from pysam import Samfile [as 别名]
# 或者: from pysam.Samfile import close [as 别名]
def main():
	parser = OptionParser(usage=usage)
	#parser.add_option("-s", action="store_true", dest="sam_input", default=False,
					  #help="Input is in SAM format instead of BAM format")
	(options, args) = parser.parse_args()
	if len(args) != 4:
		parser.print_help()
		sys.exit(1)
	psl_filename = args[0]
	ref_filename = args[1]
	contigs_filename = args[2]
	bam_filename = args[3]
	liftover_dir = args[1]
	
	references, ref_chromosomes = read_fasta(ref_filename)
	refname_to_id = dict([(name,i) for i,name in enumerate(ref_chromosomes)])
	print('Read', len(ref_chromosomes), 'reference chromosomes:', ','.join(ref_chromosomes), file=sys.stderr)
	contigs, contig_names = read_fasta(contigs_filename)
	print('Read', len(contig_names), 'contigs.', file=sys.stderr)
	bam_header = {'HD': {'VN': '1.0'}, 'SQ': [dict([('LN', len(references[chromosome])), ('SN', chromosome)]) for chromosome in ref_chromosomes] }
	outfile = Samfile(bam_filename, 'wb', header=bam_header)

	line_nr = 0
	header_read = False
	for line in (s.strip() for s in open(psl_filename)):
		line_nr += 1
		if line.startswith('------'): 
			header_read = True
			continue
		if not header_read: continue
		fields = line.split()
		assert len(fields) == 21, 'Error reading PSL file, offending line: %d'%line_nr
		sizes = [int(x) for x in fields[18].strip(',').split(',')]
		contig_starts = [int(x) for x in fields[19].strip(',').split(',')]
		ref_starts = [int(x) for x in fields[20].strip(',').split(',')]
		assert 0 < len(sizes) == len(contig_starts) == len(ref_starts)
		strand = fields[8]
		contig_name = fields[9]
		ref_name = fields[13]
		assert strand in ['-','+']
		assert contig_name in contigs
		assert ref_name in references
		a = AlignedRead()
		a.qname = contig_name
		if strand == '+':
			a.seq = str(contigs[contig_name])
		else:
			a.seq = str(contigs[contig_name].reverse_complement())
		a.flag = (16 if strand == '+' else 0)
		a.rname = refname_to_id[ref_name]
		a.pos = ref_starts[0]
		a.mapq = 255
		qpos = contig_starts[0]
		refpos = ref_starts[0]
		cigar = []
		# soft-clipping at the start?
		if contig_starts[0] > 0:
			cigar.append((4,contig_starts[0]))
		longest_insertion = 0
		longest_deletion = 0
		total_matches = 0
		total_insertion = 0
		total_deletion = 0
		for length, contig_start, ref_start in zip(sizes, contig_starts, ref_starts):
			assert contig_start >= qpos
			assert ref_start >= refpos
			# insertion?
			if contig_start > qpos:
				insertion_length = contig_start - qpos
				longest_insertion = max(longest_insertion, insertion_length)
				total_insertion += insertion_length
				append_to_cigar(cigar, 1, insertion_length)
				qpos = contig_start
			# deletion?
			if ref_start > refpos:
				deletion_length = ref_start - refpos
				longest_deletion = max(longest_deletion, deletion_length)
				total_deletion += deletion_length
				append_to_cigar(cigar, 2, deletion_length)
				refpos = ref_start
			# strech of matches/mismatches
			append_to_cigar(cigar, 0, length)
			refpos += length
			qpos += length
			total_matches += length
		# soft-clipping at the end?
		if len(a.seq) > qpos:
			cigar.append((4,len(a.seq) - qpos))
		a.cigar = tuple(cigar)
		# only use contigs where longest deletion is <= 10000 bp
		if longest_deletion > 10000: continue
		# require at least 200 matching positions
		if total_matches < 200: continue
		# require the matching positions to make up at least 75 percent of the contig
		# (without counting parts of the contig that are insertions).
		if total_matches / (len(a.seq) - total_insertion) < 0.75: continue
		outfile.write(a)
	outfile.close()
开发者ID:ALLBio,项目名称:allbiotc2,代码行数:100,代码来源:convert-blat-output.py

示例15: print_reads

# 需要导入模块: from pysam import Samfile [as 别名]
# 或者: from pysam.Samfile import close [as 别名]
def print_reads(reads_to_print, ref_name, header):
    output_name = "{0}_{1}.bam".format(args.output_base, ref_name)
    output_samfile = Samfile(output_name, "wb", header=header)
    for aln in reads_to_print:
        output_samfile.write(aln)
    output_samfile.close()
开发者ID:alneberg,项目名称:bu,代码行数:8,代码来源:split_bam_per_reference.py


注:本文中的pysam.Samfile.close方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。