当前位置: 首页>>代码示例>>Python>>正文


Python Samfile.fetch方法代码示例

本文整理汇总了Python中pysam.Samfile.fetch方法的典型用法代码示例。如果您正苦于以下问题:Python Samfile.fetch方法的具体用法?Python Samfile.fetch怎么用?Python Samfile.fetch使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在pysam.Samfile的用法示例。


在下文中一共展示了Samfile.fetch方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: parse_barcode

# 需要导入模块: from pysam import Samfile [as 别名]
# 或者: from pysam.Samfile import fetch [as 别名]
def parse_barcode(bamfile):
	"""parses a sorted and index bam file, removes all cases where rna hits more than one spot in genome
	and writes to a file, create file for mutant and wildtype based on barcodes"""
	samfile = Samfile(bamfile, "rb")
	multi_hit_file = Samfile("MultiHit.bam","wb",template=samfile)
	mutant_one = Samfile("MutantOne.bam","wb",template=samfile)
	wildtype_one = Samfile("WildtypeOne.bam","wb",template=samfile)
	mutant_two = Samfile("MutantTwo.bam","wb",template=samfile)
	wildtype_two = Samfile("WildtypeTwo.bam","wb",template=samfile)
	for line in samfile.fetch():
		#if line.is_secondary:
		## does this hit to more than one spot in genome
		#	multi_hit_file.write(line)
		if "#GAGT"in line.qname: 
		## write to mutant file
			mutant_one.write(line)
		elif "#TTAG" in line.qname:
			mutant_two.write(line)
		elif "#ACCC" in line.qname:
		### write to wildtype file
			wildtype_one.write(line)
		elif "#CGTA" in line.qname:
		### write to wildtype file
			wildtype_two.write(line)

	multi_hit_file.close()
	mutant_one.close()
	wildtype_one.close()
	mutant_two.close()
	wildtype_two.close()
	samfile.close()
开发者ID:gturco,项目名称:seqtools,代码行数:33,代码来源:barcode_parser.py

示例2: main

# 需要导入模块: from pysam import Samfile [as 别名]
# 或者: from pysam.Samfile import fetch [as 别名]
def main(args):
    option = "r" if args.samformat else "rb"
    samfile = Samfile(args.bamfile, "rb")

    #Iterates over each read instead of each contig
    outputs = defaultdict(list)
    #import ipdb; ipdb.set_trace()
    for aln in samfile.fetch(until_eof = True):
        ref = samfile.getrname(aln.tid)
        outputs[ref].append(aln)

    for ref, alns in outputs.iteritems():
        print_reads(alns, ref, samfile.header)
开发者ID:alneberg,项目名称:bu,代码行数:15,代码来源:split_bam_per_reference.py

示例3: get_raw_tracks

# 需要导入模块: from pysam import Samfile [as 别名]
# 或者: from pysam.Samfile import fetch [as 别名]
def get_raw_tracks(args):
    # Initializing Error Handler
    err = ErrorHandler()

    if len(args.input_files) != 2:
        err.throw_error("ME_FEW_ARG", add_msg="You must specify reads and regions file.")

    output_fname = os.path.join(args.output_location, "{}.wig".format(args.output_prefix))

    bam = Samfile(args.input_files[0], "rb")
    regions = GenomicRegionSet("Interested regions")
    regions.read(args.input_files[1])
    regions.merge()
    reads_file = GenomicSignal()

    with open(output_fname, "a") as output_f:
        for region in regions:
            # Raw counts
            signal = [0.0] * (region.final - region.initial)
            for read in bam.fetch(region.chrom, region.initial, region.final):
                if not read.is_reverse:
                    cut_site = read.pos + args.forward_shift
                    if region.initial <= cut_site < region.final:
                        signal[cut_site - region.initial] += 1.0
                else:
                    cut_site = read.aend + args.reverse_shift - 1
                    if region.initial <= cut_site < region.final:
                        signal[cut_site - region.initial] += 1.0

            if args.norm:
                signal = reads_file.boyle_norm(signal)
                perc = scoreatpercentile(signal, 98)
                std = np.std(signal)
                signal = reads_file.hon_norm_atac(signal, perc, std)

            output_f.write("fixedStep chrom=" + region.chrom + " start=" + str(region.initial + 1) + " step=1\n" +
                           "\n".join([str(e) for e in np.nan_to_num(signal)]) + "\n")
    output_f.close()

    if args.bigWig:
        genome_data = GenomeData(args.organism)
        chrom_sizes_file = genome_data.get_chromosome_sizes()
        bw_filename = os.path.join(args.output_location, "{}.bw".format(args.output_prefix))
        os.system(" ".join(["wigToBigWig", output_fname, chrom_sizes_file, bw_filename, "-verbose=0"]))
        os.remove(output_fname)
开发者ID:CostaLab,项目名称:reg-gen,代码行数:47,代码来源:Tracks.py

示例4: main

# 需要导入模块: from pysam import Samfile [as 别名]
# 或者: from pysam.Samfile import fetch [as 别名]
def main(args):
    option = "r" if args.samformat else "rb"
    samfile = Samfile(args.bamfile, "rb")
    ref_ids = [samfile.gettid(r) for r in samfile.references]
    #Iterates over each read instead of each contig
    reads_to_print = []
    for aln in samfile.fetch(until_eof = True):
        if pair_is_aligned(aln, ref_ids):
            if args.read_pair == 1 and aln.is_read1:
                reads_to_print.append(aln)
            elif args.read_pair == 2 and aln.is_read2:
                reads_to_print.append(aln)
            elif args.read_pair == 0:
                reads_to_print.append(aln)
        if len(reads_to_print) >= 10000:
            # Flush the reads collected
            print_reads(reads_to_print)
            reads_to_print = []

    print_reads(reads_to_print)
开发者ID:alneberg,项目名称:bu,代码行数:22,代码来源:extract_reads_from_bam.py

示例5: main

# 需要导入模块: from pysam import Samfile [as 别名]
# 或者: from pysam.Samfile import fetch [as 别名]
def main(args):
    option = "r" if args.samformat else "rb"
    samfile = Samfile(args.bamfile, option)
    ref_ids = [samfile.gettid(r) for r in samfile.references]
    #Iterates over each read instead of each contig
    reads_to_print_1 = []
    reads_to_print_2 = []
    reads_to_print_u = []
    for aln in samfile.fetch(until_eof = True):
        if aln.tid in ref_ids: # This read is aligned
            if aln.rnext in ref_ids: # The mate is also aligned
                if aln.is_read1:
                    reads_to_print_1.append(aln)
                    reads_to_print_1 = flush_reads(reads_to_print_1, args.R1)
                elif aln.is_read2:
                    reads_to_print_2.append(aln)
                    reads_to_print_2 = flush_reads(reads_to_print_2, args.R2)
            else:
                reads_to_print_u.append(aln)
                reads_to_print_u = flush_reads(reads_to_print_u, args.u)

    print_reads(reads_to_print_1, args.R1)
    print_reads(reads_to_print_2, args.R2)
    print_reads(reads_to_print_u, args.u)
开发者ID:alneberg,项目名称:bu,代码行数:26,代码来源:extract_reads_from_bam_and_separate.py

示例6: ace

# 需要导入模块: from pysam import Samfile [as 别名]
# 或者: from pysam.Samfile import fetch [as 别名]
def ace(args):
    """
    %prog ace bamfile fastafile

    convert bam format to ace format. This often allows the remapping to be
    assessed as a denovo assembly format. bam file needs to be indexed. also
    creates a .mates file to be used in amos/bambus, and .astat file to mark
    whether the contig is unique or repetitive based on A-statistics in Celera
    assembler.
    """
    p = OptionParser(ace.__doc__)
    p.add_option("--splitdir", dest="splitdir", default="outRoot",
            help="split the ace per contig to dir [default: %default]")
    p.add_option("--unpaired", dest="unpaired", default=False,
            help="remove read pairs on the same contig [default: %default]")
    p.add_option("--minreadno", dest="minreadno", default=3, type="int",
            help="minimum read numbers per contig [default: %default]")
    p.add_option("--minctgsize", dest="minctgsize", default=100, type="int",
            help="minimum contig size per contig [default: %default]")
    p.add_option("--astat", default=False, action="store_true",
            help="create .astat to list repetitiveness [default: %default]")
    p.add_option("--readids", default=False, action="store_true",
            help="create file of mapped and unmapped ids [default: %default]")

    from pysam import Samfile

    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    bamfile, fastafile = args
    astat = opts.astat
    readids = opts.readids

    f = Fasta(fastafile)
    prefix = bamfile.split(".")[0]
    acefile = prefix + ".ace"
    readsfile = prefix + ".reads"
    astatfile = prefix + ".astat"

    logging.debug("Load {0}".format(bamfile))
    s = Samfile(bamfile, "rb")

    ncontigs = s.nreferences
    genomesize = sum(x for a, x in f.itersizes())
    logging.debug("Total {0} contigs with size {1} base".format(ncontigs,
        genomesize))
    qual = "20"  # default qual

    totalreads = sum(s.count(x) for x in s.references)
    logging.debug("Total {0} reads mapped".format(totalreads))

    fw = open(acefile, "w")
    if astat:
        astatfw = open(astatfile, "w")
    if readids:
        readsfw = open(readsfile, "w")

    print >> fw, "AS {0} {1}".format(ncontigs, totalreads)
    print >> fw

    for i, contig in enumerate(s.references):
        cseq = f[contig]
        nbases = len(cseq)

        mapped_reads = [x for x in s.fetch(contig) if not x.is_unmapped]
        nreads = len(mapped_reads)

        nsegments = 0
        print >> fw, "CO {0} {1} {2} {3} U".format(contig, nbases, nreads,
                nsegments)
        print >> fw, fill(str(cseq.seq))
        print >> fw

        if astat:
            astat = Astat(nbases, nreads, genomesize, totalreads)
            print >> astatfw, "{0}\t{1:.1f}".format(contig, astat)

        text = fill([qual] * nbases, delimiter=" ", width=30)
        print >> fw, "BQ\n{0}".format(text)
        print >> fw

        rnames = []
        for a in mapped_reads:
            readname = a.qname
            rname = readname

            if readids:
                print >> readsfw, readname
            rnames.append(rname)

            strand = "C" if a.is_reverse else "U"
            paddedstart = a.pos + 1  # 0-based to 1-based
            af = "AF {0} {1} {2}".format(rname, strand, paddedstart)
            print >> fw, af

        print >> fw

        for a, rname in zip(mapped_reads, rnames):
#.........这里部分代码省略.........
开发者ID:arvin580,项目名称:jcvi,代码行数:103,代码来源:sam.py

示例7: get_raw_signal

# 需要导入模块: from pysam import Samfile [as 别名]
# 或者: from pysam.Samfile import fetch [as 别名]
def get_raw_signal(arguments):
    (mpbs_name, mpbs_file1, mpbs_file2, reads_file1, reads_file2, organism,
     window_size, forward_shift, reverse_shift) = arguments

    mpbs1 = GenomicRegionSet("Motif Predicted Binding Sites of Condition1")
    mpbs1.read(mpbs_file1)

    mpbs2 = GenomicRegionSet("Motif Predicted Binding Sites of Condition2")
    mpbs2.read(mpbs_file2)

    mpbs = mpbs1.combine(mpbs2, output=True)
    mpbs.sort()

    bam1 = Samfile(reads_file1, "rb")
    bam2 = Samfile(reads_file2, "rb")

    genome_data = GenomeData(organism)
    fasta = Fastafile(genome_data.get_genome())

    signal_1 = np.zeros(window_size)
    signal_2 = np.zeros(window_size)
    motif_len = None
    pwm = dict([("A", [0.0] * window_size), ("C", [0.0] * window_size),
                ("G", [0.0] * window_size), ("T", [0.0] * window_size),
                ("N", [0.0] * window_size)])

    mpbs_regions = mpbs.by_names([mpbs_name])
    num_motif = len(mpbs_regions)

    for region in mpbs_regions:
        if motif_len is None:
            motif_len = region.final - region.initial

        mid = (region.final + region.initial) / 2
        p1 = mid - window_size / 2
        p2 = mid + window_size / 2

        if p1 <= 0:
            continue

        # Fetch raw signal
        for read in bam1.fetch(region.chrom, p1, p2):
            # check if the read is unmapped, according to issue #112
            if read.is_unmapped:
                continue

            if not read.is_reverse:
                cut_site = read.pos + forward_shift
                if p1 <= cut_site < p2:
                    signal_1[cut_site - p1] += 1.0
            else:
                cut_site = read.aend + reverse_shift - 1
                if p1 <= cut_site < p2:
                    signal_1[cut_site - p1] += 1.0

        for read in bam2.fetch(region.chrom, p1, p2):
            # check if the read is unmapped, according to issue #112
            if read.is_unmapped:
                continue

            if not read.is_reverse:
                cut_site = read.pos + forward_shift
                if p1 <= cut_site < p2:
                    signal_2[cut_site - p1] += 1.0
            else:
                cut_site = read.aend + reverse_shift - 1
                if p1 <= cut_site < p2:
                    signal_2[cut_site - p1] += 1.0

        update_pwm(pwm, fasta, region, p1, p2)

    return signal_1, signal_2, motif_len, pwm, num_motif
开发者ID:CostaLab,项目名称:reg-gen,代码行数:74,代码来源:DifferentialAnalysis.py

示例8: open

# 需要导入模块: from pysam import Samfile [as 别名]
# 或者: from pysam.Samfile import fetch [as 别名]
  intFile = open(intFileName,"r")
  spr = 0.0
  counter = 0.0
  for line in intFile:

    # Fetching signal
    ll = line.strip().split("\t")
    mLen = int(ll[2]) - int(ll[1])
    mid = (int(ll[1])+int(ll[2]))/2
    p1 = max(mid - halfWindow,0)
    p2 = mid + halfWindow

    # Fetch raw signal
    pileup_region = PileupRegion(p1,p2,1)
    if(ps_version == "0.7.5"):
      bam.fetch(reference=ll[0], start=p1, end=p2, callback = pileup_region)
    else:
      iter = bam.fetch(reference=ll[0], start=p1, end=p2)
      for alignment in iter: pileup_region.__call__(alignment)
    raw_signal = array([min(e,initial_clip) for e in pileup_region.vector])
    
    # Std-based clipping
    mean = raw_signal.mean()
    std = raw_signal.std()
    clip_signal = [min(e, mean + (10 * std)) for e in raw_signal]

    # Bias Correction
    correctedSignal = bias_correction(bam, clip_signal, biasTableF, biasTableR, genomeFileName, ll[0], p1, p2)

    # Summing min value to signal
    stdzSignal = [e+minValue for e in correctedSignal]
开发者ID:CostaLab,项目名称:reg-gen,代码行数:33,代码来源:protectionScore.py

示例9: __init__

# 需要导入模块: from pysam import Samfile [as 别名]
# 或者: from pysam.Samfile import fetch [as 别名]
class GenomicSignal:
    """
    Represents a genomic signal. It should be used to fetch normalized and slope
    signals from a bam file.
    Usage:
    1. Initialize class.
    2. Call load_sg_coefs once.
    3. Call get_signal as many times as needed.

    Authors: Eduardo G. Gusmao.
    """

    def __init__(self, file_name=None):
        """ 
        Initializes GenomicSignal.
        """
        self.file_name = file_name
        self.sg_coefs = None
        if file_name is not None:
            self.bam = Samfile(file_name, "rb")

    def load_sg_coefs(self, slope_window_size):
        """ 
        Loads Savitzky-Golay coefficients into self.sg_coefs based on a slope_window_size.

        Keyword arguments:
        slope_window_size -- Window size of Savitzky-Golay coefficients.

        Return:
        None -- It updates self.sg_coefs.
        """
        self.sg_coefs = self.savitzky_golay_coefficients(slope_window_size, 2, 1)

    def get_tag_count(self, ref, start, end, downstream_ext, upstream_ext, forward_shift, reverse_shift,
                      initial_clip=1000):
        """
        Gets the tag count associated with self.bam based on start, end and ext.

        Keyword arguments:
        ref -- Chromosome name.
        start -- Initial genomic coordinate of signal.
        end -- Final genomic coordinate of signal.
        downstream_ext -- Number of bps to extend towards the downstream region (right for forward strand and left for reverse strand).
        upstream_ext -- Number of bps to extend towards the upstream region (left for forward strand and right for reverse strand).
        forward_shift -- Number of bps to shift the reads aligned to the forward strand. Can be a positive number for a shift towards the downstream region (towards the inside of the aligned read) and a negative number for a shift towards the upstream region.
        reverse_shift -- Number of bps to shift the reads aligned to the reverse strand. Can be a positive number for a shift towards the upstream region and a negative number for a shift towards the downstream region (towards the inside of the aligned read).
        initial_clip -- Signal will be initially clipped at this level to avoid outliers.

        Return:
        tag_count -- Total signal.
        """

        # Fetch raw signal
        pileup_region = PileupRegion(start, end, downstream_ext, upstream_ext, forward_shift, reverse_shift)
        if ps_version == "0.7.5":
            self.bam.fetch(reference=ref, start=start, end=end, callback=pileup_region)
        else:
            iter = self.bam.fetch(reference=ref, start=start, end=end)
            for alignment in iter: pileup_region.__call__(alignment)
        raw_signal = array([min(e, initial_clip) for e in pileup_region.vector])

        # Tag count
        try:
            tag_count = sum(raw_signal)
        except Exception:
            tag_count = 0

        return tag_count

    def get_signal(self, ref, start, end, downstream_ext, upstream_ext, forward_shift, reverse_shift,
                   initial_clip=1000, per_norm=98, per_slope=98,
                   bias_table=None, genome_file_name=None, print_raw_signal=False):
        """
        Gets the signal associated with self.bam based on start, end and ext.
        initial_clip, per_norm and per_slope are used as normalization factors during the normalization
        and slope evaluation procedures.

        Keyword arguments:
        ref -- Chromosome name.
        start -- Initial genomic coordinate of signal.
        end -- Final genomic coordinate of signal.
        initial_clip -- Signal will be initially clipped at this level to avoid outliers.
        per_norm -- Percentile value for 'hon_norm' function of the normalized signal.
        per_slope -- Percentile value for 'hon_norm' function of the slope signal.
        bias_table -- Bias table to perform bias correction.
        genome_file_name -- Genome to perform bias correction.
        downstream_ext -- Number of bps to extend towards the downstream region
        (right for forward strand and left for reverse strand).
        upstream_ext -- Number of bps to extend towards the upstream region
        (left for forward strand and right for reverse strand).
        forward_shift -- Number of bps to shift the reads aligned to the forward strand.
        Can be a positive number for a shift towards the downstream region
        (towards the inside of the aligned read) and a negative number for a shift towards the upstream region.
        reverse_shift -- Number of bps to shift the reads aligned to the reverse strand.
        Can be a positive number for a shift towards the upstream region and a negative number
        for a shift towards the downstream region (towards the inside of the aligned read).

        Return:
        hon_signal -- Normalized signal.
        slopehon_signal -- Slope signal.
#.........这里部分代码省略.........
开发者ID:CostaLab,项目名称:reg-gen,代码行数:103,代码来源:signalProcessing.py

示例10: defaultdict

# 需要导入模块: from pysam import Samfile [as 别名]
# 或者: from pysam.Samfile import fetch [as 别名]
if(not os.path.exists(args.outdir)):
    os.makedirs(args.outdir)
#________________________________________________________________________________________________________________
#get error dict;
#________________________________________________________________________________________________________________
errors = defaultdict(set)
with open(args.errors, 'r') as f:
	for l in f:
		a = l.strip().split("\t");
		if(not args.etype or ",".join(a[:2]) in args.etype):
			errors[tuple(a[:2])].add(a[2]);
			
errors2segments = defaultdict(lambda: defaultdict(list));
samfile = Samfile(args.path)
for segment in samfile.fetch(until_eof=True):
	num = segment.query_name.split("|")[0]
	for etype, eset in errors.iteritems():
		if(num in eset):
			errors2segments[etype][num].append(segment);
			break;
		
		
additional = defaultdict(list);
for fname in args.additional:
	tsamfile = Samfile(fname);
	for segment in tsamfile.fetch(until_eof=True):
		num = segment.query_name.split("|")[0]
		additional[num].append(ArWrapper(segment, tsamfile.getrname(segment.tid)))
	tsamfile.close();
		
开发者ID:afilipch,项目名称:nrlbio,代码行数:31,代码来源:chiflex_explore_errors.py

示例11: estimate_table

# 需要导入模块: from pysam import Samfile [as 别名]
# 或者: from pysam.Samfile import fetch [as 别名]
    def estimate_table(self, regions, dnase_file_name, genome_file_name, k_nb, shift):
        """ 
        Estimates bias based on HS regions, DNase-seq signal and genomic sequences.

        Keyword arguments:
        regions -- DNase-seq HS regions.
        dnase_file_name -- DNase-seq file name.
        genome_file_name -- Genome to fetch genomic sequences from.
        
        Return:
        bias_table_F, bias_table_R -- Bias tables.
        """

        # Parameters
        maxDuplicates = 100
        pseudocount = 1.0

        # Initializing bam and fasta
        if(dnase_file_name.split(".")[-1].upper() != "BAM"): return None # TODO ERROR
        bamFile = Samfile(dnase_file_name, "rb")
        fastaFile = Fastafile(genome_file_name)

        # Initializing dictionaries
        obsDictF = dict(); obsDictR = dict()
        expDictF = dict(); expDictR = dict()

        ct_reads_r=0
        ct_reads_f=0
        ct_kmers=0

        # Iterating on HS regions
        for region in regions:

            # Initialization
            prevPos = -1
            trueCounter = 0

            # Evaluating observed frequencies ####################################

            # Fetching reads
            for r in bamFile.fetch(region.chrom, region.initial, region.final):

                # Calculating positions
                if(not r.is_reverse): p1 = r.pos - (k_nb/2) - 1 + shift
                else: p1 = r.aend - (k_nb/2) + 1 - shift
                p2 = p1 + k_nb

                # Verifying PCR artifacts
                if(p1 == prevPos): trueCounter += 1
                else:
                    prevPos = p1
                    trueCounter = 0
                if(trueCounter > maxDuplicates): continue

                # Fetching k-mer
                try: currStr = str(fastaFile.fetch(region.chrom, p1, p2)).upper()
                except Exception: continue
                if(r.is_reverse): currStr = AuxiliaryFunctions.revcomp(currStr)

                # Counting k-mer in dictionary
                if(not r.is_reverse):
                    ct_reads_r+=1
                    try: obsDictF[currStr] += 1
                    except Exception: obsDictF[currStr] = 1
                else:
                    ct_reads_f+=1
                    try: obsDictR[currStr] += 1
                    except Exception: obsDictR[currStr] = 1 


            # Evaluating expected frequencies ####################################

            # Fetching whole sequence
            try: currStr = str(fastaFile.fetch(region.chrom, region.initial, region.final)).upper()
            except Exception: continue
            currRevComp = AuxiliaryFunctions.revcomp(currStr)

            # Iterating on each sequence position
            for i in range(0,len(currStr)-k_nb):
                ct_kmers+=1
                # Counting k-mer in dictionary
                s = currStr[i:i+k_nb]
                try: expDictF[s] += 1
                except Exception: expDictF[s] = 1

                # Counting k-mer in dictionary for reverse complement
                s = currRevComp[i:i+k_nb]
                try: expDictR[s] += 1
                except Exception: expDictR[s] = 1

        # Closing files
        bamFile.close()
        fastaFile.close()

        # Creating bias dictionary
        alphabet = ["A","C","G","T"]
        kmerComb = ["".join(e) for e in product(alphabet, repeat=k_nb)]
        bias_table_F = dict([(e,0.0) for e in kmerComb]) 
        bias_table_R = dict([(e,0.0) for e in kmerComb]) 
        for kmer in kmerComb:
#.........这里部分代码省略.........
开发者ID:Marvin84,项目名称:reg-gen,代码行数:103,代码来源:biasTable.py

示例12: estimate_bias_kmer

# 需要导入模块: from pysam import Samfile [as 别名]
# 或者: from pysam.Samfile import fetch [as 别名]
def estimate_bias_kmer(args):
    # Parameters
    maxDuplicates = 100
    pseudocount = 1.0

    # Initializing bam and fasta
    bamFile = Samfile(args.reads_file, "rb")
    genome_data = GenomeData(args.organism)
    fastaFile = Fastafile(genome_data.get_genome())
    regions = GenomicRegionSet("regions")
    regions.read(args.regions_file)

    # Initializing dictionaries
    obsDictF = dict()
    obsDictR = dict()
    expDictF = dict()
    expDictR = dict()

    ct_reads_r = 0
    ct_reads_f = 0
    ct_kmers = 0

    # Iterating on HS regions
    for region in regions:

        # Initialization
        prevPos = -1
        trueCounter = 0

        # Evaluating observed frequencies ####################################
        # Fetching reads
        for r in bamFile.fetch(region.chrom, region.initial, region.final):

            # Calculating positions
            if not r.is_reverse:
                cut_site = r.pos + args.forward_shift - 1
                p1 = cut_site - int(floor(args.k_nb / 2))
            else:
                cut_site = r.aend + args.reverse_shift + 1
                p1 = cut_site - int(floor(args.k_nb / 2))
            p2 = p1 + args.k_nb

            # Verifying PCR artifacts
            if p1 == prevPos:
                trueCounter += 1
            else:
                prevPos = p1
                trueCounter = 0
            if trueCounter > maxDuplicates: continue

            # Fetching k-mer
            try:
                currStr = str(fastaFile.fetch(region.chrom, p1, p2)).upper()
            except Exception:
                continue
            if r.is_reverse: currStr = AuxiliaryFunctions.revcomp(currStr)

            # Counting k-mer in dictionary
            if not r.is_reverse:
                ct_reads_f += 1
                try:
                    obsDictF[currStr] += 1
                except Exception:
                    obsDictF[currStr] = 1
            else:
                ct_reads_r += 1
                try:
                    obsDictR[currStr] += 1
                except Exception:
                    obsDictR[currStr] = 1

        # Evaluating expected frequencies ####################################
        # Fetching whole sequence
        try:
            currStr = str(fastaFile.fetch(region.chrom, region.initial, region.final)).upper()
        except Exception:
            continue
        currRevComp = AuxiliaryFunctions.revcomp(currStr)

        # Iterating on each sequence position
        for i in range(0, len(currStr) - args.k_nb):
            ct_kmers += 1
            # Counting k-mer in dictionary
            s = currStr[i:i + args.k_nb]
            try:
                expDictF[s] += 1
            except Exception:
                expDictF[s] = 1

            # Counting k-mer in dictionary for reverse complement
            s = currRevComp[i:i + args.k_nb]
            try:
                expDictR[s] += 1
            except Exception:
                expDictR[s] = 1

    # Closing files
    bamFile.close()
    fastaFile.close()

#.........这里部分代码省略.........
开发者ID:CostaLab,项目名称:reg-gen,代码行数:103,代码来源:Estimation.py

示例13: create_signal

# 需要导入模块: from pysam import Samfile [as 别名]
# 或者: from pysam.Samfile import fetch [as 别名]
def create_signal(args, regions):
    def revcomp(s):
        rev_dict = dict([("A", "T"), ("T", "A"), ("C", "G"), ("G", "C"), ("N", "N")])
        return "".join([rev_dict[e] for e in s[::-1]])

    alphabet = ["A", "C", "G", "T"]
    kmer_comb = ["".join(e) for e in product(alphabet, repeat=args.k_nb)]
    f_obs_dict = dict([(e, 0.0) for e in kmer_comb])
    r_obs_dict = dict([(e, 0.0) for e in kmer_comb])
    f_exp_dict = dict([(e, 0.0) for e in kmer_comb])
    r_exp_dict = dict([(e, 0.0) for e in kmer_comb])

    bam_file = Samfile(args.reads_file, "rb")
    genome_data = GenomeData(args.organism)
    fasta_file = Fastafile(genome_data.get_genome())

    for region in regions:
        # Fetching observed reads
        reads = bam_file.fetch(reference=region.chrom, start=region.initial, end=region.final)
        for read in reads:
            if not read.is_reverse:
                p1 = read.pos - int(floor(args.k_nb / 2)) + args.forward_shift - 1
            else:
                p1 = read.aend - int(floor(args.k_nb / 2)) + args.reverse_shift + 1
            p2 = p1 + args.k_nb
            try:
                dna_sequence_obs = str(fasta_file.fetch(region.chrom, p1, p2)).upper()
            except Exception:
                continue
            if 'N' not in dna_sequence_obs:
                if read.is_reverse:
                    dna_sequence_obs = revcomp(dna_sequence_obs)
                    r_obs_dict[dna_sequence_obs] += 1
                else:
                    f_obs_dict[dna_sequence_obs] += 1

        # Fetching whole sequence
        try:
            dna_sequence_exp = str(fasta_file.fetch(region.chrom, region.initial, region.final)).upper()
        except Exception:
            continue
        dna_sequence_exp_rev = revcomp(dna_sequence_exp)
        for i in range(0, len(dna_sequence_exp) - args.k_nb):
            s = dna_sequence_exp[i:i + args.k_nb]
            if "N" not in s:
                f_exp_dict[s] += 1
            s = dna_sequence_exp_rev[i:i + args.k_nb]
            if "N" not in s:
                r_exp_dict[s] += 1

    output_fname_f_obs = os.path.join(args.output_location, "{}_f_obs.fa".format(str(args.k_nb)))
    output_fname_f_exp = os.path.join(args.output_location, "{}_f_exp.fa".format(str(args.k_nb)))
    output_fname_r_obs = os.path.join(args.output_location, "{}_r_obs.fa".format(str(args.k_nb)))
    output_fname_r_exp = os.path.join(args.output_location, "{}_r_exp.fa".format(str(args.k_nb)))

    output_file_f_obs = open(output_fname_f_obs, "w")
    output_file_f_exp = open(output_fname_f_exp, "w")
    output_file_r_obs = open(output_fname_r_obs, "w")
    output_file_r_exp = open(output_fname_r_exp, "w")

    for kmer in r_obs_dict.keys():
        if f_obs_dict[kmer] > 0:
            output_file_f_obs.write(kmer + "\t" + str(f_obs_dict[kmer]) + "\n")
    for kmer in r_obs_dict.keys():
        if f_exp_dict[kmer] > 0:
            output_file_f_exp.write(kmer + "\t" + str(f_exp_dict[kmer]) + "\n")
    for kmer in r_obs_dict.keys():
        if r_obs_dict[kmer] > 0:
            output_file_r_obs.write(kmer + "\t" + str(r_obs_dict[kmer]) + "\n")
    for kmer in r_obs_dict.keys():
        if r_exp_dict[kmer] > 0:
            output_file_r_exp.write(kmer + "\t" + str(r_exp_dict[kmer]) + "\n")

    output_file_f_obs.close()
    output_file_f_exp.close()
    output_file_r_obs.close()
    output_file_r_exp.close()
开发者ID:CostaLab,项目名称:reg-gen,代码行数:79,代码来源:Estimation.py

示例14: estimate_bias_pwm

# 需要导入模块: from pysam import Samfile [as 别名]
# 或者: from pysam.Samfile import fetch [as 别名]
def estimate_bias_pwm(args):
    # Parameters
    max_duplicates = 100

    # Initializing bam and fasta
    bamFile = Samfile(args.reads_file, "rb")
    genome_data = GenomeData(args.organism)
    fastaFile = Fastafile(genome_data.get_genome())
    regions = GenomicRegionSet("regions")
    regions.read(args.regions_file)

    obs_f_pwm_dict = dict([("A", [0.0] * args.k_nb), ("C", [0.0] * args.k_nb),
                           ("G", [0.0] * args.k_nb), ("T", [0.0] * args.k_nb), ("N", [0.0] * args.k_nb)])
    exp_f_pwm_dict = dict([("A", [0.0] * args.k_nb), ("C", [0.0] * args.k_nb),
                           ("G", [0.0] * args.k_nb), ("T", [0.0] * args.k_nb), ("N", [0.0] * args.k_nb)])
    obs_r_pwm_dict = dict([("A", [0.0] * args.k_nb), ("C", [0.0] * args.k_nb),
                           ("G", [0.0] * args.k_nb), ("T", [0.0] * args.k_nb), ("N", [0.0] * args.k_nb)])
    exp_r_pwm_dict = dict([("A", [0.0] * args.k_nb), ("C", [0.0] * args.k_nb),
                           ("G", [0.0] * args.k_nb), ("T", [0.0] * args.k_nb), ("N", [0.0] * args.k_nb)])

    # Iterating on HS regions
    for region in regions:
        # Initialization
        prev_pos = -1
        true_counter = 0

        # Evaluating observed frequencies
        # Fetching reads
        for r in bamFile.fetch(region.chrom, region.initial, region.final):
            # Calculating positions
            if not r.is_reverse:
                cut_site = r.pos + args.forward_shift - 1
                p1 = cut_site - int(floor(args.k_nb / 2))
            else:
                cut_site = r.aend + args.reverse_shift + 1
                p1 = cut_site - int(floor(args.k_nb / 2))
            p2 = p1 + args.k_nb

            # Verifying PCR artifacts
            if p1 == prev_pos:
                true_counter += 1
            else:
                prev_pos = p1
                true_counter = 0
            if true_counter > max_duplicates: continue

            # Fetching k-mer
            try:
                currStr = str(fastaFile.fetch(region.chrom, p1, p2)).upper()
            except Exception:
                continue
            if r.is_reverse: currStr = AuxiliaryFunctions.revcomp(currStr)

            # Counting k-mer in dictionary
            if not r.is_reverse:
                for i in range(0, len(currStr)):
                    obs_f_pwm_dict[currStr[i]][i] += 1
            else:
                for i in range(0, len(currStr)):
                    obs_r_pwm_dict[currStr[i]][i] += 1

        # Evaluating expected frequencies
        # Fetching whole sequence
        try:
            currStr = str(fastaFile.fetch(region.chrom, region.initial, region.final)).upper()
        except Exception:
            continue

        # Iterating on each sequence position
        s = None
        for i in range(0, len(currStr) - args.k_nb):
            # Counting k-mer in dictionary
            s = currStr[i:i + args.k_nb]
            for i in range(0, len(s)):
                exp_f_pwm_dict[s[i]][i] += 1

            # Counting k-mer in dictionary for reverse complement
            s = AuxiliaryFunctions.revcomp(s)
            for i in range(0, len(s)):
                exp_r_pwm_dict[s[i]][i] += 1

    # Closing files
    bamFile.close()
    fastaFile.close()

    # Output pwms
    os.system("mkdir -p " + os.path.join(args.output_location, "pfm"))
    pwm_dict_list = [obs_f_pwm_dict, obs_r_pwm_dict, exp_f_pwm_dict, exp_r_pwm_dict]
    pwm_file_list = []
    pwm_obs_f = os.path.join(args.output_location, "pfm", "obs_{}_f.pfm".format(str(args.k_nb)))
    pwm_obs_r = os.path.join(args.output_location, "pfm", "obs_{}_r.pfm".format(str(args.k_nb)))
    pwm_exp_f = os.path.join(args.output_location, "pfm", "exp_{}_f.pfm".format(str(args.k_nb)))
    pwm_exp_r = os.path.join(args.output_location, "pfm", "exp_{}_r.pfm".format(str(args.k_nb)))

    pwm_file_list.append(pwm_obs_f)
    pwm_file_list.append(pwm_obs_r)
    pwm_file_list.append(pwm_exp_f)
    pwm_file_list.append(pwm_exp_r)

    for i in range(len(pwm_dict_list)):
#.........这里部分代码省略.........
开发者ID:CostaLab,项目名称:reg-gen,代码行数:103,代码来源:Estimation.py

示例15: load_sg_coefs

# 需要导入模块: from pysam import Samfile [as 别名]
# 或者: from pysam.Samfile import fetch [as 别名]
class GenomicSignal:
    """
    Represents a genomic signal. It should be used to fetch normalized and slope
    signals from a bam or bw file.
    Usage:
    1. Initialize class.
    2. Call load_sg_coefs once.
    3. Call get_signal as many times as needed.

    Authors: Eduardo G. Gusmao.

    Methods:

    load_sg_coefs(self, slope_window_size):
    Loads Savitzky-Golay coefficients into self.sg_coefs based on a slope_window_size.

    get_signal(self, ref, start, end, ext, initial_clip = 1000, per_norm = 98, per_slope = 98)
    Gets the signal associated with self.bam or self.bw based on start, end and ext.
    initial_clip, per_norm and per_slope are used as normalization factors during the normalization
    and slope evaluation procedures.

    hon_norm(self, sequence, mean, std):
    Normalizes a sequence according to hon's criterion using mean and std.
    This represents a between-dataset normalization.

    boyle_norm(self, sequence):
    Normalizes a sequence according to Boyle's criterion.
    This represents a within-dataset normalization.

    savitzky_golay_coefficients(self, window_size, order, deriv):
    Evaluate the Savitzky-Golay coefficients in order to evaluate the slope of the signal.
    It uses a window_size (of the interpolation), order (of the polynomial), deriv (derivative needed).

    slope(self, sequence, sg_coefs):
    Evaluates the slope of sequence given the sg_coefs loaded.
    """

    def __init__(self, file_name):
        """ 
        Initializes GenomicSignal.
        """
        self.file_name = file_name
        self.bam = None
        self.bw = None
        self.sg_coefs = None
        self.is_bam = False
        self.is_bw = False
        if(self.file_name.split(".")[-1].upper() == "BAM"):
            self.is_bam = True
            self.bam = Samfile(file_name,"rb")
        elif(self.file_name.split(".")[-1].upper() == "BW" or self.file_name.split(".")[-1].upper() == "BIGWIG"):
            self.is_bw = True
            self.bw = BigWigFile(file_name)
        else: pass # TODO ERROR

    def load_sg_coefs(self, slope_window_size):
        """ 
        Loads Savitzky-Golay coefficients into self.sg_coefs based on a slope_window_size.

        Keyword arguments:
        slope_window_size -- Window size of Savitzky-Golay coefficients.
        
        Return:
        None -- It updates self.sg_coefs.
        """
        self.sg_coefs = self.savitzky_golay_coefficients(slope_window_size, 2, 1)

    def get_tag_count(self, ref, start, end, ext, initial_clip = 1000, ext_both_directions=False):
        """ 
        Gets the tag count associated with self.bam based on start, end and ext.

        Keyword arguments:
        ref -- Chromosome name.
        start -- Initial genomic coordinate of signal.
        end -- Final genomic coordinate of signal.
        ext -- Fragment extention. Eg. 1 for DNase and 200 for histone modifications.
        initial_clip -- Signal will be initially clipped at this level to avoid outliers.
        
        Return:
        tag_count -- Total signal.
        """

        # Fetch raw signal
        pileup_region = PileupRegion(start,end,ext)
        if(self.is_bam):
            if(ps_version == "0.7.5"):
                self.bam.fetch(reference=ref, start=start, end=end, callback = pileup_region)
            else:
                iter = self.bam.fetch(reference=ref, start=start, end=end)
                if(not ext_both_directions):
                    for alignment in iter: pileup_region.__call__(alignment)
                else:
                    for alignment in iter: pileup_region.__call2__(alignment)
            raw_signal = array([min(e,initial_clip) for e in pileup_region.vector])
        elif(self.is_bw):
            signal = self.bw.pileup(ref, start, end)
            raw_signal = array([min(e,initial_clip) for e in signal])

        # Std-based clipping
        mean = raw_signal.mean()
#.........这里部分代码省略.........
开发者ID:Marvin84,项目名称:reg-gen,代码行数:103,代码来源:signalProcessing.py


注:本文中的pysam.Samfile.fetch方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。