Python CGAT.Genomics类代码示例

本文整理汇总了Python中CGAT.Genomics类的典型用法代码示例。如果您正苦于以下问题：Python Genomics类的具体用法？Python Genomics怎么用？Python Genomics使用的例子？那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。

在下文中一共展示了Genomics类的15个代码示例，这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞，您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: updateIndels

    def updateIndels(self, snp, is_negative_strand):

        contig = snp.chromosome
        lcontig = self.mFasta.getLength(contig)

        # get location of insertion/deletion. The location
        # is after position, hence get position and position + 1
        code = self.mAnnotations.getSequence(contig, "+", snp.pos, snp.pos + 2)
        self.mCode = code

        variants = snp.genotype.split("/")
        for variant in variants:

            if variant[0] == "*":
                self.mVariantType.append("W")

            elif variant[0] == "+":
                toinsert = variant[1:]
                self.mVariantType.append("I")

            elif variant[0] == "-":
                todelete = variant[1:]
                # deletions need to be looked at in a wider range
                self.mVariantType.append("D")

            else:
                raise ValueError("unknown variant sign '%s'" % variant[0])

        # ignore non-coding Indels
        if code[0] and code[1] not in 'abcABC':
            return

        if is_negative_strand:
            variants = [Genomics.complement(x) for x in variants]

        for reference_codon in self.mReferenceCodons:

            variants = snp.genotype.split("/")
            variants = [x[1:] for x in variants]

            for variant in variants:
                if len(variant) % 3 != 0:
                    self.mVariantCodons.append("!")
                else:
                    self.mVariantCodons.append(variant)

            self.mVariantAAs.extend(
                [Genomics.translate(x) for x in self.mVariantCodons])

开发者ID:CGATOxford，项目名称:cgat，代码行数:48，代码来源:snp2table.py

示例2: countMotifs

def countMotifs(infile, motifs):
    '''find regular expression *motifs* in
    sequences within fasta formatted *infile*.
    '''

    it = FastaIterator.FastaIterator(infile)
    positions = []
    while 1:
        try:
            seq = it.next()
        except StopIteration:
            break
        if not seq:
            break

        rseq = Genomics.complement(seq.sequence)
        lsequence = len(seq.sequence)
        pos = []
        for motif, pattern in motifs:

            for x in pattern.finditer(seq.sequence):
                pos.append((motif, "+", x.start(), x.end()))
            for x in pattern.finditer(rseq):
                pos.append(
                    (motif, "-", lsequence - x.end(), lsequence - x.start()))

        positions.append((seq.title, pos))

    return positions

开发者ID:BioXiao，项目名称:cgat，代码行数:29，代码来源:Motifs.py

示例3: updateSNPs

    def updateSNPs(self, snp, is_negative_strand, pos):
        """update SNPs."""

        contig = snp.chromosome
        lcontig = self.mFasta.getLength(contig)
        reference_base = snp.reference_base

        if snp.genotype in "ACGTacgt":
            # homozygous substitution
            self.mVariantType.append("O")
        else:
            # heterozygous substitution
            self.mVariantType.append("E")

        # switch reference strand codon to correct strand
        if reference_base != "*" and is_negative_strand:
            reference_base = Genomics.complement(reference_base)

        # collect all possible variants of reference codons
        for reference_codon in self.mReferenceCodons:

            self.mReferenceAAs.append(Genomics.translate(reference_codon))

            # process single base changes
            variant_bases = Genomics.resolveAmbiguousNA(snp.genotype)

            if reference_codon[pos] != reference_base:
                raise ValueError(
                    "base mismatch at %i (codon=%s,%i): codon:%s != genome:%s; `%s`"
                    % (snp.pos, reference_codon, pos, reference_codon[pos], reference_base, ";".join(map(str, snp)))
                )

            for variant_base in variant_bases:
                if is_negative_strand:
                    variant_base = Genomics.complement(variant_base)

        self.mVariantAAs.extend([Genomics.translate(x) for x in self.mVariantCodons])

开发者ID:nishantthakur，项目名称:cgat，代码行数:37，代码来源:snp2table.py

示例4: main


#.........这里部分代码省略.........
        ninput = len(seqs)
        map_id2title = dict( enumerate( [re.sub("\s.*", "", x.title) for x in seqs] ) )
        matcher = Nubiscan.MatcherRandomisationSequences( sense_matrix,
                                                          samples = options.iterations )
        
        results = matcher.run( masked_seqs,
                               options.arrangements,
                               qvalue_threshold = options.qvalue_threshold )

        if options.combine:
            results =  Nubiscan.combineMotifs( results )
        
        for r in results:

            if r.alternatives:
                alternatives = ",".join( [x.arrangement for x in r.alternatives ] )
            else:
                alternatives = ""

            options.stdout.write( "\t".join( ( 
                map_id2title[r.id],
                "%i" % r.start,
                "%i" % r.end,
                r.strand,
                r.arrangement,
                "%6.4f" % r.score,
                "%6.4f" % r.zscore,
                "%6.4e" % r.pvalue,
                "%6.4e" % r.qvalue,
                alternatives) ) )

            if options.add_sequence:
                s = masked_seqs[int(r.id)][r.start:r.end]
                if r.strand == "-": s = Genomics.complement( s )
                s = s[:6].upper() + s[6:-6].lower() + s[-6:].upper()
                options.stdout.write( "\t%s" % s )
                
            options.stdout.write("\n")
            noutput += 1

        # output stats
        if options.output_stats:
            outfile = E.openOutputFile( "fdr" )
            outfile.write("bin\thist\tnobserved\n" )
            for bin, hist, nobs in zip(matcher.bin_edges, matcher.hist, matcher.nobservations):
                outfile.write( "%f\t%f\t%f\n" % (bin, hist, nobs))
            outfile.close()


    elif options.fdr_control == "xall":

        matcher = Nubiscan.MatcherRandomisationSequence( sense_matrix,
                                                         samples = options.iterations )
    

        # collect all results
        matches = []
        for seq in FastaIterator.iterate(options.stdin):
            ninput += 1
            mm = matcher.run( seq.sequence,
                              options.arrangements,
                              qvalue_threshold = None )
            for m in mm:
                matches.append( m._replace( sequence = seq.title ) )

        # estimate qvalues for all matches across all sequences

开发者ID:BioinformaticsArchive，项目名称:cgat，代码行数:67，代码来源:run_nubiscan.py

示例5: main

def main( argv = None ):

    parser = E.OptionParser( version = "%prog version: $Id: analyze_codonbias_shannon.py 2864 2010-03-03 10:18:16Z andreas $",
                                    usage = globals()["__doc__"] )

    parser.add_option( "-c", "--is-cds", dest="is_cds", action="store_true",
                       help = "input are cds (nucleotide) sequences [%default]" )
    
    parser.set_defaults(
        is_cds = False,
        )
    
    (options, args) = E.Start( parser, argv = argv )

    options.stdout.write( "snpid\tidentifier\tpos\treference\tvariant\tcounts\tweight\n" )

    alphabet = "ACDEFGHIKLMNPQRSTVWY"
    
    snpid = 0

    for entry in FastaIterator.iterate( options.stdin ):
        identifier = entry.title

        if options.is_cds:
            cds_sequence = entry.sequence.upper()
            assert len(cds_sequence) % 3 == 0, \
                "length of sequence '%s' is not a multiple of 3" % entry.title

            sequence = Genomics.translate( cds_sequence )
            weights = []
            for pos, cds_pos in enumerate(range( 0, len(cds_sequence), 3)):
                codon = cds_sequence[cds_pos:cds_pos+3]
                counts = collections.defaultdict(int)
                for x in range(0,3):
                    rna = codon[x]
                    for na in "ACGT":
                        if na == rna: continue
                        taa = Genomics.translate(codon[:x] + na + codon[x+1:])
                        counts[taa] += 1
                weights.append( counts )

        else:
            sequence = entry.sequence.upper()
            counts = {}
            for x in alphabet: counts[x] = 1
            weights = [counts] * len(sequence)

        for pos, ref in enumerate( sequence ):

            if ref not in alphabet: continue
            w = weights[pos]
            t = float(sum(w.values()))
            for variant in alphabet:
                if variant == ref: continue
                snpid +=1
                options.stdout.write( 
                    "%s\n" % "\t".join(
                        ( "%010i" % snpid,
                          identifier,
                          str(pos+1),
                          ref, 
                          variant,
                          "%i" % w[variant],
                          "%6.4f" % (w[variant] / t),
                          )))
    
    E.Stop()

开发者ID:BioinformaticsArchive，项目名称:cgat，代码行数:67，代码来源:fasta2variants.py

示例6: updateVariants

def updateVariants(variants, lcontig, strand, phased=True):
    '''update variants such that they use same coordinate
    system (and strand) as the transcript

    fixes 1-ness of variants
    '''

    new_variants = []
    is_positive = Genomics.IsPositiveStrand(strand)

    for variant in variants:

        pos = variant.pos
        genotype = bytes(variant.genotype)
        reference = bytes(variant.reference)

        # fix 1-ness of variants
        # pos -= 1

        if len(genotype) == 1:
            variantseqs = list(Genomics.decodeGenotype(genotype))
            has_wildtype = reference in variantseqs
            action = "="
            start, end = pos, pos + 1
        else:

            variantseqs = [x[1:] for x in genotype.split("/")]
            lvariant = max([len(x) for x in variantseqs])
            if not phased:
                variantseqs = [x for x in variantseqs if x]
            has_wildtype = "*" in genotype

            if "+" in genotype and "-" in genotype:
                # both insertion and deletion at position
                # the range is given by the deletion
                # see below for explanations
                if genotype.startswith("+"):
                    action = ">"
                    variantseqs[1] += "-" * (lvariant - len(variantseqs[1]))
                else:
                    action = "<"
                    variantseqs[0] += "-" * (lvariant - len(variantseqs[0]))

                start, end = pos + 1, pos + lvariant + 1

            elif "-" in genotype:
                action = "-"
                # samtools: deletions are after the base denoted by snp.position
                #   * <- deletion at 1
                # 0 1 2 3 4 5 6
                #     - -
                # 6 5 4 3 2 1 0
                # deletion of 2+3 = (2,4)
                # on reverse: (7-4, 7-2) = (3,5)
                start, end = pos + 1, pos + lvariant + 1

                # deletions of unequal length are filled up with "-"
                # This is necessary to deal with negative strands:
                # -at/-atg on the positive strand deletes a t [g]
                # -at/-atg on the negative strand deletes [g] t a
                variantseqs = [x + "-" * (lvariant - len(x))
                               for x in variantseqs]

            elif "+" in genotype:
                action = "+"
                # indels are after the base denoted by position
                # as region use both flanking base so that negative strand
                # coordinates work
                # insertion between position 2 and 3
                #     * <- insection at pos 2
                # 0 1 2i3 4
                # 4 3 2i1 0
                # is insertion between 1 and 2 in reverse
                # including both flanking residues makes it work:
                # (2,3) = (5-3,5-2) = (2,3)
                # but:
                # (2,4) = (5-4,5-2) = (1,3)
                start, end = pos, pos + 2

        # revert strand
        if not is_positive:
            reference = Genomics.complement(reference)
            variantseqs = [Genomics.complement(x.upper()) for x in variantseqs]
            start, end = lcontig - end, lcontig - start

        new_variants.append(ExtendedVariant._make((
            start, end, reference.upper(), action, has_wildtype, variantseqs)))

    return new_variants

开发者ID:SCV，项目名称:cgat，代码行数:89，代码来源:Variants.py

示例7: _alignToProfile

def _alignToProfile( infile, outfile, 
                     min_score = 0 ):
    '''align sequences in *infile* against mali

    Only alignments with a score higher than *min_score* are accepted.

    Output multiple alignment in fasta format to *outfile* and a table
    in :file:`outfile.log`.
    '''

    mali = Mali.Mali()
    mali.readFromFile( open("../data/mouse.fasta") )
    src_mali = Mali.convertMali2Alignlib( mali )
    
    E.debug( "read mali: %i sequences x %i columns" % (mali.getNumSequences(), mali.getNumColumns() ))

    # add pseudocounts
    profile_mali = mali.getClone()
    n = profile_mali.getNumColumns() 
    for x in "ACGT": 
        for y in range(0,2):
            profile_mali.addSequence( "%s%i" % (x,y), 0, n, x * n )


    profile_mali = Mali.convertMali2Alignlib( profile_mali )
    alignlib.setDefaultEncoder( alignlib.getEncoder( alignlib.DNA4 ) )
    alignlib.setDefaultLogOddor( alignlib.makeLogOddorUniform() )

    # bg = alignlib.FrequencyVector()
    # bg.extend( ( 0.3, 0.1, 0.2, 0.2, 0.2) )
    # alignlib.setDefaultRegularizor( alignlib.makeRegularizorTatusov(
    #         alignlib.makeSubstitutionMatrixDNA4(),
    #         bg,
    #         "ACGTN",
    #         10.0, 1.0) )

    profile = alignlib.makeProfile( profile_mali )
    
    alignment_mode = alignlib.ALIGNMENT_WRAP

    alignator = alignlib.makeAlignatorDPFull( alignment_mode,
                                              -5.0,
                                              -0.5 )
    
    map_seq2profile = alignlib.makeAlignmentVector()
    map_rseq2profile = alignlib.makeAlignmentVector()
    profile.prepare()

    # print profile

    build_mali = alignlib.makeMultAlignment()
    m = alignlib.makeAlignmentVector()
    m.addDiagonal( 0, n, 0 )
    build_mali.add( src_mali, m )

    outf = open( outfile, "w" )
    outf_log = open( outfile + ".info", "w" )
    outf_log.write( "read_id\tlength\tstart\tend\tparts\tcovered\tpcovered\tscore\tmali_start\tmali_end\tmali_covered\tmali_pcovered\n" )

    sequences, aa = alignlib.StringVector(), alignlib.AlignandumVector()
    ids = []

    for pid in mali.getIdentifiers():
        sequences.append( re.sub( "-", "", mali[pid] ) )
        ids.append( pid )

    # print str(alignlib.MultAlignmentFormatPlain( build_mali, sequences ))

    c = E.Counter()

    for s in FastaIterator.FastaIterator( open(infile)):

        E.debug("adding %s" % s.title )
        c.input += 1
        rsequence = Genomics.complement(s.sequence)
        seq = alignlib.makeSequence( s.sequence )
        rseq = alignlib.makeSequence( rsequence )

        alignator.align( map_seq2profile, seq, profile )
        alignator.align( map_rseq2profile, rseq, profile )

        if map_seq2profile.getScore() > map_rseq2profile.getScore():
            m, seq, sequence = map_seq2profile, seq, s.sequence
        else:
            m, seq, sequence = map_rseq2profile, rseq, rsequence

        if m.getLength() == 0:
            c.skipped += 1
            continue

        if m.getScore() < min_score: 
            c.skipped += 1
            continue

        r = getParts( m )

        covered = 0
        for mm in r:
            build_mali.add( mm )
            sequences.append( sequence )
#.........这里部分代码省略.........

开发者ID:BioinformaticsArchive，项目名称:cgat，代码行数:101，代码来源:pipeline_prdm9.py

示例8: _buildAllele


#.........这里部分代码省略.........
                ", len(exon)=", exon.end - exon.start, \
                ", offsets=%i,%i," % (start_offset, end_offset), \
                ", offset at start=", _getOffset( exon.start, offsets), \
                ", offset at end=", _getOffset(exon.end, offsets)

        for exon in transcript[1:]:

            last_exon_sequence = exon_sequence
            last_start_offset, last_end_offset = start_offset, end_offset

            # get the next intron/exon parameters
            exon_key = (exon.start, exon.end)
            exon_sequence = exons[exon_key]
            start_offset, end_offset = _getEndOffsets(exon_sequence)
            intron_key = (last_end, exon.start)

            if last_end == exon.start:
                # catch empty introns
                intron_sequence = []
                intron_key = None
            else:
                intron_sequence = introns[intron_key]

            intron_seq = "".join(intron_sequence)

            ###################################################
            ###################################################
            ###################################################
            # add preceding intron
            new_exon = True

            if len(intron_seq) > frameshiftsize:

                intron_name, intron_seq5, intron_seq3 = Genomics.GetIntronType(
                    intron_seq)
                if intron_name == "unknown":
                    if intron_seq[:2].islower() and intron_seq[-2:].islower():
                        E.debug("%s: transcript has unknown splice signal - kept because not a variant: %s: %s:%s" %
                                (transcript_id, intron_name, intron_seq5, intron_seq3))
                        nsplice_noncanonical += 1
                    else:
                        is_splice_truncated = True
                        E.debug("%s: transcript has splice truncated allele: %s: %s:%s" %
                                (transcript_id, intron_name, intron_seq5, intron_seq3))
                        break
                # start a new exon
                cds_starts.append(lcds)

            else:
                # treat as frameshifting intron
                #
                # frame-shifting introns are checked if they are
                # fixed by indels either in the intron itself or
                # the terminal exon sequence. To this end, the effective
                # size of the intron is computed:
                # effective size of intron =
                # indels at terminal x bases at previous exon
                # + size of intron
                # + indels at terminal x bases at next exon
                effective_intron_size = len(intron_seq)
                previous_indels = _sumIndels(
                    last_exon_sequence[max(0, -frameshiftsize):])
                next_indels = _sumIndels(exon_sequence[:frameshiftsize])
                effective_intron_size += previous_indels + next_indels

                if previous_indels + next_indels == 0 and len(intron_seq) % 3 == 0:

开发者ID:SCV，项目名称:cgat，代码行数:67，代码来源:gtf2alleles.py

示例9: Align

    def Align(self, method, anchor=0, loglevel=1):
        """align a pair of sequences.
        get rid of this and use a method class instead in the future
        """

        map_a2b = alignlib_lite.py_makeAlignmentVector()
        s1 = "A" * anchor + self.mSequence1 + "A" * anchor
        s2 = "A" * anchor + self.mSequence2 + "A" * anchor

        self.strand = "+"

        if method == "dialign":
            dialign = WrapperDialign.Dialign(self.mOptionsDialign)
            dialign.Align(s1, s2, map_a2b)
        elif method == "blastz":
            blastz = WrapperBlastZ.BlastZ(self.mOptionsBlastZ)
            blastz.Align(s1, s2, map_a2b)
            if blastz.isReverseComplement():
                self.strand = "-"
                self.mSequence2 = Genomics.complement(self.mSequence2)

        elif method == "dialignlgs":
            dialignlgs = WrapperDialign.Dialign(self.mOptionsDialignLGS)
            dialignlgs.Align(s1, s2, map_a2b)
        elif method == "dba":
            dba = WrapperDBA.DBA()
            dba.Align(s1, s2, map_a2b)
        elif method == "clustal":
            raise NotImplementedError("clustal wrapper needs to be updated")
            clustal = WrapperClustal.Clustal()
            clustal.Align(s1, s2, map_a2b)
        elif method == "nw":
            seq1 = alignlib_lite.py_makeSequence(s1)
            seq2 = alignlib_lite.py_makeSequence(s2)
            alignator = alignlib_lite.py_makeAlignatorDPFull(alignlib_lite.py_ALIGNMENT_GLOBAL,
                                                             gop=-12.0,
                                                             gep=-2.0)
            alignator.align(map_a2b, seq1, seq2)
        elif method == "sw":
            seq1 = alignlib_lite.py_makeSequence(s1)
            seq2 = alignlib_lite.py_makeSequence(s2)
            alignlib_lite.py_performIterativeAlignment(
                map_a2b, seq1, seq2, alignator_sw, min_score_sw)
        else:
            # use callback function
            method(s1, s2, map_a2b)

        if map_a2b.getLength() == 0:
            raise AlignmentError("empty alignment")

        if anchor:
            map_a2b.removeRowRegion(
                anchor + len(self.mSequence1) + 1, map_a2b.getRowTo())
            map_a2b.removeRowRegion(1, anchor)
            map_a2b.removeColRegion(
                anchor + len(self.mSequence2) + 1, map_a2b.getColTo())
            map_a2b.removeColRegion(1, anchor)
            map_a2b.moveAlignment(-anchor, -anchor)

        f = alignlib_lite.py_AlignmentFormatExplicit(map_a2b,
                                                     alignlib_lite.py_makeSequence(
                                                         self.mSequence1),
                                                     alignlib_lite.py_makeSequence(self.mSequence2))

        self.mMethod = method
        self.mAlignment = map_a2b
        self.mAlignedSequence1, self.mAlignedSequence2 = f.mRowAlignment, f.mColAlignment
        f = alignlib_lite.py_AlignmentFormatEmissions(map_a2b)
        self.mAlignment1, self.mAlignment2 = f.mRowAlignment, f.mColAlignment
        self.mAlignmentFrom1 = map_a2b.getRowFrom()
        self.mAlignmentTo1 = map_a2b.getRowTo()
        self.mAlignmentFrom2 = map_a2b.getColFrom()
        self.mAlignmentTo2 = map_a2b.getColTo()
        self.mNumGaps, self.mLength = map_a2b.getNumGaps(), map_a2b.getLength()
        self.mAligned = self.mLength - self.mNumGaps

        self.SetPercentIdentity()
        self.SetBlockSizes()

开发者ID:BioXiao，项目名称:cgat，代码行数:78，代码来源:AlignedPairs.py

示例10: buildSequenceVariants

    def buildSequenceVariants(self, seq, strand, pos, snp):
        '''build new sequence by modifying a sequence fragment in seq at
        pos with snp.

        It is assumed that seq is already oriented according to strand.
        The strand is used to revert the snp if necessary.

        Note that only sequences different from seq will be returned.

        returns is_homozygous, seqs
        '''
        is_negative_strand = Genomics.IsNegativeStrand(strand)
        reference_base = snp.reference_base

        if reference_base != "*" and is_negative_strand:
            reference_base = Genomics.complement(reference_base)

        new_sequences = []
        is_homozygous = True
        if reference_base != "*":
            if seq[pos].upper() != reference_base.upper():
                raise ValueError("base mismatch at snp %i, expected %s, got %s in %s at position %i; snp=%s" %
                                 (snp.pos, reference_base, seq[pos], seq, pos,
                                  ";".join(map(str, snp))))

            # single base changes
            variant_bases = Genomics.resolveAmbiguousNA(snp.genotype)
            if len(variant_bases) == 1:
                is_homozygous = True
            else:
                is_homozygous = False

            for variant_base in variant_bases:
                if is_negative_strand:
                    variant_base = Genomics.complement(variant_base)

                s = list(seq)
                s[pos] = variant_base
                s = "".join(s)
                if s != seq:
                    new_sequences.append(s)
        else:
            variants = snp.genotype.split("/")
            is_homozygous = False
            for variant in variants:

                s = list(seq)
                # samtools denotes insert/deletion after position
                # while python is before/at position, hence the pos+1
                if variant[0] == "+":
                    toinsert = variant[1:].upper()
                    if is_negative_strand:
                        toinsert = Genomics.complement(toinsert)
                        s.insert(pos, toinsert)
                    else:
                        s.insert(pos + 1, toinsert)

                elif variant[0] == "-":
                    # pos+1+len(x)-1 = pos+len(x)
                    todelete = variant[1:].upper()
                    l = len(todelete)
                    if is_negative_strand:
                        # delete left of pos
                        xstart = max(0, pos - l)
                        xend = pos
                        todelete = todelete[:min(l, pos)]
                    else:
                        # delete right of pos
                        xstart = pos + 1
                        xend = min(self.mSize, pos + 1 + l)
                        todelete = todelete[:self.mSize - (pos + 1)]

                    deleted = "".join(s[xstart:xend])

                    if is_negative_strand:
                        deleted = Genomics.complement(deleted)

                    if deleted != todelete:
                        raise ValueError("base mismatch at indel %i, expected %s, got %s in %s at position %i(%i:%i); is_negative_strand=%s, snp=%s" %
                                         (snp.pos, todelete, deleted, seq, pos, xstart, xend,
                                          is_negative_strand,
                                          ";".join(map(str, snp))))
                    del s[xstart:xend]

                elif variant[0] == "*":
                    is_homozygous = True
                else:
                    raise ValueError("unknown variant sign '%s'" % variant[0])

                s = "".join(s)
                if s != seq:
                    new_sequences.append(s)

        return is_homozygous, new_sequences

开发者ID:CGATOxford，项目名称:cgat，代码行数:94，代码来源:snp2table.py

示例11: getSequence

    def getSequence(self, contig, strand="+", start=0, end=0, converter=None, as_array=False):
        """get a genomic fragment.

        A genomic fragment is identified by the coordinates
        contig, strand, start, end.

        The converter function supplied translated these coordinates
        into 0-based coordinates. By default, start and end are assumed
        to be pythonic coordinates and are forward/reverse coordinates.

        If as_array is set to true, return the AString object. This might
        be beneficial for large sequence chunks. If as_array is set to False,
        return a python string.
        """

        contig = self.getToken(contig)

        data = self.mIndex[contig]
        # dummy is
        # -> pos_seq for seekable streams
        # -> block_size for unseekable streams
        try:
            pos_id, dummy, lsequence = struct.unpack("QQi", data)
        except (struct.error, TypeError):
            pos_id, dummy, lsequence, points = data

        pos_seq = dummy
        block_size = dummy

        if end == 0:
            end = lsequence

        if end > lsequence:
            raise ValueError("3' coordinate on %s out of bounds: %i > %i" % (contig, end, lsequence))

        if start < 0:
            raise ValueError("5' coordinate on %s out of bounds: %i < 0" % (contig, start))

        if converter:
            first_pos, last_pos = converter(start, end, str(strand) in ("+", "1"), lsequence)
        elif self.mConverter:
            first_pos, last_pos = self.mConverter(start, end, str(strand) in ("+", "1"), lsequence)
        else:
            first_pos, last_pos = start, end
            if str(strand) in ("-", "0", "-1"):
                first_pos, last_pos = lsequence - last_pos, lsequence - first_pos

        if first_pos == last_pos:
            return ""

        assert first_pos < last_pos, "first position %i is larger than last position %i " % (first_pos, last_pos)

        p = AString()

        if self.mNoSeek:
            # read directly from position
            p.fromstring(self.mDatabaseFile.read(block_size, data[3], first_pos, last_pos))
        else:
            first_pos += pos_seq
            last_pos += pos_seq

            self.mDatabaseFile.seek(first_pos)
            p.fromstring(self.mDatabaseFile.read(last_pos - first_pos))

        if str(strand) in ("-", "0", "-1"):
            p = AString(Genomics.complement(str(p)))

        if self.mTranslator:
            return self.mTranslator.translate(p)
        elif as_array:
            return p
        else:
            if IS_PY3:
                return p.tostring().decode("ascii")
            else:
                return p.tostring()

开发者ID:CGATOxford，项目名称:cgat，代码行数:76，代码来源:IndexedFasta.py

示例12: main

def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("-m", "--method", dest="method", type="choice",
                      choices=('join', ),
                      help="method to apply [default=%default].")

    parser.set_defaults(
        method="join",
    )

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.Start(parser, argv=argv)

    if len(args) != 2:
        raise ValueError(
            "please supply at least two fastq files on the commandline")

    fn1, fn2 = args
    c = E.Counter()
    outfile = options.stdout

    if options.method == "join":
        # merge based on diagonals in dotplot
        iter1 = Fastq.iterate(IOTools.openFile(fn1))
        iter2 = Fastq.iterate(IOTools.openFile(fn2))
        tuple_size = 2
        for left, right in zip(iter1, iter2):
            c.input += 1

            # build dictionary of tuples
            s1, q1 = left.seq, left.quals
            d = collections.defaultdict(list)
            for x in range(len(s1) - tuple_size):
                d[s1[x:x + tuple_size]].append(x)

            s2, q2 = right.seq, right.quals
            # reverse complement
            s2 = Genomics.complement(s2)
            q2 = q2[::-1]

            # compute list of offsets/diagonals
            offsets = collections.defaultdict(int)
            for x in range(len(s2) - tuple_size):
                c = s2[x:x + tuple_size]
                for y in d[c]:
                    offsets[x - y] += 1

            # find maximum diagonal
            sorted = sorted([(y, x) for x, y in offsets.items()])
            max_count, max_offset = sorted[-1]

            E.debug('%s: maximum offset at %i' % (left.identifier,
                                                  max_offset))

            # simple merge sequence
            take = len(s2) - max_offset
            merged_seq = s1 + s2[take:]

            # simple merge quality scores
            merged_quals = q1 + q2[take:]

            new_entry = copy.copy(left)
            new_entry.seq = merged_seq
            new_entry.quals = merged_quals
            outfile.write(new_entry)
            c.output += 1

    # write footer and output benchmark information.
    E.info("%s" % str(c))
    E.Stop()

开发者ID:SCV，项目名称:cgat，代码行数:81，代码来源:fastqs2fastq.py

示例13: main

def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("--query-psl-file", dest="filename_query", type="string",
                      help="fasta filename with queries.")

    parser.add_option("--target-psl-file", dest="filename_target", type="string",
                      help="fasta filename with target.")

    parser.add_option("-m", "--method", dest="method", type="choice",
                      choices=(
                          "full", "pileup-query", "pileup-target", "gapless"),
                      help="method to use for constructing the alignment [%default].")

    parser.add_option("--forward-query", dest="forward_query", action="store_true",
                      help="reverse-complement sequences such that query is always on forward strand [%default]")

    parser.add_option("--target-prefix", dest="target_prefix", type="string",
                      help="prefix to use for target [%default].")

    parser.add_option("--query-prefix", dest="query_prefix", type="string",
                      help="prefix to use for query [%default].")

    parser.add_option("--id", dest="id", type="choice",
                      choices=("numeric", "query"),
                      help="choose type of identifier to use [%default]")

    parser.set_defaults(
        filename_query=None,
        filename_target=None,
        method="full",
        output_format_id="%06i",
        target_prefix="",
        query_prefix="",
        forward_query=False,
    )

    (options, args) = E.Start(parser)

    if options.filename_query:
        query = IndexedFasta.IndexedFasta(options.filename_query)

    if options.filename_target:
        target = IndexedFasta.IndexedFasta(options.filename_target)

    if options.method == "full":
        getAlignment = getAlignmentFull

    id = 0
    for match in Blat.iterator(options.stdin):
        if options.loglevel >= 2:
            options.stdout.write("# %s\n" % str(match))

        m = match.getMapQuery2Target()
        m.moveAlignment(-min(match.mQueryBlockStarts), -
                        min(match.mSbjctBlockStarts))
        q = query.getSequence(
            match.mQueryId, match.strand, match.mQueryFrom, match.mQueryTo)
        t = target.getSequence(
            match.mSbjctId, "+", match.mSbjctFrom, match.mSbjctTo)
        query_ali, sbjct_ali = getAlignment(m, q, t, options)

        if match.strand == "-" and options.forward_query:
            query_ali = Genomics.complement(query_ali)
            sbjct_ali = Genomics.complement(sbjct_ali)

        options.stdout.write(">%s%s:%s/%i-%i\n%s\n>%s%s:%s%s/%i-%i\n%s\n" %
                             (options.query_prefix,
                              options.output_format_id % id,
                              match.mQueryId, match.mQueryFrom, match.mQueryTo,
                              query_ali,
                              options.target_prefix,
                              options.output_format_id % id,
                              match.mSbjctId, match.strand,
                              match.mSbjctFrom, match.mSbjctTo,
                              sbjct_ali))
        id += 1

    E.Stop()

开发者ID:Q-KIM，项目名称:cgat，代码行数:88，代码来源:psl2fasta.py

示例14: len

    genome = IndexedFasta.IndexedFasta( options.genome_file )

    assert options.filename_regions != None, "please supply a gff formatted filename with regions"

    regions = GTF.readAsIntervals( GFF.iterator( IOTools.openFile(options.filename_regions, "r" ) ) )

    # build pairs for complement
    reverse_splice_pairs = []
    forward_splice_pairs = options.splice_pairs
    left_tokens, right_tokens = {}, {}
    x = 0
    for a,b in forward_splice_pairs:
        assert len(a) == 2, "only two-residue patterns allowed"
        assert len(b) == 2, "only two-residue patterns allowed"

        ca, cb = Genomics.complement( a ), Genomics.complement( b ) 
        reverse_splice_pairs.append( (b,a) )
        left_tokens[a] = x
        left_tokens[cb] = x+1
        right_tokens[b] = x
        right_tokens[ca] = x+1
        x += 2

    search_area = options.search_area
    read_length = options.read_length
    joined = options.joined

    ninput, noutput = 0, 0

    if joined:
        outfile_coordinates = IOTools.openFile( options.output_filename_pattern % "coords", "w" )

开发者ID:BioinformaticsArchive，项目名称:cgat，代码行数:31，代码来源:fasta2spliced.py

示例15: main

def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(
        version="%prog version: $Id: snp2maf.py 2875 2010-03-27 17:42:04Z andreas $", usage=globals()["__doc__"])

    parser.add_option("-g", "--genome-file", dest="genome_file", type="string",
                      help="filename with genome [default=%default].")
    parser.add_option("-t", "--tracks", dest="tracks", type="string", action="append",
                      help="tracks (tablenames) to use in sqlite database [default=%default].")
    parser.add_option("-d", "--database", dest="database", type="string",
                      help="sqlite3 database [default=%default].")
    parser.add_option("-r", "--reference", dest="reference", type="string",
                      help="name of reference [default=%default].")
    parser.add_option("-i", "--is-gtf", dest="is_gtf", action="store_true",
                      help="if set, the gene_id will be added to the alignment header [default=%default].")
    parser.add_option("-z", "--compress", dest="compress", action="store_true",
                      help="compress output with gzip [default=%default].")
    parser.add_option("-p", "--pattern-identifier", dest="pattern_track", type="string",
                      help="regular expression pattern for track [default=%default].")

    parser.set_defaults(
        genome_file=None,
        tracks=[],
        database="csvdb",
        output=[],
        border=0,
        reference_name="reference",
        pattern_track="(\S+)",
        is_gtf=True,
        compress=False,
    )

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.Start(parser, argv=argv, add_output_options=True)

    ninput, nskipped, noutput = 0, 0, 0

    if not options.database or not options.tracks:
        raise ValueError("please supply both database and tracks")

    if options.genome_file:
        fasta = IndexedFasta.IndexedFasta(options.genome_file)
    else:
        fasta = None

    if options.is_gtf:
        infile_gff = GTF.iterator(options.stdin)
    else:
        infile_gff = GTF.iterator(options.stdin)

    dbhandle = sqlite3.connect(options.database)

    statement = '''SELECT pos, reference, genotype 
                   FROM %(track)s
                   WHERE contig = '%(contig)s' AND 
                   pos BETWEEN %(extended_start)s and %(extended_end)s
                '''

    counts = E.Counter()
    tracks = options.tracks
    try:
        translated_tracks = [
            re.search(options.pattern_track, track).groups()[0] for track in tracks]
    except AttributeError:
        raise AttributeError(
            "pattern `%s` does not match input tracks." % options.pattern_track)

    if options.compress:
        outfile = gzip.GzipFile(fileobj=options.stdout)
    else:
        outfile = options.stdout

    outfile.flush()
    outfile.write("##maf version=1 program=snp2maf.py\n\n")

    for gff in infile_gff:
        counts.input += 1

        contig = gff.contig
        strand = gff.strand
        lcontig = fasta.getLength(contig)
        region_start, region_end = gff.start, gff.end
        if contig.startswith("chr"):
            contig = contig[3:]
        extended_start = region_start - options.border
        extended_end = region_end + options.border
        is_positive = Genomics.IsPositiveStrand(strand)

        E.info("processing %s" % str(gff))

        # collect all variants
        all_variants = []
#.........这里部分代码省略.........

开发者ID:Q-KIM，项目名称:cgat，代码行数:101，代码来源:snp2maf.py

注：本文中的CGAT.Genomics类示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台，相关代码片段筛选自各路编程大神贡献的开源项目，源码版权归原作者所有，传播和使用请参考对应项目的License；未经允许，请勿转载。