Python Genomics.complement方法代码示例

本文整理汇总了Python中CGAT.Genomics.complement方法的典型用法代码示例。如果您正苦于以下问题：Python Genomics.complement方法的具体用法？Python Genomics.complement怎么用？Python Genomics.complement使用的例子？那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类CGAT.Genomics的用法示例。

在下文中一共展示了Genomics.complement方法的15个代码示例，这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞，您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: countMotifs

# 需要导入模块: from CGAT import Genomics [as 别名]
# 或者: from CGAT.Genomics import complement [as 别名]
def countMotifs(infile, motifs):
    '''find regular expression *motifs* in
    sequences within fasta formatted *infile*.
    '''

    it = FastaIterator.FastaIterator(infile)
    positions = []
    while 1:
        try:
            seq = it.next()
        except StopIteration:
            break
        if not seq:
            break

        rseq = Genomics.complement(seq.sequence)
        lsequence = len(seq.sequence)
        pos = []
        for motif, pattern in motifs:

            for x in pattern.finditer(seq.sequence):
                pos.append((motif, "+", x.start(), x.end()))
            for x in pattern.finditer(rseq):
                pos.append(
                    (motif, "-", lsequence - x.end(), lsequence - x.start()))

        positions.append((seq.title, pos))

    return positions

开发者ID:BioXiao，项目名称:cgat，代码行数:31，代码来源:Motifs.py

示例2: updateSNPs

# 需要导入模块: from CGAT import Genomics [as 别名]
# 或者: from CGAT.Genomics import complement [as 别名]
    def updateSNPs(self, snp, is_negative_strand, pos):
        """update SNPs."""

        contig = snp.chromosome
        lcontig = self.mFasta.getLength(contig)
        reference_base = snp.reference_base

        if snp.genotype in "ACGTacgt":
            # homozygous substitution
            self.mVariantType.append("O")
        else:
            # heterozygous substitution
            self.mVariantType.append("E")

        # switch reference strand codon to correct strand
        if reference_base != "*" and is_negative_strand:
            reference_base = Genomics.complement(reference_base)

        # collect all possible variants of reference codons
        for reference_codon in self.mReferenceCodons:

            self.mReferenceAAs.append(Genomics.translate(reference_codon))

            # process single base changes
            variant_bases = Genomics.resolveAmbiguousNA(snp.genotype)

            if reference_codon[pos] != reference_base:
                raise ValueError(
                    "base mismatch at %i (codon=%s,%i): codon:%s != genome:%s; `%s`"
                    % (snp.pos, reference_codon, pos, reference_codon[pos], reference_base, ";".join(map(str, snp)))
                )

            for variant_base in variant_bases:
                if is_negative_strand:
                    variant_base = Genomics.complement(variant_base)

        self.mVariantAAs.extend([Genomics.translate(x) for x in self.mVariantCodons])

开发者ID:nishantthakur，项目名称:cgat，代码行数:39，代码来源:snp2table.py

示例3: updateIndels

# 需要导入模块: from CGAT import Genomics [as 别名]
# 或者: from CGAT.Genomics import complement [as 别名]
    def updateIndels(self, snp, is_negative_strand):

        contig = snp.chromosome
        lcontig = self.mFasta.getLength(contig)

        # get location of insertion/deletion. The location
        # is after position, hence get position and position + 1
        code = self.mAnnotations.getSequence(contig, "+", snp.pos, snp.pos + 2)
        self.mCode = code

        variants = snp.genotype.split("/")
        for variant in variants:

            if variant[0] == "*":
                self.mVariantType.append("W")

            elif variant[0] == "+":
                toinsert = variant[1:]
                self.mVariantType.append("I")

            elif variant[0] == "-":
                todelete = variant[1:]
                # deletions need to be looked at in a wider range
                self.mVariantType.append("D")

            else:
                raise ValueError("unknown variant sign '%s'" % variant[0])

        # ignore non-coding Indels
        if code[0] and code[1] not in 'abcABC':
            return

        if is_negative_strand:
            variants = [Genomics.complement(x) for x in variants]

        for reference_codon in self.mReferenceCodons:

            variants = snp.genotype.split("/")
            variants = [x[1:] for x in variants]

            for variant in variants:
                if len(variant) % 3 != 0:
                    self.mVariantCodons.append("!")
                else:
                    self.mVariantCodons.append(variant)

            self.mVariantAAs.extend(
                [Genomics.translate(x) for x in self.mVariantCodons])

开发者ID:CGATOxford，项目名称:cgat，代码行数:50，代码来源:snp2table.py

示例4: Align

# 需要导入模块: from CGAT import Genomics [as 别名]
# 或者: from CGAT.Genomics import complement [as 别名]
    def Align(self, method, anchor=0, loglevel=1):
        """align a pair of sequences.
        get rid of this and use a method class instead in the future
        """

        map_a2b = alignlib_lite.py_makeAlignmentVector()
        s1 = "A" * anchor + self.mSequence1 + "A" * anchor
        s2 = "A" * anchor + self.mSequence2 + "A" * anchor

        self.strand = "+"

        if method == "dialign":
            dialign = WrapperDialign.Dialign(self.mOptionsDialign)
            dialign.Align(s1, s2, map_a2b)
        elif method == "blastz":
            blastz = WrapperBlastZ.BlastZ(self.mOptionsBlastZ)
            blastz.Align(s1, s2, map_a2b)
            if blastz.isReverseComplement():
                self.strand = "-"
                self.mSequence2 = Genomics.complement(self.mSequence2)

        elif method == "dialignlgs":
            dialignlgs = WrapperDialign.Dialign(self.mOptionsDialignLGS)
            dialignlgs.Align(s1, s2, map_a2b)
        elif method == "dba":
            dba = WrapperDBA.DBA()
            dba.Align(s1, s2, map_a2b)
        elif method == "clustal":
            raise NotImplementedError("clustal wrapper needs to be updated")
            clustal = WrapperClustal.Clustal()
            clustal.Align(s1, s2, map_a2b)
        elif method == "nw":
            seq1 = alignlib_lite.py_makeSequence(s1)
            seq2 = alignlib_lite.py_makeSequence(s2)
            alignator = alignlib_lite.py_makeAlignatorDPFull(alignlib_lite.py_ALIGNMENT_GLOBAL,
                                                             gop=-12.0,
                                                             gep=-2.0)
            alignator.align(map_a2b, seq1, seq2)
        elif method == "sw":
            seq1 = alignlib_lite.py_makeSequence(s1)
            seq2 = alignlib_lite.py_makeSequence(s2)
            alignlib_lite.py_performIterativeAlignment(
                map_a2b, seq1, seq2, alignator_sw, min_score_sw)
        else:
            # use callback function
            method(s1, s2, map_a2b)

        if map_a2b.getLength() == 0:
            raise AlignmentError("empty alignment")

        if anchor:
            map_a2b.removeRowRegion(
                anchor + len(self.mSequence1) + 1, map_a2b.getRowTo())
            map_a2b.removeRowRegion(1, anchor)
            map_a2b.removeColRegion(
                anchor + len(self.mSequence2) + 1, map_a2b.getColTo())
            map_a2b.removeColRegion(1, anchor)
            map_a2b.moveAlignment(-anchor, -anchor)

        f = alignlib_lite.py_AlignmentFormatExplicit(map_a2b,
                                                     alignlib_lite.py_makeSequence(
                                                         self.mSequence1),
                                                     alignlib_lite.py_makeSequence(self.mSequence2))

        self.mMethod = method
        self.mAlignment = map_a2b
        self.mAlignedSequence1, self.mAlignedSequence2 = f.mRowAlignment, f.mColAlignment
        f = alignlib_lite.py_AlignmentFormatEmissions(map_a2b)
        self.mAlignment1, self.mAlignment2 = f.mRowAlignment, f.mColAlignment
        self.mAlignmentFrom1 = map_a2b.getRowFrom()
        self.mAlignmentTo1 = map_a2b.getRowTo()
        self.mAlignmentFrom2 = map_a2b.getColFrom()
        self.mAlignmentTo2 = map_a2b.getColTo()
        self.mNumGaps, self.mLength = map_a2b.getNumGaps(), map_a2b.getLength()
        self.mAligned = self.mLength - self.mNumGaps

        self.SetPercentIdentity()
        self.SetBlockSizes()

开发者ID:BioXiao，项目名称:cgat，代码行数:80，代码来源:AlignedPairs.py

示例5: main

# 需要导入模块: from CGAT import Genomics [as 别名]
# 或者: from CGAT.Genomics import complement [as 别名]

#.........这里部分代码省略.........
                "# processing %s - len=%i\n" % (id, lsequence))
            options.stdlog.flush()

        total_len += lsequence
        lvsequence = lsequence * \
            random.gauss(options.coverage_mean, options.coverage_stddev)

        covered = 0
        counts = numpy.zeros(lsequence)
        nreads = 0

        error_pids, read_lengths = [], []

        while covered < lvsequence:

            read_length = int(
                random.gauss(options.read_length_mean, options.read_length_stddev))
            positive = random.randint(0, 1)

            if positive:
                start = random.randint(0, lsequence)
                end = min(lsequence, start + read_length)
            else:
                end = random.randint(0, lsequence)
                start = max(0, end - read_length)

            read_length = end - start
            if read_length < options.min_read_length:
                continue

            segment = sequence[start:end]

            if not positive:
                segment = Genomics.complement(segment)

            noutput += 1

            if do_error:
                new_segment = getMutatedSequence(segment, options.error_mean)
                pid_calc.loadPair(segment, new_segment)
                pid = pid_calc.mPID
                error_pids.append(pid)
                segment = new_segment
            else:
                pid = 100.0

            options.stdout.write(
                ">%s\n%s\n" % (options.output_format_id % noutput, segment))

            if outfile_map:
                outfile_map.write(
                    "%s\t%s\n" % (id, options.output_format_id % noutput))

            for x in range(start, end):
                counts[x] += 1

            nreads += 1

            covered += read_length
            read_lengths.append(read_length)

        if options.loglevel >= 2:
            options.stdout.write("# transcript %s: len=%i, nreads=%i, len_mean=%.2f, len_std=%.2f, cov_mean=%.2f, cov_stddev=%.2f\n" % (id,
                                                                                                                                        lsequence,
                                                                                                                                        nreads,
                                                                                                                                        numpy.mean(

开发者ID:CGATOxford，项目名称:cgat，代码行数:70，代码来源:gtf2reads.py

示例6: str

# 需要导入模块: from CGAT import Genomics [as 别名]
# 或者: from CGAT.Genomics import complement [as 别名]
    if options.method == "full":
        getAlignment = getAlignmentFull

    id = 0
    for match in Blat.iterator( options.stdin ):        
        if options.loglevel >= 2:
            options.stdout.write("# %s\n" % str(match))

        m = match.getMapQuery2Target()
        m.moveAlignment( -min(match.mQueryBlockStarts), -min(match.mSbjctBlockStarts) )
        q = query.getSequence( match.mQueryId, match.strand, match.mQueryFrom, match.mQueryTo )
        t = target.getSequence( match.mSbjctId, "+", match.mSbjctFrom, match.mSbjctTo )
        query_ali, sbjct_ali = getAlignment( m, q, t, options )

        if match.strand == "-" and options.forward_query:
            query_ali = Genomics.complement( query_ali )
            sbjct_ali = Genomics.complement( sbjct_ali )

        options.stdout.write(">%s%s:%s/%i-%i\n%s\n>%s%s:%s%s/%i-%i\n%s\n" % \
                                 (options.query_prefix, 
                                  options.output_format_id % id,
                                  match.mQueryId, match.mQueryFrom, match.mQueryTo,
                                  query_ali,
                                  options.target_prefix,
                                  options.output_format_id % id, 
                                  match.mSbjctId, match.strand, match.mSbjctFrom, match.mSbjctTo,
                                  sbjct_ali ) )
        id += 1

    E.Stop()

开发者ID:siping，项目名称:cgat，代码行数:32，代码来源:psl2fasta.py

示例7: main

# 需要导入模块: from CGAT import Genomics [as 别名]
# 或者: from CGAT.Genomics import complement [as 别名]

#.........这里部分代码省略.........
            "pattern `%s` does not match input tracks." % options.pattern_track)

    if options.compress:
        outfile = gzip.GzipFile(fileobj=options.stdout)
    else:
        outfile = options.stdout

    outfile.flush()
    outfile.write("##maf version=1 program=snp2maf.py\n\n")

    for gff in infile_gff:
        counts.input += 1

        contig = gff.contig
        strand = gff.strand
        lcontig = fasta.getLength(contig)
        region_start, region_end = gff.start, gff.end
        if contig.startswith("chr"):
            contig = contig[3:]
        extended_start = region_start - options.border
        extended_end = region_end + options.border
        is_positive = Genomics.IsPositiveStrand(strand)

        E.info("processing %s" % str(gff))

        # collect all variants
        all_variants = []
        for track in options.tracks:
            cc = dbhandle.cursor()
            cc.execute(statement % locals())
            all_variants.append(map(Variants.Variant._make, cc.fetchall()))
            cc.close()

        E.debug("%s:%i..%i collected %i variants for %i tracks" % (contig,
                                                                   region_start, region_end,
                                                                   sum([
                                                                       len(x) for x in all_variants]),
                                                                   len(all_variants)))

        reference_seq = fasta.getSequence(
            contig, "+", region_start, region_end)
        lseq = len(reference_seq)
        alleles = collections.defaultdict(list)

        # build allele sequences for track and count maximum chars per mali
        # column
        colcounts = numpy.ones(lseq)
        for track, variants in zip(translated_tracks, all_variants):
            variants = Variants.updateVariants(variants, lcontig, "+")
            a = Variants.buildAlleles(reference_seq,
                                      variants,
                                      reference_start=region_start)

            alleles[track] = a
            for allele in a:
                for pos, c in enumerate(allele):
                    colcounts[pos] = max(colcounts[pos], len(c))

        # realign gapped regions
        alignIndels(alleles, colcounts)

        if options.is_gtf:
            outfile.write("a gene_id=%s\n" % gff.gene_id)
        else:
            outfile.write("a\n")

        maf_format = "s %(name)-30s %(pos)9i %(size)6i %(strand)s %(lcontig)9i %(seq)s\n"

        def __addGaps(sequence, colcounts):
            '''output gapped sequence.'''
            r = []
            for x, c in enumerate(sequence):
                r.append(c + "-" * (colcounts[x] - len(c)))
            return "".join(r)

        name = ".".join((options.reference, contig))
        if is_positive:
            pos = region_start
        else:
            pos = lcontig - region_start

        size = lseq
        seq = __addGaps(reference_seq, colcounts)
        outfile.write(maf_format % (locals()))

        for track in translated_tracks:
            for aid, allele in enumerate(alleles[track]):
                seq = __addGaps(allele, colcounts)
                if not is_positive:
                    Genomics.complement(seq)
                size = len(seq) - seq.count("-")
                name = ".".join((track + "-%i" % aid, contig))
                outfile.write(maf_format % (locals()))

        outfile.write("\n")

    E.info("%s" % str(counts))

    # write footer and output benchmark information.
    E.Stop()

开发者ID:Q-KIM，项目名称:cgat，代码行数:104，代码来源:snp2maf.py

示例8: main

# 需要导入模块: from CGAT import Genomics [as 别名]
# 或者: from CGAT.Genomics import complement [as 别名]

#.........这里部分代码省略.........
        ninput = len(seqs)
        map_id2title = dict( enumerate( [re.sub("\s.*", "", x.title) for x in seqs] ) )
        matcher = Nubiscan.MatcherRandomisationSequences( sense_matrix,
                                                          samples = options.iterations )
        
        results = matcher.run( masked_seqs,
                               options.arrangements,
                               qvalue_threshold = options.qvalue_threshold )

        if options.combine:
            results =  Nubiscan.combineMotifs( results )
        
        for r in results:

            if r.alternatives:
                alternatives = ",".join( [x.arrangement for x in r.alternatives ] )
            else:
                alternatives = ""

            options.stdout.write( "\t".join( ( 
                map_id2title[r.id],
                "%i" % r.start,
                "%i" % r.end,
                r.strand,
                r.arrangement,
                "%6.4f" % r.score,
                "%6.4f" % r.zscore,
                "%6.4e" % r.pvalue,
                "%6.4e" % r.qvalue,
                alternatives) ) )

            if options.add_sequence:
                s = masked_seqs[int(r.id)][r.start:r.end]
                if r.strand == "-": s = Genomics.complement( s )
                s = s[:6].upper() + s[6:-6].lower() + s[-6:].upper()
                options.stdout.write( "\t%s" % s )
                
            options.stdout.write("\n")
            noutput += 1

        # output stats
        if options.output_stats:
            outfile = E.openOutputFile( "fdr" )
            outfile.write("bin\thist\tnobserved\n" )
            for bin, hist, nobs in zip(matcher.bin_edges, matcher.hist, matcher.nobservations):
                outfile.write( "%f\t%f\t%f\n" % (bin, hist, nobs))
            outfile.close()


    elif options.fdr_control == "xall":

        matcher = Nubiscan.MatcherRandomisationSequence( sense_matrix,
                                                         samples = options.iterations )
    

        # collect all results
        matches = []
        for seq in FastaIterator.iterate(options.stdin):
            ninput += 1
            mm = matcher.run( seq.sequence,
                              options.arrangements,
                              qvalue_threshold = None )
            for m in mm:
                matches.append( m._replace( sequence = seq.title ) )

        # estimate qvalues for all matches across all sequences

开发者ID:BioinformaticsArchive，项目名称:cgat，代码行数:70，代码来源:run_nubiscan.py

示例9: updateVariants

# 需要导入模块: from CGAT import Genomics [as 别名]
# 或者: from CGAT.Genomics import complement [as 别名]
def updateVariants(variants, lcontig, strand, phased=True):
    '''update variants such that they use same coordinate
    system (and strand) as the transcript

    fixes 1-ness of variants
    '''

    new_variants = []
    is_positive = Genomics.IsPositiveStrand(strand)

    for variant in variants:

        pos = variant.pos
        genotype = bytes(variant.genotype)
        reference = bytes(variant.reference)

        # fix 1-ness of variants
        # pos -= 1

        if len(genotype) == 1:
            variantseqs = list(Genomics.decodeGenotype(genotype))
            has_wildtype = reference in variantseqs
            action = "="
            start, end = pos, pos + 1
        else:

            variantseqs = [x[1:] for x in genotype.split("/")]
            lvariant = max([len(x) for x in variantseqs])
            if not phased:
                variantseqs = [x for x in variantseqs if x]
            has_wildtype = "*" in genotype

            if "+" in genotype and "-" in genotype:
                # both insertion and deletion at position
                # the range is given by the deletion
                # see below for explanations
                if genotype.startswith("+"):
                    action = ">"
                    variantseqs[1] += "-" * (lvariant - len(variantseqs[1]))
                else:
                    action = "<"
                    variantseqs[0] += "-" * (lvariant - len(variantseqs[0]))

                start, end = pos + 1, pos + lvariant + 1

            elif "-" in genotype:
                action = "-"
                # samtools: deletions are after the base denoted by snp.position
                #   * <- deletion at 1
                # 0 1 2 3 4 5 6
                #     - -
                # 6 5 4 3 2 1 0
                # deletion of 2+3 = (2,4)
                # on reverse: (7-4, 7-2) = (3,5)
                start, end = pos + 1, pos + lvariant + 1

                # deletions of unequal length are filled up with "-"
                # This is necessary to deal with negative strands:
                # -at/-atg on the positive strand deletes a t [g]
                # -at/-atg on the negative strand deletes [g] t a
                variantseqs = [x + "-" * (lvariant - len(x))
                               for x in variantseqs]

            elif "+" in genotype:
                action = "+"
                # indels are after the base denoted by position
                # as region use both flanking base so that negative strand
                # coordinates work
                # insertion between position 2 and 3
                #     * <- insection at pos 2
                # 0 1 2i3 4
                # 4 3 2i1 0
                # is insertion between 1 and 2 in reverse
                # including both flanking residues makes it work:
                # (2,3) = (5-3,5-2) = (2,3)
                # but:
                # (2,4) = (5-4,5-2) = (1,3)
                start, end = pos, pos + 2

        # revert strand
        if not is_positive:
            reference = Genomics.complement(reference)
            variantseqs = [Genomics.complement(x.upper()) for x in variantseqs]
            start, end = lcontig - end, lcontig - start

        new_variants.append(ExtendedVariant._make((
            start, end, reference.upper(), action, has_wildtype, variantseqs)))

    return new_variants

开发者ID:SCV，项目名称:cgat，代码行数:91，代码来源:Variants.py

示例10: main

# 需要导入模块: from CGAT import Genomics [as 别名]
# 或者: from CGAT.Genomics import complement [as 别名]
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("-m", "--method", dest="method", type="choice",
                      choices=(
                          "apply",
                          "change-format",
                          "renumber-reads",
                          "sample",
                          "sort",
                          "trim3",
                          "trim5",
                          "unique",
                          "reverse-complement",
                          "grep"),
                      help="method to apply [%default]")

    parser.add_option(
        "--target-format", dest="target_format", type="choice",
        choices=('sanger', 'solexa', 'phred64', 'integer', 'illumina-1.8'),
        help="guess quality score format and set quality scores "
        "to format [default=%default].")

    parser.add_option(
        "--guess-format", dest="guess_format", type="choice",
        choices=('sanger', 'solexa', 'phred64', 'integer', 'illumina-1.8'),
        help="quality score format to assume if ambiguous [default=%default].")

    parser.add_option(
        "--sample-size", dest="sample_size", type="float",
        help="proportion of reads to sample. "
        "Provide a proportion of reads to sample, e.g. 0.1 for 10%, "
        "0.5 for 50%, etc [default=%default].")

    parser.add_option(
        "--pair-fastq-file", dest="pair", type="string",
        help="if data is paired, filename with second pair. "
        "Implemented for sampling [default=%default].")

    parser.add_option(
        "--map-tsv-file", dest="map_tsv_file", type="string",
        help="filename with tab-separated identifiers mapping for "
        "method apply [default=%default].")

    parser.add_option(
        "--num-bases", dest="nbases", type="int",
        help="number of bases to trim [default=%default].")

    parser.add_option(
        "--seed", dest="seed", type="int",
        help="seed for random number generator [default=%default].")

    parser.add_option(
        "--pattern-identifier", dest="renumber_pattern", type="string",
        help="rename reads in file by pattern [default=%default]")

    parser.add_option(
        "--grep-pattern", dest="grep_pattern", type="string",
        help="subset to reads matching pattern [default=%default]")

    parser.set_defaults(
        method=None,
        change_format=None,
        guess_format=None,
        sample_size=0.1,
        nbases=0,
        pair=None,
        apply=None,
        seed=None,
        renumber_pattern="read_%010i",
        grep_pattern=".*")

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.Start(parser, argv=argv, add_output_options=True)

    c = E.Counter()

    if options.method is None:
        raise ValueError("no method specified, please use --method")

    if options.method == "change-format":
        for record in Fastq.iterate_convert(options.stdin,
                                            format=options.target_format,
                                            guess=options.guess_format):
            c.input += 1
            options.stdout.write("%s\n" % record)
            c.output += 1

    elif options.method == "grep":
        for record in Fastq.iterate(options.stdin):
#.........这里部分代码省略.........

开发者ID:CGATOxford，项目名称:cgat，代码行数:103，代码来源:fastq2fastq.py

示例11: buildSequenceVariants

# 需要导入模块: from CGAT import Genomics [as 别名]
# 或者: from CGAT.Genomics import complement [as 别名]
    def buildSequenceVariants(self, seq, strand, pos, snp):
        '''build new sequence by modifying a sequence fragment in seq at
        pos with snp.

        It is assumed that seq is already oriented according to strand.
        The strand is used to revert the snp if necessary.

        Note that only sequences different from seq will be returned.

        returns is_homozygous, seqs
        '''
        is_negative_strand = Genomics.IsNegativeStrand(strand)
        reference_base = snp.reference_base

        if reference_base != "*" and is_negative_strand:
            reference_base = Genomics.complement(reference_base)

        new_sequences = []
        is_homozygous = True
        if reference_base != "*":
            if seq[pos].upper() != reference_base.upper():
                raise ValueError("base mismatch at snp %i, expected %s, got %s in %s at position %i; snp=%s" %
                                 (snp.pos, reference_base, seq[pos], seq, pos,
                                  ";".join(map(str, snp))))

            # single base changes
            variant_bases = Genomics.resolveAmbiguousNA(snp.genotype)
            if len(variant_bases) == 1:
                is_homozygous = True
            else:
                is_homozygous = False

            for variant_base in variant_bases:
                if is_negative_strand:
                    variant_base = Genomics.complement(variant_base)

                s = list(seq)
                s[pos] = variant_base
                s = "".join(s)
                if s != seq:
                    new_sequences.append(s)
        else:
            variants = snp.genotype.split("/")
            is_homozygous = False
            for variant in variants:

                s = list(seq)
                # samtools denotes insert/deletion after position
                # while python is before/at position, hence the pos+1
                if variant[0] == "+":
                    toinsert = variant[1:].upper()
                    if is_negative_strand:
                        toinsert = Genomics.complement(toinsert)
                        s.insert(pos, toinsert)
                    else:
                        s.insert(pos + 1, toinsert)

                elif variant[0] == "-":
                    # pos+1+len(x)-1 = pos+len(x)
                    todelete = variant[1:].upper()
                    l = len(todelete)
                    if is_negative_strand:
                        # delete left of pos
                        xstart = max(0, pos - l)
                        xend = pos
                        todelete = todelete[:min(l, pos)]
                    else:
                        # delete right of pos
                        xstart = pos + 1
                        xend = min(self.mSize, pos + 1 + l)
                        todelete = todelete[:self.mSize - (pos + 1)]

                    deleted = "".join(s[xstart:xend])

                    if is_negative_strand:
                        deleted = Genomics.complement(deleted)

                    if deleted != todelete:
                        raise ValueError("base mismatch at indel %i, expected %s, got %s in %s at position %i(%i:%i); is_negative_strand=%s, snp=%s" %
                                         (snp.pos, todelete, deleted, seq, pos, xstart, xend,
                                          is_negative_strand,
                                          ";".join(map(str, snp))))
                    del s[xstart:xend]

                elif variant[0] == "*":
                    is_homozygous = True
                else:
                    raise ValueError("unknown variant sign '%s'" % variant[0])

                s = "".join(s)
                if s != seq:
                    new_sequences.append(s)

        return is_homozygous, new_sequences

开发者ID:CGATOxford，项目名称:cgat，代码行数:96，代码来源:snp2table.py

示例12: getSequence

# 需要导入模块: from CGAT import Genomics [as 别名]
# 或者: from CGAT.Genomics import complement [as 别名]
    def getSequence(self, contig, strand="+", start=0, end=0, converter=None, as_array=False):
        """get a genomic fragment.

        A genomic fragment is identified by the coordinates
        contig, strand, start, end.

        The converter function supplied translated these coordinates
        into 0-based coordinates. By default, start and end are assumed
        to be pythonic coordinates and are forward/reverse coordinates.

        If as_array is set to true, return the AString object. This might
        be beneficial for large sequence chunks. If as_array is set to False,
        return a python string.
        """

        contig = self.getToken(contig)

        data = self.mIndex[contig]
        # dummy is
        # -> pos_seq for seekable streams
        # -> block_size for unseekable streams
        try:
            pos_id, dummy, lsequence = struct.unpack("QQi", data)
        except (struct.error, TypeError):
            pos_id, dummy, lsequence, points = data

        pos_seq = dummy
        block_size = dummy

        if end == 0:
            end = lsequence

        if end > lsequence:
            raise ValueError("3' coordinate on %s out of bounds: %i > %i" % (contig, end, lsequence))

        if start < 0:
            raise ValueError("5' coordinate on %s out of bounds: %i < 0" % (contig, start))

        if converter:
            first_pos, last_pos = converter(start, end, str(strand) in ("+", "1"), lsequence)
        elif self.mConverter:
            first_pos, last_pos = self.mConverter(start, end, str(strand) in ("+", "1"), lsequence)
        else:
            first_pos, last_pos = start, end
            if str(strand) in ("-", "0", "-1"):
                first_pos, last_pos = lsequence - last_pos, lsequence - first_pos

        if first_pos == last_pos:
            return ""

        assert first_pos < last_pos, "first position %i is larger than last position %i " % (first_pos, last_pos)

        p = AString()

        if self.mNoSeek:
            # read directly from position
            p.fromstring(self.mDatabaseFile.read(block_size, data[3], first_pos, last_pos))
        else:
            first_pos += pos_seq
            last_pos += pos_seq

            self.mDatabaseFile.seek(first_pos)
            p.fromstring(self.mDatabaseFile.read(last_pos - first_pos))

        if str(strand) in ("-", "0", "-1"):
            p = AString(Genomics.complement(str(p)))

        if self.mTranslator:
            return self.mTranslator.translate(p)
        elif as_array:
            return p
        else:
            if IS_PY3:
                return p.tostring().decode("ascii")
            else:
                return p.tostring()

开发者ID:CGATOxford，项目名称:cgat，代码行数:78，代码来源:IndexedFasta.py

示例13: main

# 需要导入模块: from CGAT import Genomics [as 别名]
# 或者: from CGAT.Genomics import complement [as 别名]
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("-m", "--method", dest="method", type="choice",
                      choices=('join', ),
                      help="method to apply [default=%default].")

    parser.set_defaults(
        method="join",
    )

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.Start(parser, argv=argv)

    if len(args) != 2:
        raise ValueError(
            "please supply at least two fastq files on the commandline")

    fn1, fn2 = args
    c = E.Counter()
    outfile = options.stdout

    if options.method == "join":
        # merge based on diagonals in dotplot
        iter1 = Fastq.iterate(IOTools.openFile(fn1))
        iter2 = Fastq.iterate(IOTools.openFile(fn2))
        tuple_size = 2
        for left, right in zip(iter1, iter2):
            c.input += 1

            # build dictionary of tuples
            s1, q1 = left.seq, left.quals
            d = collections.defaultdict(list)
            for x in range(len(s1) - tuple_size):
                d[s1[x:x + tuple_size]].append(x)

            s2, q2 = right.seq, right.quals
            # reverse complement
            s2 = Genomics.complement(s2)
            q2 = q2[::-1]

            # compute list of offsets/diagonals
            offsets = collections.defaultdict(int)
            for x in range(len(s2) - tuple_size):
                c = s2[x:x + tuple_size]
                for y in d[c]:
                    offsets[x - y] += 1

            # find maximum diagonal
            sorted = sorted([(y, x) for x, y in offsets.items()])
            max_count, max_offset = sorted[-1]

            E.debug('%s: maximum offset at %i' % (left.identifier,
                                                  max_offset))

            # simple merge sequence
            take = len(s2) - max_offset
            merged_seq = s1 + s2[take:]

            # simple merge quality scores
            merged_quals = q1 + q2[take:]

            new_entry = copy.copy(left)
            new_entry.seq = merged_seq
            new_entry.quals = merged_quals
            outfile.write(new_entry)
            c.output += 1

    # write footer and output benchmark information.
    E.info("%s" % str(c))
    E.Stop()

开发者ID:SCV，项目名称:cgat，代码行数:83，代码来源:fastqs2fastq.py

示例14: main

# 需要导入模块: from CGAT import Genomics [as 别名]
# 或者: from CGAT.Genomics import complement [as 别名]
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("--query-psl-file", dest="filename_query", type="string",
                      help="fasta filename with queries.")

    parser.add_option("--target-psl-file", dest="filename_target", type="string",
                      help="fasta filename with target.")

    parser.add_option("-m", "--method", dest="method", type="choice",
                      choices=(
                          "full", "pileup-query", "pileup-target", "gapless"),
                      help="method to use for constructing the alignment [%default].")

    parser.add_option("--forward-query", dest="forward_query", action="store_true",
                      help="reverse-complement sequences such that query is always on forward strand [%default]")

    parser.add_option("--target-prefix", dest="target_prefix", type="string",
                      help="prefix to use for target [%default].")

    parser.add_option("--query-prefix", dest="query_prefix", type="string",
                      help="prefix to use for query [%default].")

    parser.add_option("--id", dest="id", type="choice",
                      choices=("numeric", "query"),
                      help="choose type of identifier to use [%default]")

    parser.set_defaults(
        filename_query=None,
        filename_target=None,
        method="full",
        output_format_id="%06i",
        target_prefix="",
        query_prefix="",
        forward_query=False,
    )

    (options, args) = E.Start(parser)

    if options.filename_query:
        query = IndexedFasta.IndexedFasta(options.filename_query)

    if options.filename_target:
        target = IndexedFasta.IndexedFasta(options.filename_target)

    if options.method == "full":
        getAlignment = getAlignmentFull

    id = 0
    for match in Blat.iterator(options.stdin):
        if options.loglevel >= 2:
            options.stdout.write("# %s\n" % str(match))

        m = match.getMapQuery2Target()
        m.moveAlignment(-min(match.mQueryBlockStarts), -
                        min(match.mSbjctBlockStarts))
        q = query.getSequence(
            match.mQueryId, match.strand, match.mQueryFrom, match.mQueryTo)
        t = target.getSequence(
            match.mSbjctId, "+", match.mSbjctFrom, match.mSbjctTo)
        query_ali, sbjct_ali = getAlignment(m, q, t, options)

        if match.strand == "-" and options.forward_query:
            query_ali = Genomics.complement(query_ali)
            sbjct_ali = Genomics.complement(sbjct_ali)

        options.stdout.write(">%s%s:%s/%i-%i\n%s\n>%s%s:%s%s/%i-%i\n%s\n" %
                             (options.query_prefix,
                              options.output_format_id % id,
                              match.mQueryId, match.mQueryFrom, match.mQueryTo,
                              query_ali,
                              options.target_prefix,
                              options.output_format_id % id,
                              match.mSbjctId, match.strand,
                              match.mSbjctFrom, match.mSbjctTo,
                              sbjct_ali))
        id += 1

    E.Stop()

开发者ID:Q-KIM，项目名称:cgat，代码行数:90，代码来源:psl2fasta.py

示例15: len

# 需要导入模块: from CGAT import Genomics [as 别名]
# 或者: from CGAT.Genomics import complement [as 别名]
    genome = IndexedFasta.IndexedFasta( options.genome_file )

    assert options.filename_regions != None, "please supply a gff formatted filename with regions"

    regions = GTF.readAsIntervals( GFF.iterator( IOTools.openFile(options.filename_regions, "r" ) ) )

    # build pairs for complement
    reverse_splice_pairs = []
    forward_splice_pairs = options.splice_pairs
    left_tokens, right_tokens = {}, {}
    x = 0
    for a,b in forward_splice_pairs:
        assert len(a) == 2, "only two-residue patterns allowed"
        assert len(b) == 2, "only two-residue patterns allowed"

        ca, cb = Genomics.complement( a ), Genomics.complement( b ) 
        reverse_splice_pairs.append( (b,a) )
        left_tokens[a] = x
        left_tokens[cb] = x+1
        right_tokens[b] = x
        right_tokens[ca] = x+1
        x += 2

    search_area = options.search_area
    read_length = options.read_length
    joined = options.joined

    ninput, noutput = 0, 0

    if joined:
        outfile_coordinates = IOTools.openFile( options.output_filename_pattern % "coords", "w" )

开发者ID:BioinformaticsArchive，项目名称:cgat，代码行数:33，代码来源:fasta2spliced.py

注：本文中的CGAT.Genomics.complement方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台，相关代码片段筛选自各路编程大神贡献的开源项目，源码版权归原作者所有，传播和使用请参考对应项目的License；未经允许，请勿转载。