Python pyfasta.Fasta类代码示例

本文整理汇总了Python中pyfasta.Fasta类的典型用法代码示例。如果您正苦于以下问题：Python Fasta类的具体用法？Python Fasta怎么用？Python Fasta使用的例子？那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。

在下文中一共展示了Fasta类的15个代码示例，这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞，您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: run

    def run(self, filename):
        self.openOutFiles(filename)
        f = Fasta(filename)

        count = len(f)
        self.not_found_in_kabat, self.fr4_not_found, current = (0, 0, 0)

        for name in f.keys():
            current += 1
            if current % 1000 == 0:
                print "All %d. Current: %d" % (count, current)
                # format: vName_jName{frameNumber} or vName_dName{frameNumber}_jName{frameNumber}

            vGeneName = name.split("_")[0]

            vGeneRegions = self.getVGeneRegions(vGeneName)
            if vGeneRegions is None:
                continue

            withoutMarkup = f[name][vGeneRegions[self.kabat.regions_count * 2 - 1]:]
            group = self.findFR4(name, withoutMarkup)
            if group is None:
                continue

            self.result_kabat_file.write(name)
            self.result_kabat_file.write(("\t%d" * 10) % tuple(vGeneRegions))
            self.result_kabat_file.write(("\t%d" * 4 + "\n") % tuple(
                [vGeneRegions[9] + i for i in [1, group.start(), group.start() + 1, len(withoutMarkup)]]))

        self.closeOutFiles()
        print "all: {}; not in kabat: {}; without fr4: {}".format(current, self.not_found_in_kabat, self.fr4_not_found)

开发者ID:biocad，项目名称:au-summer-2013，代码行数:31，代码来源:markup.py

示例2: parse_sequences

def parse_sequences(sites, size, fasta_file):
    """Adds the binding site sequences extende to 'size' per row (decoded as A=0, C=1, G=2, T=3) to each input region."""
    from pyfasta import Fasta  # Fasta package is needed to fetch sequences from genome fasta file
            
    print "INFO: Begin to fetch sequences...."
    
    f = Fasta(fasta_file, key_fn=lambda key: key.split()[0])

    for i, reg in enumerate(sites):
        
        start = reg["ext_start"]
        end = reg["ext_end"]
        
        # if motif on negativ strand, shift region by +1 to account for zero based half-open intervals
        if reg["strand"] == '-':
            start += 1
            end += 1
        
        seq = f.sequence({"chr":reg["chr"], "start":start, "stop":end}, one_based=False)

        # Note, the 'strand':reg["strand"] argument for f.sequence does not work, there seems to be a bug in the pyfasta/fasta.py code.
        seq = seq.upper()
 
        # if motif on negative strand, convert seq to reverse complement
        if reg["strand"] == '-': 
            seq = reverse_complement(seq)
        
        # add sequence to region dict
        reg["ext_seq"] = seq
        
    print "INFO: Finished sequences."
    return regions

开发者ID:ComputationalSystemsBiology，项目名称:ExoProfiler，代码行数:32，代码来源:5primeCounter.py

示例3: calc_nuc_counts

def calc_nuc_counts(fasta_filename, region_size_min,
                    region_size_max, verbose):
    ''' calculate nuc frequencies for normalization.

        Returns: dict of nucleotide frequencies.
    '''

    nuc_counts = defaultdict(Counter)

    fasta = Fasta(fasta_filename)

    for chrom, seq in fasta.items():

        for idx, pos in enumerate(seq):

            for region_size in range(region_size_min,
                                     region_size_max + 1):

                nucs = seq[idx:idx+region_size]

                if len(nucs) < region_size: continue

                nuc_counts[region_size][nucs] += 1

    return nuc_counts

开发者ID:speach，项目名称:modmap，代码行数:25，代码来源:genome_nuc_freqs.py

示例4: aa_seq

def aa_seq(options):
    """ Gets the ancestral sequence from a Fasta file

    """
    f = Fasta(options.ancestralfasta)
    keyz = (f.keys())
    match = ''
    if (options.single_chromosome):
        # Single chromosome fasta should only have one sequence.
        # that sequence should be the sequence of interest.
        keyz = list(keyz)
        key = keyz[0]
    else:
        get_chromosome_from_header = options.header
        get_chromosome_from_header = \
            get_chromosome_from_header.replace('?', options.chromosome)
        for key in keyz:
            if(re.match(get_chromosome_from_header, key) is not None):
                match = key
        if(match is ''):
            raise Exception("No match possible is something wrong with the"
                            " regex specified to the program as"
                            "--header-regex")
    aaSeq = f[key]
    return(aaSeq)

开发者ID:MMesbahU，项目名称:selectionTools，代码行数:25，代码来源:aa_annotate.py

示例5: _no_empty

    def _no_empty(self, lista, listb):
        ''' removes empty entries '''
        
        # check for empty fasta.
        tmpa = list()
        tmpb = list()
        for i in range(len(listb)):
            
            # open it.
            try:
                z = Fasta(listb[i], record_class=MemoryRecord)
            
                # check for empty.
                if len(z.keys()) == 0:
                    continue

                # add to temp.
                tmpa.append(lista[i])
                tmpb.append(listb[i])

            except:
                logging.warning("bad fasta file")
            
        # sort back.
        return tmpa, tmpb

开发者ID:jim-bo，项目名称:parabio，代码行数:25，代码来源:nucmer.py

示例6: create_fasta_flat_file

def create_fasta_flat_file(file):
    """Reads a fasta file for fast sequence retrival"""

    fasta_file = Fasta(file, key_fn=lambda key: key.split()[0])

    fasta_headers = set(fasta_file.keys());

    return fasta_file, fasta_headers

开发者ID:henrikstranneheim，项目名称:VariantUtilities，代码行数:8，代码来源:splicer.py

示例7: genome_contenct_stats

def genome_contenct_stats(fasta_path):
    f = Fasta(fasta_path)
    g_box_total = []
    for seqid in f.keys():
        seq = f[seqid][:]
        g_boxs = len(re.findall("CACGTG", seq, flags=re.IGNORECASE))
        g_box_total.append(g_boxs)
    print >> sys.stderr, "total gboxes:{0}".format(sum(g_box_total))

开发者ID:gturco，项目名称:random_bio_tools，代码行数:8，代码来源:genome_stats.py

示例8: check_keyfn2

def check_keyfn2(path, klass, inplace):
    f = Fasta(path, record_class=klass, flatten_inplace=inplace, key_fn=lambda
            key: "-".join(key.split()))

    assert sorted(f.keys()) == ['a-extra', 'b-extra', 'c-extra'], f.keys()

    assert f['a-extra']
    fix(path)

开发者ID:brentp，项目名称:pyfasta，代码行数:8，代码来源:test_all.py

示例9: read_fa

def read_fa(fa='/Share/home/zhangqf7/gongjing/zebrafish/data/reference/transcriptome/danRer10.refSeq.transcriptome.fa'):
	gj.printFuncRun('read_fa')
	gj.printFuncArgs()
	fa_dict1 = Fasta(fa, key_fn=lambda key:key.split("\t")[0])
	fa_dict = {i.split()[0]:j[0:] for i,j in fa_dict1.items()}
	print fa_dict.keys()[0:3]
	gj.printFuncRun('read_fa')
	return fa_dict

开发者ID:Tsinghua-gongjing，项目名称:test，代码行数:8，代码来源:atcg_stats.py

示例10: split

def split(args):
    parser = optparse.OptionParser("""\
   split a fasta file into separated files.
        pyfasta split -n 6 [-k 5000 ] some.fasta
    the output will be some.0.fasta, some.1.fasta ... some.6.fasta
    the sizes will be as even as reasonable.
   """)
    parser.add_option("--header", dest="header", metavar="FILENAME_FMT",
       help="""this overrides all other options. if specified, it will
               split the file into a separate file for each header. it
               will be a template specifying the file name for each new file.
               e.g.:    "%(fasta)s.%(seqid)s.fasta"
               where 'fasta' is the basename of the input fasta file and seqid
               is the header of each entry in the fasta file.""" ,default=None)

    parser.add_option("-n", "--n", type="int", dest="nsplits", 
                            help="number of new files to create")
    parser.add_option("-o", "--overlap", type="int", dest="overlap", 
                            help="overlap in basepairs", default=0)
    parser.add_option("-k", "--kmers", type="int", dest="kmers", default=-1,
                     help="""\
    split big files into pieces of this size in basepairs. default
    default of -1 means do not split the sequence up into k-mers, just
    split based on the headers. a reasonable value would be 10Kbp""")
    options, fasta = parser.parse_args(args)
    if not (fasta and (options.nsplits or options.header)):
        sys.exit(parser.print_help())

    if isinstance(fasta, (tuple, list)):
        assert len(fasta) == 1, fasta
        fasta = fasta[0]

    kmer = options.kmers if options.kmers != -1 else None
    overlap = options.overlap if options.overlap != 0 else None
    f = Fasta(fasta)
    if options.header:
        names = dict([(seqid, options.header % \
                      dict(fasta=f.fasta_name, seqid=seqid)) \
                                       for seqid in f.iterkeys()])
        """
        if len(names) > 0:
            assert names[0][1] != names[1][1], ("problem with header format", options.header)
        fhs = dict([(seqid, open(fn, 'wb')) for seqid, fn in names[:200]])
        fhs.extend([(seqid, StringIO(), fn) for seqid, fn in names[200:]])
        """
        return with_header_names(f, names)
    else:
        names = newnames(fasta, options.nsplits, kmers=kmer, overlap=overlap, 
                     header=options.header)

        #fhs = [open(n, 'wb') for n in names]
    if options.kmers == -1:
        return without_kmers(f, names)
    else: 
        return with_kmers(f, names, options.kmers, options.overlap)

开发者ID:brentp，项目名称:pyfasta，代码行数:55，代码来源:split_fasta.py

示例11: mask_to_bed

def mask_to_bed(fasta_file, mask_bed_name):
    "creates a bed file of the start and stops of masked seqs"
    mask_bed = open(mask_bed_name,"wb")
    f= Fasta(fasta_file)
    mask_id = 1
    for seqid in f.keys():
        seq = f[seqid][:]
        for m in re.finditer("X+",seq):
            mask_id = mask_id + 1
            w = '{0}\t{1}\t{2}\t{3}\t{4}\t+\t.\t.\t.\t1\t{5}\t0\n'.format(seqid,m.start(),m.end(),"mask_id {0}".format(mask_id),(m.end()-m.start()),(m.end()-m.start()+1))
            mask_bed.write(w)

开发者ID:gturco，项目名称:random_bio_tools，代码行数:11，代码来源:cns_to_bed.py

示例12: write_c2t

def write_c2t(fasta_name, unconverted, colorspace=False):
    """
    given a fasta file, write a new file:
        `some.fr.c2t.fasta` which contains:
          + the same headers prefixed with 'f' with all C's converted to T
          + headers prefixed with 'r' reverse complemented with
                                 all C's converted to T.

    if unconverted is false, then also save a file with the forward and reverse
    without conversion.
    """
    d = op.join(op.dirname(fasta_name), "bowtie_index")
    if colorspace: d += "_colorspace"
    if not op.exists(d): os.mkdir(d)

    p, ext = op.splitext(op.basename(fasta_name)) # some.fasta -> some, fasta
    fname = "%s/%s.fr.c2t%s" % (d, p, ext)
        # no conversion, just copy the file into the index dir.
    unconverted_fname = "%s/%s.fr%s" % (d, p, ext)
    if op.exists(fname):
        if not unconverted: return fname, unconverted_fname
        elif op.exists(unconverted_fname): return fname, unconverted_fname

    fasta = Fasta(fasta_name)

    c2t_fh = open(fname, 'w')
    unc_fh = open(unconverted_fname, 'w') if unconverted else None

    print >>sys.stderr, "writing forward and reverse c2t to: %s" % (fname,)

    try:
        for header in fasta.iterkeys():
            seq = str(fasta[header]).upper()
            assert not ">" in seq
            # c2t, prefix header with f and write
            print >>c2t_fh, ">f%s" % header
            print >>c2t_fh, seq.replace('C', 'T')
            # then r-c, c2t, prefix header with r and write
            print >>c2t_fh, ">r%s" % header
            rseq = revcomp(seq)
            print >>c2t_fh, rseq.replace('C', 'T')
            if unc_fh is not None:
                print >>unc_fh, ">f%s\n%s" % (header, seq)
                print >>unc_fh, ">r%s\n%s" % (header, rseq)

        c2t_fh.close()
    except:
        os.unlink(fname)
        os.unlink(unconverted_fname)
        raise

    return fname, unconverted_fname

开发者ID:BioinformaticsArchive，项目名称:methylcode，代码行数:52，代码来源:__init__.py

示例13: mask

def mask(fasta_file, org, cutoff, mask_value='X'):
    h5, node = get_node(org, 'r')

    outfile = fasta_file[:fasta_file.rfind(".")] + (".masked.%i" % cutoff) \
                         + fasta_file[fasta_file.rfind("."):]

    print "> masking sequence to file:", outfile
    out = open(outfile ,'w')

    fasta = Fasta(fasta_file)

    soft_mask = mask_value.lower() == 'soft'
    for seqid in sorted(fasta.iterkeys()): 
        masked = 0
        if soft_mask:
            seq = str(fasta[seqid])
            # mask is the lowercase sequence.
            mask_value = np.array(seq.lower(), dtype='c')
            seq = np.array(seq.upper(), dtype='c')
        else:
            fasta[seqid].tostring = False
            seq = fasta[seqid][:] # a


        if not 'c' + seqid in node:
            print >>sys.stderr, seqid,\
                '! not found in masked, writing unchanged\n' \
                '  this means that no section of this sequence appeared\n' \
                '  more than %i times' % cutoff
            out.write('>' + seqid + '\n')
            out.write(seq.tostring() + '\n')
            continue
        
        hit_counts = getattr(node, 'c' + seqid)[:]
        masked_seq = np.where(numexpr.evaluate("hit_counts > %i" % cutoff)
                              , mask_value, seq).tostring() 

        l = len(masked_seq)
        print >>sys.stderr, "! seq:%s len:%i %%masked:%.3f" % (seqid, l, 
                                   100.0 * masked_seq.count(mask_value) / l)
        assert len(seq) == l
        out.write('>' + seqid + '\n')
        out.write(masked_seq + '\n')

    out.close()
    # write out a file .fasta.version containing
    # the svnversion (if available of this script
    # that was used to create the file.
    path = os.path.dirname(__file__)
    os.system('svnversion %s > %s.version' % (path, outfile))
    h5.close()

开发者ID:gturco，项目名称:find_cns，代码行数:51，代码来源:mask_genome.py

示例14: main

def main(gff_file, outdir):
    """empty docstring"""
    name = re.compile("parent=([^.;]+)", re.I)

    feats = {}
    non_cds_feats = collections.defaultdict(list)
    for line in open(gff_file):
        line = line.split("\t")
        match = re.search(name, line[-1])
        if not match:
            continue
        fname = match.groups(0)[0]
        non_cds_feats[fname].append(line)
        if line[2].upper() == "CDS":
            feats[fname] = True
            continue
        if fname in feats:
            continue
        feats[fname] = None
    i = 0
    for k, v in sorted(feats.items()):
        if not v is None:
            del non_cds_feats[k]

    seen = {}
    RNA = open(outdir + "/at_non_cds.gff", "w")
    for k, feat_list in sorted(non_cds_feats.items()):
        for feat in feat_list:
            if feat[0] in ("ChrC", "ChrM"):
                continue
            if feat[2] == "exon":
                continue
            key = (feat[0], feat[3], feat[4])
            if key in seen:
                continue
            feat[0] = feat[0].upper().replace("CHR", "")
            seen[key] = True
            feat[-1] = k
            print >> RNA, "\t".join(feat)
    RNA.close()

    gff = read_gff(outdir + "/at_non_cds.gff")
    fasta = Fasta("/home/gturco/src/find_cns_gturco/pipeline/data/arabidopsis.fasta")
    ftypes = {}
    FA = open(outdir + "/at_rnas.fasta", "w")
    for chr, feature_list in gff.iteritems():
        for fname, feature in feature_list.iteritems():
            seq = fasta.sequence(feature)
            print >> FA, ">", feature["name"]
            print >> FA, seq
    FA.close()

开发者ID:gturco，项目名称:find_cns，代码行数:51，代码来源:arabidopsis_rna.py

示例15: check_kmer_overlap

def check_kmer_overlap(f):
    chr2 = f['chr2']

    kmers = Fasta.as_kmers(chr2, 10, overlap=2)
    for i, k in enumerate(list(kmers)[:-1]):
        assert (len(k[1]) == 10)
        assert (k[0] == (i * (10 - 2)))

    kmers = Fasta.as_kmers(chr2, 10, overlap=4)
    seqs = [k[1] for k in kmers]
    paired_seqs = zip(seqs[0:-1], seqs[1:])
    for a, b in paired_seqs:
        if len(a) < 4 or len(b) < 4: continue
        assert (a[-4:] == b[:4])

开发者ID:jamescasbon，项目名称:pyfasta，代码行数:14，代码来源:test_all.py

注：本文中的pyfasta.Fasta类示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台，相关代码片段筛选自各路编程大神贡献的开源项目，源码版权归原作者所有，传播和使用请参考对应项目的License；未经允许，请勿转载。