本文整理匯總了Python中pyfasta.Fasta類的典型用法代碼示例。如果您正苦於以下問題:Python Fasta類的具體用法?Python Fasta怎麽用?Python Fasta使用的例子?那麽, 這裏精選的類代碼示例或許可以為您提供幫助。
在下文中一共展示了Fasta類的15個代碼示例,這些例子默認根據受歡迎程度排序。您可以為喜歡或者感覺有用的代碼點讚,您的評價將有助於係統推薦出更棒的Python代碼示例。
示例1: run
def run(self, filename):
self.openOutFiles(filename)
f = Fasta(filename)
count = len(f)
self.not_found_in_kabat, self.fr4_not_found, current = (0, 0, 0)
for name in f.keys():
current += 1
if current % 1000 == 0:
print "All %d. Current: %d" % (count, current)
# format: vName_jName{frameNumber} or vName_dName{frameNumber}_jName{frameNumber}
vGeneName = name.split("_")[0]
vGeneRegions = self.getVGeneRegions(vGeneName)
if vGeneRegions is None:
continue
withoutMarkup = f[name][vGeneRegions[self.kabat.regions_count * 2 - 1]:]
group = self.findFR4(name, withoutMarkup)
if group is None:
continue
self.result_kabat_file.write(name)
self.result_kabat_file.write(("\t%d" * 10) % tuple(vGeneRegions))
self.result_kabat_file.write(("\t%d" * 4 + "\n") % tuple(
[vGeneRegions[9] + i for i in [1, group.start(), group.start() + 1, len(withoutMarkup)]]))
self.closeOutFiles()
print "all: {}; not in kabat: {}; without fr4: {}".format(current, self.not_found_in_kabat, self.fr4_not_found)
示例2: parse_sequences
def parse_sequences(sites, size, fasta_file):
"""Adds the binding site sequences extende to 'size' per row (decoded as A=0, C=1, G=2, T=3) to each input region."""
from pyfasta import Fasta # Fasta package is needed to fetch sequences from genome fasta file
print "INFO: Begin to fetch sequences...."
f = Fasta(fasta_file, key_fn=lambda key: key.split()[0])
for i, reg in enumerate(sites):
start = reg["ext_start"]
end = reg["ext_end"]
# if motif on negativ strand, shift region by +1 to account for zero based half-open intervals
if reg["strand"] == '-':
start += 1
end += 1
seq = f.sequence({"chr":reg["chr"], "start":start, "stop":end}, one_based=False)
# Note, the 'strand':reg["strand"] argument for f.sequence does not work, there seems to be a bug in the pyfasta/fasta.py code.
seq = seq.upper()
# if motif on negative strand, convert seq to reverse complement
if reg["strand"] == '-':
seq = reverse_complement(seq)
# add sequence to region dict
reg["ext_seq"] = seq
print "INFO: Finished sequences."
return regions
示例3: calc_nuc_counts
def calc_nuc_counts(fasta_filename, region_size_min,
region_size_max, verbose):
''' calculate nuc frequencies for normalization.
Returns: dict of nucleotide frequencies.
'''
nuc_counts = defaultdict(Counter)
fasta = Fasta(fasta_filename)
for chrom, seq in fasta.items():
for idx, pos in enumerate(seq):
for region_size in range(region_size_min,
region_size_max + 1):
nucs = seq[idx:idx+region_size]
if len(nucs) < region_size: continue
nuc_counts[region_size][nucs] += 1
return nuc_counts
示例4: aa_seq
def aa_seq(options):
""" Gets the ancestral sequence from a Fasta file
"""
f = Fasta(options.ancestralfasta)
keyz = (f.keys())
match = ''
if (options.single_chromosome):
# Single chromosome fasta should only have one sequence.
# that sequence should be the sequence of interest.
keyz = list(keyz)
key = keyz[0]
else:
get_chromosome_from_header = options.header
get_chromosome_from_header = \
get_chromosome_from_header.replace('?', options.chromosome)
for key in keyz:
if(re.match(get_chromosome_from_header, key) is not None):
match = key
if(match is ''):
raise Exception("No match possible is something wrong with the"
" regex specified to the program as"
"--header-regex")
aaSeq = f[key]
return(aaSeq)
示例5: _no_empty
def _no_empty(self, lista, listb):
''' removes empty entries '''
# check for empty fasta.
tmpa = list()
tmpb = list()
for i in range(len(listb)):
# open it.
try:
z = Fasta(listb[i], record_class=MemoryRecord)
# check for empty.
if len(z.keys()) == 0:
continue
# add to temp.
tmpa.append(lista[i])
tmpb.append(listb[i])
except:
logging.warning("bad fasta file")
# sort back.
return tmpa, tmpb
示例6: create_fasta_flat_file
def create_fasta_flat_file(file):
"""Reads a fasta file for fast sequence retrival"""
fasta_file = Fasta(file, key_fn=lambda key: key.split()[0])
fasta_headers = set(fasta_file.keys());
return fasta_file, fasta_headers
示例7: genome_contenct_stats
def genome_contenct_stats(fasta_path):
f = Fasta(fasta_path)
g_box_total = []
for seqid in f.keys():
seq = f[seqid][:]
g_boxs = len(re.findall("CACGTG", seq, flags=re.IGNORECASE))
g_box_total.append(g_boxs)
print >> sys.stderr, "total gboxes:{0}".format(sum(g_box_total))
示例8: check_keyfn2
def check_keyfn2(path, klass, inplace):
f = Fasta(path, record_class=klass, flatten_inplace=inplace, key_fn=lambda
key: "-".join(key.split()))
assert sorted(f.keys()) == ['a-extra', 'b-extra', 'c-extra'], f.keys()
assert f['a-extra']
fix(path)
示例9: read_fa
def read_fa(fa='/Share/home/zhangqf7/gongjing/zebrafish/data/reference/transcriptome/danRer10.refSeq.transcriptome.fa'):
gj.printFuncRun('read_fa')
gj.printFuncArgs()
fa_dict1 = Fasta(fa, key_fn=lambda key:key.split("\t")[0])
fa_dict = {i.split()[0]:j[0:] for i,j in fa_dict1.items()}
print fa_dict.keys()[0:3]
gj.printFuncRun('read_fa')
return fa_dict
示例10: split
def split(args):
parser = optparse.OptionParser("""\
split a fasta file into separated files.
pyfasta split -n 6 [-k 5000 ] some.fasta
the output will be some.0.fasta, some.1.fasta ... some.6.fasta
the sizes will be as even as reasonable.
""")
parser.add_option("--header", dest="header", metavar="FILENAME_FMT",
help="""this overrides all other options. if specified, it will
split the file into a separate file for each header. it
will be a template specifying the file name for each new file.
e.g.: "%(fasta)s.%(seqid)s.fasta"
where 'fasta' is the basename of the input fasta file and seqid
is the header of each entry in the fasta file.""" ,default=None)
parser.add_option("-n", "--n", type="int", dest="nsplits",
help="number of new files to create")
parser.add_option("-o", "--overlap", type="int", dest="overlap",
help="overlap in basepairs", default=0)
parser.add_option("-k", "--kmers", type="int", dest="kmers", default=-1,
help="""\
split big files into pieces of this size in basepairs. default
default of -1 means do not split the sequence up into k-mers, just
split based on the headers. a reasonable value would be 10Kbp""")
options, fasta = parser.parse_args(args)
if not (fasta and (options.nsplits or options.header)):
sys.exit(parser.print_help())
if isinstance(fasta, (tuple, list)):
assert len(fasta) == 1, fasta
fasta = fasta[0]
kmer = options.kmers if options.kmers != -1 else None
overlap = options.overlap if options.overlap != 0 else None
f = Fasta(fasta)
if options.header:
names = dict([(seqid, options.header % \
dict(fasta=f.fasta_name, seqid=seqid)) \
for seqid in f.iterkeys()])
"""
if len(names) > 0:
assert names[0][1] != names[1][1], ("problem with header format", options.header)
fhs = dict([(seqid, open(fn, 'wb')) for seqid, fn in names[:200]])
fhs.extend([(seqid, StringIO(), fn) for seqid, fn in names[200:]])
"""
return with_header_names(f, names)
else:
names = newnames(fasta, options.nsplits, kmers=kmer, overlap=overlap,
header=options.header)
#fhs = [open(n, 'wb') for n in names]
if options.kmers == -1:
return without_kmers(f, names)
else:
return with_kmers(f, names, options.kmers, options.overlap)
示例11: mask_to_bed
def mask_to_bed(fasta_file, mask_bed_name):
"creates a bed file of the start and stops of masked seqs"
mask_bed = open(mask_bed_name,"wb")
f= Fasta(fasta_file)
mask_id = 1
for seqid in f.keys():
seq = f[seqid][:]
for m in re.finditer("X+",seq):
mask_id = mask_id + 1
w = '{0}\t{1}\t{2}\t{3}\t{4}\t+\t.\t.\t.\t1\t{5}\t0\n'.format(seqid,m.start(),m.end(),"mask_id {0}".format(mask_id),(m.end()-m.start()),(m.end()-m.start()+1))
mask_bed.write(w)
示例12: write_c2t
def write_c2t(fasta_name, unconverted, colorspace=False):
"""
given a fasta file, write a new file:
`some.fr.c2t.fasta` which contains:
+ the same headers prefixed with 'f' with all C's converted to T
+ headers prefixed with 'r' reverse complemented with
all C's converted to T.
if unconverted is false, then also save a file with the forward and reverse
without conversion.
"""
d = op.join(op.dirname(fasta_name), "bowtie_index")
if colorspace: d += "_colorspace"
if not op.exists(d): os.mkdir(d)
p, ext = op.splitext(op.basename(fasta_name)) # some.fasta -> some, fasta
fname = "%s/%s.fr.c2t%s" % (d, p, ext)
# no conversion, just copy the file into the index dir.
unconverted_fname = "%s/%s.fr%s" % (d, p, ext)
if op.exists(fname):
if not unconverted: return fname, unconverted_fname
elif op.exists(unconverted_fname): return fname, unconverted_fname
fasta = Fasta(fasta_name)
c2t_fh = open(fname, 'w')
unc_fh = open(unconverted_fname, 'w') if unconverted else None
print >>sys.stderr, "writing forward and reverse c2t to: %s" % (fname,)
try:
for header in fasta.iterkeys():
seq = str(fasta[header]).upper()
assert not ">" in seq
# c2t, prefix header with f and write
print >>c2t_fh, ">f%s" % header
print >>c2t_fh, seq.replace('C', 'T')
# then r-c, c2t, prefix header with r and write
print >>c2t_fh, ">r%s" % header
rseq = revcomp(seq)
print >>c2t_fh, rseq.replace('C', 'T')
if unc_fh is not None:
print >>unc_fh, ">f%s\n%s" % (header, seq)
print >>unc_fh, ">r%s\n%s" % (header, rseq)
c2t_fh.close()
except:
os.unlink(fname)
os.unlink(unconverted_fname)
raise
return fname, unconverted_fname
示例13: mask
def mask(fasta_file, org, cutoff, mask_value='X'):
h5, node = get_node(org, 'r')
outfile = fasta_file[:fasta_file.rfind(".")] + (".masked.%i" % cutoff) \
+ fasta_file[fasta_file.rfind("."):]
print "> masking sequence to file:", outfile
out = open(outfile ,'w')
fasta = Fasta(fasta_file)
soft_mask = mask_value.lower() == 'soft'
for seqid in sorted(fasta.iterkeys()):
masked = 0
if soft_mask:
seq = str(fasta[seqid])
# mask is the lowercase sequence.
mask_value = np.array(seq.lower(), dtype='c')
seq = np.array(seq.upper(), dtype='c')
else:
fasta[seqid].tostring = False
seq = fasta[seqid][:] # a
if not 'c' + seqid in node:
print >>sys.stderr, seqid,\
'! not found in masked, writing unchanged\n' \
' this means that no section of this sequence appeared\n' \
' more than %i times' % cutoff
out.write('>' + seqid + '\n')
out.write(seq.tostring() + '\n')
continue
hit_counts = getattr(node, 'c' + seqid)[:]
masked_seq = np.where(numexpr.evaluate("hit_counts > %i" % cutoff)
, mask_value, seq).tostring()
l = len(masked_seq)
print >>sys.stderr, "! seq:%s len:%i %%masked:%.3f" % (seqid, l,
100.0 * masked_seq.count(mask_value) / l)
assert len(seq) == l
out.write('>' + seqid + '\n')
out.write(masked_seq + '\n')
out.close()
# write out a file .fasta.version containing
# the svnversion (if available of this script
# that was used to create the file.
path = os.path.dirname(__file__)
os.system('svnversion %s > %s.version' % (path, outfile))
h5.close()
示例14: main
def main(gff_file, outdir):
"""empty docstring"""
name = re.compile("parent=([^.;]+)", re.I)
feats = {}
non_cds_feats = collections.defaultdict(list)
for line in open(gff_file):
line = line.split("\t")
match = re.search(name, line[-1])
if not match:
continue
fname = match.groups(0)[0]
non_cds_feats[fname].append(line)
if line[2].upper() == "CDS":
feats[fname] = True
continue
if fname in feats:
continue
feats[fname] = None
i = 0
for k, v in sorted(feats.items()):
if not v is None:
del non_cds_feats[k]
seen = {}
RNA = open(outdir + "/at_non_cds.gff", "w")
for k, feat_list in sorted(non_cds_feats.items()):
for feat in feat_list:
if feat[0] in ("ChrC", "ChrM"):
continue
if feat[2] == "exon":
continue
key = (feat[0], feat[3], feat[4])
if key in seen:
continue
feat[0] = feat[0].upper().replace("CHR", "")
seen[key] = True
feat[-1] = k
print >> RNA, "\t".join(feat)
RNA.close()
gff = read_gff(outdir + "/at_non_cds.gff")
fasta = Fasta("/home/gturco/src/find_cns_gturco/pipeline/data/arabidopsis.fasta")
ftypes = {}
FA = open(outdir + "/at_rnas.fasta", "w")
for chr, feature_list in gff.iteritems():
for fname, feature in feature_list.iteritems():
seq = fasta.sequence(feature)
print >> FA, ">", feature["name"]
print >> FA, seq
FA.close()
示例15: check_kmer_overlap
def check_kmer_overlap(f):
chr2 = f['chr2']
kmers = Fasta.as_kmers(chr2, 10, overlap=2)
for i, k in enumerate(list(kmers)[:-1]):
assert (len(k[1]) == 10)
assert (k[0] == (i * (10 - 2)))
kmers = Fasta.as_kmers(chr2, 10, overlap=4)
seqs = [k[1] for k in kmers]
paired_seqs = zip(seqs[0:-1], seqs[1:])
for a, b in paired_seqs:
if len(a) < 4 or len(b) < 4: continue
assert (a[-4:] == b[:4])