本文整理匯總了Python中gffutils.create_db方法的典型用法代碼示例。如果您正苦於以下問題:Python gffutils.create_db方法的具體用法?Python gffutils.create_db怎麽用?Python gffutils.create_db使用的例子?那麽, 這裏精選的方法代碼示例或許可以為您提供幫助。您也可以進一步了解該方法所在類gffutils
的用法示例。
在下文中一共展示了gffutils.create_db方法的15個代碼示例,這些例子默認根據受歡迎程度排序。您可以為喜歡或者感覺有用的代碼點讚,您的評價將有助於係統推薦出更棒的Python代碼示例。
示例1: create_feature_db_connections
# 需要導入模塊: import gffutils [as 別名]
# 或者: from gffutils import create_db [as 別名]
def create_feature_db_connections(g_arg, db_arg, infer_transcripts, infer_genes):
if infer_transcripts is True:
disable_transcripts = False
else:
disable_transcripts = True
if infer_genes is True:
disable_genes = False
else:
disable_genes = True
if db_arg is None:
feature_db = gffutils.create_db(g_arg, g_arg + "_db", merge_strategy="create_unique", force=True,
disable_infer_transcripts=disable_transcripts, disable_infer_genes=disable_genes, verbose=True)
feature_db_name = db_arg
else:
feature_db = gffutils.FeatureDB(db_arg)
feature_db_name = db_arg
return feature_db, feature_db_name
示例2: create_db
# 需要導入模塊: import gffutils [as 別名]
# 或者: from gffutils import create_db [as 別名]
def create_db(gtf_filename, db_filename=None):
db_filename = ':memory:' if db_filename is None else db_filename
db = gffutils.create_db(
gtf_filename,
db_filename,
merge_strategy='merge',
id_spec={'gene': 'gene_id', 'transcript': 'transcript_id',
'exon': 'location_id', 'CDS': 'location_id',
'start_codon': 'location_id',
'stop_codon': 'location_id', 'UTR': 'location_id'},
transform=transform,
force=True,
verbose=True,
disable_infer_genes=True,
disable_infer_transcripts=True,
force_merge_fields=['source'])
maybe_analyze(db)
return db
示例3: __init__
# 需要導入模塊: import gffutils [as 別名]
# 或者: from gffutils import create_db [as 別名]
def __init__(self, gtf_file, fasta_file,
disable_infer_transcripts=True,
disable_infer_genes=True):
self.gtf_file = gtf_file
self.fasta_file = fasta_file
self.fasta_extractor = None
self.db = gffutils.create_db(gtf_file, ":memory:",
disable_infer_transcripts=disable_infer_transcripts,
disable_infer_genes=disable_infer_genes)
self.start_codons = list(self.db.features_of_type("start_codon"))
self.input_transform = OneHot()
示例4: _get_introns
# 需要導入模塊: import gffutils [as 別名]
# 或者: from gffutils import create_db [as 別名]
def _get_introns(self):
"""
Get introns from the specified seq and annotation files.
:return: List of all detected introns as gffutils.Feature objects.
"""
# create a gffutils database
self.db = gffutils.create_db(data=self.gtf_file, dbfn=":memory:",
force=True, id_spec={'gene': 'gene_id', 'transcript': 'transcript_id'},
disable_infer_transcripts=True, disable_infer_genes=True, verbose=False,
merge_strategy="merge")
if not self.create_introns:
# load introns from gtf, don't create them
introns = list(self.db.features_of_type('intron', order_by=('seqid', 'start', 'end'))) # exons are sorted start-coord. asc.
self._add_SOI(introns)
return introns
exons = list(self.db.features_of_type('exon', order_by=('seqid', 'start', 'end'))) # exons are sorted start-coord. asc.
# group exons in a dict by gene id
transcript_to_exon = self._get_tr_to_exon_dict(exons)
collected_introns = self._build_introns(transcript_to_exon)
self._add_SOI(collected_introns)
return collected_introns
示例5: add_removed_evm
# 需要導入模塊: import gffutils [as 別名]
# 或者: from gffutils import create_db [as 別名]
def add_removed_evm(pasa, exon, wd):
"""
here the clusters of sequence from the same locus are prepared
"""
db_evm = gffutils.create_db(pasa, ':memory:', merge_strategy='create_unique', keep_order=True)
ids_evm = [gene.attributes["ID"][0] for gene in db_evm.features_of_type("mRNA")]
db_gmap = gffutils.create_db(exon, ':memory:', merge_strategy='create_unique', keep_order=True)
ids_gmap_full = [gene.attributes["ID"][0] for gene in db_gmap.features_of_type("gene")]
ids_gmap = [gene.attributes["ID"][0].split("_")[0] for gene in db_gmap.features_of_type("mRNA")]
uniq_evm = [evm for evm in ids_evm if evm not in ids_gmap]
uniq_gene = [gene.attributes["ID"][0] for mrna in uniq_evm for gene in db_evm.parents(mrna)]
uniq = list(set(uniq_gene))
outfile = tempfile.NamedTemporaryFile(delete=False, prefix="additional.", suffix=".gff3", dir=wd)
gff_out_s = gffwriter.GFFWriter(outfile.name)
for name in uniq:
for i in db_evm.children(name, order_by='start'):
gff_out_s.write_rec(i)
gff_out_s.write_rec(db_evm[name])
for name in ids_gmap_full:
for i in db_gmap.children(name, order_by='start'):
gff_out_s.write_rec(i)
gff_out_s.write_rec(db_gmap[name])
gff_out_s.close()
return outfile.name
#final_evm="/home/lfaino/lfainoData/lorean/LoReAn_Example/Crispa/LoReAn_crispa/run/exonerate/exonerate_step4.final.4e4aoxwb.gff3"
#genome = "/home/lfaino/lfainoData/lorean/LoReAn_Example/Crispa/LoReAn_crispa/run/split/scaffold3.fasta.masked.rename.fasta"
示例6: add_EVM
# 需要導入模塊: import gffutils [as 別名]
# 或者: from gffutils import create_db [as 別名]
def add_EVM(final_update, wd, consensus_mapped_gff3):
"""
"""
db_evm = gffutils.create_db(final_update, ':memory:', merge_strategy='create_unique', keep_order=True)
ids_evm = [gene.attributes["ID"][0] for gene in db_evm.features_of_type("mRNA")]
db_gmap = gffutils.create_db(consensus_mapped_gff3, ':memory:', merge_strategy='create_unique', keep_order=True)
ids_gmap_full = [gene.attributes["ID"][0] for gene in db_gmap.features_of_type("gene")]
ids_gmap = [gene.attributes["ID"][0].split("_")[0] for gene in db_gmap.features_of_type("gene")]
uniq_evm = [evm for evm in ids_evm if not evm in ids_gmap]
mRNA = []
for evm in uniq_evm:
for line in db_evm.parents(evm, order_by='start'):
mRNA.append(line.attributes["ID"][0])
mRNA_uniq = list(set(mRNA))
outfile = tempfile.NamedTemporaryFile(delete=False, prefix="additional.1.", suffix=".gff3", dir=wd)
gff_out_s = gffwriter.GFFWriter(outfile.name)
for name in mRNA_uniq:
for i in db_evm.children(name, order_by='start'):
gff_out_s.write_rec(i)
gff_out_s.write_rec(db_evm[name])
for name in ids_gmap_full:
for i in db_gmap.children(name, order_by='start'):
gff_out_s.write_rec(i)
gff_out_s.write_rec(db_gmap[name])
gff_out_s.close()
return outfile.name
#return outfile_out.name
示例7: get_gene_coordinates_single
# 需要導入模塊: import gffutils [as 別名]
# 或者: from gffutils import create_db [as 別名]
def get_gene_coordinates_single(
gff_file, contig, gene, l
): #Takes a GFF file, splits into CDS and FASTA and identifies the coordinates of the 2 genes
gff = open(gff_file).read() #Import the GFF file
split = gff.split("##FASTA")
geneCoordinates = [] #Will be filled with the coordinates of the 2 genes
with StringIO(split[1]
) as temp_fasta: #Import the sequences from the GFF as fasta
sequences = list(SeqIO.parse(temp_fasta, "fasta"))
parsed_gff = gffutils.create_db(clean_gff_string(split[0]),
dbfn=":memory:",
force=True,
keep_order=True,
from_string=True)
for geneSample in parsed_gff.all_features(
featuretype=()): #Iterate through the genes
if geneSample.id == gene: #Check if the gene is one of the genes of interest
geneCoordinates.append(geneSample.start)
geneCoordinates.append(geneSample.stop)
firstPosition = min(
geneCoordinates
) - l + 1 #The first position to be extracted from the contig
endPosition = max(
geneCoordinates) + l #The end position to be extracted from the contig
if firstPosition < 0:
firstPosition = 0
if endPosition > len(sequences[int(contig)].seq):
endPosition = len(sequences[int(contig)].seq)
extractSequence = sequences[int(contig)].seq[
firstPosition:
endPosition] #Assign to the sequence in the region of interest
return extractSequence
示例8: __init__
# 需要導入模塊: import gffutils [as 別名]
# 或者: from gffutils import create_db [as 別名]
def __init__(self, file_path, prefered_name="transcript_name",
merge_transcripts=True):
"""
:param file_path: the path of the gtf file
:return:
"""
self.file_type = 'bed12'
# list of bed fields
self.fields = ['chromosome', 'start', 'end',
'name', 'score', 'strand',
'thick_start', 'thick_end',
'rgb', 'block_count',
'block_sizes', 'block_starts']
self.BedInterval = collections.namedtuple('BedInterval', self.fields)
# I think the name which should be written
# should be the transcript_name
# But we can change it to gene_name
self.prefered_name = prefered_name
self.merge_transcripts = merge_transcripts
# Will process the gtf to get one item per transcript:
# This will create a database:
try:
self.db = gffutils.create_db(file_path, ':memory:')
except ValueError as ve:
if "No lines parsed" in str(ve):
self.length = 0
self.all_transcripts = open(file_path, 'r')
else:
raise InputError("This is not a gtf file.")
else:
if self.merge_transcripts:
self.length = len([i for i in self.db.features_of_type("gene")])
self.all_transcripts = self.db.features_of_type("gene",
order_by='start')
else:
self.length = len([i for i in self.db.features_of_type("transcript")])
self.all_transcripts = self.db.features_of_type("transcript",
order_by='start')
示例9: generate_exons
# 需要導入模塊: import gffutils [as 別名]
# 或者: from gffutils import create_db [as 別名]
def generate_exons(gtf_file,
overhang=(100, 100), # overhang from the exon
gtf_db_path=":memory:",
out_file=None,
disable_infer_transcripts=True,
disable_infer_genes=True,
firstLastNoExtend=True,
source_filter=None):
"""
Build IntervalTree object from gtf file for one feature unit (e.g. gene, exon). If give out_file, pickle it.
Args:
gtf_file: gtf format file or pickled Intervaltree object.
overhang: flanking intron length to take along with exon. Corresponding to left (acceptor side) and right (donor side)
gtf_db_path: (optional) gtf database path. Database for one gtf file only need to be created once
out_file: (optional) file path to store the pickled Intervaltree obejct. Next time run it can be given to `gtf_file`
disable_infer_transcripts: option to disable infering transcripts. Can be True if the gtf file has transcripts annotated.
disable_infer_genes: option to disable infering genes. Can be True if the gtf file has genes annotated.
firstLastNoExtend: if True, overhang is not taken for 5' of the first exon, or 3' of the last exon of a gene.
source_filter: gene source filters, such as "protein_coding" filter for protein coding genes
"""
try:
gtf_db = gffutils.interface.FeatureDB(gtf_db_path)
except ValueError:
gtf_db = gffutils.create_db(
gtf_file,
gtf_db_path,
disable_infer_transcripts=disable_infer_transcripts,
disable_infer_genes=disable_infer_genes)
genes = gtf_db.features_of_type('gene')
default_overhang = overhang
for gene in genes:
if source_filter is not None:
if gene.source != source_filter:
continue
for exon in gtf_db.children(gene, featuretype='exon'):
isLast = False # track whether is last exon
if firstLastNoExtend:
if (gene.strand == "+" and exon.end == gene.end) or (gene.strand == "-" and exon.start == gene.start):
overhang = (overhang[0], 0)
isLast = True
elif (gene.strand == "+" and exon.start == gene.start) or (gene.strand == "-" and exon.end == gene.end):
overhang = (0, overhang[1])
exon = ExonInterval.from_feature(exon, overhang)
exon.isLast = isLast
overhang = default_overhang
yield exon
示例10: longest_cds
# 需要導入模塊: import gffutils [as 別名]
# 或者: from gffutils import create_db [as 別名]
def longest_cds(gff_file, gff_filerc, verbose, wd, filename):
db = gffutils.create_db(gff_file, ':memory:', merge_strategy='create_unique', keep_order=True, transform=transform)
dbrc = gffutils.create_db(gff_filerc, ':memory:', merge_strategy='create_unique', keep_order=True, transform=transform)
list_mrna = [mRNA.attributes["ID"][0] for mRNA in db.features_of_type('mRNA')]
list_mrna_rc = [mRNA.attributes["ID"][0] for mRNA in dbrc.features_of_type('mRNA')]
list_all = list(set(list_mrna + list_mrna_rc))
list_db = []
list_db_rc = []
for mrna_id in list_all:
cds_len = [int(i.end) - int(i.start) for i in db.children(mrna_id, featuretype='CDS', order_by='start')]
cds_len_rc = [int(i.end) - int(i.start) for i in dbrc.children(mrna_id, featuretype='CDS', order_by='start')]
if cds_len == cds_len_rc:
list_db.append(mrna_id)
elif cds_len > cds_len_rc:
list_db.append(mrna_id)
else:
list_db_rc.append(mrna_id)
gff_out = gffwriter.GFFWriter(filename)
for evm in list_db:
if evm in list_mrna:
for i in db.children(evm, featuretype='CDS', order_by='start'):
gff_out.write_rec(i)
i = db[evm]
gff_out.write_rec(i)
for i in db.parents(evm, featuretype='gene', order_by='start'):
gff_out.write_rec(i)
for i in db.children(evm, featuretype='exon', order_by='start'):
gff_out.write_rec(i)
for evm in list_db_rc:
if evm in list_mrna_rc:
for i in dbrc.children(evm, featuretype='CDS', order_by='start'):
gff_out.write_rec(i)
i = dbrc[evm]
gff_out.write_rec(i)
for i in dbrc.parents(evm, featuretype='gene', order_by='start'):
gff_out.write_rec(i)
for i in dbrc.children(evm, featuretype='exon', order_by='start'):
gff_out.write_rec(i)
gff_out.close()
if verbose:
print(filename)
return filename
示例11: genename_last
# 需要導入模塊: import gffutils [as 別名]
# 或者: from gffutils import create_db [as 別名]
def genename_last(gff_filename, prefix, verbose, wd, dict_ref_name, step):
global prefix_name
prefix_name = prefix
out = tempfile.NamedTemporaryFile(delete=False, mode="w", dir=wd)
err = tempfile.NamedTemporaryFile(delete=False, mode="w")
gt_com = GT_GFF3 % gff_filename
if verbose:
sys.stderr.write('Executing: %s\n\n' % gt_com)
gt_call = subprocess.Popen(gt_com, stdout=out, stderr=err, shell=True)
gt_call.communicate()
db1 = gffutils.create_db(out.name, ':memory:', merge_strategy='create_unique', keep_order=True, transform=transform_name)
gene_count = 0
list_mrna = [mRNA.attributes["ID"][0] for mRNA in db1.features_of_type('mRNA')]
out_gff = tempfile.NamedTemporaryFile(delete=False, prefix="gffread", suffix=".gff3", dir=wd)
gff_out = gffwriter.GFFWriter(out_gff.name)
gene_name = []
for evm in list_mrna:
for i in db1.children(evm, featuretype='CDS', order_by='start'):
if i.chrom in dict_ref_name:
i.chrom = dict_ref_name[i.chrom]
gff_out.write_rec(i)
i = db1[evm]
if i.chrom in dict_ref_name:
i.chrom = dict_ref_name[i.chrom]
gff_out.write_rec(i)
for i in db1.parents(evm, featuretype='gene', order_by='start'):
if i.chrom in dict_ref_name:
i.chrom = dict_ref_name[i.chrom]
id_gene = i.attributes['ID'][0]
if not id_gene in gene_name:
gff_out.write_rec(i)
gene_name.append(id_gene)
for i in db1.children(evm, featuretype='exon', order_by='start'):
if i.chrom in dict_ref_name:
i.chrom = dict_ref_name[i.chrom]
gff_out.write_rec(i)
gff_out.close()
if "pasa" in step:
out_name = os.path.join(wd, "Final.evm.update.gff3")
with open(out_name, "w") as fh:
gt_com = GT_RETAINID % out_gff.name
if verbose:
sys.stderr.write('Executing: %s\n\n' % gt_com)
gt_call = subprocess.Popen(gt_com, stdout=fh, stderr=err, shell=True)
gt_call.communicate()
if "lorean" in step:
out_name = os.path.join(wd, "Final.LoReAn.update.gff3")
with open(out_name, "w") as fh:
gt_com = GT_RETAINID % out_gff.name
if verbose:
sys.stderr.write('Executing: %s\n\n' % gt_com)
gt_call = subprocess.Popen(gt_com, stdout=fh, stderr=err, shell=True)
gt_call.communicate()
if verbose:
print(out_name)
return out_name
示例12: pep_seq
# 需要導入模塊: import gffutils [as 別名]
# 或者: from gffutils import create_db [as 別名]
def pep_seq(myFasta, final_evm):
fasta = {}
db = gffutils.create_db(final_evm, ':memory:', merge_strategy="create_unique", keep_order=True)
gff_file = final_evm + ".gff3"
gff_out = gffwriter.GFFWriter(gff_file)
for t in db.features_of_type('mRNA', order_by='start'):
position = []
seq_combined = ''
j = 0
for i in db.children(t, featuretype='CDS', order_by='start'):
j += 1
if j == 1:
pphase = i[7]
seq = i.sequence(myFasta, use_strand=False)
seq_combined += seq
position = position + [i.start,i.stop]
seq_combined = SeqRecord(Seq(seq_combined, generic_dna))
#print(t.attributes["ID"][0])
#print(seq_combined.seq)
if t.strand == '-':
pphase = i[7]
seq_combined = seq_combined.reverse_complement()
if pphase == "0" or pphase == ".":
seq_transl = seq_combined.translate()
elif pphase == "1":
seq_transl = seq_combined[1:].translate()
elif pphase == "2":
seq_transl = seq_combined[2:].translate()
seq_transl.description = position
seq_transl.id = t.id
position = sorted(position)
t.start = position[0]
t.stop = position[-1]
fasta[str(seq_transl.seq)] = [seq_transl, t]
count = 0
for key in fasta:
for i in db.parents(fasta[key][1], featuretype='gene', order_by='start'):
gff_out.write_rec(i)
i.start = fasta[key][1].start
i.stop = fasta[key][1].stop
gff_out.write_rec(fasta[key][1])
for i in db.children(fasta[key][1], featuretype='CDS', order_by='start'):
gff_out.write_rec(i)
for i in db.children(fasta[key][1], featuretype='CDS', order_by='start'):
count += 1
i.featuretype="exon"
if "ID" in i.attributes:
i.attributes["ID"][0] = i.attributes["ID"][0] + "-" + str(count)
else:
i.attributes["ID"] = ["exon" + "-" + str(count)]
gff_out.write_rec(i)
return (gff_file)
示例13: get_gene_coordinates
# 需要導入模塊: import gffutils [as 別名]
# 或者: from gffutils import create_db [as 別名]
def get_gene_coordinates(
gff_file, contig, genes, gene1, l
): #Takes a GFF file, splits into CDS and FASTA and identifies the coordinates of the 2 genes
gff = open(gff_file).read() #Import the GFF file
split = gff.split("##FASTA")
geneCoordinates = [] #Will be filled with the coordinates of the 2 genes
with StringIO(split[1]
) as temp_fasta: #Import the sequences from the GFF as fasta
sequences = list(SeqIO.parse(temp_fasta, "fasta"))
parsed_gff = gffutils.create_db(clean_gff_string(split[0]),
dbfn=":memory:",
force=True,
keep_order=True,
from_string=True)
for geneSample in parsed_gff.all_features(
featuretype=()): #Iterate through the genes
if geneSample.id == genes[0] or geneSample.id == genes[
1]: #Check if the gene is one of the genes of interest
geneCoordinates.append(geneSample.start)
geneCoordinates.append(geneSample.stop)
if geneSample.id == gene1: #Check if the gene is gene 1
strand = geneSample.strand
firstPosition = min(
geneCoordinates
) - l + 1 #The first position to be extracted from the contig
endPosition = max(
geneCoordinates) + l #The end position to be extracted from the contig
if firstPosition < 0:
firstPosition = 0
if endPosition > len(sequences[int(contig)].seq):
endPosition = len(sequences[int(contig)].seq)
if strand == "+": #Check if the sequence needs to be reverse complemented
extractSequence = sequences[int(contig)].seq[
firstPosition:
endPosition] #Assign to the sequence in the region of interest
else:
extractSequence = sequences[int(
contig)].seq[firstPosition:endPosition].reverse_complement()
return extractSequence
示例14: post_process_fmt
# 需要導入模塊: import gffutils [as 別名]
# 或者: from gffutils import create_db [as 別名]
def post_process_fmt(input_files, out_dir):
# get mapping between GFF ids and ppanggolin ids
id_mapping = {}
for f in input_files:
prefix = os.path.splitext(os.path.basename(f))[0]
#Split file and parse
gff_file = open(f, 'r')
lines = gff_file.read()
split = lines.split('##FASTA')
if len(split) != 2:
print("Problem reading GFF3 file: ", gff_file.name)
raise RuntimeError("Error reading prokka input!")
parsed_gff = gff.create_db(clean_gff_string(split[0]),
dbfn=":memory:",
force=True,
keep_order=True,
from_string=True)
gene_count = 0
for entry in parsed_gff.all_features(featuretype=()):
if "CDS" not in entry.featuretype: continue
ppanggolin_id = prefix + "_CDS_" + str(gene_count).zfill(4)
id_mapping[ppanggolin_id] = entry.id
gene_count += 1
gff_file.close()
id_mapping[''] = ''
with open(out_dir + "ppanggolin_gene_presence_absence.csv", 'w') as outfile:
with open(out_dir + "matrix.csv", 'r') as infile:
outfile.write(next(infile))
for line in infile:
line = line.strip().split(",")
for i in range(14, len(line)):
line[i] = ";".join([id_mapping[g.strip('"')] for g in line[i].split()])
outfile.write(",".join(line) + "\n")
return
示例15: post_process_fmt
# 需要導入模塊: import gffutils [as 別名]
# 或者: from gffutils import create_db [as 別名]
def post_process_fmt(input_files, pirate_dir, out_dir):
file_id_maps = {}
for f in input_files:
#Split file and parse
gff_file = open(f, 'r')
lines = gff_file.read()
split = lines.split('##FASTA')
if len(split) != 2:
print("Problem reading GFF3 file: ", gff_file.name)
raise RuntimeError("Error reading prokka input!")
parsed_gff = gff.create_db(clean_gff_string(split[0]),
dbfn=":memory:",
force=True,
keep_order=True,
from_string=True)
gene_count = 1
count_to_id = {}
for entry in parsed_gff.all_features(featuretype=()):
if "CDS" not in entry.featuretype: continue
count_to_id[gene_count] = entry.id
gene_count += 1
file_id_maps[os.path.splitext(os.path.basename(f))[0].replace(
"-", "_")] = count_to_id
gff_file.close()
with open(pirate_dir + "PIRATE.gene_families.ordered.tsv", 'r') as infile:
with open(out_dir + "pirate_gene_presence_absence.csv",
'w') as outfile:
line = next(infile)
line = line.strip().split("\t")
outfile.write("\t".join([line[0]] + line[22:]) + "\n")
for line in infile:
line = line.strip().split("\t")
for i in range(22, len(line)):
if line[i] == "": continue
# currently take the first as we do in panaroo for this format. May want to redo.
line[i] = line[i].split(";")[0]
sample = "_".join(line[i].split("_")[:-1])
gene_num = int(line[i].split("_")[-1])
line[i] = file_id_maps[sample][gene_num]
outfile.write("\t".join([line[0]] + line[22:]) + "\n")
return