本文整理汇总了Python中Bio.SeqIO.index方法的典型用法代码示例。如果您正苦于以下问题:Python SeqIO.index方法的具体用法?Python SeqIO.index怎么用?Python SeqIO.index使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类Bio.SeqIO
的用法示例。
在下文中一共展示了SeqIO.index方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: getBreakPoint
# 需要导入模块: from Bio import SeqIO [as 别名]
# 或者: from Bio.SeqIO import index [as 别名]
def getBreakPoint(tupList, idx, direction='reverse', gap=2000):
# takes list of tuples of coords and a starting index (idx). finds closest
# break point in between tuple coordSorted
solution = False
while not solution:
try:
start, end, diff = tupList[idx]
except IndexError:
return False
if diff >= gap:
phase = int(round(diff/2))
solution = end + phase
else:
if direction == 'reverse':
idx -= 1
else:
idx += 1
return solution
示例2: _write_strain_functional_genes
# 需要导入模块: from Bio import SeqIO [as 别名]
# 或者: from Bio.SeqIO import index [as 别名]
def _write_strain_functional_genes(self, strain_id, ref_functional_genes, orth_matrix, force_rerun=False):
"""Create strain functional genes json file"""
func_genes_path = op.join(self.model_dir, '{}_funcgenes.json'.format(strain_id))
if ssbio.utils.force_rerun(flag=force_rerun, outfile=func_genes_path):
gene_to_func = {k:True for k in ref_functional_genes}
# Get a list of genes which do not have orthology in the strain
genes_to_remove = orth_matrix[pd.isnull(orth_matrix[strain_id])][strain_id].index.tolist()
# Mark genes non-functional
genes_to_remove = list(set(genes_to_remove).intersection(set(ref_functional_genes)))
if len(genes_to_remove) > 0:
for g in genes_to_remove:
gene_to_func[g] = False
with open(func_genes_path, 'w') as f:
json.dump(gene_to_func, f)
else:
with open(func_genes_path, 'r') as f:
gene_to_func = json.load(f)
return strain_id, gene_to_func
示例3: _load_sequences_to_reference_gene
# 需要导入模块: from Bio import SeqIO [as 别名]
# 或者: from Bio.SeqIO import index [as 别名]
def _load_sequences_to_reference_gene(self, g_id, force_rerun=False):
"""Load orthologous strain sequences to reference Protein object, save as new pickle"""
protein_seqs_pickle_path = op.join(self.sequences_by_gene_dir, '{}_protein_withseqs.pckl'.format(g_id))
if ssbio.utils.force_rerun(flag=force_rerun, outfile=protein_seqs_pickle_path):
protein_pickle_path = self.gene_protein_pickles[g_id]
protein_pickle = ssbio.io.load_pickle(protein_pickle_path)
for strain, info in self.strain_infodict.items():
strain_sequences = SeqIO.index(info['genome_path'], 'fasta')
strain_gene_functional = info['functional_genes'][g_id]
if strain_gene_functional:
# Pull the gene ID of the strain from the orthology matrix
strain_gene_key = self.df_orthology_matrix.at[g_id, strain]
new_id = '{}_{}'.format(g_id, strain)
if protein_pickle.sequences.has_id(new_id):
continue
protein_pickle.load_manual_sequence(seq=strain_sequences[strain_gene_key],
ident=new_id,
set_as_representative=False)
protein_pickle.save_pickle(outfile=protein_seqs_pickle_path)
return g_id, protein_seqs_pickle_path
示例4: split_blast_inputs_by_core
# 需要导入模块: from Bio import SeqIO [as 别名]
# 或者: from Bio.SeqIO import index [as 别名]
def split_blast_inputs_by_core(blast_inputs, threads, blast_files_dir):
"""
"""
splitted_ids = [[] for cpu in range(threads)]
splitted_values = [[] for cpu in range(threads)]
cluster_sums = [0] * threads
i = 0
for cluster in blast_inputs:
cluster_file = os.path.join(blast_files_dir,
'{0}_ids.txt'.format(cluster))
with open(cluster_file, 'r') as infile:
cluster_seqs = [line.strip() for line in infile.readlines()]
splitted_values[i].append(len(cluster_seqs))
splitted_ids[i].append(cluster)
cluster_sums[i] += len(cluster_seqs)
i = cluster_sums.index(min(cluster_sums))
return [s for s in splitted_ids if len(s) > 0]
示例5: reverse_bed
# 需要导入模块: from Bio import SeqIO [as 别名]
# 或者: from Bio.SeqIO import index [as 别名]
def reverse_bed():
"""Convert bed-file coordinates to coordinates on the reverse strand."""
parser = argparse.ArgumentParser(
prog='reverse_bed',
description='Convert bed-file coordinates to coordinates on the reverse strand.',
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument('bed_in', help='Input bed file.')
parser.add_argument('ref_fasta', help='Input reference fasta file.')
parser.add_argument('bed_out', help='Output bed file.')
args = parser.parse_args()
fasta = pysam.FastaFile(args.ref_fasta)
lengths = dict(zip(fasta.references, fasta.lengths))
d = pd.read_csv(args.bed_in, sep='\t', names=['chrom', 'start', 'stop'])
d['chrom_length'] = d['chrom'].map(lambda x: lengths[x])
d['rc_stop'] = d['chrom_length'] - d['start']
d['rc_start'] = d['chrom_length'] - d['stop']
d['chrom_rc'] = d['chrom'] + '_rc'
d[['chrom_rc', 'rc_start', 'rc_stop']].to_csv(args.bed_out, index=False, header=False, sep='\t')
示例6: __getitem__
# 需要导入模块: from Bio import SeqIO [as 别名]
# 或者: from Bio.SeqIO import index [as 别名]
def __getitem__(self, index: int):
if not 0 <= index < self._num_examples:
raise IndexError(index)
# if self._in_memory and self._cache[index] is not None:
record = self._cache[index]
# else:
# key = self._keys[index]
# record = self._records[key]
# if self._in_memory:
# self._cache[index] = record
item = {'id': record.id,
'primary': str(record.seq),
'protein_length': len(record.seq)}
return item
示例7: compress_and_send_mail
# 需要导入模块: from Bio import SeqIO [as 别名]
# 或者: from Bio.SeqIO import index [as 别名]
def compress_and_send_mail(job_name, fasta_file, index_file, lock_file, mailer):
"""
Compress fasta file and the send mail with its link to the client
:param job_name: job id
:type job_name: str
:param fasta_file: fasta file path
:type fasta_file: str
:param index_file: index file path
:type index_file: str
:param lock_file: lock file path
:type lock_file: str
:param mailer: mailer object (to send mail)
:type mailer: Mailer
"""
Functions.compress(fasta_file)
os.remove(lock_file)
index, sample_name = Functions.read_index(index_file)
Functions.send_fasta_ready(mailer, job_name, sample_name, True)
示例8: build_list_no_assoc
# 需要导入模块: from Bio import SeqIO [as 别名]
# 或者: from Bio.SeqIO import index [as 别名]
def build_list_no_assoc(self, to):
"""
Build list of queries that match with None target, or the opposite
:param to: query or target
:return: content of the file
"""
index = self.idx_q if to == "query" else self.idx_t
name, contigs_list, contigs, reversed, abs_start, c_len = Index.load(index)
contigs_list = set(contigs_list)
with open(self.paf, "r") as paf:
for line in paf:
c_name = line.strip("\n").split("\t")[0 if to == "query" else 5]
if c_name in contigs_list:
contigs_list.remove(c_name)
return "\n".join(contigs_list) + "\n"
示例9: _load_strain_sequences
# 需要导入模块: from Bio import SeqIO [as 别名]
# 或者: from Bio.SeqIO import index [as 别名]
def _load_strain_sequences(self, strain_gempro):
"""Load strain sequences from the orthology matrix into the base model for comparisons, and into the
strain-specific model itself.
"""
if self._orthology_matrix_has_sequences: # Load directly from the orthology matrix if it contains sequences
strain_sequences = self.df_orthology_matrix[strain_gempro.id].to_dict()
else: # Otherwise load from the genome file if the orthology matrix contains gene IDs
# Load the genome FASTA file
log.debug('{}: loading strain genome CDS file'.format(strain_gempro.genome_path))
strain_sequences = SeqIO.index(strain_gempro.genome_path, 'fasta')
for strain_gene in strain_gempro.genes:
if strain_gene.functional:
if self._orthology_matrix_has_sequences:
strain_gene_key = strain_gene.id
else:
# Pull the gene ID of the strain from the orthology matrix
strain_gene_key = self.df_orthology_matrix.loc[strain_gene.id, strain_gempro.id]
log.debug('{}: original gene ID to be pulled from strain fasta file'.format(strain_gene_key))
# # Load into the base strain for comparisons
ref_gene = self.reference_gempro.genes.get_by_id(strain_gene.id)
new_id = '{}_{}'.format(strain_gene.id, strain_gempro.id)
if ref_gene.protein.sequences.has_id(new_id):
log.debug('{}: sequence already loaded into reference model'.format(new_id))
continue
ref_gene.protein.load_manual_sequence(seq=strain_sequences[strain_gene_key], ident=new_id,
set_as_representative=False)
log.debug('{}: loaded sequence into reference model'.format(new_id))
# Load into the strain GEM-PRO
strain_gene.protein.load_manual_sequence(seq=strain_sequences[strain_gene_key], ident=new_id,
set_as_representative=True)
log.debug('{}: loaded sequence into strain model'.format(new_id))
示例10: build_strain_specific_models
# 需要导入模块: from Bio import SeqIO [as 别名]
# 或者: from Bio.SeqIO import index [as 别名]
def build_strain_specific_models(self, save_models=False):
"""Using the orthologous genes matrix, create and modify the strain specific models based on if orthologous
genes exist.
Also store the sequences directly in the reference GEM-PRO protein sequence attribute for the strains.
"""
if len(self.df_orthology_matrix) == 0:
raise RuntimeError('Empty orthology matrix')
# Create an emptied copy of the reference GEM-PRO
for strain_gempro in tqdm(self.strains):
log.debug('{}: building strain specific model'.format(strain_gempro.id))
# For each genome, load the metabolic model or genes from the reference GEM-PRO
logging.disable(logging.WARNING)
if self._empty_reference_gempro.model:
strain_gempro.load_cobra_model(self._empty_reference_gempro.model)
elif self._empty_reference_gempro.genes:
strain_gempro.genes = [x.id for x in self._empty_reference_gempro.genes]
logging.disable(logging.NOTSET)
# Get a list of genes which do not have orthology in the strain
not_in_strain = self.df_orthology_matrix[pd.isnull(self.df_orthology_matrix[strain_gempro.id])][strain_gempro.id].index.tolist()
# Mark genes non-functional
self._pare_down_model(strain_gempro=strain_gempro, genes_to_remove=not_in_strain)
# Load sequences into the base and strain models
self._load_strain_sequences(strain_gempro=strain_gempro)
if save_models:
cobra.io.save_json_model(model=strain_gempro.model,
filename=op.join(self.model_dir, '{}.json'.format(strain_gempro.id)))
strain_gempro.save_pickle(op.join(self.model_dir, '{}_gp.pckl'.format(strain_gempro.id)))
log.info('Created {} new strain-specific models and loaded in sequences'.format(len(self.strains)))
示例11: _build_strain_specific_model
# 需要导入模块: from Bio import SeqIO [as 别名]
# 或者: from Bio.SeqIO import index [as 别名]
def _build_strain_specific_model(self, strain_id, ref_functional_genes, orth_matrix, force_rerun=False):
"""Create strain GEMPRO, set functional genes"""
gp_noseqs_path = op.join(self.model_dir, '{}_gp.pckl'.format(strain_id))
if ssbio.utils.force_rerun(flag=force_rerun, outfile=gp_noseqs_path):
logging.disable(logging.WARNING)
strain_gp = GEMPRO(gem_name=strain_id)
# if self.reference_gempro.model:
# strain_gp.load_cobra_model(deepcopy(self.reference_gempro.model))
# # Reset the GenePro attributes
# for x in strain_gp.genes:
# x.reset_protein()
# else:
# Otherwise, just copy the list of genes over and rename the IDs
strain_genes = [x for x in ref_functional_genes]
strain_gp.add_gene_ids(strain_genes)
logging.disable(logging.NOTSET)
# Get a list of genes which do not have orthology in the strain
genes_to_remove = orth_matrix[pd.isnull(orth_matrix[strain_id])][strain_id].index.tolist()
# Mark genes non-functional
strain_genes = [x.id for x in strain_gp.genes]
genes_to_remove = list(set(genes_to_remove).intersection(set(strain_genes)))
if len(genes_to_remove) > 0:
# If a COBRApy model exists, utilize the delete_model_genes method
# if strain_gp.model:
# cobra.manipulation.delete_model_genes(strain_gp.model, genes_to_remove)
# # Otherwise, just mark the genes as non-functional
# else:
for g in genes_to_remove:
strain_gp.genes.get_by_id(g).functional = False
strain_gp.save_pickle(outfile=gp_noseqs_path)
return strain_id, gp_noseqs_path
示例12: __init__
# 需要导入模块: from Bio import SeqIO [as 别名]
# 或者: from Bio.SeqIO import index [as 别名]
def __init__(self, seq_table, records, max_dist, min_fold, threshold_pval, log=None):
'''
seq_table: pandas.DataFrame
Samples on the columns; sequences on the rows
records: index of Bio.Seq
Indexed, unaligned input sequences. This could come from BioPython's
SeqIO.to_dict or SeqIO.index.
max_dist: float
genetic distance cutoff above which a sequence will not be merged into an OTU
min_fold: float
Multiply the sequence's abundance by this fold to get the minimum abundance
of an OTU for merging
threshold_pval: float
P-value below which a sequence will not be merged into an OTU
log: filehandle
Log file reporting the abundance, genetic, and distribution checks.
'''
self.seq_table = seq_table
self.records = records
self.max_dist = max_dist
self.min_fold = min_fold
self.threshold_pval = threshold_pval
self.log = log
# get a list of the names of the sequences in order of their (decreasing) abundance
self.seq_abunds = self.seq_table.sum(axis=1).sort_values(ascending=False)
# check that all sequence IDs in the table are in the fasta
missing_ids = [seq_id for seq_id in self.seq_abunds.index if seq_id not in self.records]
if len(missing_ids) > 0:
raise RuntimeError("{} sequence IDs found in the sequence table but not in the fasta: {}".format(len(missing_ids), missing_ids))
# initialize OTU information
self.membership = {}
self.otus = []
示例13: _process_record
# 需要导入模块: from Bio import SeqIO [as 别名]
# 或者: from Bio.SeqIO import index [as 别名]
def _process_record(self, record_id):
'''
Process the next sequence: run the genetic, abundance, and distribution checks, either
merging the sequence into an existing OTU or creating a new OTU.
'''
assert record_id in self.seq_table.index
record = self.records[record_id]
candidate = OTU(record.id, str(record.seq), self.seq_table.loc[record.id])
if self.log is not None:
print('seq', candidate.name, sep='\t', file=self.log)
merged = False
for otu in self.ga_matches(candidate):
test_pval = candidate.distribution_pval(otu)
if self.log is not None:
print(candidate.name, 'distribution_check', otu.name, test_pval, sep='\t', file=self.log)
if test_pval > self.threshold_pval:
otu.absorb(candidate)
self.membership[otu.name].append(candidate.name)
merged = True
break
if not merged:
# form own otu
self.otus.append(candidate)
self.membership[candidate.name] = [candidate.name]
示例14: generate_otu_table
# 需要导入模块: from Bio import SeqIO [as 别名]
# 或者: from Bio.SeqIO import index [as 别名]
def generate_otu_table(self):
'''
Process all the input sequences to make an OTU table.
returns: pandas.DataFrame
OTU table (which can also be found at instance.otu_table)
'''
for record_id in self.seq_abunds.index:
self._process_record(record_id)
self.otus.sort(key=lambda otu: otu.abundance, reverse=True)
self.otu_table = pd.DataFrame([otu.counts for otu in self.otus], index=[otu.name for otu in self.otus])
self.otu_table.columns = self.seq_table.columns
return self.otu_table
示例15: read_sequence_table
# 需要导入模块: from Bio import SeqIO [as 别名]
# 或者: from Bio.SeqIO import index [as 别名]
def read_sequence_table(fn):
'''
Read in a table of sequences. Expect a header and the sequence IDs in the
first column. Samples are on the columns.
fn: filename (or handle)
returns: pandas.DataFrame
'''
df = pd.read_table(fn, dtype={0: str}, header=0)
df.index = df.iloc[:,0]
df = df.iloc[:,1:].astype(int)
return df