本文整理汇总了Python中frogsBiom.BiomIO类的典型用法代码示例。如果您正苦于以下问题:Python BiomIO类的具体用法?Python BiomIO怎么用?Python BiomIO使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。
在下文中一共展示了BiomIO类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: sampling_by_sample
def sampling_by_sample( input_biom, output_biom, nb_sampled=None, sampled_ratio=None ):
"""
@summary: Writes a BIOM after a random sampling in each sample.
@param input_biom: [str] Path to the processed BIOM.
@param output_biom: [str] Path to outputed BIOM.
@param nb_sampled: [int] Number of sampled sequences by sample.
@param sampled_ratio: [float] Ratio of sampled sequences by sample.
@note: nb_sampled and sampled_ratio are mutually exclusive.
"""
initial_biom = BiomIO.from_json( input_biom )
new_biom = Biom(
matrix_type="sparse",
generated_by="Sampling " + (str(nb_sampled) if nb_sampled is not None else str(sampled_ratio) + "%" ) + " elements by sample from " + input_biom
)
observations_already_added = dict()
for sample_name in initial_biom.get_samples_names():
new_biom.add_sample( sample_name, initial_biom.get_sample_metadata(sample_name) )
sample_seq = initial_biom.get_sample_count(sample_name)
sample_nb_sampled = nb_sampled
if nb_sampled is None:
sample_nb_sampled = int(sample_seq * sampled_ratio)
if sample_seq < nb_sampled:
raise Exception( str(sample_nb_sampled) + " sequences cannot be sampled in sample '" + str(sample_name) + "'. It only contains " + str(sample_seq) + " sequences." )
else:
for current_nb_iter in range(sample_nb_sampled):
# Take an observation in initial BIOM
selected_observation = initial_biom.random_obs_by_sample(sample_name)
selected_observation_id = selected_observation['id']
initial_biom.subtract_count( selected_observation_id, sample_name, 1 )
# Put in new BIOM
if not observations_already_added.has_key(selected_observation_id):
new_biom.add_observation( selected_observation_id, initial_biom.get_observation_metadata(selected_observation_id) )
observations_already_added[selected_observation_id] = True
new_biom.add_count( selected_observation_id, sample_name, 1 )
BiomIO.write( output_biom, new_biom )
示例2: process
def process( in_biom, out_biom, out_metadata ):
ordered_blast_keys = ["taxonomy", "subject", "evalue", "perc_identity", "perc_query_coverage", "aln_length"] # Keys in blast_affiliations metadata
taxonomy_depth = 0
unclassified_observations = list()
FH_metadata = open( out_metadata, "w" )
FH_metadata.write( "#OTUID\t" + "\t".join([item for item in ordered_blast_keys]) + "\n" )
biom = BiomIO.from_json( in_biom )
for observation in biom.get_observations():
for metadata_key in observation["metadata"].keys():
if metadata_key == "blast_affiliations": # Extract blast_affiliations metadata in metadata_file
if observation["metadata"][metadata_key] is not None:
for current_affi in observation["metadata"][metadata_key]:
if isinstance(current_affi["taxonomy"], list) or isinstance(current_affi["taxonomy"], tuple):
current_affi["taxonomy"] = ";".join( current_affi["taxonomy"] )
FH_metadata.write( observation["id"] + "\t" + "\t".join([str(current_affi[item]) for item in ordered_blast_keys]) + "\n" )
del observation["metadata"][metadata_key]
elif observation["metadata"][metadata_key] is not None: # All list are transformed in string
if isinstance(observation["metadata"][metadata_key], list) or isinstance(observation["metadata"][metadata_key], tuple):
observation["metadata"][metadata_key] = ";".join( map(str, observation["metadata"][metadata_key]) )
if observation["metadata"].has_key( "blast_taxonomy" ):
if observation["metadata"]["blast_taxonomy"] is None:
unclassified_observations.append( observation["id"] )
observation["metadata"]["taxonomy"] = list()
else:
taxonomy_depth = len(observation["metadata"]["blast_taxonomy"].split(";"))
observation["metadata"]["taxonomy"] = observation["metadata"]["blast_taxonomy"].split(";")
# Add "Unclassified" ranks in unclassified observations
if taxonomy_depth > 0:
for observation_id in unclassified_observations:
observation_metadata = biom.get_observation_metadata(observation_id)
observation_metadata["taxonomy"] = ["Unclassified"] * taxonomy_depth
BiomIO.write( out_biom, biom )
示例3: remove_observations
def remove_observations( removed_observations, input_biom, output_biom ):
"""
@summary: Removes the specified list of observations.
@param removed_observations: [list] The names of the observations to remove.
@param input_biom: [str] The path to the input BIOM.
@param output_biom: [str] The path to the output BIOM.
"""
biom = BiomIO.from_json( input_biom )
biom.remove_observations( removed_observations )
BiomIO.write( output_biom, biom )
示例4: filter_biom
def filter_biom( removed_observations, in_biom, out_biom ):
"""
@summary: Removed the specified observations from BIOM.
@param removed_observations: [dict] Each key is an observation name.
@param in_biom: [str]: Path to the processed BIOM file.
@param out_biom: [str]: Path to the cleaned BIOM file.
"""
biom = BiomIO.from_json(in_biom)
biom.remove_observations(removed_observations)
BiomIO.write(out_biom, biom)
示例5: to_biom
def to_biom( clusters_file, count_file, output_biom, size_separator ):
"""
@summary : Write a biom file from swarm results.
@param clusters_file : [str] path to the '.clstr' file.
@param count_file : [str] path to the count file. It contains the count of
sequences by sample of each preclusters.
Line format : "Precluster_id nb_in_sampleA nb_in_sampleB"
@param output_biom : [str] path to the output file.
@param size_separator : [str] the pre-cluster abundance separator.
"""
biom = Biom( generated_by='swarm', matrix_type="sparse" )
# Preclusters count by sample
preclusters_count = dict()
count_fh = open( count_file )
samples = count_fh.readline().strip().split()[1:]
for line in count_fh:
precluster_id, count_str = line.strip().split(None, 1)
preclusters_count[precluster_id] = count_str # For large dataset store count into a string consumes minus RAM than a sparse count
count_fh.close()
# Add samples
for sample_name in samples:
biom.add_sample( sample_name )
# Process count
cluster_idx = 1
clusters_fh = open( clusters_file )
for line in clusters_fh:
cluster_name = "Cluster_" + str(cluster_idx)
cluster_count = {key:0 for key in samples}
line_fields = line.strip().split()
# Retrieve count by sample
for seq_id in line_fields:
real_seq_id = seq_id.rsplit(size_separator, 1)[0]
sample_counts = preclusters_count[real_seq_id].split()
for sample_idx, sample_name in enumerate(samples):
cluster_count[sample_name] += int(sample_counts[sample_idx])
preclusters_count[real_seq_id] = None
# Add cluster on biom
biom.add_observation( cluster_name, {'seed_id':line_fields[0].rsplit(size_separator, 1)[0]} )
observation_idx = biom.find_idx("observation", cluster_name)
for sample_idx, sample_name in enumerate(samples):
if cluster_count[sample_name] > 0:
biom.data.change( observation_idx, sample_idx, cluster_count[sample_name] )
# Next cluster
cluster_idx += 1
# Write
BiomIO.write( output_biom, biom )
示例6: excluded_obs_on_blastMetrics
def excluded_obs_on_blastMetrics( input_biom, tag, cmp_operator, threshold, excluded_file ):
"""
@summary: Writes the list of the observations with no affiliations with sufficient blast value.
@param input_biom: [str] The path to the BIOM file to check.
@param tag: [str] The metadata checked.
@param cmp_operator: [str] The operator use in comparison (tag_value ">=" thresold or tag_value "<=" thresold ).
@param threshold: [float] The limit for the tag value.
@param excluded_file: [str] The path to the output file.
"""
valid_operators = {
">=": operator.__ge__,
"<=": operator.__le__
}
cmp_func = valid_operators[cmp_operator]
biom = BiomIO.from_json( input_biom )
FH_excluded_file = open( excluded_file, "w" )
for observation in biom.get_observations():
alignments = observation["metadata"]["blast_affiliations"]
is_discarded = True
for current_alignment in alignments:
if cmp_func(float(current_alignment[tag]), threshold):
is_discarded = False
if is_discarded:
FH_excluded_file.write( str(observation["id"]) + "\n" )
FH_excluded_file.close()
示例7: __init__
def __init__( self, out_tsv, in_biom, in_fasta=None ):
"""
@param in_biom: [str] Path to BIOM file.
@param out_tsv: [str] Path to output TSV file.
"""
# Sequence file option
sequence_file_opt = "" if in_fasta is None else " --input-fasta " + in_fasta
# Check the metadata
biom = BiomIO.from_json( in_biom )
conversion_tags = ""
if biom.has_observation_metadata( 'rdp_taxonomy' ) and biom.has_observation_metadata( 'rdp_bootstrap' ):
conversion_tags += "'@rdp_tax_and_bootstrap' "
if biom.has_observation_metadata( 'blast_taxonomy' ):
conversion_tags += "'blast_taxonomy' "
if biom.has_observation_metadata( 'blast_affiliations' ):
conversion_tags += "'@blast_subject' "
conversion_tags += "'@blast_perc_identity' "
conversion_tags += "'@blast_perc_query_coverage' "
conversion_tags += "'@blast_evalue' "
conversion_tags += "'@blast_aln_length' "
if biom.has_observation_metadata( 'seed_id' ):
conversion_tags += "'seed_id' "
if in_fasta is not None:
conversion_tags += "'@seed_sequence' "
conversion_tags += "'@observation_name' '@observation_sum' '@sample_count'"
# Set command
Cmd.__init__( self,
'biom2tsv.py',
'Converts a BIOM file in TSV file.',
"--input-file " + in_biom + sequence_file_opt + " --output-file " + out_tsv + " --fields " + conversion_tags,
'--version' )
示例8: biom_fasta_to_tsv
def biom_fasta_to_tsv( input_biom, input_fasta, output_tsv, fields, list_separator ):
"""
@summary: Convert BIOM file to TSV file with sequence.
@param input_biom: [str] Path to the BIOM file.
@param input_fasta: [str] Path to the sequences of the observations.
@param output_tsv: [str] Path to the output file (format : TSV).
@param fields: [list] Columns and their order in output. Special columns : '@observation_name', '@observation_sum', '@sample_count', '@rdp_tax_and_bootstrap', '@seed_sequence'. The others columns must be metadata title.
@param list_separator: [str] Separator for complex metadata.
"""
biom = BiomIO.from_json( input_biom )
out_fh = open( output_tsv, "w" )
sequence_idx = fields.index("@seed_sequence")
# Header
header_parts = header_line_parts( fields, biom )
out_fh.write( "#" + "\t".join(header_parts) + "\n" )
# Data
fields_without_seq = fields
del fields_without_seq[sequence_idx]
FH_in = FastaIO( input_fasta )
for record in FH_in:
obs_idx = biom.find_idx("observation", record.id)
count_by_sample = biom.data.get_row_array(obs_idx)
observation_parts = observation_line_parts( biom.rows[obs_idx], count_by_sample, fields_without_seq, list_separator )
observation_parts.insert( sequence_idx, record.string )
out_fh.write( "\t".join(observation_parts) + "\n" )
out_fh.close()
示例9: get_checked
def get_checked( abund_file, checked_sample, taxonomy_key, expected_by_depth ):
checked_by_depth = dict()
biom = BiomIO.from_json(abund_file)
for current_obs in biom.get_observations():
clean_taxonomy = getCleanedTaxonomy(current_obs["metadata"][taxonomy_key])
count = biom.get_count(current_obs["id"], checked_sample)
if count > 0:
if clean_taxonomy[len(clean_taxonomy)-1] == "Multi-affiliation":
nb_selected = 0
selected = list()
taxonomies = list()
expected_taxonomies = expected_by_depth[len(clean_taxonomy)-1]
for affi_idx in range(len(current_obs["metadata"]["blast_affiliations"])):
affi_taxonomy = ";".join(getCleanedTaxonomy(current_obs["metadata"]["blast_affiliations"][affi_idx]["taxonomy"]))
if affi_taxonomy not in taxonomies:
taxonomies.append(affi_taxonomy)
if affi_taxonomy in expected_taxonomies:
selected = getCleanedTaxonomy(current_obs["metadata"]["blast_affiliations"][affi_idx]["taxonomy"])
nb_selected += 1
if nb_selected == 1:
clean_taxonomy = selected
else:
warnings.warn( "Multi-affiliation cannot be resolved for " + str((float(count)*100)/biom.get_total_count()) + "% sequences. Possible taxonomies: '" + "', '".join(taxonomies) + "'." )
for rank_depth in range(len(clean_taxonomy)):
rank_taxonomy = ";".join(clean_taxonomy[:rank_depth + 1])
if rank_depth not in checked_by_depth:
checked_by_depth[rank_depth] = dict()
if rank_taxonomy not in checked_by_depth[rank_depth]:
checked_by_depth[rank_depth][rank_taxonomy] = 0
checked_by_depth[rank_depth][rank_taxonomy] += count
return checked_by_depth
示例10: write_log
def write_log(in_biom, out_biom, log):
FH_log=open(log,"w")
FH_log.write("#sample\tnb_otu_before\tnb_otu_after\n")
initial_biom = BiomIO.from_json( in_biom )
new_biom = BiomIO.from_json( out_biom )
for sample_name in initial_biom.get_samples_names():
nb_otu_before = len(initial_biom.get_sample_obs(sample_name))
nb_otu_after = len(new_biom.get_sample_obs(sample_name))
FH_log.write("Sample name: "+sample_name+"\n\tnb initials OTU: "+str(nb_otu_before)+"\n\tnb normalized OTU: "+str(nb_otu_after)+"\n")
nb_initial_otu=len(initial_biom.rows)
nb_new_otu=len(new_biom.rows)
FH_log.write("Sample name: all samples\n\tnb initials OTU: "+str(nb_initial_otu)+"\n\tnb normalized OTU: "+str(nb_new_otu)+"\n")
FH_log.close()
示例11: getRealTaxByRefID
def getRealTaxByRefID( input_biom, taxonomy_key, duplication_groups ):
"""
@summary: Return taxonomy by reference.
@param input_biom: [str] Path to BIOM file.
@param taxonomy_key: [str] The metadata key for taxonomy.
@param duplication_groups: [dict] By reference ID the list of references with the same sequence.
@return: [dict] List of taxonomies by reference ID.
Example:
{
"MVF01000012.1.1317": [
["Root", "Bacteria", "Proteobacteria", "Gammaproteobacteria", "Enterobacteriales", "Enterobacteriaceae", "Cronobacter", "Escherichia coli BIDMC 73"]
],
"JQ607252.1.1437": [
["Root", "Bacteria", "Firmicutes", "Bacilli", "Bacillales", "Staphylococcaceae", "Staphylococcus", "bacterium NLAE-zl-P471"],
["Root", "Bacteria", "Firmicutes", "Bacilli", "Bacillales", "Staphylococcaceae", "Staphylococcus", "Staphylococcus aureus M17299"]
]
}
"""
taxonomy_by_obs_id = dict()
tmp_taxonomy_by_obs_id = dict()
biom = BiomIO.from_json( input_biom )
for observation in biom.get_observations():
taxonomy_clean = getCleanedTaxonomy(observation["metadata"][taxonomy_key])
taxonomy_by_obs_id[observation["id"]] = [taxonomy_clean]
tmp_taxonomy_by_obs_id[observation["id"]] = taxonomy_clean
if duplication_groups is not None:
for obs_id in duplication_groups:
taxonomy_by_obs_id[obs_id] = list()
for id_duplicated_seq in duplication_groups[obs_id]: # For each duplication group member
taxonomy_by_obs_id[obs_id].append(tmp_taxonomy_by_obs_id[id_duplicated_seq])
return taxonomy_by_obs_id
示例12: get_step_size
def get_step_size(self, nb_step=35):
"""
@summary: Returns the step size to obtain 'nb_step' steps or more in 3/4 of samples.
@param nb_step: [int] The number of expected steps.
@returns: [int] The step size.
"""
counts = list()
# Get the number of sequences by sample
biom = BiomIO.from_json( self.in_biom )
for sample_name in biom.get_samples_names():
counts.append( biom.get_sample_count(sample_name) )
del biom
counts = sorted(counts)
nb_samples = len(counts)
# Finds the lower quartile number of sequences
lower_quartile_idx = nb_samples/4
nb_seq = counts[lower_quartile_idx]
# If lower quartile sample is empty
if nb_seq == 0:
idx = 1
while (lower_quartile_idx + idx) < nb_samples and counts[lower_quartile_idx + idx] == 0:
idx += 1
if (lower_quartile_idx + idx) < nb_samples:
nb_seq = counts[lower_quartile_idx + idx]
step_size = int(nb_seq/nb_step)
return max(1, step_size)
示例13: observations_depth
def observations_depth( input_biom, output_depth ):
"""
@summary : Write the depths of the observation in file.
@param input_biom : [str] path to the biom file processed.
@param output_depth : [str] path to the output file.
@note : Example of one output file
#Depth<TAB>Nb_Observ_concerned<TAB>Prct_Observ_concerned
1<TAB>65<TAB>65.000
2<TAB>30<TAB>30.000
3<TAB>0<TAB>0.000
4<TAB>5<TAB>5.000
"""
obs_depth = list()
nb_observ = 0
# Process depth calculation
biom = BiomIO.from_json( input_biom )
for observation_id, observation_count in biom.get_observations_counts():
while len(obs_depth) <= observation_count:
obs_depth.append(0)
obs_depth[observation_count] += 1
if observation_count != 0:
nb_observ += 1
del biom
# Write output
out_fh = open( output_depth, 'w' )
out_fh.write( "#Depth\tNb_Observ_concerned\tPrct_Observ_concerned\n" )
for depth in range(1, len(obs_depth)):
prct = (float(obs_depth[depth])/ nb_observ)*100
out_fh.write( str(depth) + "\t" + str(obs_depth[depth]) + "\t" + ("%.3f" % prct) + "\n" )
out_fh.close()
示例14: get_retrieved_by_sample
def get_retrieved_by_sample( biom_file, reference_by_obs_id, references_by_sample, uniq_id, uniq_id_by_sample ):
counts_by_sample = dict()
biom = BiomIO.from_json( biom_file )
for sample_name in biom.get_samples_names():
nb_detected = 0
retrieved = dict()
expected_retrieved = dict()
for obs in biom.get_observations_by_sample( sample_name ):
nb_detected += 1
if not "," in reference_by_obs_id[obs['id']]: # Is not a chimera
ref_id = reference_by_obs_id[obs['id']]
retrieved[ref_id] = 1
if ref_id in references_by_sample[sample_name]:
expected_retrieved[ref_id] = 1
# Uniq sequence for retrieved
uniq_retrieved = set()
for ref_id in retrieved:
uniq_retrieved.add( uniq_id[ref_id] )
# Uniq sequence for retrieved
uniq_expected_retrieved = set()
for ref_id in expected_retrieved:
uniq_expected_retrieved.add( uniq_id_by_sample[sample_name][ref_id] )
# Results
counts_by_sample[sample_name] = {
"detected": nb_detected,
"retrieved": len(uniq_retrieved),
"expected_retrieved": len(uniq_expected_retrieved)
}
return counts_by_sample
示例15: aff_to_metadata
def aff_to_metadata(reference_file, biom_in, biom_out, blast_files=None, rdp_files=None):
"""
@summary: Add taxonomy metadata on biom file from a blast result.
@param reference_file: [str] The path to the reference file.
@param biom_in: [str] The path to the Biom file to process.
@param biom_out: [str] The path to the biom output file.
@param blast_files: [list] the list of the path to the blast results in tabular format (outfmt 6 with NCBI Blast+).
@param rdp_files: [list] the list of path to the RDPClassifier results.
"""
# Build an hash with the taxonomy for each gene (key=gene_id ; value=gene_taxonomy)
taxonomy_by_reference = get_tax_from_fasta( reference_file )
# Retrieve blast clusters annotations
cluster_blast_annot = dict()
if blast_files is not None:
cluster_blast_annot = get_bests_blast_affi( blast_files, taxonomy_by_reference )
del taxonomy_by_reference
# Retrieve rdp clusters annotations
cluster_rdp_annot = dict()
if rdp_files is not None:
cluster_rdp_annot = get_rdp_affi( rdp_files )
# Add metadata to biom
biom = BiomIO.from_json(biom_in)
for cluster in biom.get_observations():
cluster_id = cluster["id"]
# Blast
if blast_files is not None:
blast_taxonomy = None
blast_affiliations = list()
if cluster_blast_annot.has_key(cluster_id): # Current observation has a match
blast_taxonomy = get_tax_consensus( [alignment['taxonomy'] for alignment in cluster_blast_annot[cluster_id]['alignments']] )
blast_affiliations = cluster_blast_annot[cluster_id]['alignments']
biom.add_metadata( cluster_id, "blast_affiliations", blast_affiliations, "observation" )
biom.add_metadata( cluster_id, "blast_taxonomy", blast_taxonomy, "observation" )
# RDP
if rdp_files is not None:
rdp_taxonomy = None
rdp_bootstrap = None
if cluster_rdp_annot.has_key(cluster_id):
rdp_taxonomy = cluster_rdp_annot[cluster_id]['taxonomy']
rdp_bootstrap = cluster_rdp_annot[cluster_id]['bootstrap']
biom.add_metadata(cluster_id, "rdp_taxonomy", rdp_taxonomy, "observation")
biom.add_metadata(cluster_id, "rdp_bootstrap", rdp_bootstrap, "observation")
BiomIO.write(biom_out, biom)