本文整理汇总了Python中qiime.format.format_distance_matrix函数的典型用法代码示例。如果您正苦于以下问题:Python format_distance_matrix函数的具体用法?Python format_distance_matrix怎么用?Python format_distance_matrix使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。
在下文中一共展示了format_distance_matrix函数的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: main
def main():
option_parser, opts, args = parse_command_line_parameters(**script_info)
# Open the input distance matrices, parse them, find the intersection, and
# write the two new distance matrices to the output filepaths.
input_dm_fps = opts.input_dms.split(',')
output_dm_fps = opts.output_dms.split(',')
if len(input_dm_fps) != 2 or len(output_dm_fps) != 2:
option_parser.error("You must provide exactly two input and output "
"distance matrix filepaths.")
labels1, dm1_data = parse_distmat(open(input_dm_fps[0], 'U'))
labels2, dm2_data = parse_distmat(open(input_dm_fps[1], 'U'))
(dm1_labels, dm1), (dm2_labels, dm2) = make_compatible_distance_matrices(
parse_distmat(open(input_dm_fps[0],'U')),
parse_distmat(open(input_dm_fps[1],'U')))
assert (dm1_labels == dm2_labels), "The order of sample IDs is not the " +\
"same for the two matrices."
output1_f = open(output_dm_fps[0], 'w')
output2_f = open(output_dm_fps[1], 'w')
output1_f.write(format_distance_matrix(dm1_labels, dm1))
output2_f.write(format_distance_matrix(dm2_labels, dm2))
output1_f.close()
output2_f.close()
示例2: calc_shared_phylotypes
def calc_shared_phylotypes(infile, reference_sample=None):
"""Calculates number of shared phylotypes for each pair of sample.
infile: otu table filehandle
reference_sample: if set, will use this sample name to calculate shared OTUs
between reference sample, and pair of samples. Useful,
e.g. when the reference sample is the Donor in a transplant study
"""
sample_ids, otu_ids, otu_table, lineages = parse_otu_table(infile)
if reference_sample:
ref_idx = sample_ids.index(reference_sample)
(n,m) = otu_table.shape
result_array = zeros((m,m), dtype=int)
for i in range(m):
for j in range (i+1):
if reference_sample:
result_array[i,j] = result_array[j,i] = \
_calc_shared_phylotypes_multiple(otu_table, [i, j, ref_idx])
else:
result_array[i,j] = result_array[j,i] = \
_calc_shared_phylotypes_pairwise(otu_table, i, j)
return format_distance_matrix(sample_ids, result_array)+"\n"
示例3: distance_matrix
def distance_matrix(input_path, column):
""" calculates distance matrix on a single column of a mapping file
inputs:
input_path (file handler)
column (str)
"""
data, comments = parse_mapping_file_to_dict(input_path)
column_data = []
column_headers = []
for i in data:
if column not in data[i]:
stderr.write("\n\nNo column: '%s' in the mapping file. Existing columns are: %s\n\n" % (column,data[i].keys()))
exit(1)
try:
column_data.append(float(data[i][column]))
except ValueError:
stderr.write("\n\nall the values in the column '%s' must be numeric but '%s' has '%s'\n\n"\
% (column,i,data[i][column]))
exit(1)
column_headers.append(i)
data_row = array(column_data)
data_col = reshape(data_row, (1, len(data_row)))
dist_mtx = abs(data_row-data_col.T)
return format_distance_matrix(column_headers, dist_mtx)
示例4: assemble_distance_matrix
def assemble_distance_matrix(dm_components):
""" assemble distance matrix components into a complete dm string
"""
print "I get called."
data = {}
# iterate over compenents
for c in dm_components:
# create a blank list to store the column ids
col_ids = []
# iterate over lines
for line in c:
# split on tabs remove leading and trailing whitespace
fields = line.strip().split()
if fields:
# if no column ids seen yet, these are them
if not col_ids:
col_ids = fields
# otherwise this is a data row so add it to data
else:
sid = fields[0]
data[sid] = dict(zip(col_ids,fields[1:]))
# grab the col/row ids as a list so it's ordered
labels = data.keys()
# create an empty list to build the dm
dm = []
# construct the dm one row at a time
for l1 in labels:
dm.append([data[l1][l2] for l2 in labels])
# create the dm string and return it
dm = format_distance_matrix(labels,dm)
return dm
示例5: test_format_distance_matrix
def test_format_distance_matrix(self):
"""format_distance_matrix should return tab-delimited dist mat"""
a = array([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
labels = [11, 22, 33]
res = format_distance_matrix(labels, a)
self.assertEqual(res, "\t11\t22\t33\n11\t1\t2\t3\n22\t4\t5\t6\n33\t7\t8\t9")
self.assertRaises(ValueError, format_distance_matrix, labels[:2], a)
示例6: calc_shared_phylotypes
def calc_shared_phylotypes(infile, reference_sample=None):
"""Calculates number of shared phylotypes for each pair of sample.
infile: otu table filehandle
reference_sample: if set, will use this sample name to calculate shared OTUs
between reference sample, and pair of samples. Useful,
e.g. when the reference sample is the Donor in a transplant study
"""
otu_table = parse_biom_table(infile)
if reference_sample:
#ref_idx = sample_ids.index(reference_sample)
ref_idx = reference_sample
num_samples = len(otu_table.SampleIds)
result_array = zeros((num_samples, num_samples), dtype=int)
for i,samp1_id in enumerate(otu_table.SampleIds):
for j,samp2_id in enumerate(otu_table.SampleIds[:i+1]):
if reference_sample:
result_array[i,j] = result_array[j,i] = \
_calc_shared_phylotypes_multiple(otu_table,
[samp1_id, samp2_id, ref_idx])
else:
result_array[i,j] = result_array[j,i] = \
_calc_shared_phylotypes_pairwise(otu_table, samp1_id,
samp2_id)
return format_distance_matrix(otu_table.SampleIds, result_array)+"\n"
示例7: main
def main():
option_parser, opts, args = parse_command_line_parameters(**script_info)
data, comments = parse_mapping_file_to_dict(opts.input_path)
column_headers = []
if ',' not in opts.column:
column_data = []
column_name = opts.column
for i in data:
if column_name not in data[i]:
raise ValueError(
"No column: '%s' in the mapping file. Existing columns are: %s" %
(column_name, data[i].keys()))
try:
column_data.append(float(data[i][opts.column]))
except ValueError:
raise ValueError(
"All the values in the column '%s' must be numeric but '%s' has '%s'" %
(column_name, i, data[i][column_name]))
column_headers.append(i)
dtx_mtx = compute_distance_matrix_from_metadata(column_data)
else:
latitudes = []
longitudes = []
try:
latitude, longitude = opts.column.split(',')
except ValueError:
raise ValueError(
"This script accepts a maximum of 2 colums separated by comma and you passed: %s" %
(opts.column))
for i in data:
if latitude not in data[i] or longitude not in data[i]:
raise ValueError(
"One of these columns or both do not exist: '%s' or '%s' in the mapping file. Existing columns are: %s" %
(latitude, longitude, data[i].keys()))
try:
latitudes.append(float(data[i][latitude]))
longitudes.append(float(data[i][longitude]))
except ValueError:
raise ValueError(
"All the values in the columnd '%s' & '%s' must be numeric but '%s' has '%s'" %
(latitude, longitude, i, data[i][column_name]))
column_headers.append(i)
dtx_mtx = calculate_dist_vincenty(latitudes, longitudes)
dtx_txt = format_distance_matrix(column_headers, dtx_mtx)
outfilepath = os.path.join(opts.output_fp)
f = open(outfilepath, 'w')
f.write(dtx_txt)
f.close()
示例8: main
def main():
option_parser, opts, args = parse_command_line_parameters(**script_info)
# Open the input distance matrix and parse it. Shuffle its labels and write
# them and the original data to the output file.
labels, dm_data = parse_distmat(open(opts.input_distance_matrix, 'U'))
shuffle(labels)
output_f = open(opts.output_distance_matrix, 'w')
output_f.write(format_distance_matrix(labels, dm_data))
output_f.close()
示例9: compute_procrustes
def compute_procrustes(result_tables, expected_pc_lookup, taxonomy_level=6, num_dimensions=3, random_trials=999):
""" Compute Procrustes M2 and p-values for a set of results
result_tables: 2d list of tables to be compared to expected tables,
where the data in the inner list is:
[dataset_id, reference_database_id, method_id,
parameter_combination_id, table_fp]
expected_pc_lookup: 2d dict of dataset_id, reference_db_id to principal
coordinate matrices, for the expected result coordinate matrices
taxonomy_level: level to compute results
"""
### Start code copied ALMOST* directly from compute_prfs - some re-factoring for re-use is
### in order here. *ALMOST refers to changes to parser and variable names since expected
### is a pc matrix here.
for dataset_id, reference_id, method_id, params, actual_table_fp in result_tables:
## parse the expected table (unless taxonomy_level is specified, this should be
## collapsed on level 6 taxonomy)
try:
expected_pc_fp = expected_pc_lookup[dataset_id][reference_id]
except KeyError:
raise KeyError, "Can't find expected table for (%s, %s)." % (dataset_id, reference_id)
## parse the actual table and collapse it at the specified taxonomic level
try:
actual_table = parse_biom_table(open(actual_table_fp, "U"))
except ValueError:
raise ValueError, "Couldn't parse BIOM table: %s" % actual_table_fp
collapse_by_taxonomy = get_taxonomy_collapser(taxonomy_level)
actual_table = actual_table.collapseObservationsByMetadata(collapse_by_taxonomy)
### End code copied directly from compute_prfs.
# Next block of code, how do I hate thee? Let me count the ways...
# (1) dist_bray_curtis doesn't take a BIOM Table object
# (2) pcoa takes a qiime-formatted distance matrix as a list of lines
# (3) pcoa return a qiime-formatted pc matrix
# (4) procrustes_monte_carlo needs to pass through the pc "file" multiple
# times, so we actually *need* those the pcs that get passed in to be
# lists of lines
dm = dist_bray_curtis(asarray([v for v in actual_table.iterSampleData()]))
formatted_dm = format_distance_matrix(actual_table.SampleIds, dm)
actual_pc = pcoa(formatted_dm.split("\n")).split("\n")
expected_pc = list(open(expected_pc_fp, "U"))
## run Procrustes analysis with monte carlo simulation
actual_m_squared, trial_m_squareds, count_better, mc_p_value = procrustes_monte_carlo(
expected_pc,
actual_pc,
trials=random_trials,
max_dimensions=num_dimensions,
sample_id_map=None,
trial_output_dir=None,
)
yield (dataset_id, reference_id, method_id, params, actual_m_squared, mc_p_value)
示例10: main
def main():
option_parser, opts, args = parse_command_line_parameters(**script_info)
indir = opts.input_dir
outdir = opts.output_dir
if not os.path.exists(outdir):
os.makedirs(outdir)
#input
file_names = os.listdir(indir)
file_names = [fname for fname in file_names if not fname.startswith('.')]
distmats = []
headers_list = []
for fname in file_names:
f = open(os.path.join(indir,fname), 'U')
headers, data = parse_distmat(f)
f.close()
distmats.append(data)
headers_list.append(headers)
#calcs
headers, means, medians, stdevs = matrix_stats(headers_list, distmats)
#output
f = open(os.path.join(outdir,'means.txt'), 'w')
f.write(format_distance_matrix(headers,means))
f.close()
f = open(os.path.join(outdir,'medians.txt'), 'w')
f.write(format_distance_matrix(headers,medians))
f.close()
f = open(os.path.join(outdir,'stdevs.txt'), 'w')
f.write(format_distance_matrix(headers,stdevs))
f.close()
示例11: test_single_file_nj
def test_single_file_nj(self):
""" single_file_nj should throw no errors"""
titles = ["hi", "ho", "yo"]
distdata = numpy.array([[0, 0.5, 0.3], [0.5, 0.0, 0.9], [0.3, 0.9, 0.0]])
fname = get_tmp_filename(prefix="nj_", suffix=".txt")
f = open(fname, "w")
self._paths_to_clean_up.append(fname)
f.write(format_distance_matrix(titles, distdata))
f.close()
fname2 = get_tmp_filename(prefix="nj_", suffix=".txt", result_constructor=str)
self._paths_to_clean_up.append(fname2)
single_file_nj(fname, fname2)
assert os.path.exists(fname2)
示例12: test_single_file_nj
def test_single_file_nj(self):
""" single_file_nj should throw no errors"""
titles = ['hi','ho','yo']
distdata = numpy.array([[0,.5,.3],[.5,0.,.9],[.3,.9,0.]])
fname = get_tmp_filename(prefix='nj_',suffix='.txt')
f = open(fname,'w')
self._paths_to_clean_up.append(fname)
f.write(format_distance_matrix(titles, distdata))
f.close()
fname2 = get_tmp_filename(prefix='nj_',suffix='.txt',
result_constructor=str)
self._paths_to_clean_up.append(fname2)
single_file_nj(fname,fname2)
assert(os.path.exists(fname2))
示例13: test_single_file_upgma
def test_single_file_upgma(self):
""" single_file_upgma should throw no errors"""
titles = ['hi', 'ho']
distdata = numpy.array([[0, .5], [.5, 0.]])
fd, fname = mkstemp(prefix='upgma_', suffix='.txt')
close(fd)
f = open(fname, 'w')
self._paths_to_clean_up.append(fname)
f.write(format_distance_matrix(titles, distdata))
f.close()
fd, fname2 = mkstemp(prefix='upgma_', suffix='.txt')
close(fd)
self._paths_to_clean_up.append(fname2)
single_file_upgma(fname, fname2)
assert(os.path.exists(fname2))
示例14: test_single_file_upgma
def test_single_file_upgma(self):
""" single_file_upgma should throw no errors"""
titles = ["hi", "ho"]
distdata = numpy.array([[0, 0.5], [0.5, 0.0]])
fd, fname = mkstemp(prefix="upgma_", suffix=".txt")
close(fd)
f = open(fname, "w")
self._paths_to_clean_up.append(fname)
f.write(format_distance_matrix(titles, distdata))
f.close()
fd, fname2 = mkstemp(prefix="upgma_", suffix=".txt")
close(fd)
self._paths_to_clean_up.append(fname2)
single_file_upgma(fname, fname2)
assert os.path.exists(fname2)
示例15: filter_samples_from_distance_matrix
def filter_samples_from_distance_matrix(dm, samples_to_discard, negate=False):
""" Remove specified samples from distance matrix
dm: (sample_ids, dm_data) tuple, as returned from
qiime.parse.parse_distmat; or a file handle that can be passed
to qiime.parse.parse_distmat
"""
try:
sample_ids, dm_data = dm
except ValueError:
# input was provide as a file handle
sample_ids, dm_data = parse_distmat(dm)
sample_lookup = {}.fromkeys([e.split()[0] for e in samples_to_discard])
temp_dm_data = []
new_dm_data = []
new_sample_ids = []
if negate:
def keep_sample(s):
return s in sample_lookup
else:
def keep_sample(s):
return s not in sample_lookup
for row, sample_id in zip(dm_data, sample_ids):
if keep_sample(sample_id):
temp_dm_data.append(row)
new_sample_ids.append(sample_id)
temp_dm_data = array(temp_dm_data).transpose()
for col, sample_id in zip(temp_dm_data, sample_ids):
if keep_sample(sample_id):
new_dm_data.append(col)
new_dm_data = array(new_dm_data).transpose()
return format_distance_matrix(new_sample_ids, new_dm_data)