本文整理汇总了Python中sklearn.neighbors.NearestNeighbors.get_feature_names方法的典型用法代码示例。如果您正苦于以下问题:Python NearestNeighbors.get_feature_names方法的具体用法?Python NearestNeighbors.get_feature_names怎么用?Python NearestNeighbors.get_feature_names使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类sklearn.neighbors.NearestNeighbors
的用法示例。
在下文中一共展示了NearestNeighbors.get_feature_names方法的1个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: plot
# 需要导入模块: from sklearn.neighbors import NearestNeighbors [as 别名]
# 或者: from sklearn.neighbors.NearestNeighbors import get_feature_names [as 别名]
#.........这里部分代码省略.........
for stratum in strata_proportions:
strata = [title for title in titles if stratum == title.split('_')[0]]
sampling_fraction = strata_proportions[stratum]
local_rand_strat_titles = random.sample(strata, sampling_fraction)
strat_titles.append(local_rand_strat_titles)
strat_titles = sum(strat_titles, [])
strat_authors = [author for author, title in zip(authors, titles) if title in strat_titles]
strat_texts = [text for title, text in zip(titles, texts) if title in strat_titles]
titles = strat_titles
authors = strat_authors
texts = strat_texts
fob_nodes = open(os.path.dirname(os.getcwd()) + "/gephi_nodes.txt", "w")
fob_edges = open(os.path.dirname(os.getcwd()) + "/gephi_edges.txt", "w")
fob_nodes.write("Id" + "\t" + "Work" + "\t" + "Author" + "\n")
fob_edges.write("Source" + "\t" + "Target" + "\t" + "Type" + "\t" + "Weight" + "\n")
# Build up consensus distances of different feature ranges and different metrics
exhsearch_data = []
for n_feats in feat_range:
# print("::: running through feature range {} ::: ".format(str(n_feats)))
tfidf_vectors, tfidf_features = Vectorizer(texts, self.invalid_words,
n_feats=n_feats,
feat_scaling='standard_scaler',
analyzer='word',
vocab=None
).tfidf(smoothing=True)
if n_feats == feat_range[-1]:
pass
# print("FEATURES: ", ", ".join(tfidf_features))
for metric in metric_dictionary:
model = NearestNeighbors(n_neighbors=n_nbrs,
algorithm='brute',
metric=metric_dictionary[metric],
).fit(tfidf_vectors)
distances, indices = model.kneighbors(tfidf_vectors)
# Distances are normalized in order for valid ground for comparison
all_distances = []
for distance_vector in distances:
for value in distance_vector:
if value != 0.0:
all_distances.append(value)
all_distances = np.array(all_distances)
highest_value = all_distances[np.argmin(all_distances)]
lowest_value = all_distances[np.argmax(all_distances)]
normalized_distances = (distances - lowest_value) / (highest_value - lowest_value)
# Distances appended to dataframe
for distance_vec, index_vec in zip(normalized_distances, indices):
data_tup = ('{} feats, {}'.format(str(n_feats), metric_dictionary[metric]),
titles[index_vec[0]],
titles[index_vec[1]], distance_vec[1],
titles[index_vec[2]], distance_vec[2],
titles[index_vec[3]], distance_vec[3])
exhsearch_data.append(data_tup)
# Entire collected dataframe
df = pd.DataFrame(exhsearch_data, columns=['exp', 'node', 'neighbor 1', 'dst 1', 'neighbor 2',
'dst 2', 'neighbor 3', 'dst 3']).sort_values(by='node', ascending=0)
final_data = []
weights= []
node_orientation = {title: idx+1 for idx, title in enumerate(titles)}
for idx, (author, title) in enumerate(zip(authors, titles)):
neighbors = []
dsts = []
# Pool all neighbors and distances together (ignore ranking of nb1, nb2, etc.)
for num in range(1, n_nbrs):
neighbors.append([neighb for neighb in df[df['node']==title]['neighbor {}'.format(str(num))]])
dsts.append([neighb for neighb in df[df['node']==title]['dst {}'.format(str(num))]])
neighbors = sum(neighbors, [])
dsts = sum(dsts, [])
# Token pattern in order for hyphenated title names not to become split up
pattern = "(?u)\\b[\\w-]+\\b"
model = CountVectorizer(lowercase=False, token_pattern=pattern)
count_dict = model.fit_transform(neighbors)
# Collect all the candidates per sample that were chosen by the algorithm as nearest neighbor at least once
candidate_dict = {neighbor: [] for neighbor in model.get_feature_names()}
for nbr, dst in zip(neighbors, dsts):
candidate_dict[nbr].append(dst)
candidate_dict = {nbr: np.mean(candidate_dict[nbr])*len(candidate_dict[nbr]) for nbr in candidate_dict}
candidate_dict = sorted(candidate_dict.items(), key=lambda x: x[1], reverse=True)
fob_nodes.write(str(idx + 1) + "\t" + str(title.split('_')[-1]) + "\t" + str(author) + "\n")
data_tup = (title,)
for candtitle, weight in candidate_dict[:8]:
data_tup = data_tup + (candtitle, weight,)
weights.append(weight)
fob_edges.write(str(idx+1) + "\t" + str(node_orientation[candtitle]) + "\t" + "Undirected" + "\t" + str(weight) + "\n")
final_data.append(data_tup)
# Prepare column names for dataframe
longest = np.int((len(final_data[np.argmax([len(i) for i in final_data])]) - 1) / 2)
columns = sum([['neighbor {}'.format(str(i)), 'dst {}'.format(str(i))] for i in range(1, longest+1)], [])
columns.insert(0, 'node')
final_df = pd.DataFrame(final_data, columns=columns).sort_values(by='node', ascending=0)