本文整理汇总了Python中sklearn.metrics.pairwise.linear_kernel函数的典型用法代码示例。如果您正苦于以下问题:Python linear_kernel函数的具体用法?Python linear_kernel怎么用?Python linear_kernel使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。
在下文中一共展示了linear_kernel函数的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: pilot_test
def pilot_test():
"""
"""
users_vectors = []
vectorsums = []
for i, user in enumerate(sample_users):
df = pd.read_pickle('./fc8_100imgs_{}.pkl'.format(user))
users_vectors.append(df)
vectorsums.append(df.fc8.values.sum())
corpus = []
for vector in vectorsums:
corpus.append(vector_to_document(vector))
tfidf = TfidfVectorizer()
tfidf_vectorized = tfidf.fit_transform(corpus)
cosine_similarities = linear_kernel(tfidf_vectorized, tfidf_vectorized)
new_docs = []
for i, user in enumerate(sample_users):
for j, img_vec in enumerate(users_vectors[i].fc8):
doc = vector_to_document(img_vec)
new_docs.append(doc)
# vectorized = tfidf.transform([doc])
# sims = linear_kernel(vectorized, tfidf_vectorized)[0]
# most_sims = np.argsort(sims)[::-1]
#
# print '{} img {} most similar to \n{}'.format(user, j, [(sample_users[i], sims[i]) for i in most_sims] )
new_docs_vectorized = tfidf.transform(new_docs)
cosine_similarities = linear_kernel(new_docs_vectorized, tfidf_vectorized)
for sim in cosine_similarities:
print 'top score: {} top user: {}'.format(sim.max(), sample_users[np.argmax(sim)])
示例2: plot_hist_d_to_centroid
def plot_hist_d_to_centroid(self, min_w=0):
'''
histograms of distance to centroids: overall vs. each cluster
'''
self.assign_cluster(min_w)
self.cal_centroid()
n_clusters = np.max(self.clusters)
#fig = plt.figure(figsize=(20,8))
X2_dense = self.X2.todense()
centroid_overall = np.mean(X2_dense, axis=0)
sim = linear_kernel(centroid_overall, X2_dense)
max_sim = np.max(sim)
min_sim = np.min(sim)
# multiple plot, subplots
ncols = 3
nrows = (n_clusters + 1) // ncols + (((n_clusters + 1) % ncols) > 0)
# subplot preferred way
fig, ax = plt.subplots(nrows, ncols, figsize=(30, 10))
axs = ax.flatten()
i_plot = 0
axs[i_plot].hist(sim.flatten(), alpha=0.2) # , ax=axs[i_plot])
axs[i_plot].set_xlim(min_sim, max_sim)
i_plot = i_plot + 1
for i in xrange(n_clusters):
cond = self.clusters == i
arr = X2_dense[cond]
sim = linear_kernel(self.centroids[i], arr)
axs[i_plot].hist(sim.flatten(), alpha=0.2) # , ax=axs[i_plot])
axs[i_plot].set_xlim(min_sim, max_sim)
i_plot = i_plot + 1
fig.savefig(self.model_name + '_hist_dis_to_centroid.png')
plt.close(fig)
示例3: _build_similarity_matrix
def _build_similarity_matrix(self):
"""
partitioned similarity matrix ('s' for source nodes and 't' for target nodes)
S = [[S_ss, S_st],
[S_ts, S_tt]]
"""
normalize(self.source_features, norm='l2', copy=False)
normalize(self.target_features, norm='l2', copy=False)
self.ss = linear_kernel(self.source_features)
self.st = linear_kernel(self.source_features, self.target_features)
self.ts = self.st.T
self.tt = linear_kernel(self.target_features)
示例4: plot_hist_d_to_centroid
def plot_hist_d_to_centroid(self, min_w=0):
'''
plot histogram of distance to centroid, overall vs. per cluster
- INPUT: self.X2
'''
self.assign_cluster(min_w)
self.cal_centroid()
n_clusters = np.max(self.clusters)
#fig = plt.figure(figsize=(20,8))
# multiple plot, subplots
ncols = 3
nrows = (n_clusters + 1) // ncols + (((n_clusters + 1) % ncols) > 0)
# subplot preferred way
fig, ax = plt.subplots(nrows, ncols, figsize=(30, 10))
axs = ax.flatten()
centroid_overall = np.mean(self.X2, axis=0)
sim = linear_kernel(centroid_overall, self.X2)
max_sim = np.max(sim)
min_sim = np.min(sim)
print 'sim shape: %s X shape: %s centroid_overall shape: %s' % (sim.shape, self.X2.shape, centroid_overall.shape)
print 'min %.2f max %.2f ' % (min_sim, max_sim)
print sorted(sim.flatten(), reverse=True)[:5]
print sorted(centroid_overall.getA().flatten(), reverse=True)[:5]
max_sim = 1
min_sim = 0
i_plot = 0
axs[i_plot].hist(sim.flatten(), alpha=0.2) # , ax=axs[i_plot])
axs[i_plot].set_xlim(min_sim, max_sim)
i_plot = i_plot + 1
for i in xrange(n_clusters + 1):
cond = self.clusters == i
arr = self.X2[cond]
sim = linear_kernel(self.centroids[i], arr)
print 'sim shape: %s arr shape: %s centroid shape: %s' % (sim.shape, arr.shape, self.centroids[i].shape)
print sorted(sim.flatten(), reverse=True)[:5]
print sorted(self.centroids[i].flatten(), reverse=True)[:5]
axs[i_plot].hist(sim.flatten(), alpha=0.2) # , ax=axs[i_plot])
axs[i_plot].set_xlim(min_sim, max_sim)
i_plot = i_plot + 1
plt.show()
fig.savefig(self.model_name + '_hist_dis_to_centroid.png')
plt.close(fig)
示例5: main
def main():
twenty = fetch_20newsgroups()
tfidf = TfidfVectorizer().fit_transform(twenty.data)
cosine_similarities = linear_kernel(tfidf[0:1], tfidf).flatten()
related_docs_indices = cosine_similarities.argsort()[:-5:-1]
print related_docs_indices
print cosine_similarities[related_docs_indices]
# vectorizer = CountVectorizer(min_df=1)
# corpus = [
# 'This is the first document.',
# 'This is the second second document.',
# 'And the third one.',
# 'Is this the first document?',
# ]
# tfidf = TfidfVectorizer(tokenizer=tokenize, stop_words='english')
# tfs = tfidf.fit_transform(token_dict.values())
train_set = ("The sky is blue.", "The sun is bright.")
test_set = ("The sun in the sky is bright.",
"We can see the shining sun, the bright sun.")
count_vectorizer = CountVectorizer()
count_vectorizer.fit_transform(train_set)
print "Vocabulary:", count_vectorizer.vocabulary
# Vocabulary: {'blue': 0, 'sun': 1, 'bright': 2, 'sky': 3}
freq_term_matrix = count_vectorizer.transform(test_set)
print freq_term_matrix.todense()
tfidf = TfidfTransformer(norm="l2")
tfidf.fit(freq_term_matrix)
print "IDF:", tfidf.idf_
tf_idf_matrix = tfidf.transform(freq_term_matrix)
print tf_idf_matrix.todense()
示例6: __asyncable_similarity
def __asyncable_similarity(tup):
# bs, beer_id_ref, ref_vect, s_ids, b_ids, X_t, top = tup
# bs: beer similarity object for db commit
# ref_vects from one style
# ref_b_ids: beer ids for ref vecs
# s_ids, b_ids: style and beer indices of X_t
# X_t for beers in other styles to be compared to
# keep top similarities by style
bs, b_refs, X_t_ref, b_comps, X_t_comp, top = tup
start = dt.now()
print "Beer ct %s vs ct %s: Compute Similarity" % (len(b_refs), len(b_comps))
try:
for i in xrange(len(b_refs)):
# compute similarity between beer_ref[i] and all b_comps
lk = linear_kernel(X_t_ref.getrow(i), X_t_comp).flatten()
# take #top of largest similarities
n = len(lk)
kp = min(top, n)
m_ixs = lk.argsort()[-kp:]
sims = [(b_refs[i], b_comps[j], lk[j]) for j in m_ixs if b_refs[i] != b_comps[j]]
# bs.smooth_similarity(sims)
bs.add_many(sims)
print "Comparison Complete: %s" % (dt.now() - start)
return (b_refs, None)
except Exception as e:
return (b_refs, e)
示例7: __kernel_definition__
def __kernel_definition__(self):
if self.Kf == 'rbf':
return lambda X,Y : rbf_kernel(X,Y,self.rbf_gamma)
if self.Kf == 'poly':
return lambda X,Y : polynomial_kernel(X, Y, degree=self.poly_deg, gamma=None, coef0=self.poly_coeff)
if self.Kf == None or self.Kf == 'linear':
return lambda X,Y : linear_kernel(X,Y)
示例8: thread_diag_block
def thread_diag_block(top_nbrs,dataM,job_ranges,r_offset, c_offset,
n_nbr=100,verbose=False):
''' (cos,idx)
Note in the min-heap, the first one is the smallest.
'''
for job_bd in job_ranges:
crossV = linear_kernel(dataM[job_bd[0]:job_bd[1],:],dataM)
n_doc1, n_doc2 = crossV.shape
for i_doc in range(n_doc1):
i_offset = i_doc + job_bd[0] + r_offset
L = top_nbrs[i_offset]
for j in range(n_doc2):
if i_offset == j+c_offset:
continue
if len(L)<n_nbr:
heapq.heappush(L, (crossV[i_doc,j],j+c_offset))
elif crossV[i_doc,j] > L[0][0]:
heapq.heapreplace(L, (crossV[i_doc,j],j+c_offset))
top_nbrs[i_offset] = L
if verbose:
print('process range (%d,%d)'%(job_bd[0],job_bd[1]))
示例9: get_related_news
def get_related_news(articles ,base_art_index):
if related_dict.get(base_art_index) is not None :
return related_dict.get(base_art_index)
corpus = []
for art in articles :
corpus.append( ' '.join( jieba.cut(art.context) ) )
ls = [w for w in WordCutLibs.stopwords.split('\n')]
vectorizer = CountVectorizer(stop_words=ls)
X = vectorizer.fit_transform(corpus)
#word = vectorizer.get_feature_names()
#stopword = vectorizer.get_stop_words()
transformer = TfidfTransformer()
tfidf = transformer.fit_transform(X)
#weight = tfidf.toarray()
target = base_art_index #設定目標標題 #index順序同SQL
cosine_similarities = linear_kernel(tfidf[target], tfidf).flatten().argsort()
max_len = len(cosine_similarities)
bnd = -11 if max_len >= 10 else -(max_len)
related_docs_indices = cosine_similarities[: bnd:-1]
res = [ articles[idx] for idx in related_docs_indices ]
related_dict[base_art_index] = res
return res
示例10: print_most_cos_sim
def print_most_cos_sim(self, thresh=0.675):
'''
Prints the two posts that have the highest cosine similarity
'''
cos_sims = linear_kernel(self.word_vecs, self.word_vecs)
# Initialize max_sim = 0, only consider cos sims under threshold
# so we know we're not recording a post compared with itself (1.0)
max_cos_sim = 0.0
thr = thresh
# Find max_cos_sim
for i, j in enumerate(cos_sims):
for k, l in enumerate(j):
if (float(l) >= max_cos_sim) and (float(l) < thr):
max_cos_sim = float(l)
# Find indices of max_cos_sim
double_break = False
for i, j in enumerate(cos_sims):
for k, l in enumerate(j):
if float(l) == max_cos_sim:
ind1, ind2 = i, k
double_break = True
break
if double_break:
break
print 'Posts with highest cosine similarity ({:.3f}):\n\nPost {}:\n{}\
\n\nPost {}:\n{}'.format(max_cos_sim, ind1, self.posts[ind1],
ind2, self.posts[ind2])
示例11: __init__
def __init__(self, *args, **kwargs):
super(QUIRE, self).__init__(*args, **kwargs)
self.Uindex = [idx for idx, _ in self.dataset.get_unlabeled_entries()]
self.Lindex = [idx for idx in range(len(self.dataset)) if idx not in self.Uindex]
self.lmbda = kwargs.pop("lambda", 1.0)
X, self.y = zip(*self.dataset.get_entries())
self.y = list(self.y)
self.kernel = kwargs.pop("kernel", "rbf")
if self.kernel == "rbf":
self.K = rbf_kernel(X=X, Y=X, gamma=kwargs.pop("gamma", 1.0))
elif self.kernel == "poly":
self.K = polynomial_kernel(
X=X, Y=X, coef0=kwargs.pop("coef0", 1), degree=kwargs.pop("degree", 3), gamma=kwargs.pop("gamma", 1.0)
)
elif self.kernel == "linear":
self.K = linear_kernel(X=X, Y=X)
elif hasattr(self.kernel, "__call__"):
self.K = self.kernel(X=np.array(X), Y=np.array(X))
else:
raise NotImplementedError
if not isinstance(self.K, np.ndarray):
raise TypeError("K should be an ndarray")
if self.K.shape != (len(X), len(X)):
raise ValueError("kernel should have size (%d, %d)" % (len(X), len(X)))
self.L = np.linalg.inv(self.K + self.lmbda * np.eye(len(X)))
示例12: get
def get(self):
query = self.get_argument('q', None)
if query is None:
return
queryTerms = query.split()
# let's say we have N documents and M terms in query
# Apparently we assume unique term in query
# queryVector is a 1 * M dimension array
queryVector = np.array([self._logIDF[term] for term in queryTerms])
# docVectoDict is a N * M vector, with default value np.array([0] * M)
docVectorDict = defaultdict(lambda: np.array([0]*len(queryTerms)))
for i in range(len(queryTerms)):
term = queryTerms[i].lower()
newList = self._postingsList[term]
for item in newList: # newList is [(docID,tf)]
docVectorDict[item[0]][i] = item[1] * self._logIDF[term]
docMatrix = np.zeros((len(docVectoDict)), len(queryTerms)))
docIx = 0
docIxToDocID = {}
for docID in docVectorDict.keys():
docMatrix[docIx][:] = docVectorDict[docID][:]
docIxToDocID[docIx] = docID
docIx += 1
# linear_kernel is used to compute the similarity
sims = linear_kernel(queryVector,docMatrix).flatten()
# argsort return the index
bestDocIxes = sims.argsort()[::-1]
bestDocSims = sims[bestDocIxes]
bestDocIDs = [docIxToDocID[docIx] for docIx in bestDocIxes]
postings = zip(bestDocIDs, bestDocSims)
self.write(json.dumps({"postings":postings}))
示例13: predict
def predict(data, vect, user_list, tweet_list, word_counts):
vector = vect.transform(data)
result_matrix = linear_kernel(vector, word_counts)
indices_of_tweets = []
# For each tweet by the client, find the 30 most similar tweets
# This list may include tweets by the client
for row in result_matrix:
indices = row.argsort()[:][::-1]
indices_of_tweets.append(indices[2:51])
# Return the person that tweeted each of the 50 most similar tweets
user_array = np.array(user_list)
persons_per_tweet = []
for row in indices_of_tweets:
persons_per_tweet.append(user_array[row])
# Count up how many times each person shows up.
# Same weighting is given to people who have many tweets similar to one client tweet
# and a tweet that matches a high number of client tweets.
persons_counter = Counter()
for row in persons_per_tweet:
persons_counter.update(row)
# return the top 25 people in this list
top_people_and_count = persons_counter.most_common(25)
top_people = [tup[0] for tup in top_people_and_count]
return top_people
示例14: getRelevantPassages
def getRelevantPassages(query, k):
queryVector = allTextVectorizer.transform([query])
queryIndices = numpy.array([allTextVectorizer.vocabulary_.get(word) for word in allTextAnalyzer(query)])
queryIndices = [i for i in queryIndices if i is not None]
querySimilarityScores = linear_kernel(queryVector[:,queryIndices], allTextIndex[:,queryIndices]).flatten()
relatedDocIndices = querySimilarityScores.argsort()[:-k:-1]
return [allTextLines[i] for i in relatedDocIndices]
示例15: get_results
def get_results(query):
test = query
response = tfidf.transform([test])
print 'response: ', response
RESULTS_ARRAY = []
cosine_similarities = linear_kernel(response, tfs).flatten()
related_docs_indices = cosine_similarities.argsort()[:-10:-1]
for i in related_docs_indices:
if cosine_similarities[i] > 0:
file_name = token_dict.keys()[i].split('.')[0] + '.pdf.html.json'
data = {}
data = summary_dict[file_name]
data.update({"candidate": token_dict.keys()[i].split('.')[0],
"cosine": cosine_similarities[i]})
# data = {"candidate": token_dict.keys()[i].split('.')[0],
# "cosine": cosine_similarities[i]}
RESULTS_ARRAY.append(data)
# print "%-50s %.4f" % (token_dict.keys()[i].split('.')[0],cosine_similarities[i])
# print RESULTS_ARRAY
return RESULTS_ARRAY