本文整理汇总了Python中sklearn.neighbors.LSHForest类的典型用法代码示例。如果您正苦于以下问题:Python LSHForest类的具体用法?Python LSHForest怎么用?Python LSHForest使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。
在下文中一共展示了LSHForest类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: test_fit
def test_fit():
"""Checks whether `fit` method sets all attribute values correctly."""
n_samples = 12
n_features = 2
n_estimators = 5
rng = np.random.RandomState(42)
X = rng.rand(n_samples, n_features)
lshf = LSHForest(n_estimators=n_estimators)
lshf.fit(X)
# _input_array = X
assert_array_equal(X, lshf._fit_X)
# A hash function g(p) for each tree
assert_equal(n_estimators, len(lshf.hash_functions_))
# Hash length = 32
assert_equal(32, lshf.hash_functions_[0].components_.shape[0])
# Number of trees_ in the forest
assert_equal(n_estimators, len(lshf.trees_))
# Each tree has entries for every data point
assert_equal(n_samples, len(lshf.trees_[0]))
# Original indices after sorting the hashes
assert_equal(n_estimators, len(lshf.original_indices_))
# Each set of original indices in a tree has entries for every data point
assert_equal(n_samples, len(lshf.original_indices_[0]))
示例2: single_batch
def single_batch(self, tweets):
"""Performs an approximate nearest neighbors search on tweets in the database
passed to it. The database must be a list of tweets (text of the tweets only).
Returns the indices of tweets with nearby neighbors (i.e. spam tweets).
These indices correspond to indices within the batch of tweets fed to
this function."""
# Vectorize and fit tree:
vect2 = CountVectorizer(stop_words = self.custom_stop_words)
X2 = vect2.fit_transform(tweets)
tree2 = LSHForest()
tree2.fit(X2)
# Build tree:
n_neighbors = []
neighbors_indices = []
working_batch_size = len(tweets)
for x in vect2.transform(tweets):
if len(n_neighbors) % 100 == 0: print "%r tweets analyzed out of %r for this batch" % (len(n_neighbors), working_batch_size)
# Only deal with tweets that are longer than 3 words.
neighbors = tree2.radius_neighbors(x, radius = self.sensitivity)[1]
if x.getnnz() > 2:
n_neighbors.append(len(neighbors[0]))
neighbors_indices.append(neighbors)
else:
n_neighbors.append(1)
neighbors_indices.append(np.array([np.array([0])]))
neighbors_indices = [x for x in range(len(neighbors_indices)) if len(neighbors_indices[x][0]) > 2]
return neighbors_indices
示例3: __init__
class EmbeddingNetworkBuilder:
""" Basically a wrapper around sklearns LSH forest """
def __init__(self, lsh_init=None):
if lsh_init == None:
self._lsh_forest = LSHForest(n_estimators=25, n_candidates=1000)
else:
self._lsh_forest = lsh_init
self.iw = None
self.m = None
def fit_lsh_forest(self, embedding):
self._lsh_forest.fit(embedding.m)
self._embedding = embedding
def extract_nn_network(self, nn=20):
dir_graph_mat = self._lsh_forest.kneighbors_graph(X=self._embedding.m, n_neighbors=nn+1)
return dir_graph_mat
def make_undirected(self, dir_graph_mat):
nodes = set(range(dir_graph_mat.shape[0]))
edges = set([])
for node_i in dir_graph_mat.shape[0]:
for node_j in dir_graph_mat[node_i].nonzero()[1]:
edges.add((node_i, node_j))
return nodes, edges
def get_forest(self):
return self._lsh_forest
def get_node_to_word(self):
return self.iw
示例4: single_batch
def single_batch(self, tweets):
"""Performs an approximate nearest neighbors search on tweets in the database
passed to it. The database must be a list of tweets (text of the tweets only).
Returns the indices of tweets with nearby neighbors (i.e. spam tweets).
These indices correspond to indices within the batch of tweets fed to
this function."""
# Vectorize and fit tree:
vect2 = CountVectorizer(stop_words = self.common_twitter_handles)
X2 = vect2.fit_transform(tweets)
tree2 = LSHForest()
tree2.fit(X2)
# Build tree:
n_neighbors = []
neighbors_indices = []
for x in vect2.transform(tweets):
if len(n_neighbors) % 100 == 0: print "%r tweets analyzed out of %r for this batch" % (len(n_neighbors), self.batch_size)
neighbors = tree2.radius_neighbors(x, radius = .4)[1]
n_neighbors.append(len(neighbors[0]))
neighbors_indices.append(neighbors)
neighbors_indices = [x for x in range(len(neighbors_indices)) if len(neighbors_indices[x][0]) > 2]
return neighbors_indices
示例5: get_nearest_neighbor_iterable
def get_nearest_neighbor_iterable(self, graphlist, start_graphs, start_is_subset=True):
# vectorize all
graphlist= list(graphlist)
graphlist_ = copy.deepcopy(graphlist)
X = self.vectorizer.transform_single(graphlist_)
start_graphs= list(start_graphs)
graphlist_= copy.deepcopy(start_graphs)
Y = self.vectorizer.transform_single(graphlist_)
forest = LSHForest()
forest.fit(X)
#http://scikit-learn.org/stable/modules/neighbors.html
distances, indices = forest.kneighbors(Y, n_neighbors=2)
# we just assume that this is short...
index = 0
if start_is_subset:
index += 1
#matches= ( X_index ,Y_index, distance )
matches = [(indices[i, index], i, distances[i, index]) for i in range(len(indices))]
matches.sort()
# this looks super confusing....
#for index, graph in enumerate(selection_iterator(graphlist, [a[0] for a in matches])):
# yield ((graph, start_graphs[matches[index][1]], X[matches[index][0]]))
# so i wrote this:,,, you may even get rid of the matches variable i think.. and use indices directly
for Xi,Yi,dist in matches:
yield ((start_graphs[Yi],graphlist[Xi],X[Xi]))
示例6: test_hash_functions
def test_hash_functions():
"""Checks randomness of hash functions.
Variance and mean of each hash function (projection vector)
should be different from flattened array of hash functions.
If hash functions are not randomly built (seeded with
same value), variances and means of all functions are equal.
"""
n_samples = 12
n_features = 2
n_estimators = 5
rng = np.random.RandomState(42)
X = rng.rand(n_samples, n_features)
lshf = LSHForest(n_estimators=n_estimators,
random_state=rng.randint(0, np.iinfo(np.int32).max))
lshf.fit(X)
hash_functions = []
for i in range(n_estimators):
hash_functions.append(lshf.hash_functions_[i].components_)
for i in range(n_estimators):
assert_not_equal(np.var(hash_functions),
np.var(lshf.hash_functions_[i].components_))
for i in range(n_estimators):
assert_not_equal(np.mean(hash_functions),
np.mean(lshf.hash_functions_[i].components_))
示例7: search_neighbors
def search_neighbors(request):
designs = Design.objects.all()
image_list = []
for design in designs:
image_list.append(str(design.uid) + ".png")
d_geometry = settings.D_GEOMETRY
designed_images = np.empty((len(image_list), d_geometry[0]*d_geometry[1]*3), dtype="float32")
for i in range(len(image_list)):
designed_images[i] = img2numpy_arr(settings.DESIGN_PATH + image_list[i]).reshape(d_geometry[0]*d_geometry[1]*3)
designed_images /= 255
lshf = LSHForest(random_state=42)
lshf.fit(designed_images)
num = int(request.GET['num'])
input_fname = str(request.GET['input'])
input_image = img2numpy_arr(settings.DESIGN_PATH + input_fname)
input_image = input_image.reshape(1, -1)/255
_, indices = lshf.kneighbors(input_image, n_neighbors=num)
similar_images = []
for i in list(indices.reshape(-1)):
similar_images.append({
"image": str(designs[i].uid) + ".png",
"text": str(designs[i].history_text),
"like": int(designs[i].like),
"filtered": str(designs[i].filtered)
})
return JsonResponse({
"results": similar_images
})
示例8: build_index
def build_index(data, n_estimators=20, n_candidates=100, n_neighbors=10, seed=0):
lshf = LSHForest(n_estimators=n_estimators, n_candidates=n_candidates,
n_neighbors=n_neighbors, random_state=seed)
t0 = time()
lshf.fit(data)
duration = time() - t0
return lshf, duration
示例9: get_heap_and_forest
def get_heap_and_forest(self, griter, k):
'''
so we create the heap and the forest...
heap is (dist to hyperplane, count, graph)
and the forest ist just a nearest neighbor from sklearn
'''
graphs = list(griter)
graphs2 = copy.deepcopy(graphs)
# transform doess mess up the graph objects
X = self.vectorizer.transform(graphs)
forest = LSHForest()
forest.fit(X)
print 'got forest'
heap = []
for vector, graph in zip(X, graphs2):
graph2 = nx.Graph(graph)
heapq.heappush(heap, (
self.sampler.estimator.predict_proba(self.sampler.vectorizer.transform_single(graph2))[0][1],
# score ~ dist from hyperplane
k + 1, # making sure that the counter is high so we dont output the startgraphz at the end
graph)) # at last the actual graph
print 'got heap'
distances, unused = forest.kneighbors(X, n_neighbors=2)
distances = [a[1] for a in distances] # the second element should be the dist we want
avg_dist = distances[len(distances) / 2] # sum(distances)/len(distances)
print 'got dist'
return heap, forest, avg_dist
示例10: test_neighbors_accuracy_with_n_estimators
def test_neighbors_accuracy_with_n_estimators():
# Checks whether accuracy increases as `n_estimators` increases.
n_estimators = np.array([1, 10, 100])
n_samples = 100
n_features = 10
n_iter = 10
n_points = 5
rng = np.random.RandomState(42)
accuracies = np.zeros(n_estimators.shape[0], dtype=float)
X = rng.rand(n_samples, n_features)
for i, t in enumerate(n_estimators):
lshf = LSHForest(n_candidates=500, n_estimators=t)
ignore_warnings(lshf.fit)(X)
for j in range(n_iter):
query = X[rng.randint(0, n_samples)].reshape(1, -1)
neighbors = lshf.kneighbors(query, n_neighbors=n_points,
return_distance=False)
distances = pairwise_distances(query, X, metric='cosine')
ranks = np.argsort(distances)[0, :n_points]
intersection = np.intersect1d(ranks, neighbors).shape[0]
ratio = intersection / float(n_points)
accuracies[i] = accuracies[i] + ratio
accuracies[i] = accuracies[i] / float(n_iter)
# Sorted accuracies should be equal to original accuracies
assert_true(np.all(np.diff(accuracies) >= 0),
msg="Accuracies are not non-decreasing.")
# Highest accuracy should be strictly greater than the lowest
assert_true(np.ptp(accuracies) > 0,
msg="Highest accuracy is not strictly greater than lowest.")
示例11: text_hist
def text_hist():
"""
Calculate histogram of text of images
"""
with open('data/sift_names.pkl', 'r') as f:
names = cPickle.load(f)
with open('data/sift_hist.pkl', 'r') as f:
sift_hists = cPickle.load(f)
filenames = []
for name in names:
name = name.replace('img', 'descr')
name = name.replace('.jpg', '.txt')
filenames.append('shopping/images/' + name)
vectorizer = CountVectorizer(input='filename', token_pattern="(?u)"+'\w+', ngram_range=(1, 1), min_df=2)
xall_transformed = vectorizer.fit_transform(filenames).tocsr()
preprocessing.normalize(xall_transformed, copy=False)
lamb = .5
hists = scipy.sparse.hstack([xall_transformed * lamb, sift_hists * (1-lamb)]).toarray()
preprocessing.normalize(hists, copy=False)
model = LSHForest()
model.fit(hists)
with open('data/text_hist.pkl', 'w') as f:
cPickle.dump(xall_transformed, f)
with open('data/vectorizer.pkl', 'w') as f:
cPickle.dump(vectorizer, f)
with open('data/lshforest_combine.pkl', 'w') as f:
cPickle.dump(model, f)
示例12: test_distances
def test_distances():
"""Checks whether returned neighbors are from closest to farthest."""
n_samples = 12
n_features = 2
n_iter = 10
rng = np.random.RandomState(42)
X = rng.rand(n_samples, n_features)
lshf = LSHForest()
lshf.fit(X)
for i in range(n_iter):
n_neighbors = rng.randint(0, n_samples)
query = X[rng.randint(0, n_samples)]
distances, neighbors = lshf.kneighbors(query,
n_neighbors=n_neighbors,
return_distance=True)
# Returned neighbors should be from closest to farthest.
assert_true(np.all(np.diff(distances[0]) >= 0))
mean_dist = np.mean(pairwise_distances(query, X, metric='cosine'))
distances, neighbors = lshf.radius_neighbors(query,
radius=mean_dist,
return_distance=True)
assert_true(np.all(np.diff(distances[0]) >= 0))
示例13: create_tree
def create_tree(self,listNames,variableName):
#LSHForest. only once for the main database
lshf = LSHForest(n_estimators=50,n_candidates=500)
TF, tfidfs = self.create_TDIDF(self.tokenize(listNames))
lshf.fit(tfidfs)
pickle.dump(lshf,open("{0}/{1}_lshf.dump".format(self.folderSaveData,variableName),"wb+"))
pickle.dump(listNames,open("{0}/{1}_listNames.dump".format(self.folderSaveData,variableName),"wb+"))
pickle.dump(TF,open("{0}/{1}_TF.dump".format(self.folderSaveData,variableName),"wb+"))
示例14: test_radius_neighbors_boundary_handling
def test_radius_neighbors_boundary_handling():
X = [[0.999, 0.001], [0.5, 0.5], [0, 1.], [-1., 0.001]]
n_points = len(X)
# Build an exact nearest neighbors model as reference model to ensure
# consistency between exact and approximate methods
nnbrs = NearestNeighbors(algorithm='brute', metric='cosine').fit(X)
# Build a LSHForest model with hyperparameter values that always guarantee
# exact results on this toy dataset.
lsfh = LSHForest(min_hash_match=0, n_candidates=n_points,
random_state=42).fit(X)
# define a query aligned with the first axis
query = [[1., 0.]]
# Compute the exact cosine distances of the query to the four points of
# the dataset
dists = pairwise_distances(query, X, metric='cosine').ravel()
# The first point is almost aligned with the query (very small angle),
# the cosine distance should therefore be almost null:
assert_almost_equal(dists[0], 0, decimal=5)
# The second point form an angle of 45 degrees to the query vector
assert_almost_equal(dists[1], 1 - np.cos(np.pi / 4))
# The third point is orthogonal from the query vector hence at a distance
# exactly one:
assert_almost_equal(dists[2], 1)
# The last point is almost colinear but with opposite sign to the query
# therefore it has a cosine 'distance' very close to the maximum possible
# value of 2.
assert_almost_equal(dists[3], 2, decimal=5)
# If we query with a radius of one, all the samples except the last sample
# should be included in the results. This means that the third sample
# is lying on the boundary of the radius query:
exact_dists, exact_idx = nnbrs.radius_neighbors(query, radius=1)
approx_dists, approx_idx = lsfh.radius_neighbors(query, radius=1)
assert_array_equal(np.sort(exact_idx[0]), [0, 1, 2])
assert_array_equal(np.sort(approx_idx[0]), [0, 1, 2])
assert_array_almost_equal(np.sort(exact_dists[0]), dists[:-1])
assert_array_almost_equal(np.sort(approx_dists[0]), dists[:-1])
# If we perform the same query with a slightly lower radius, the third
# point of the dataset that lay on the boundary of the previous query
# is now rejected:
eps = np.finfo(np.float64).eps
exact_dists, exact_idx = nnbrs.radius_neighbors(query, radius=1 - eps)
approx_dists, approx_idx = lsfh.radius_neighbors(query, radius=1 - eps)
assert_array_equal(np.sort(exact_idx[0]), [0, 1])
assert_array_equal(np.sort(approx_idx[0]), [0, 1])
assert_array_almost_equal(np.sort(exact_dists[0]), dists[:-2])
assert_array_almost_equal(np.sort(approx_dists[0]), dists[:-2])
示例15: __init__
class LHSForestEngine:
def __init__(self):
self.engine = LSHForest(random_state=42)
self.name = "LHS"
def fit(self, data):
self.engine.fit(data)
def dist(self, data):
distances, indices = self.engine.kneighbors(data, n_neighbors=1)
return distances.ravel()