本文整理汇总了Python中sklearn.neighbors.LSHForest.fit方法的典型用法代码示例。如果您正苦于以下问题:Python LSHForest.fit方法的具体用法?Python LSHForest.fit怎么用?Python LSHForest.fit使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类sklearn.neighbors.LSHForest
的用法示例。
在下文中一共展示了LSHForest.fit方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: get_heap_and_forest
# 需要导入模块: from sklearn.neighbors import LSHForest [as 别名]
# 或者: from sklearn.neighbors.LSHForest import fit [as 别名]
def get_heap_and_forest(self, griter, k):
'''
so we create the heap and the forest...
heap is (dist to hyperplane, count, graph)
and the forest ist just a nearest neighbor from sklearn
'''
graphs = list(griter)
graphs2 = copy.deepcopy(graphs)
# transform doess mess up the graph objects
X = self.vectorizer.transform(graphs)
forest = LSHForest()
forest.fit(X)
print 'got forest'
heap = []
for vector, graph in zip(X, graphs2):
graph2 = nx.Graph(graph)
heapq.heappush(heap, (
self.sampler.estimator.predict_proba(self.sampler.vectorizer.transform_single(graph2))[0][1],
# score ~ dist from hyperplane
k + 1, # making sure that the counter is high so we dont output the startgraphz at the end
graph)) # at last the actual graph
print 'got heap'
distances, unused = forest.kneighbors(X, n_neighbors=2)
distances = [a[1] for a in distances] # the second element should be the dist we want
avg_dist = distances[len(distances) / 2] # sum(distances)/len(distances)
print 'got dist'
return heap, forest, avg_dist
示例2: Main
# 需要导入模块: from sklearn.neighbors import LSHForest [as 别名]
# 或者: from sklearn.neighbors.LSHForest import fit [as 别名]
def Main():
trainingSet, people = LoadTrainingSet()
# Uncomment when running from console:
# colorama.init()
if loadPreviousResults:
previouslyLearnedVectors, previouslyLearnedPeople = LoadPreviouslyLearnedResults()
trainingSet.extend(previouslyLearnedVectors)
people.extend(previouslyLearnedPeople)
else:
client.drop_database(Constants.PreviousResultsDb)
chartsForest = LSHForest(n_neighbors = ChartsNeighbors, n_estimators = ChartsEstimators, n_candidates = ChartsCandidates)
chartsForest.fit(trainingSet)
peopleForest = LSHForest(n_neighbors = PeopleNeighbors, n_estimators = PeopleEstimators, n_candidates = PeopleCandidates)
peopleForest.fit(people)
while True:
try:
featureVector, person = GetNewInput()
ShowCurrentPatient(person)
warnings = DumbDiagnoser.GetDumbDiagnosis(featureVector, person)
diagnosis, closestChartsPeople = Diagnose(chartsForest, featureVector)
closestPeople = GetClosestPeople(peopleForest, person)
ShowWarnings(warnings)
ShowResults(diagnosis, closestChartsPeople, closestPeople)
Learn(chartsForest, featureVector, peopleForest, person, diagnosis)
except EOFError:
print('Exiting')
client.close()
break
except NoSuchRecordException as details:
print(details)
finally:
print
示例3: __init__
# 需要导入模块: from sklearn.neighbors import LSHForest [as 别名]
# 或者: from sklearn.neighbors.LSHForest import fit [as 别名]
class EmbeddingNetworkBuilder:
""" Basically a wrapper around sklearns LSH forest """
def __init__(self, lsh_init=None):
if lsh_init == None:
self._lsh_forest = LSHForest(n_estimators=25, n_candidates=1000)
else:
self._lsh_forest = lsh_init
self.iw = None
self.m = None
def fit_lsh_forest(self, embedding):
self._lsh_forest.fit(embedding.m)
self._embedding = embedding
def extract_nn_network(self, nn=20):
dir_graph_mat = self._lsh_forest.kneighbors_graph(X=self._embedding.m, n_neighbors=nn+1)
return dir_graph_mat
def make_undirected(self, dir_graph_mat):
nodes = set(range(dir_graph_mat.shape[0]))
edges = set([])
for node_i in dir_graph_mat.shape[0]:
for node_j in dir_graph_mat[node_i].nonzero()[1]:
edges.add((node_i, node_j))
return nodes, edges
def get_forest(self):
return self._lsh_forest
def get_node_to_word(self):
return self.iw
示例4: text_hist
# 需要导入模块: from sklearn.neighbors import LSHForest [as 别名]
# 或者: from sklearn.neighbors.LSHForest import fit [as 别名]
def text_hist():
"""
Calculate histogram of text of images
"""
with open('data/sift_names.pkl', 'r') as f:
names = cPickle.load(f)
with open('data/sift_hist.pkl', 'r') as f:
sift_hists = cPickle.load(f)
filenames = []
for name in names:
name = name.replace('img', 'descr')
name = name.replace('.jpg', '.txt')
filenames.append('shopping/images/' + name)
vectorizer = CountVectorizer(input='filename', token_pattern="(?u)"+'\w+', ngram_range=(1, 1), min_df=2)
xall_transformed = vectorizer.fit_transform(filenames).tocsr()
preprocessing.normalize(xall_transformed, copy=False)
lamb = .5
hists = scipy.sparse.hstack([xall_transformed * lamb, sift_hists * (1-lamb)]).toarray()
preprocessing.normalize(hists, copy=False)
model = LSHForest()
model.fit(hists)
with open('data/text_hist.pkl', 'w') as f:
cPickle.dump(xall_transformed, f)
with open('data/vectorizer.pkl', 'w') as f:
cPickle.dump(vectorizer, f)
with open('data/lshforest_combine.pkl', 'w') as f:
cPickle.dump(model, f)
示例5: get_nearest_neighbor_iterable
# 需要导入模块: from sklearn.neighbors import LSHForest [as 别名]
# 或者: from sklearn.neighbors.LSHForest import fit [as 别名]
def get_nearest_neighbor_iterable(self, graphlist, start_graphs, start_is_subset=True):
# vectorize all
graphlist= list(graphlist)
graphlist_ = copy.deepcopy(graphlist)
X = self.vectorizer.transform_single(graphlist_)
start_graphs= list(start_graphs)
graphlist_= copy.deepcopy(start_graphs)
Y = self.vectorizer.transform_single(graphlist_)
forest = LSHForest()
forest.fit(X)
#http://scikit-learn.org/stable/modules/neighbors.html
distances, indices = forest.kneighbors(Y, n_neighbors=2)
# we just assume that this is short...
index = 0
if start_is_subset:
index += 1
#matches= ( X_index ,Y_index, distance )
matches = [(indices[i, index], i, distances[i, index]) for i in range(len(indices))]
matches.sort()
# this looks super confusing....
#for index, graph in enumerate(selection_iterator(graphlist, [a[0] for a in matches])):
# yield ((graph, start_graphs[matches[index][1]], X[matches[index][0]]))
# so i wrote this:,,, you may even get rid of the matches variable i think.. and use indices directly
for Xi,Yi,dist in matches:
yield ((start_graphs[Yi],graphlist[Xi],X[Xi]))
示例6: search_neighbors
# 需要导入模块: from sklearn.neighbors import LSHForest [as 别名]
# 或者: from sklearn.neighbors.LSHForest import fit [as 别名]
def search_neighbors(request):
designs = Design.objects.all()
image_list = []
for design in designs:
image_list.append(str(design.uid) + ".png")
d_geometry = settings.D_GEOMETRY
designed_images = np.empty((len(image_list), d_geometry[0]*d_geometry[1]*3), dtype="float32")
for i in range(len(image_list)):
designed_images[i] = img2numpy_arr(settings.DESIGN_PATH + image_list[i]).reshape(d_geometry[0]*d_geometry[1]*3)
designed_images /= 255
lshf = LSHForest(random_state=42)
lshf.fit(designed_images)
num = int(request.GET['num'])
input_fname = str(request.GET['input'])
input_image = img2numpy_arr(settings.DESIGN_PATH + input_fname)
input_image = input_image.reshape(1, -1)/255
_, indices = lshf.kneighbors(input_image, n_neighbors=num)
similar_images = []
for i in list(indices.reshape(-1)):
similar_images.append({
"image": str(designs[i].uid) + ".png",
"text": str(designs[i].history_text),
"like": int(designs[i].like),
"filtered": str(designs[i].filtered)
})
return JsonResponse({
"results": similar_images
})
示例7: test_fit
# 需要导入模块: from sklearn.neighbors import LSHForest [as 别名]
# 或者: from sklearn.neighbors.LSHForest import fit [as 别名]
def test_fit():
"""Checks whether `fit` method sets all attribute values correctly."""
n_samples = 12
n_features = 2
n_estimators = 5
rng = np.random.RandomState(42)
X = rng.rand(n_samples, n_features)
lshf = LSHForest(n_estimators=n_estimators)
lshf.fit(X)
# _input_array = X
assert_array_equal(X, lshf._fit_X)
# A hash function g(p) for each tree
assert_equal(n_estimators, len(lshf.hash_functions_))
# Hash length = 32
assert_equal(32, lshf.hash_functions_[0].components_.shape[0])
# Number of trees_ in the forest
assert_equal(n_estimators, len(lshf.trees_))
# Each tree has entries for every data point
assert_equal(n_samples, len(lshf.trees_[0]))
# Original indices after sorting the hashes
assert_equal(n_estimators, len(lshf.original_indices_))
# Each set of original indices in a tree has entries for every data point
assert_equal(n_samples, len(lshf.original_indices_[0]))
示例8: single_batch
# 需要导入模块: from sklearn.neighbors import LSHForest [as 别名]
# 或者: from sklearn.neighbors.LSHForest import fit [as 别名]
def single_batch(self, tweets):
"""Performs an approximate nearest neighbors search on tweets in the database
passed to it. The database must be a list of tweets (text of the tweets only).
Returns the indices of tweets with nearby neighbors (i.e. spam tweets).
These indices correspond to indices within the batch of tweets fed to
this function."""
# Vectorize and fit tree:
vect2 = CountVectorizer(stop_words = self.custom_stop_words)
X2 = vect2.fit_transform(tweets)
tree2 = LSHForest()
tree2.fit(X2)
# Build tree:
n_neighbors = []
neighbors_indices = []
working_batch_size = len(tweets)
for x in vect2.transform(tweets):
if len(n_neighbors) % 100 == 0: print "%r tweets analyzed out of %r for this batch" % (len(n_neighbors), working_batch_size)
# Only deal with tweets that are longer than 3 words.
neighbors = tree2.radius_neighbors(x, radius = self.sensitivity)[1]
if x.getnnz() > 2:
n_neighbors.append(len(neighbors[0]))
neighbors_indices.append(neighbors)
else:
n_neighbors.append(1)
neighbors_indices.append(np.array([np.array([0])]))
neighbors_indices = [x for x in range(len(neighbors_indices)) if len(neighbors_indices[x][0]) > 2]
return neighbors_indices
示例9: single_batch
# 需要导入模块: from sklearn.neighbors import LSHForest [as 别名]
# 或者: from sklearn.neighbors.LSHForest import fit [as 别名]
def single_batch(self, tweets):
"""Performs an approximate nearest neighbors search on tweets in the database
passed to it. The database must be a list of tweets (text of the tweets only).
Returns the indices of tweets with nearby neighbors (i.e. spam tweets).
These indices correspond to indices within the batch of tweets fed to
this function."""
# Vectorize and fit tree:
vect2 = CountVectorizer(stop_words = self.common_twitter_handles)
X2 = vect2.fit_transform(tweets)
tree2 = LSHForest()
tree2.fit(X2)
# Build tree:
n_neighbors = []
neighbors_indices = []
for x in vect2.transform(tweets):
if len(n_neighbors) % 100 == 0: print "%r tweets analyzed out of %r for this batch" % (len(n_neighbors), self.batch_size)
neighbors = tree2.radius_neighbors(x, radius = .4)[1]
n_neighbors.append(len(neighbors[0]))
neighbors_indices.append(neighbors)
neighbors_indices = [x for x in range(len(neighbors_indices)) if len(neighbors_indices[x][0]) > 2]
return neighbors_indices
示例10: test_hash_functions
# 需要导入模块: from sklearn.neighbors import LSHForest [as 别名]
# 或者: from sklearn.neighbors.LSHForest import fit [as 别名]
def test_hash_functions():
"""Checks randomness of hash functions.
Variance and mean of each hash function (projection vector)
should be different from flattened array of hash functions.
If hash functions are not randomly built (seeded with
same value), variances and means of all functions are equal.
"""
n_samples = 12
n_features = 2
n_estimators = 5
rng = np.random.RandomState(42)
X = rng.rand(n_samples, n_features)
lshf = LSHForest(n_estimators=n_estimators,
random_state=rng.randint(0, np.iinfo(np.int32).max))
lshf.fit(X)
hash_functions = []
for i in range(n_estimators):
hash_functions.append(lshf.hash_functions_[i].components_)
for i in range(n_estimators):
assert_not_equal(np.var(hash_functions),
np.var(lshf.hash_functions_[i].components_))
for i in range(n_estimators):
assert_not_equal(np.mean(hash_functions),
np.mean(lshf.hash_functions_[i].components_))
示例11: test_distances
# 需要导入模块: from sklearn.neighbors import LSHForest [as 别名]
# 或者: from sklearn.neighbors.LSHForest import fit [as 别名]
def test_distances():
"""Checks whether returned neighbors are from closest to farthest."""
n_samples = 12
n_features = 2
n_iter = 10
rng = np.random.RandomState(42)
X = rng.rand(n_samples, n_features)
lshf = LSHForest()
lshf.fit(X)
for i in range(n_iter):
n_neighbors = rng.randint(0, n_samples)
query = X[rng.randint(0, n_samples)]
distances, neighbors = lshf.kneighbors(query,
n_neighbors=n_neighbors,
return_distance=True)
# Returned neighbors should be from closest to farthest.
assert_true(np.all(np.diff(distances[0]) >= 0))
mean_dist = np.mean(pairwise_distances(query, X, metric='cosine'))
distances, neighbors = lshf.radius_neighbors(query,
radius=mean_dist,
return_distance=True)
assert_true(np.all(np.diff(distances[0]) >= 0))
示例12: test_neighbors_accuracy_with_n_estimators
# 需要导入模块: from sklearn.neighbors import LSHForest [as 别名]
# 或者: from sklearn.neighbors.LSHForest import fit [as 别名]
def test_neighbors_accuracy_with_n_estimators():
"""Checks whether accuracy increases as `n_estimators` increases."""
n_estimators = np.array([1, 10, 100])
n_samples = 100
n_features = 10
n_iter = 10
n_points = 5
rng = np.random.RandomState(42)
accuracies = np.zeros(n_estimators.shape[0], dtype=float)
X = rng.rand(n_samples, n_features)
for i, t in enumerate(n_estimators):
lshf = LSHForest(n_candidates=500, n_estimators=t)
lshf.fit(X)
for j in range(n_iter):
query = X[rng.randint(0, n_samples)]
neighbors = lshf.kneighbors(query, n_neighbors=n_points,
return_distance=False)
distances = pairwise_distances(query, X, metric='cosine')
ranks = np.argsort(distances)[0, :n_points]
intersection = np.intersect1d(ranks, neighbors).shape[0]
ratio = intersection / float(n_points)
accuracies[i] = accuracies[i] + ratio
accuracies[i] = accuracies[i] / float(n_iter)
# Sorted accuracies should be equal to original accuracies
assert_true(np.all(np.diff(accuracies) >= 0),
msg="Accuracies are not non-decreasing.")
# Highest accuracy should be strictly greater than the lowest
assert_true(np.ptp(accuracies) > 0,
msg="Highest accuracy is not strictly greater than lowest.")
示例13: build_index
# 需要导入模块: from sklearn.neighbors import LSHForest [as 别名]
# 或者: from sklearn.neighbors.LSHForest import fit [as 别名]
def build_index(data, n_estimators=20, n_candidates=100, n_neighbors=10, seed=0):
lshf = LSHForest(n_estimators=n_estimators, n_candidates=n_candidates,
n_neighbors=n_neighbors, random_state=seed)
t0 = time()
lshf.fit(data)
duration = time() - t0
return lshf, duration
示例14: test_partial_fit
# 需要导入模块: from sklearn.neighbors import LSHForest [as 别名]
# 或者: from sklearn.neighbors.LSHForest import fit [as 别名]
def test_partial_fit():
"""Checks whether inserting array is consitent with fitted data.
`partial_fit` method should set all attribute values correctly.
"""
n_samples = 12
n_samples_partial_fit = 3
n_features = 2
rng = np.random.RandomState(42)
X = rng.rand(n_samples, n_features)
X_partial_fit = rng.rand(n_samples_partial_fit, n_features)
lshf = LSHForest()
# Test unfitted estimator
lshf.partial_fit(X)
assert_array_equal(X, lshf._fit_X)
lshf.fit(X)
# Insert wrong dimension
assert_raises(ValueError, lshf.partial_fit,
np.random.randn(n_samples_partial_fit, n_features - 1))
lshf.partial_fit(X_partial_fit)
# size of _input_array = samples + 1 after insertion
assert_equal(lshf._fit_X.shape[0],
n_samples + n_samples_partial_fit)
# size of original_indices_[1] = samples + 1
assert_equal(len(lshf.original_indices_[0]),
n_samples + n_samples_partial_fit)
# size of trees_[1] = samples + 1
assert_equal(len(lshf.trees_[1]),
n_samples + n_samples_partial_fit)
示例15: create_tree
# 需要导入模块: from sklearn.neighbors import LSHForest [as 别名]
# 或者: from sklearn.neighbors.LSHForest import fit [as 别名]
def create_tree(self,listNames,variableName):
#LSHForest. only once for the main database
lshf = LSHForest(n_estimators=50,n_candidates=500)
TF, tfidfs = self.create_TDIDF(self.tokenize(listNames))
lshf.fit(tfidfs)
pickle.dump(lshf,open("{0}/{1}_lshf.dump".format(self.folderSaveData,variableName),"wb+"))
pickle.dump(listNames,open("{0}/{1}_listNames.dump".format(self.folderSaveData,variableName),"wb+"))
pickle.dump(TF,open("{0}/{1}_TF.dump".format(self.folderSaveData,variableName),"wb+"))