Python neighbors.LSHForest类代码示例

本文整理汇总了Python中sklearn.neighbors.LSHForest类的典型用法代码示例。如果您正苦于以下问题：Python LSHForest类的具体用法？Python LSHForest怎么用？Python LSHForest使用的例子？那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。

在下文中一共展示了LSHForest类的15个代码示例，这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞，您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: test_fit

def test_fit():
    """Checks whether `fit` method sets all attribute values correctly."""
    n_samples = 12
    n_features = 2
    n_estimators = 5
    rng = np.random.RandomState(42)
    X = rng.rand(n_samples, n_features)

    lshf = LSHForest(n_estimators=n_estimators)
    lshf.fit(X)

    # _input_array = X
    assert_array_equal(X, lshf._fit_X)
    # A hash function g(p) for each tree
    assert_equal(n_estimators, len(lshf.hash_functions_))
    # Hash length = 32
    assert_equal(32, lshf.hash_functions_[0].components_.shape[0])
    # Number of trees_ in the forest
    assert_equal(n_estimators, len(lshf.trees_))
    # Each tree has entries for every data point
    assert_equal(n_samples, len(lshf.trees_[0]))
    # Original indices after sorting the hashes
    assert_equal(n_estimators, len(lshf.original_indices_))
    # Each set of original indices in a tree has entries for every data point
    assert_equal(n_samples, len(lshf.original_indices_[0]))

开发者ID:cnspica，项目名称:scikit-learn，代码行数:25，代码来源:test_approximate.py

示例2: single_batch

    def single_batch(self, tweets):
        """Performs an approximate nearest neighbors search on tweets in the database
        passed to it. The database must be a list of tweets (text of the tweets only).
        
        Returns the indices of tweets with nearby neighbors (i.e. spam tweets).
        These indices correspond to indices within the batch of tweets fed to
        this function."""

        # Vectorize and fit tree:
        vect2 = CountVectorizer(stop_words = self.custom_stop_words)
        X2 = vect2.fit_transform(tweets)
        tree2 = LSHForest()
        tree2.fit(X2)

        # Build tree:
        n_neighbors = []
        neighbors_indices = []
        working_batch_size = len(tweets)
        for x in vect2.transform(tweets):
            if len(n_neighbors) % 100 == 0: print "%r tweets analyzed out of %r for this batch" % (len(n_neighbors), working_batch_size)
            # Only deal with tweets that are longer than 3 words.
            neighbors = tree2.radius_neighbors(x, radius = self.sensitivity)[1]
            if x.getnnz() > 2:
                n_neighbors.append(len(neighbors[0]))
                neighbors_indices.append(neighbors)
            else:
                n_neighbors.append(1)
                neighbors_indices.append(np.array([np.array([0])]))

        neighbors_indices = [x for x in range(len(neighbors_indices)) if len(neighbors_indices[x][0]) > 2]

        return neighbors_indices

开发者ID:ilyaaltshteyn，项目名称:danger_tweets，代码行数:32，代码来源:tweetPreprocessor.py

示例3: init

class EmbeddingNetworkBuilder:
    """ Basically a wrapper around sklearns LSH forest """

    def __init__(self, lsh_init=None):
        if lsh_init == None:
            self._lsh_forest = LSHForest(n_estimators=25, n_candidates=1000)
        else:
            self._lsh_forest = lsh_init 
        self.iw = None
        self.m = None

    def fit_lsh_forest(self, embedding):
        self._lsh_forest.fit(embedding.m)
        self._embedding = embedding

    def extract_nn_network(self, nn=20):
        dir_graph_mat = self._lsh_forest.kneighbors_graph(X=self._embedding.m, n_neighbors=nn+1)
        return dir_graph_mat

    def make_undirected(self, dir_graph_mat):
        nodes = set(range(dir_graph_mat.shape[0]))
        edges = set([])
        for node_i in dir_graph_mat.shape[0]:
            for node_j in dir_graph_mat[node_i].nonzero()[1]:
                edges.add((node_i, node_j))
        return nodes, edges

    def get_forest(self):
        return self._lsh_forest
    
    def get_node_to_word(self):
        return self.iw

开发者ID:viveksck，项目名称:langchange，代码行数:32，代码来源:networkinducer.py

示例4: single_batch

    def single_batch(self, tweets):
        """Performs an approximate nearest neighbors search on tweets in the database
        passed to it. The database must be a list of tweets (text of the tweets only).
        
        Returns the indices of tweets with nearby neighbors (i.e. spam tweets).
        These indices correspond to indices within the batch of tweets fed to
        this function."""

        # Vectorize and fit tree:
        vect2 = CountVectorizer(stop_words = self.common_twitter_handles)
        X2 = vect2.fit_transform(tweets)
        tree2 = LSHForest()
        tree2.fit(X2)

        # Build tree:
        n_neighbors = []
        neighbors_indices = []
        for x in vect2.transform(tweets):
            if len(n_neighbors) % 100 == 0: print "%r tweets analyzed out of %r for this batch" % (len(n_neighbors), self.batch_size)
            neighbors = tree2.radius_neighbors(x, radius = .4)[1]
            n_neighbors.append(len(neighbors[0]))
            neighbors_indices.append(neighbors)

        neighbors_indices = [x for x in range(len(neighbors_indices)) if len(neighbors_indices[x][0]) > 2]

        return neighbors_indices

开发者ID:ilyaaltshteyn，项目名称:tweet_pre_processor，代码行数:26，代码来源:tweet_processor_bigdist.py

示例5: get_nearest_neighbor_iterable

    def get_nearest_neighbor_iterable(self, graphlist, start_graphs, start_is_subset=True):

        # vectorize all
        graphlist= list(graphlist)
        graphlist_ = copy.deepcopy(graphlist)
        X = self.vectorizer.transform_single(graphlist_)


        start_graphs= list(start_graphs)
        graphlist_= copy.deepcopy(start_graphs)
        Y = self.vectorizer.transform_single(graphlist_)
        
        
        forest = LSHForest()
        forest.fit(X)
        #http://scikit-learn.org/stable/modules/neighbors.html
        distances, indices = forest.kneighbors(Y, n_neighbors=2)

        # we just assume that this is short...
        index = 0
        if start_is_subset:
            index += 1
        
        #matches= ( X_index ,Y_index, distance  )
        matches = [(indices[i, index], i, distances[i, index]) for i in range(len(indices))]
        matches.sort()

        # this looks super confusing....
        #for index, graph in enumerate(selection_iterator(graphlist, [a[0] for a in matches])):
        #    yield ((graph, start_graphs[matches[index][1]], X[matches[index][0]]))
        # so i wrote this:,,, you may even get rid of the matches variable i think.. and use indices directly
        for Xi,Yi,dist in matches:
            yield ((start_graphs[Yi],graphlist[Xi],X[Xi]))

开发者ID:antworteffekt，项目名称:GraphLearn，代码行数:33，代码来源:directedsampler.py

示例6: test_hash_functions

def test_hash_functions():
    """Checks randomness of hash functions.

    Variance and mean of each hash function (projection vector)
    should be different from flattened array of hash functions.
    If hash functions are not randomly built (seeded with
    same value), variances and means of all functions are equal.
    """
    n_samples = 12
    n_features = 2
    n_estimators = 5
    rng = np.random.RandomState(42)
    X = rng.rand(n_samples, n_features)

    lshf = LSHForest(n_estimators=n_estimators,
                     random_state=rng.randint(0, np.iinfo(np.int32).max))
    lshf.fit(X)

    hash_functions = []
    for i in range(n_estimators):
        hash_functions.append(lshf.hash_functions_[i].components_)

    for i in range(n_estimators):
        assert_not_equal(np.var(hash_functions),
                         np.var(lshf.hash_functions_[i].components_))

    for i in range(n_estimators):
        assert_not_equal(np.mean(hash_functions),
                         np.mean(lshf.hash_functions_[i].components_))

开发者ID:cnspica，项目名称:scikit-learn，代码行数:29，代码来源:test_approximate.py

示例7: search_neighbors

def search_neighbors(request):
	designs = Design.objects.all()

	image_list = []
	for design in designs:
		image_list.append(str(design.uid) + ".png")

	d_geometry = settings.D_GEOMETRY
	designed_images = np.empty((len(image_list), d_geometry[0]*d_geometry[1]*3), dtype="float32")
	for i in range(len(image_list)):
		designed_images[i] = img2numpy_arr(settings.DESIGN_PATH + image_list[i]).reshape(d_geometry[0]*d_geometry[1]*3)
	designed_images /= 255
	
	lshf = LSHForest(random_state=42)
	lshf.fit(designed_images) 

	num = int(request.GET['num'])
	input_fname = str(request.GET['input'])
	input_image = img2numpy_arr(settings.DESIGN_PATH + input_fname)
	input_image = input_image.reshape(1, -1)/255
	_, indices = lshf.kneighbors(input_image, n_neighbors=num)

	similar_images = []
	for i in list(indices.reshape(-1)):
		similar_images.append({ 
			"image": str(designs[i].uid) + ".png", 
			"text": str(designs[i].history_text), 
			"like": int(designs[i].like),
			"filtered": str(designs[i].filtered)
		})

	return JsonResponse({
		"results": similar_images
	})

开发者ID:Soma2-HighFashion，项目名称:Design_Studio，代码行数:34，代码来源:views.py

示例8: build_index

def build_index(data, n_estimators=20, n_candidates=100, n_neighbors=10, seed=0):
    lshf = LSHForest(n_estimators=n_estimators, n_candidates=n_candidates,
                     n_neighbors=n_neighbors, random_state=seed)
    t0 = time()
    lshf.fit(data)
    duration = time() - t0
    return lshf, duration

开发者ID:ogrisel，项目名称:lsh_glove，代码行数:7，代码来源:index_embedding.py

示例9: get_heap_and_forest

    def get_heap_and_forest(self, griter, k):
        '''
        so we create the heap and the forest...
        heap is (dist to hyperplane, count, graph)
        and the forest ist just a nearest neighbor from sklearn
        '''

        graphs = list(griter)
        graphs2 = copy.deepcopy(graphs)
        # transform doess mess up the graph objects
        X = self.vectorizer.transform(graphs)

        forest = LSHForest()
        forest.fit(X)
        print 'got forest'

        heap = []
        for vector, graph in zip(X, graphs2):
            graph2 = nx.Graph(graph)
            heapq.heappush(heap, (
                self.sampler.estimator.predict_proba(self.sampler.vectorizer.transform_single(graph2))[0][1],
                # score ~ dist from hyperplane
                k + 1,  # making sure that the counter is high so we dont output the startgraphz at the end
                graph))  # at last the actual graph

        print 'got heap'
        distances, unused = forest.kneighbors(X, n_neighbors=2)
        distances = [a[1] for a in distances]  # the second element should be the dist we want
        avg_dist = distances[len(distances) / 2]  # sum(distances)/len(distances)
        print 'got dist'

        return heap, forest, avg_dist

开发者ID:smautner，项目名称:GraphLearn，代码行数:32，代码来源:discsampler.py

示例10: test_neighbors_accuracy_with_n_estimators

def test_neighbors_accuracy_with_n_estimators():
    # Checks whether accuracy increases as `n_estimators` increases.
    n_estimators = np.array([1, 10, 100])
    n_samples = 100
    n_features = 10
    n_iter = 10
    n_points = 5
    rng = np.random.RandomState(42)
    accuracies = np.zeros(n_estimators.shape[0], dtype=float)
    X = rng.rand(n_samples, n_features)

    for i, t in enumerate(n_estimators):
        lshf = LSHForest(n_candidates=500, n_estimators=t)
        ignore_warnings(lshf.fit)(X)
        for j in range(n_iter):
            query = X[rng.randint(0, n_samples)].reshape(1, -1)
            neighbors = lshf.kneighbors(query, n_neighbors=n_points,
                                        return_distance=False)
            distances = pairwise_distances(query, X, metric='cosine')
            ranks = np.argsort(distances)[0, :n_points]

            intersection = np.intersect1d(ranks, neighbors).shape[0]
            ratio = intersection / float(n_points)
            accuracies[i] = accuracies[i] + ratio

        accuracies[i] = accuracies[i] / float(n_iter)
    # Sorted accuracies should be equal to original accuracies
    assert_true(np.all(np.diff(accuracies) >= 0),
                msg="Accuracies are not non-decreasing.")
    # Highest accuracy should be strictly greater than the lowest
    assert_true(np.ptp(accuracies) > 0,
                msg="Highest accuracy is not strictly greater than lowest.")

开发者ID:AlexandreAbraham，项目名称:scikit-learn，代码行数:32，代码来源:test_approximate.py

示例11: text_hist

def text_hist():
    """
    Calculate histogram of text of images
    """
    with open('data/sift_names.pkl', 'r') as f:
        names = cPickle.load(f)
    with open('data/sift_hist.pkl', 'r') as f:
        sift_hists = cPickle.load(f)
    filenames = []
    for name in names:
        name = name.replace('img', 'descr')
        name = name.replace('.jpg', '.txt')
        filenames.append('shopping/images/' + name)
    vectorizer = CountVectorizer(input='filename', token_pattern="(?u)"+'\w+', ngram_range=(1, 1), min_df=2)
    xall_transformed = vectorizer.fit_transform(filenames).tocsr()
    preprocessing.normalize(xall_transformed, copy=False)

    lamb = .5
    hists = scipy.sparse.hstack([xall_transformed * lamb, sift_hists * (1-lamb)]).toarray()
    preprocessing.normalize(hists, copy=False)
    model = LSHForest()
    model.fit(hists)
    with open('data/text_hist.pkl', 'w') as f:
        cPickle.dump(xall_transformed, f)
    with open('data/vectorizer.pkl', 'w') as f:
        cPickle.dump(vectorizer, f)
    with open('data/lshforest_combine.pkl', 'w') as f:
        cPickle.dump(model, f)

开发者ID:bangnk，项目名称:tu_anh，代码行数:28，代码来源:tiny.py

示例12: test_distances

def test_distances():
    """Checks whether returned neighbors are from closest to farthest."""
    n_samples = 12
    n_features = 2
    n_iter = 10
    rng = np.random.RandomState(42)
    X = rng.rand(n_samples, n_features)

    lshf = LSHForest()
    lshf.fit(X)

    for i in range(n_iter):
        n_neighbors = rng.randint(0, n_samples)
        query = X[rng.randint(0, n_samples)]
        distances, neighbors = lshf.kneighbors(query,
                                               n_neighbors=n_neighbors,
                                               return_distance=True)
        # Returned neighbors should be from closest to farthest.
        assert_true(np.all(np.diff(distances[0]) >= 0))

        mean_dist = np.mean(pairwise_distances(query, X, metric='cosine'))
        distances, neighbors = lshf.radius_neighbors(query,
                                                     radius=mean_dist,
                                                     return_distance=True)
        assert_true(np.all(np.diff(distances[0]) >= 0))

开发者ID:CC-Fu-CC，项目名称:scikit-learn，代码行数:25，代码来源:test_approximate.py

示例13: create_tree

 def create_tree(self,listNames,variableName):
     #LSHForest. only once for the main database
     lshf = LSHForest(n_estimators=50,n_candidates=500)
     TF, tfidfs = self.create_TDIDF(self.tokenize(listNames))
     lshf.fit(tfidfs)        
     pickle.dump(lshf,open("{0}/{1}_lshf.dump".format(self.folderSaveData,variableName),"wb+"))
     pickle.dump(listNames,open("{0}/{1}_listNames.dump".format(self.folderSaveData,variableName),"wb+"))
     pickle.dump(TF,open("{0}/{1}_TF.dump".format(self.folderSaveData,variableName),"wb+"))

开发者ID:uvacorpnet，项目名称:name_matching，代码行数:8，代码来源:matcher.py

示例14: test_radius_neighbors_boundary_handling

def test_radius_neighbors_boundary_handling():
    X = [[0.999, 0.001], [0.5, 0.5], [0, 1.], [-1., 0.001]]
    n_points = len(X)

    # Build an exact nearest neighbors model as reference model to ensure
    # consistency between exact and approximate methods
    nnbrs = NearestNeighbors(algorithm='brute', metric='cosine').fit(X)

    # Build a LSHForest model with hyperparameter values that always guarantee
    # exact results on this toy dataset.
    lsfh = LSHForest(min_hash_match=0, n_candidates=n_points,
                     random_state=42).fit(X)

    # define a query aligned with the first axis
    query = [[1., 0.]]

    # Compute the exact cosine distances of the query to the four points of
    # the dataset
    dists = pairwise_distances(query, X, metric='cosine').ravel()

    # The first point is almost aligned with the query (very small angle),
    # the cosine distance should therefore be almost null:
    assert_almost_equal(dists[0], 0, decimal=5)

    # The second point form an angle of 45 degrees to the query vector
    assert_almost_equal(dists[1], 1 - np.cos(np.pi / 4))

    # The third point is orthogonal from the query vector hence at a distance
    # exactly one:
    assert_almost_equal(dists[2], 1)

    # The last point is almost colinear but with opposite sign to the query
    # therefore it has a cosine 'distance' very close to the maximum possible
    # value of 2.
    assert_almost_equal(dists[3], 2, decimal=5)

    # If we query with a radius of one, all the samples except the last sample
    # should be included in the results. This means that the third sample
    # is lying on the boundary of the radius query:
    exact_dists, exact_idx = nnbrs.radius_neighbors(query, radius=1)
    approx_dists, approx_idx = lsfh.radius_neighbors(query, radius=1)

    assert_array_equal(np.sort(exact_idx[0]), [0, 1, 2])
    assert_array_equal(np.sort(approx_idx[0]), [0, 1, 2])
    assert_array_almost_equal(np.sort(exact_dists[0]), dists[:-1])
    assert_array_almost_equal(np.sort(approx_dists[0]), dists[:-1])

    # If we perform the same query with a slightly lower radius, the third
    # point of the dataset that lay on the boundary of the previous query
    # is now rejected:
    eps = np.finfo(np.float64).eps
    exact_dists, exact_idx = nnbrs.radius_neighbors(query, radius=1 - eps)
    approx_dists, approx_idx = lsfh.radius_neighbors(query, radius=1 - eps)

    assert_array_equal(np.sort(exact_idx[0]), [0, 1])
    assert_array_equal(np.sort(approx_idx[0]), [0, 1])
    assert_array_almost_equal(np.sort(exact_dists[0]), dists[:-2])
    assert_array_almost_equal(np.sort(approx_dists[0]), dists[:-2])

开发者ID:AlexandreAbraham，项目名称:scikit-learn，代码行数:58，代码来源:test_approximate.py

示例15: init

class LHSForestEngine:

    def __init__(self):
        self.engine = LSHForest(random_state=42)
        self.name = "LHS"

    def fit(self, data):
        self.engine.fit(data)

    def dist(self, data):
        distances, indices = self.engine.kneighbors(data, n_neighbors=1)
        return distances.ravel()

开发者ID:enoonIT，项目名称:nbnn-nbnl，代码行数:12，代码来源:nbnn.py

注：本文中的sklearn.neighbors.LSHForest类示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台，相关代码片段筛选自各路编程大神贡献的开源项目，源码版权归原作者所有，传播和使用请参考对应项目的License；未经允许，请勿转载。