当前位置: 首页>>代码示例>>Python>>正文


Python LSHForest.fit方法代码示例

本文整理汇总了Python中sklearn.neighbors.LSHForest.fit方法的典型用法代码示例。如果您正苦于以下问题:Python LSHForest.fit方法的具体用法?Python LSHForest.fit怎么用?Python LSHForest.fit使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在sklearn.neighbors.LSHForest的用法示例。


在下文中一共展示了LSHForest.fit方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: get_heap_and_forest

# 需要导入模块: from sklearn.neighbors import LSHForest [as 别名]
# 或者: from sklearn.neighbors.LSHForest import fit [as 别名]
    def get_heap_and_forest(self, griter, k):
        '''
        so we create the heap and the forest...
        heap is (dist to hyperplane, count, graph)
        and the forest ist just a nearest neighbor from sklearn
        '''

        graphs = list(griter)
        graphs2 = copy.deepcopy(graphs)
        # transform doess mess up the graph objects
        X = self.vectorizer.transform(graphs)

        forest = LSHForest()
        forest.fit(X)
        print 'got forest'

        heap = []
        for vector, graph in zip(X, graphs2):
            graph2 = nx.Graph(graph)
            heapq.heappush(heap, (
                self.sampler.estimator.predict_proba(self.sampler.vectorizer.transform_single(graph2))[0][1],
                # score ~ dist from hyperplane
                k + 1,  # making sure that the counter is high so we dont output the startgraphz at the end
                graph))  # at last the actual graph

        print 'got heap'
        distances, unused = forest.kneighbors(X, n_neighbors=2)
        distances = [a[1] for a in distances]  # the second element should be the dist we want
        avg_dist = distances[len(distances) / 2]  # sum(distances)/len(distances)
        print 'got dist'

        return heap, forest, avg_dist
开发者ID:smautner,项目名称:GraphLearn,代码行数:34,代码来源:discsampler.py

示例2: Main

# 需要导入模块: from sklearn.neighbors import LSHForest [as 别名]
# 或者: from sklearn.neighbors.LSHForest import fit [as 别名]
def Main():
    trainingSet, people = LoadTrainingSet()
    # Uncomment when running from console:
    # colorama.init()
    if loadPreviousResults:
        previouslyLearnedVectors, previouslyLearnedPeople = LoadPreviouslyLearnedResults()
        trainingSet.extend(previouslyLearnedVectors)
        people.extend(previouslyLearnedPeople)
    else:
        client.drop_database(Constants.PreviousResultsDb)

    chartsForest = LSHForest(n_neighbors = ChartsNeighbors, n_estimators = ChartsEstimators, n_candidates = ChartsCandidates)
    chartsForest.fit(trainingSet)

    peopleForest = LSHForest(n_neighbors = PeopleNeighbors, n_estimators = PeopleEstimators, n_candidates = PeopleCandidates)
    peopleForest.fit(people)

    while True:
        try:
            featureVector, person = GetNewInput()
            ShowCurrentPatient(person)
            warnings = DumbDiagnoser.GetDumbDiagnosis(featureVector, person)
            diagnosis, closestChartsPeople = Diagnose(chartsForest, featureVector)
            closestPeople = GetClosestPeople(peopleForest, person)
            ShowWarnings(warnings)
            ShowResults(diagnosis, closestChartsPeople, closestPeople)
            Learn(chartsForest, featureVector, peopleForest, person, diagnosis)
        except EOFError:
            print('Exiting')
            client.close()
            break
        except NoSuchRecordException as details:
            print(details)
        finally:
            print
开发者ID:urialon,项目名称:ECG,代码行数:37,代码来源:Program.py

示例3: __init__

# 需要导入模块: from sklearn.neighbors import LSHForest [as 别名]
# 或者: from sklearn.neighbors.LSHForest import fit [as 别名]
class EmbeddingNetworkBuilder:
    """ Basically a wrapper around sklearns LSH forest """

    def __init__(self, lsh_init=None):
        if lsh_init == None:
            self._lsh_forest = LSHForest(n_estimators=25, n_candidates=1000)
        else:
            self._lsh_forest = lsh_init 
        self.iw = None
        self.m = None

    def fit_lsh_forest(self, embedding):
        self._lsh_forest.fit(embedding.m)
        self._embedding = embedding

    def extract_nn_network(self, nn=20):
        dir_graph_mat = self._lsh_forest.kneighbors_graph(X=self._embedding.m, n_neighbors=nn+1)
        return dir_graph_mat

    def make_undirected(self, dir_graph_mat):
        nodes = set(range(dir_graph_mat.shape[0]))
        edges = set([])
        for node_i in dir_graph_mat.shape[0]:
            for node_j in dir_graph_mat[node_i].nonzero()[1]:
                edges.add((node_i, node_j))
        return nodes, edges

    def get_forest(self):
        return self._lsh_forest
    
    def get_node_to_word(self):
        return self.iw
开发者ID:viveksck,项目名称:langchange,代码行数:34,代码来源:networkinducer.py

示例4: text_hist

# 需要导入模块: from sklearn.neighbors import LSHForest [as 别名]
# 或者: from sklearn.neighbors.LSHForest import fit [as 别名]
def text_hist():
    """
    Calculate histogram of text of images
    """
    with open('data/sift_names.pkl', 'r') as f:
        names = cPickle.load(f)
    with open('data/sift_hist.pkl', 'r') as f:
        sift_hists = cPickle.load(f)
    filenames = []
    for name in names:
        name = name.replace('img', 'descr')
        name = name.replace('.jpg', '.txt')
        filenames.append('shopping/images/' + name)
    vectorizer = CountVectorizer(input='filename', token_pattern="(?u)"+'\w+', ngram_range=(1, 1), min_df=2)
    xall_transformed = vectorizer.fit_transform(filenames).tocsr()
    preprocessing.normalize(xall_transformed, copy=False)

    lamb = .5
    hists = scipy.sparse.hstack([xall_transformed * lamb, sift_hists * (1-lamb)]).toarray()
    preprocessing.normalize(hists, copy=False)
    model = LSHForest()
    model.fit(hists)
    with open('data/text_hist.pkl', 'w') as f:
        cPickle.dump(xall_transformed, f)
    with open('data/vectorizer.pkl', 'w') as f:
        cPickle.dump(vectorizer, f)
    with open('data/lshforest_combine.pkl', 'w') as f:
        cPickle.dump(model, f)
开发者ID:bangnk,项目名称:tu_anh,代码行数:30,代码来源:tiny.py

示例5: get_nearest_neighbor_iterable

# 需要导入模块: from sklearn.neighbors import LSHForest [as 别名]
# 或者: from sklearn.neighbors.LSHForest import fit [as 别名]
    def get_nearest_neighbor_iterable(self, graphlist, start_graphs, start_is_subset=True):

        # vectorize all
        graphlist= list(graphlist)
        graphlist_ = copy.deepcopy(graphlist)
        X = self.vectorizer.transform_single(graphlist_)


        start_graphs= list(start_graphs)
        graphlist_= copy.deepcopy(start_graphs)
        Y = self.vectorizer.transform_single(graphlist_)
        
        
        forest = LSHForest()
        forest.fit(X)
        #http://scikit-learn.org/stable/modules/neighbors.html
        distances, indices = forest.kneighbors(Y, n_neighbors=2)

        # we just assume that this is short...
        index = 0
        if start_is_subset:
            index += 1
        
        #matches= ( X_index ,Y_index, distance  )
        matches = [(indices[i, index], i, distances[i, index]) for i in range(len(indices))]
        matches.sort()

        # this looks super confusing....
        #for index, graph in enumerate(selection_iterator(graphlist, [a[0] for a in matches])):
        #    yield ((graph, start_graphs[matches[index][1]], X[matches[index][0]]))
        # so i wrote this:,,, you may even get rid of the matches variable i think.. and use indices directly
        for Xi,Yi,dist in matches:
            yield ((start_graphs[Yi],graphlist[Xi],X[Xi]))
开发者ID:antworteffekt,项目名称:GraphLearn,代码行数:35,代码来源:directedsampler.py

示例6: search_neighbors

# 需要导入模块: from sklearn.neighbors import LSHForest [as 别名]
# 或者: from sklearn.neighbors.LSHForest import fit [as 别名]
def search_neighbors(request):
	designs = Design.objects.all()

	image_list = []
	for design in designs:
		image_list.append(str(design.uid) + ".png")

	d_geometry = settings.D_GEOMETRY
	designed_images = np.empty((len(image_list), d_geometry[0]*d_geometry[1]*3), dtype="float32")
	for i in range(len(image_list)):
		designed_images[i] = img2numpy_arr(settings.DESIGN_PATH + image_list[i]).reshape(d_geometry[0]*d_geometry[1]*3)
	designed_images /= 255
	
	lshf = LSHForest(random_state=42)
	lshf.fit(designed_images) 

	num = int(request.GET['num'])
	input_fname = str(request.GET['input'])
	input_image = img2numpy_arr(settings.DESIGN_PATH + input_fname)
	input_image = input_image.reshape(1, -1)/255
	_, indices = lshf.kneighbors(input_image, n_neighbors=num)

	similar_images = []
	for i in list(indices.reshape(-1)):
		similar_images.append({ 
			"image": str(designs[i].uid) + ".png", 
			"text": str(designs[i].history_text), 
			"like": int(designs[i].like),
			"filtered": str(designs[i].filtered)
		})

	return JsonResponse({
		"results": similar_images
	})
开发者ID:Soma2-HighFashion,项目名称:Design_Studio,代码行数:36,代码来源:views.py

示例7: test_fit

# 需要导入模块: from sklearn.neighbors import LSHForest [as 别名]
# 或者: from sklearn.neighbors.LSHForest import fit [as 别名]
def test_fit():
    """Checks whether `fit` method sets all attribute values correctly."""
    n_samples = 12
    n_features = 2
    n_estimators = 5
    rng = np.random.RandomState(42)
    X = rng.rand(n_samples, n_features)

    lshf = LSHForest(n_estimators=n_estimators)
    lshf.fit(X)

    # _input_array = X
    assert_array_equal(X, lshf._fit_X)
    # A hash function g(p) for each tree
    assert_equal(n_estimators, len(lshf.hash_functions_))
    # Hash length = 32
    assert_equal(32, lshf.hash_functions_[0].components_.shape[0])
    # Number of trees_ in the forest
    assert_equal(n_estimators, len(lshf.trees_))
    # Each tree has entries for every data point
    assert_equal(n_samples, len(lshf.trees_[0]))
    # Original indices after sorting the hashes
    assert_equal(n_estimators, len(lshf.original_indices_))
    # Each set of original indices in a tree has entries for every data point
    assert_equal(n_samples, len(lshf.original_indices_[0]))
开发者ID:cnspica,项目名称:scikit-learn,代码行数:27,代码来源:test_approximate.py

示例8: single_batch

# 需要导入模块: from sklearn.neighbors import LSHForest [as 别名]
# 或者: from sklearn.neighbors.LSHForest import fit [as 别名]
    def single_batch(self, tweets):
        """Performs an approximate nearest neighbors search on tweets in the database
        passed to it. The database must be a list of tweets (text of the tweets only).
        
        Returns the indices of tweets with nearby neighbors (i.e. spam tweets).
        These indices correspond to indices within the batch of tweets fed to
        this function."""

        # Vectorize and fit tree:
        vect2 = CountVectorizer(stop_words = self.custom_stop_words)
        X2 = vect2.fit_transform(tweets)
        tree2 = LSHForest()
        tree2.fit(X2)

        # Build tree:
        n_neighbors = []
        neighbors_indices = []
        working_batch_size = len(tweets)
        for x in vect2.transform(tweets):
            if len(n_neighbors) % 100 == 0: print "%r tweets analyzed out of %r for this batch" % (len(n_neighbors), working_batch_size)
            # Only deal with tweets that are longer than 3 words.
            neighbors = tree2.radius_neighbors(x, radius = self.sensitivity)[1]
            if x.getnnz() > 2:
                n_neighbors.append(len(neighbors[0]))
                neighbors_indices.append(neighbors)
            else:
                n_neighbors.append(1)
                neighbors_indices.append(np.array([np.array([0])]))

        neighbors_indices = [x for x in range(len(neighbors_indices)) if len(neighbors_indices[x][0]) > 2]

        return neighbors_indices
开发者ID:ilyaaltshteyn,项目名称:danger_tweets,代码行数:34,代码来源:tweetPreprocessor.py

示例9: single_batch

# 需要导入模块: from sklearn.neighbors import LSHForest [as 别名]
# 或者: from sklearn.neighbors.LSHForest import fit [as 别名]
    def single_batch(self, tweets):
        """Performs an approximate nearest neighbors search on tweets in the database
        passed to it. The database must be a list of tweets (text of the tweets only).
        
        Returns the indices of tweets with nearby neighbors (i.e. spam tweets).
        These indices correspond to indices within the batch of tweets fed to
        this function."""

        # Vectorize and fit tree:
        vect2 = CountVectorizer(stop_words = self.common_twitter_handles)
        X2 = vect2.fit_transform(tweets)
        tree2 = LSHForest()
        tree2.fit(X2)

        # Build tree:
        n_neighbors = []
        neighbors_indices = []
        for x in vect2.transform(tweets):
            if len(n_neighbors) % 100 == 0: print "%r tweets analyzed out of %r for this batch" % (len(n_neighbors), self.batch_size)
            neighbors = tree2.radius_neighbors(x, radius = .4)[1]
            n_neighbors.append(len(neighbors[0]))
            neighbors_indices.append(neighbors)

        neighbors_indices = [x for x in range(len(neighbors_indices)) if len(neighbors_indices[x][0]) > 2]

        return neighbors_indices
开发者ID:ilyaaltshteyn,项目名称:tweet_pre_processor,代码行数:28,代码来源:tweet_processor_bigdist.py

示例10: test_hash_functions

# 需要导入模块: from sklearn.neighbors import LSHForest [as 别名]
# 或者: from sklearn.neighbors.LSHForest import fit [as 别名]
def test_hash_functions():
    """Checks randomness of hash functions.

    Variance and mean of each hash function (projection vector)
    should be different from flattened array of hash functions.
    If hash functions are not randomly built (seeded with
    same value), variances and means of all functions are equal.
    """
    n_samples = 12
    n_features = 2
    n_estimators = 5
    rng = np.random.RandomState(42)
    X = rng.rand(n_samples, n_features)

    lshf = LSHForest(n_estimators=n_estimators,
                     random_state=rng.randint(0, np.iinfo(np.int32).max))
    lshf.fit(X)

    hash_functions = []
    for i in range(n_estimators):
        hash_functions.append(lshf.hash_functions_[i].components_)

    for i in range(n_estimators):
        assert_not_equal(np.var(hash_functions),
                         np.var(lshf.hash_functions_[i].components_))

    for i in range(n_estimators):
        assert_not_equal(np.mean(hash_functions),
                         np.mean(lshf.hash_functions_[i].components_))
开发者ID:cnspica,项目名称:scikit-learn,代码行数:31,代码来源:test_approximate.py

示例11: test_distances

# 需要导入模块: from sklearn.neighbors import LSHForest [as 别名]
# 或者: from sklearn.neighbors.LSHForest import fit [as 别名]
def test_distances():
    """Checks whether returned neighbors are from closest to farthest."""
    n_samples = 12
    n_features = 2
    n_iter = 10
    rng = np.random.RandomState(42)
    X = rng.rand(n_samples, n_features)

    lshf = LSHForest()
    lshf.fit(X)

    for i in range(n_iter):
        n_neighbors = rng.randint(0, n_samples)
        query = X[rng.randint(0, n_samples)]
        distances, neighbors = lshf.kneighbors(query,
                                               n_neighbors=n_neighbors,
                                               return_distance=True)
        # Returned neighbors should be from closest to farthest.
        assert_true(np.all(np.diff(distances[0]) >= 0))

        mean_dist = np.mean(pairwise_distances(query, X, metric='cosine'))
        distances, neighbors = lshf.radius_neighbors(query,
                                                     radius=mean_dist,
                                                     return_distance=True)
        assert_true(np.all(np.diff(distances[0]) >= 0))
开发者ID:CC-Fu-CC,项目名称:scikit-learn,代码行数:27,代码来源:test_approximate.py

示例12: test_neighbors_accuracy_with_n_estimators

# 需要导入模块: from sklearn.neighbors import LSHForest [as 别名]
# 或者: from sklearn.neighbors.LSHForest import fit [as 别名]
def test_neighbors_accuracy_with_n_estimators():
    """Checks whether accuracy increases as `n_estimators` increases."""
    n_estimators = np.array([1, 10, 100])
    n_samples = 100
    n_features = 10
    n_iter = 10
    n_points = 5
    rng = np.random.RandomState(42)
    accuracies = np.zeros(n_estimators.shape[0], dtype=float)
    X = rng.rand(n_samples, n_features)

    for i, t in enumerate(n_estimators):
        lshf = LSHForest(n_candidates=500, n_estimators=t)
        lshf.fit(X)
        for j in range(n_iter):
            query = X[rng.randint(0, n_samples)]
            neighbors = lshf.kneighbors(query, n_neighbors=n_points,
                                        return_distance=False)
            distances = pairwise_distances(query, X, metric='cosine')
            ranks = np.argsort(distances)[0, :n_points]

            intersection = np.intersect1d(ranks, neighbors).shape[0]
            ratio = intersection / float(n_points)
            accuracies[i] = accuracies[i] + ratio

        accuracies[i] = accuracies[i] / float(n_iter)
    # Sorted accuracies should be equal to original accuracies
    assert_true(np.all(np.diff(accuracies) >= 0),
                msg="Accuracies are not non-decreasing.")
    # Highest accuracy should be strictly greater than the lowest
    assert_true(np.ptp(accuracies) > 0,
                msg="Highest accuracy is not strictly greater than lowest.")
开发者ID:cnspica,项目名称:scikit-learn,代码行数:34,代码来源:test_approximate.py

示例13: build_index

# 需要导入模块: from sklearn.neighbors import LSHForest [as 别名]
# 或者: from sklearn.neighbors.LSHForest import fit [as 别名]
def build_index(data, n_estimators=20, n_candidates=100, n_neighbors=10, seed=0):
    lshf = LSHForest(n_estimators=n_estimators, n_candidates=n_candidates,
                     n_neighbors=n_neighbors, random_state=seed)
    t0 = time()
    lshf.fit(data)
    duration = time() - t0
    return lshf, duration
开发者ID:ogrisel,项目名称:lsh_glove,代码行数:9,代码来源:index_embedding.py

示例14: test_partial_fit

# 需要导入模块: from sklearn.neighbors import LSHForest [as 别名]
# 或者: from sklearn.neighbors.LSHForest import fit [as 别名]
def test_partial_fit():
    """Checks whether inserting array is consitent with fitted data.

    `partial_fit` method should set all attribute values correctly.
    """
    n_samples = 12
    n_samples_partial_fit = 3
    n_features = 2
    rng = np.random.RandomState(42)
    X = rng.rand(n_samples, n_features)
    X_partial_fit = rng.rand(n_samples_partial_fit, n_features)

    lshf = LSHForest()

    # Test unfitted estimator
    lshf.partial_fit(X)
    assert_array_equal(X, lshf._fit_X)

    lshf.fit(X)

    # Insert wrong dimension
    assert_raises(ValueError, lshf.partial_fit,
                  np.random.randn(n_samples_partial_fit, n_features - 1))

    lshf.partial_fit(X_partial_fit)

    # size of _input_array = samples + 1 after insertion
    assert_equal(lshf._fit_X.shape[0],
                 n_samples + n_samples_partial_fit)
    # size of original_indices_[1] = samples + 1
    assert_equal(len(lshf.original_indices_[0]),
                 n_samples + n_samples_partial_fit)
    # size of trees_[1] = samples + 1
    assert_equal(len(lshf.trees_[1]),
                 n_samples + n_samples_partial_fit)
开发者ID:cnspica,项目名称:scikit-learn,代码行数:37,代码来源:test_approximate.py

示例15: create_tree

# 需要导入模块: from sklearn.neighbors import LSHForest [as 别名]
# 或者: from sklearn.neighbors.LSHForest import fit [as 别名]
 def create_tree(self,listNames,variableName):
     #LSHForest. only once for the main database
     lshf = LSHForest(n_estimators=50,n_candidates=500)
     TF, tfidfs = self.create_TDIDF(self.tokenize(listNames))
     lshf.fit(tfidfs)        
     pickle.dump(lshf,open("{0}/{1}_lshf.dump".format(self.folderSaveData,variableName),"wb+"))
     pickle.dump(listNames,open("{0}/{1}_listNames.dump".format(self.folderSaveData,variableName),"wb+"))
     pickle.dump(TF,open("{0}/{1}_TF.dump".format(self.folderSaveData,variableName),"wb+"))
开发者ID:uvacorpnet,项目名称:name_matching,代码行数:10,代码来源:matcher.py


注:本文中的sklearn.neighbors.LSHForest.fit方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。