本文整理汇总了Python中nmslib.init方法的典型用法代码示例。如果您正苦于以下问题:Python nmslib.init方法的具体用法?Python nmslib.init怎么用?Python nmslib.init使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类nmslib
的用法示例。
在下文中一共展示了nmslib.init方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: nearest_neighbors
# 需要导入模块: import nmslib [as 别名]
# 或者: from nmslib import init [as 别名]
def nearest_neighbors(X, neighbors=16, threads=1):
# initialize HNSW index on Cosine Similarity
nn_index = nmslib.init(method='hnsw', space='cosinesimil')
nn_index.addDataPointBatch(X)
nn_index.createIndex({'post': 2}, print_progress=True)
# get nearest neighbours
Xn = nn_index.knnQueryBatch(X, k=(neighbors+1), num_threads=threads)
# extract graph edges
sources = []
targets = []
for i, neigh in enumerate(Xn):
sources += [i]*(neighbors-1)
targets += list(neigh[0][1:])
# construct igraph
nn_graph = igraph.Graph(directed=True)
nn_graph.add_vertices(X.shape[0])
nn_graph.add_edges(list(zip(sources, targets)))
return nn_graph
示例2: fit
# 需要导入模块: import nmslib [as 别名]
# 或者: from nmslib import init [as 别名]
def fit(self, X):
if self._method_name == 'vptree':
# To avoid this issue: terminate called after throwing an instance
# of 'std::runtime_error'
# what(): The data size is too small or the bucket size is too
# big. Select the parameters so that <total # of records> is NOT
# less than <bucket size> * 1000
# Aborted (core dumped)
self._index_param.append('bucketSize=%d' %
min(int(X.shape[0] * 0.0005), 1000))
self._index = nmslib.init(
space=self._nmslib_metric, method=self._method_name)
self._index.addDataPointBatch(X)
if os.path.exists(self._index_name):
print('Loading index from file')
self._index.loadIndex(self._index_name)
else:
self._index.createIndex(self._index_param)
if self._save_index:
self._index.saveIndex(self._index_name)
if self._query_param is not None:
self._index.setQueryTimeParams(self._query_param)
示例3: fit
# 需要导入模块: import nmslib [as 别名]
# 或者: from nmslib import init [as 别名]
def fit(self, Ciu, show_progress=True):
# nmslib can be a little chatty when first imported, disable some of
# the logging
logging.getLogger('nmslib').setLevel(logging.WARNING)
import nmslib
# train the model
super(NMSLibAlternatingLeastSquares, self).fit(Ciu, show_progress)
# create index for similar_items
if self.approximate_similar_items:
log.debug("Building nmslib similar items index")
self.similar_items_index = nmslib.init(
method=self.method, space='cosinesimil')
# there are some numerical instability issues here with
# building a cosine index with vectors with 0 norms, hack around this
# by just not indexing them
norms = numpy.linalg.norm(self.item_factors, axis=1)
ids = numpy.arange(self.item_factors.shape[0])
# delete zero valued rows from the matrix
item_factors = numpy.delete(self.item_factors, ids[norms == 0], axis=0)
ids = ids[norms != 0]
self.similar_items_index.addDataPointBatch(item_factors, ids=ids)
self.similar_items_index.createIndex(self.index_params,
print_progress=show_progress)
self.similar_items_index.setQueryTimeParams(self.query_params)
# build up a separate index for the inner product (for recommend
# methods)
if self.approximate_recommend:
log.debug("Building nmslib recommendation index")
self.max_norm, extra = augment_inner_product_matrix(
self.item_factors)
self.recommend_index = nmslib.init(
method='hnsw', space='cosinesimil')
self.recommend_index.addDataPointBatch(extra)
self.recommend_index.createIndex(self.index_params, print_progress=show_progress)
self.recommend_index.setQueryTimeParams(self.query_params)
示例4: __init__
# 需要导入模块: import nmslib [as 别名]
# 或者: from nmslib import init [as 别名]
def __init__(self, M, efC, efS, num_neighbours, num_threads,
space='cosine'):
space_map = {'cosine': 'cosinesimil'}
space = space_map[space]
self.index = nmslib.init(method='hnsw', space=space)
self.M = M
self.num_threads = num_threads
self.efC = efC
self.efS = efS
self.num_neighbours = num_neighbours
示例5: knn_nmslib
# 需要导入模块: import nmslib [as 别名]
# 或者: from nmslib import init [as 别名]
def knn_nmslib(feats, k, space='cosinesimil'):
index = nmslib.init(method='hnsw', space=space)
index.addDataPointBatch(feats)
index.createIndex({'post': 2}, print_progress=True)
neighbours = index.knnQueryBatch(feats, k=k, num_threads=multiprocessing.cpu_count())
return neighbours
示例6: build_index
# 需要导入模块: import nmslib [as 别名]
# 或者: from nmslib import init [as 别名]
def build_index(self,
data: np.ndarray):
index = nmslib.init(method=self.method, space=self.space,
data_type=nmslib.DataType.DENSE_VECTOR)
index.addDataPointBatch(data)
index.createIndex(self._index_time_params, print_progress=False)
index.setQueryTimeParams(self._query_time_params)
self.index = index
self.times_queried = 0
示例7: load_approximate_nearest_neighbours_index
# 需要导入模块: import nmslib [as 别名]
# 或者: from nmslib import init [as 别名]
def load_approximate_nearest_neighbours_index(
linker_paths: LinkerPaths, ef_search: int = 200,
) -> FloatIndex:
"""
Load an approximate nearest neighbours index from disk.
Parameters
----------
linker_paths: LinkerPaths, required.
Contains the paths to the data required for the entity linker.
ef_search: int, optional (default = 200)
Controls speed performance at query time. Max value is 2000,
but reducing to around ~100 will increase query speed by an order
of magnitude for a small performance hit.
"""
concept_alias_tfidfs = scipy.sparse.load_npz(
cached_path(linker_paths.tfidf_vectors)
).astype(numpy.float32)
ann_index = nmslib.init(
method="hnsw",
space="cosinesimil_sparse",
data_type=nmslib.DataType.SPARSE_VECTOR,
)
ann_index.addDataPointBatch(concept_alias_tfidfs)
ann_index.loadIndex(cached_path(linker_paths.ann_index))
query_time_params = {"efSearch": ef_search}
ann_index.setQueryTimeParams(query_time_params)
return ann_index
示例8: __init__
# 需要导入模块: import nmslib [as 别名]
# 或者: from nmslib import init [as 别名]
def __init__(self,
rank,
fdim,
sample_num,
num_output,
bias=False,
interval=100,
start_iter=0,
midw='0',
midb='1'):
super(HNSWSampler, self).__init__()
self.rank = rank
self.fdim = fdim
self.sample_num = sample_num
self.num_output = num_output
self.full_cls = np.arange(self.num_output)
# init param client
self.client = ParameterClient(rank)
self.midw = midw
self.midb = midb
self.is_bias = bias
self.client.add_matrix(self.midw, [self.num_output, self.fdim])
if self.is_bias:
self.client.add_matrix(self.midb, [self.num_output, 1])
# init hnsw
self.space = 'cosinesimil'
""" higher ef leads to better accuracy, but slower search
higher M leads to higher accuracy/run_time at fixed ef, but consumes more memory
"""
self.space_params = {'ef': 100, 'M': 16}
self.interval = interval
self.start_iter = start_iter
self.iter = start_iter
self.test_iter = start_iter
示例9: _update_hf
# 需要导入模块: import nmslib [as 别名]
# 或者: from nmslib import init [as 别名]
def _update_hf(self):
if not self.iter % self.interval == 0 and \
not self.iter == self.start_iter:
return
w = self.client.get_value_by_rows(self.midw, self.full_cls)
self.hnsw = nmslib.init(method='hnsw',
space=self.space,
space_params=self.space_params)
self.hnsw.addDataPointBatch(w)
""" `post` represents postprocessing applied to the constructed graph.
The default value is 0, which means no postprocessing.
Additional options are 1 and 2 (2 means more postprocessing).
"""
self.hnsw.createIndex({'post': 2}, print_progress=True)
示例10: tsne
# 需要导入模块: import nmslib [as 别名]
# 或者: from nmslib import init [as 别名]
def tsne(x, n_components=2, perplexity=30.0, early_exaggeration=12.0,
learning_rate=200.0, n_iter=1000, n_iter_without_progress=300,
min_grad_norm=1e-07, metric="euclidean", init="random", verbose=0,
random_state=None, method="barnes_hut", angle=0.5):
x_tsne = sklearn.manifold.TSNE(
n_components=n_components, perplexity=perplexity,
early_exaggeration=early_exaggeration,
learning_rate=learning_rate, n_iter=n_iter,
n_iter_without_progress=n_iter_without_progress,
min_grad_norm=min_grad_norm, metric=metric,
init=init, verbose=verbose,
random_state=random_state, method=method,
angle=angle).fit_transform(x)
return x_tsne
示例11: _load_index
# 需要导入模块: import nmslib [as 别名]
# 或者: from nmslib import init [as 别名]
def _load_index(self):
index_file = get_index_path(self.index_name)
self.primary = nmslib.init(
method='hnsw', space='l2', data_type=nmslib.DataType.DENSE_VECTOR)
self.secondary = nmslib.init(
method='hnsw', space='l2', data_type=nmslib.DataType.DENSE_VECTOR)
self.bitmap = nmslib.init(
method='hnsw', space='l2', data_type=nmslib.DataType.DENSE_VECTOR)
if os.path.exists(index_file):
self.primary_df = pd.read_hdf(index_file, 'primary')
self.primary, self.primary_c = self._add_data(
self.primary, self.primary_df)
self.secondary_df = pd.read_hdf(index_file, 'secondary')
self.secondary, self.secondary_c = self._add_data(
self.secondary, self.secondary_df)
self.bitmap_df = pd.read_hdf(index_file, 'bitmap')
self.bitmap, self.bitmap_c = self._add_data(
self.bitmap, self.bitmap_df)
else:
self.primary_df = None
self.secondary_df = None
self.bitmap_df = None
self.primary_c, self.secondary_c, self.bitmap_c = 0, 0, 0
示例12: search_hnsw_jaccard_topk
# 需要导入模块: import nmslib [as 别名]
# 或者: from nmslib import init [as 别名]
def search_hnsw_jaccard_topk(index_data, query_data, index_params, k):
(index_sets, index_keys) = index_data
(query_sets, query_keys) = query_data
print("Building HNSW Index.")
start = time.perf_counter()
index = nmslib.init(method="hnsw", space="jaccard_sparse",
data_type=nmslib.DataType.OBJECT_AS_STRING)
index.addDataPointBatch(
[" ".join(str(v) for v in s) for s in index_sets],
range(len(index_keys)))
index.createIndex(index_params)
end = time.perf_counter()
print("Indexing time: {:.3f}.".format(end-start))
print("Querying.")
times = []
results = []
index.setQueryTimeParams({"efSearch": index_params["efConstruction"]})
for query_set, query_key in zip(query_sets, query_keys):
start = time.perf_counter()
result, _ = index.knnQuery(" ".join(str(v) for v in query_set), k)
result = [[index_keys[i], compute_jaccard(query_set, index_sets[i])]
for i in result]
result.sort(key=lambda x : x[1], reverse=True)
duration = time.perf_counter() - start
times.append(duration)
results.append((query_key, result))
sys.stdout.write(f"\rQueried {len(results)} sets")
sys.stdout.write("\n")
return (results, times)
示例13: __init__
# 需要导入模块: import nmslib [as 别名]
# 或者: from nmslib import init [as 别名]
def __init__(self, feats, k, index_path='', verbose=True, **kwargs):
import nmslib
self.verbose = verbose
with Timer('[hnsw] build index', verbose):
''' higher ef leads to better accuracy, but slower search
higher M leads to higher accuracy/run_time at fixed ef,
but consumes more memory
'''
# space_params = {
# 'ef': 100,
# 'M': 16,
# }
# index = nmslib.init(method='hnsw',
# space='cosinesimil',
# space_params=space_params)
index = nmslib.init(method='hnsw', space='cosinesimil')
if index_path != '' and os.path.isfile(index_path):
index.loadIndex(index_path)
else:
index.addDataPointBatch(feats)
index.createIndex({
'post': 2,
'indexThreadQty': 1
},
print_progress=verbose)
if index_path:
print('[hnsw] save index to {}'.format(index_path))
mkdir_if_no_exists(index_path)
index.saveIndex(index_path)
with Timer('[hnsw] query topk {}'.format(k), verbose):
knn_ofn = index_path + '.npz'
if os.path.exists(knn_ofn):
print('[hnsw] read knns from {}'.format(knn_ofn))
self.knns = np.load(knn_ofn)['data']
else:
self.knns = index.knnQueryBatch(feats, k=k)
示例14: fit
# 需要导入模块: import nmslib [as 别名]
# 或者: from nmslib import init [as 别名]
def fit(self, X, y=None) -> HNSW:
""" Setup the HNSW index from training data.
Parameters
----------
X: np.array
Data to be indexed
y: any
Ignored
Returns
-------
self: HNSW
An instance of HNSW with a built graph
"""
X = check_array(X)
method = self.method
post_processing = self.post_processing
if self.metric in ['euclidean', 'l2', 'minkowski', 'squared_euclidean', 'sqeuclidean']:
if self.metric in ['squared_euclidean', 'sqeuclidean']:
self.metric = 'sqeuclidean'
else:
self.metric = 'euclidean'
self.space = 'l2'
elif self.metric in ['cosine', 'cosinesimil']:
self.space = 'cosinesimil'
else:
raise ValueError(f'Invalid metric "{self.metric}". Please try "euclidean" or "cosine".')
hnsw_index = nmslib.init(method=method,
space=self.space)
hnsw_index.addDataPointBatch(X)
hnsw_index.createIndex({'post': post_processing,
'indexThreadQty': self.n_jobs,
},
print_progress=(self.verbose >= 2))
self.index_ = hnsw_index
self.n_samples_fit_ = len(self.index_)
assert self.space in ['l2', 'cosinesimil'], f'Internal: self.space={self.space} not allowed'
return self
示例15: nmslib_knn_with_zero_vectors
# 需要导入模块: import nmslib [as 别名]
# 或者: from nmslib import init [as 别名]
def nmslib_knn_with_zero_vectors(
self, vectors: numpy.ndarray, k: int
) -> Tuple[numpy.ndarray, numpy.ndarray]:
"""
ann_index.knnQueryBatch crashes if any of the vectors is all zeros.
This function is a wrapper around `ann_index.knnQueryBatch` that solves this problem. It works as follows:
- remove empty vectors from `vectors`.
- call `ann_index.knnQueryBatch` with the non-empty vectors only. This returns `neighbors`,
a list of list of neighbors. `len(neighbors)` equals the length of the non-empty vectors.
- extend the list `neighbors` with `None`s in place of empty vectors.
- return the extended list of neighbors and distances.
"""
empty_vectors_boolean_flags = numpy.array(vectors.sum(axis=1) != 0).reshape(-1)
empty_vectors_count = vectors.shape[0] - sum(empty_vectors_boolean_flags)
if self.verbose:
print(f"Number of empty vectors: {empty_vectors_count}")
# init extended_neighbors with a list of Nones
extended_neighbors = numpy.empty(
(len(empty_vectors_boolean_flags),), dtype=object
)
extended_distances = numpy.empty(
(len(empty_vectors_boolean_flags),), dtype=object
)
if vectors.shape[0] - empty_vectors_count == 0:
return extended_neighbors, extended_distances
# remove empty vectors before calling `ann_index.knnQueryBatch`
vectors = vectors[empty_vectors_boolean_flags]
# call `knnQueryBatch` to get neighbors
original_neighbours = self.ann_index.knnQueryBatch(vectors, k=k)
neighbors, distances = zip(
*[(x[0].tolist(), x[1].tolist()) for x in original_neighbours]
)
neighbors = list(neighbors)
distances = list(distances)
# neighbors need to be converted to an np.array of objects instead of ndarray of dimensions len(vectors)xk
# Solution: add a row to `neighbors` with any length other than k. This way, calling np.array(neighbors)
# returns an np.array of objects
neighbors.append([])
distances.append([])
# interleave `neighbors` and Nones in `extended_neighbors`
extended_neighbors[empty_vectors_boolean_flags] = numpy.array(neighbors)[:-1]
extended_distances[empty_vectors_boolean_flags] = numpy.array(distances)[:-1]
return extended_neighbors, extended_distances