本文整理汇总了Python中faiss.read_index方法的典型用法代码示例。如果您正苦于以下问题:Python faiss.read_index方法的具体用法?Python faiss.read_index怎么用?Python faiss.read_index使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类faiss
的用法示例。
在下文中一共展示了faiss.read_index方法的14个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: _load_index
# 需要导入模块: import faiss [as 别名]
# 或者: from faiss import read_index [as 别名]
def _load_index(ctx, op, index, device_id):
return_index_type = _get_index_type(op.return_index_type, ctx)
if return_index_type == 'object':
# local
return index
elif return_index_type == 'filename':
# local cluster
return faiss.read_index(index)
else:
# distributed
fn = tempfile.mkstemp('.index', prefix='faiss_')[1]
with open(fn, 'wb') as f:
f.write(index)
index = faiss.read_index(f.name)
if device_id >= 0: # pragma: no cover
index = _index_to_gpu(index, device_id)
return index
示例2: __init__
# 需要导入模块: import faiss [as 别名]
# 或者: from faiss import read_index [as 别名]
def __init__(self, phrase_dump_dir, start_index_path, idx2id_path, max_answer_length, para=False,
num_dummy_zeros=0, cuda=False):
if os.path.isdir(phrase_dump_dir):
self.phrase_dump_paths = sorted(
[os.path.join(phrase_dump_dir, name) for name in os.listdir(phrase_dump_dir) if 'hdf5' in name])
dump_names = [os.path.splitext(os.path.basename(path))[0] for path in self.phrase_dump_paths]
self.dump_ranges = [list(map(int, name.split('-'))) for name in dump_names]
else:
self.phrase_dump_paths = [phrase_dump_dir]
self.phrase_dumps = [h5py.File(path, 'r') for path in self.phrase_dump_paths]
self.max_answer_length = max_answer_length
self.para = para
print('reading %s' % start_index_path)
self.start_index = faiss.read_index(start_index_path, faiss.IO_FLAG_ONDISK_SAME_DIR)
self.idx_f = self.load_idx_f(idx2id_path)
self.has_offset = not 'doc' in self.idx_f
# with h5py.File(idx2id_path, 'r') as f:
# self.idx2doc_id = f['doc'][:]
# self.idx2para_id = f['para'][:]
# self.idx2word_id = f['word'][:]
self.num_dummy_zeros = num_dummy_zeros
self.cuda = cuda
示例3: train_index
# 需要导入模块: import faiss [as 别名]
# 或者: from faiss import read_index [as 别名]
def train_index(data, quantizer_path, trained_index_path, fine_quant='SQ8', cuda=False):
quantizer = faiss.read_index(quantizer_path)
if fine_quant == 'SQ8':
trained_index = faiss.IndexIVFScalarQuantizer(quantizer, quantizer.d, quantizer.ntotal, faiss.METRIC_L2)
elif fine_quant.startswith('PQ'):
m = int(fine_quant[2:])
trained_index = faiss.IndexIVFPQ(quantizer, quantizer.d, quantizer.ntotal, m, 8)
else:
raise ValueError(fine_quant)
if cuda:
if fine_quant.startswith('PQ'):
print('PQ not supported on GPU; keeping CPU.')
else:
res = faiss.StandardGpuResources()
gpu_index = faiss.index_cpu_to_gpu(res, 0, trained_index)
gpu_index.train(data)
trained_index = faiss.index_gpu_to_cpu(gpu_index)
else:
trained_index.train(data)
faiss.write_index(trained_index, trained_index_path)
示例4: do_indexing
# 需要导入模块: import faiss [as 别名]
# 或者: from faiss import read_index [as 别名]
def do_indexing(word2vec_model=None):
if not os.path.isfile(INDEX_FILE_PATH):
index = faiss.IndexFlatIP(word2vec_model.vector_size)
index.add(word2vec_model.wv.syn0norm)
faiss.write_index(index, INDEX_FILE_PATH)
return index
else:
return faiss.read_index(INDEX_FILE_PATH)
示例5: __init__
# 需要导入模块: import faiss [as 别名]
# 或者: from faiss import read_index [as 别名]
def __init__(self, file_name, dim_vector=500, sentence_max_len=50):
self.u = faiss.read_index(file_name) # type: faiss.Index
self.sentence_max_length = sentence_max_len
示例6: deserialize_from
# 需要导入模块: import faiss [as 别名]
# 或者: from faiss import read_index [as 别名]
def deserialize_from(self, index_file: str):
logger.info("Loading index from %s", index_file)
self.index = faiss.read_index(index_file)
logger.info(
"Loaded index of type %s and size %d", type(self.index), self.index.ntotal
)
# DenseFlatIndexer does exact search
示例7: __init_indexes
# 需要导入模块: import faiss [as 别名]
# 或者: from faiss import read_index [as 别名]
def __init_indexes(self):
for fname in self.base_dir.glob(self.pattern):
print(fname)
idx = fname.stem.split('_')[-1]
self.indexes[int(idx)] = faiss.read_index(str(fname))
示例8: load
# 需要导入模块: import faiss [as 别名]
# 或者: from faiss import read_index [as 别名]
def load(self, path: str, device: Optional[str] = None) -> None:
r"""Load the index and meta data from ``path`` directory.
Args:
path (str): A path to the directory to load the index from.
device (optional str): Device to load the index into. If None,
value will be picked from hyperparameters.
"""
if not os.path.exists(path):
raise ValueError(f"Failed to load the index. {path} "
f"does not exist.")
cpu_index = faiss.read_index(f"{path}/index.faiss")
if device is None:
device = self._config.device
if device.lower().startswith("gpu"):
gpu_resource = faiss.StandardGpuResources()
gpu_id = int(device[3:])
if faiss.get_num_gpus() < gpu_id:
gpu_id = 0
logging.warning("Cannot create the index on device %s. "
"Total number of GPUs on this machine is "
"%s. Using the gpu0 for the index.",
device, faiss.get_num_gpus())
self._index = faiss.index_cpu_to_gpu(
gpu_resource, gpu_id, cpu_index)
else:
self._index = cpu_index
with open(f"{path}/index.meta_data", "rb") as f:
self._meta_data = pickle.load(f)
示例9: calculate
# 需要导入模块: import faiss [as 别名]
# 或者: from faiss import read_index [as 别名]
def calculate(self, images):
predicted = []
index = faiss.read_index(self.index_path)
with open(self.id_path) as f:
id_json = json.load(f)
logging.info('database load')
imgs = self.feature.get_feature(images)
D, I = index.search(imgs, k=1)
for p in I:
predicted.append(id_json[str(p[0])])
return predicted
示例10: main
# 需要导入模块: import faiss [as 别名]
# 或者: from faiss import read_index [as 别名]
def main(args):
index = read_index(str(args.index.with_suffix(".idx")))
index.nprobe = args.num_probes
with args.index.with_suffix(".json").open() as fp:
metadata = json.load(fp)
def query(batch, n):
feats = np.frombuffer(batch.data, dtype=np.float32)
feats = rearrange(feats, "(n d) -> n d", d=args.dimension)
assert len(feats.shape) == 2
assert feats.shape[1] == args.dimension
assert feats.dtype == np.float32
dists, indices = index.search(feats, n)
meta = [[metadata[i] for i in batch] for batch in indices]
return dists.tolist(), indices.tolist(), meta
with SimpleXMLRPCServer((args.host, args.port), logRequests=False) as server:
server.register_function(query)
try:
print("⏳ Waiting for similarity calls on {}:{}".format(args.host, args.port), file=sys.stderr)
server.serve_forever()
except KeyboardInterrupt:
print("\n⌛ Done", file=sys.stderr)
示例11: remove_doc_ids
# 需要导入模块: import faiss [as 别名]
# 或者: from faiss import read_index [as 别名]
def remove_doc_ids(args):
if os.path.isdir(args.subindex_dir):
names = os.listdir(args.subindex_dir)
index_names = [name for name in names if name.endswith('.faiss')]
index_paths = [os.path.join(args.subindex_dir, name) for name in index_names]
target_paths = [os.path.join(args.target_dir, name) for name in index_names]
idx2id_paths = [path.replace('.faiss', '.hdf5') for path in index_paths]
if not os.path.exists(args.target_dir):
os.makedirs(args.target_dir)
with open(args.ignore_path, 'r') as fp:
ignore_counter = json.load(fp)
count = sum(ignore_counter.values())
th = count * args.ratio
ignores = [int(key) for key, val in ignore_counter.items() if val > th]
print('thresholding at %.1f, removing following document ids:' % th)
for ignore in ignores:
print(ignore)
for idx2id_path, index_path, target_path in zip(idx2id_paths, tqdm(index_paths), target_paths):
with h5py.File(idx2id_path, 'r') as f:
doc_ids = f['doc'][:]
offset = f.attrs['offset']
idxs, = np.where(np.any(np.expand_dims(doc_ids, 1) == ignores, 1))
if len(idxs) > 0:
idxs = idxs + offset
print('found %d ids to remove' % len(idxs))
index = faiss.read_index(index_path)
index.remove_ids(idxs)
faiss.write_index(index, target_path)
else:
print('no ignore list found at %s' % index_path)
else:
index_path = args.subindex_dir
target_path = args.target_dir
idx2id_path = args.subindex_dir.replace('index.faiss', 'idx2id.hdf5')
with open(args.ignore_path, 'r') as fp:
ignores = np.array(list(map(int, json.load(fp))))
with h5py.File(idx2id_path, 'r') as f:
for offset, group in f.items():
doc_ids = group['doc'][:]
offset = int(offset)
idxs, = np.where(np.any(np.expand_dims(doc_ids, 1) == ignores, 1))
if len(idxs) > 0:
idxs = idxs + offset
print(idxs)
index = faiss.read_index(index_path)
index.remove_ids(idxs)
faiss.write_index(index, target_path)
else:
print('no ignore list found at %d' % offset)
示例12: merge_indexes
# 需要导入模块: import faiss [as 别名]
# 或者: from faiss import read_index [as 别名]
def merge_indexes(subindex_dir, trained_index_path, target_index_path, target_idx2id_path, target_inv_path):
# target_inv_path = merged_index.ivfdata
names = os.listdir(subindex_dir)
idx2id_paths = [os.path.join(subindex_dir, name) for name in names if name.endswith('.hdf5')]
index_paths = [os.path.join(subindex_dir, name) for name in names if name.endswith('.faiss')]
print('copying idx2id')
with h5py.File(target_idx2id_path, 'w') as out:
for idx2id_path in tqdm(idx2id_paths, desc='copying idx2id'):
with h5py.File(idx2id_path, 'r') as in_:
for key, g in in_.items():
offset = str(g.attrs['offset'])
assert key == offset
group = out.create_group(offset)
group.create_dataset('doc', data=in_['doc'])
group.create_dataset('para', data=in_['para'])
group.create_dataset('word', data=in_['word'])
print('loading invlists')
ivfs = []
for index_path in tqdm(index_paths, desc='loading invlists'):
# the IO_FLAG_MMAP is to avoid actually loading the data thus
# the total size of the inverted lists can exceed the
# available RAM
index = faiss.read_index(index_path,
faiss.IO_FLAG_MMAP)
ivfs.append(index.invlists)
# avoid that the invlists get deallocated with the index
index.own_invlists = False
# construct the output index
index = faiss.read_index(trained_index_path)
# prepare the output inverted lists. They will be written
# to merged_index.ivfdata
invlists = faiss.OnDiskInvertedLists(
index.nlist, index.code_size,
target_inv_path)
# merge all the inverted lists
print('merging')
ivf_vector = faiss.InvertedListsPtrVector()
for ivf in tqdm(ivfs):
ivf_vector.push_back(ivf)
print("merge %d inverted lists " % ivf_vector.size())
ntotal = invlists.merge_from(ivf_vector.data(), ivf_vector.size())
print(ntotal)
# now replace the inverted lists in the output index
index.ntotal = ntotal
index.replace_invlists(invlists)
print('writing index')
faiss.write_index(index, target_index_path)
示例13: run_index
# 需要导入模块: import faiss [as 别名]
# 或者: from faiss import read_index [as 别名]
def run_index(args):
phrase_path = os.path.join(args.dump_dir, 'phrase.hdf5')
if os.path.exists(phrase_path):
dump_paths = [phrase_path]
else:
dump_names = os.listdir(os.path.join(args.dump_dir, 'phrase'))
dump_paths = [os.path.join(args.dump_dir, 'phrase', name) for name in dump_names if name.endswith('.hdf5')]
data = None
if args.stage in ['all', 'coarse']:
if args.replace or not os.path.exists(args.quantizer_path):
if not os.path.exists(args.index_dir):
os.makedirs(args.index_dir)
data, max_norm = sample_data(dump_paths, max_norm=args.max_norm, para=args.para,
doc_sample_ratio=args.doc_sample_ratio, vec_sample_ratio=args.vec_sample_ratio,
max_norm_cf=args.max_norm_cf, num_dummy_zeros=args.num_dummy_zeros,
norm_th=args.norm_th)
with open(args.max_norm_path, 'w') as fp:
json.dump(max_norm, fp)
train_coarse_quantizer(data, args.quantizer_path, args.num_clusters, cuda=args.cuda)
if args.stage in ['all', 'fine']:
if args.replace or not os.path.exists(args.trained_index_path):
with open(args.max_norm_path, 'r') as fp:
max_norm = json.load(fp)
if data is None:
data, _ = sample_data(dump_paths, max_norm=max_norm, para=args.para,
doc_sample_ratio=args.doc_sample_ratio, vec_sample_ratio=args.vec_sample_ratio,
num_dummy_zeros=args.num_dummy_zeros, norm_th=args.norm_th)
train_index(data, args.quantizer_path, args.trained_index_path, fine_quant=args.fine_quant, cuda=args.cuda)
if args.stage in ['all', 'add']:
if args.replace or not os.path.exists(args.index_path):
with open(args.max_norm_path, 'r') as fp:
max_norm = json.load(fp)
if args.dump_paths is not None:
dump_paths = args.dump_paths
if not os.path.exists(args.subindex_dir):
os.makedirs(args.subindex_dir)
add_to_index(dump_paths, args.trained_index_path, args.index_path, args.idx2id_path,
max_norm=max_norm, para=args.para, num_dummy_zeros=args.num_dummy_zeros, cuda=args.cuda,
num_docs_per_add=args.num_docs_per_add, offset=args.offset, norm_th=args.norm_th,
fine_quant=args.fine_quant)
if args.stage == 'merge':
if args.replace or not os.path.exists(args.index_path):
merge_indexes(args.subindex_dir, args.trained_index_path, args.index_path, args.idx2id_path, args.inv_path)
if args.stage == 'move':
index = faiss.read_index(args.trained_index_path)
invlists = faiss.OnDiskInvertedLists(
index.nlist, index.code_size,
args.inv_path)
index.replace_invlists(invlists)
faiss.write_index(index, args.index_path)
示例14: __init__
# 需要导入模块: import faiss [as 别名]
# 或者: from faiss import read_index [as 别名]
def __init__(self,
feats,
k,
index_path='',
index_key='',
nprobe=128,
omp_num_threads=None,
rebuild_index=True,
verbose=True,
**kwargs):
import faiss
if omp_num_threads is not None:
faiss.omp_set_num_threads(omp_num_threads)
self.verbose = verbose
with Timer('[faiss] build index', verbose):
if index_path != '' and not rebuild_index and os.path.exists(
index_path):
print('[faiss] read index from {}'.format(index_path))
index = faiss.read_index(index_path)
else:
feats = feats.astype('float32')
size, dim = feats.shape
index = faiss.IndexFlatIP(dim)
if index_key != '':
assert index_key.find(
'HNSW') < 0, 'HNSW returns distances insted of sims'
metric = faiss.METRIC_INNER_PRODUCT
nlist = min(4096, 8 * round(math.sqrt(size)))
if index_key == 'IVF':
quantizer = index
index = faiss.IndexIVFFlat(quantizer, dim, nlist,
metric)
else:
index = faiss.index_factory(dim, index_key, metric)
if index_key.find('Flat') < 0:
assert not index.is_trained
index.train(feats)
index.nprobe = min(nprobe, nlist)
assert index.is_trained
print('nlist: {}, nprobe: {}'.format(nlist, nprobe))
index.add(feats)
if index_path != '':
print('[faiss] save index to {}'.format(index_path))
mkdir_if_no_exists(index_path)
faiss.write_index(index, index_path)
with Timer('[faiss] query topk {}'.format(k), verbose):
knn_ofn = index_path + '.npz'
if os.path.exists(knn_ofn):
print('[faiss] read knns from {}'.format(knn_ofn))
self.knns = np.load(knn_ofn)['data']
else:
sims, nbrs = index.search(feats, k=k)
self.knns = [(np.array(nbr, dtype=np.int32),
1 - np.array(sim, dtype=np.float32))
for nbr, sim in zip(nbrs, sims)]