本文整理汇总了Python中faiss.write_index方法的典型用法代码示例。如果您正苦于以下问题:Python faiss.write_index方法的具体用法?Python faiss.write_index怎么用?Python faiss.write_index使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类faiss
的用法示例。
在下文中一共展示了faiss.write_index方法的13个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: main
# 需要导入模块: import faiss [as 别名]
# 或者: from faiss import write_index [as 别名]
def main():
f = h5py.File(opt.states, "r")
data = f[opt.data]
seqs, slens, hid = data.shape
print("Processing {} Sequences".format(seqs))
print("with {} tokens each".format(slens))
print("and {} states".format(hid))
# Initialize a new index
index = faiss.IndexFlatIP(hid)
# Fill it
for ix in tqdm(range(0, seqs-opt.stepsize, opt.stepsize)):
cdata = np.array(data[ix:ix+opt.stepsize]\
.reshape(-1, hid), dtype="float32")
index.add(cdata)
f.close()
faiss.write_index(index, opt.output)
示例2: train_coarse_quantizer
# 需要导入模块: import faiss [as 别名]
# 或者: from faiss import write_index [as 别名]
def train_coarse_quantizer(data, quantizer_path, num_clusters, hnsw=False, niter=10, cuda=False):
d = data.shape[1]
index_flat = faiss.IndexFlatL2(d)
# make it into a gpu index
if cuda:
res = faiss.StandardGpuResources()
index_flat = faiss.index_cpu_to_gpu(res, 0, index_flat)
clus = faiss.Clustering(d, num_clusters)
clus.verbose = True
clus.niter = niter
clus.train(data, index_flat)
centroids = faiss.vector_float_to_array(clus.centroids)
centroids = centroids.reshape(num_clusters, d)
if hnsw:
quantizer = faiss.IndexHNSWFlat(d, 32)
quantizer.hnsw.efSearch = 128
quantizer.train(centroids)
quantizer.add(centroids)
else:
quantizer = faiss.IndexFlatL2(d)
quantizer.add(centroids)
faiss.write_index(quantizer, quantizer_path)
示例3: train_index
# 需要导入模块: import faiss [as 别名]
# 或者: from faiss import write_index [as 别名]
def train_index(data, quantizer_path, trained_index_path, fine_quant='SQ8', cuda=False):
quantizer = faiss.read_index(quantizer_path)
if fine_quant == 'SQ8':
trained_index = faiss.IndexIVFScalarQuantizer(quantizer, quantizer.d, quantizer.ntotal, faiss.METRIC_L2)
elif fine_quant.startswith('PQ'):
m = int(fine_quant[2:])
trained_index = faiss.IndexIVFPQ(quantizer, quantizer.d, quantizer.ntotal, m, 8)
else:
raise ValueError(fine_quant)
if cuda:
if fine_quant.startswith('PQ'):
print('PQ not supported on GPU; keeping CPU.')
else:
res = faiss.StandardGpuResources()
gpu_index = faiss.index_cpu_to_gpu(res, 0, trained_index)
gpu_index.train(data)
trained_index = faiss.index_gpu_to_cpu(gpu_index)
else:
trained_index.train(data)
faiss.write_index(trained_index, trained_index_path)
示例4: save
# 需要导入模块: import faiss [as 别名]
# 或者: from faiss import write_index [as 别名]
def save(self, path: str) -> None:
r"""Save the index and meta data in ``path`` directory. The index
will be saved as ``index.faiss`` and ``index.meta_data`` respectively
inside ``path`` directory.
Args:
path (str): A path to the directory where the index will be saved
"""
if os.path.exists(path):
logging.warning("%s directory already exists. Index will be "
"saved into an existing directory", path)
else:
os.makedirs(path)
cpu_index = faiss.index_gpu_to_cpu(self._index) \
if self._index.__class__.__name__.startswith("Gpu") else self._index
faiss.write_index(cpu_index, f"{path}/index.faiss")
with open(f"{path}/index.meta_data", "wb") as f:
pickle.dump(self._meta_data, f)
示例5: faiss_train
# 需要导入模块: import faiss [as 别名]
# 或者: from faiss import write_index [as 别名]
def faiss_train(fn_feature, root_path, index_path='train.index', id_path='data.json'):
folder_names = os.listdir(root_path)
logging.info('directory %s ', folder_names)
ids = None
vals = None
id_json = {}
print(folder_names)
for idx, folder_name in enumerate(folder_names):
id_json[str(idx)] = folder_name
now_path = os.path.join(root_path, folder_name)
feature_val = fn_feature(now_path)
vals = np.concatenate((feature_val, vals), axis=0) if vals is not None else feature_val
id_np = np.asarray([idx] * feature_val.shape[0])
ids = np.concatenate((id_np, ids), axis=0) if ids is not None else id_np
N, dim = vals.shape
x = int(2 * math.sqrt(N))
index_description = "IVF{x},Flat".format(x=x)
index = faiss.index_factory(7 * 7 * 512, index_description, faiss.METRIC_INNER_PRODUCT)
index.train(vals)
index.add_with_ids(vals, ids)
faiss.write_index(index, index_path)
with open(id_path, 'w', encoding='utf-8') as f:
json.dump(id_json, f, ensure_ascii=False, indent=4)
print(id_json)
return index, id_json
示例6: _store_index
# 需要导入模块: import faiss [as 别名]
# 或者: from faiss import write_index [as 别名]
def _store_index(ctx, op, index, device_id):
return_index_type = _get_index_type(op.return_index_type, ctx)
if return_index_type == 'object':
# no need to serialize
return index
elif return_index_type == 'filename':
# save to file, then return filename
if device_id >= 0: # pragma: no cover
# for gpu, convert to cpu first
index = faiss.index_gpu_to_cpu(index)
fn = tempfile.mkstemp('.index', prefix='faiss_')[1]
faiss.write_index(index, fn)
atexit.register(lambda: os.remove(fn))
return fn
else:
if device_id >= 0: # pragma: no cover
# for gpu, convert to cpu first
index = faiss.index_gpu_to_cpu(index)
# distributed, save to file, then return in memory bytes
fn = tempfile.mkstemp('.index', prefix='faiss_')[1]
faiss.write_index(index, fn)
try:
with open(fn, 'rb') as f:
return f.read()
finally:
os.remove(fn)
示例7: do_indexing
# 需要导入模块: import faiss [as 别名]
# 或者: from faiss import write_index [as 别名]
def do_indexing(word2vec_model=None):
if not os.path.isfile(INDEX_FILE_PATH):
index = faiss.IndexFlatIP(word2vec_model.vector_size)
index.add(word2vec_model.wv.syn0norm)
faiss.write_index(index, INDEX_FILE_PATH)
return index
else:
return faiss.read_index(INDEX_FILE_PATH)
示例8: serialize
# 需要导入模块: import faiss [as 别名]
# 或者: from faiss import write_index [as 别名]
def serialize(self, index_file: str):
logger.info("Serializing index to %s", index_file)
faiss.write_index(self.index, index_file)
示例9: save_indexes
# 需要导入模块: import faiss [as 别名]
# 或者: from faiss import write_index [as 别名]
def save_indexes(idxs, outdir, base_name=LAYER_TEMPLATE):
"""Save the faiss index into a file for each index in idxs"""
base_dir = Path(outdir)
if not base_dir.exists(): base_dir.mkdir(exist_ok=True, parents=True)
out_name = str(base_dir / base_name)
for i, idx in enumerate(idxs):
name = out_name.format(i)
print(f"Saving to {name}")
faiss.write_index(idx, name)
示例10: remove_doc_ids
# 需要导入模块: import faiss [as 别名]
# 或者: from faiss import write_index [as 别名]
def remove_doc_ids(args):
if os.path.isdir(args.subindex_dir):
names = os.listdir(args.subindex_dir)
index_names = [name for name in names if name.endswith('.faiss')]
index_paths = [os.path.join(args.subindex_dir, name) for name in index_names]
target_paths = [os.path.join(args.target_dir, name) for name in index_names]
idx2id_paths = [path.replace('.faiss', '.hdf5') for path in index_paths]
if not os.path.exists(args.target_dir):
os.makedirs(args.target_dir)
with open(args.ignore_path, 'r') as fp:
ignore_counter = json.load(fp)
count = sum(ignore_counter.values())
th = count * args.ratio
ignores = [int(key) for key, val in ignore_counter.items() if val > th]
print('thresholding at %.1f, removing following document ids:' % th)
for ignore in ignores:
print(ignore)
for idx2id_path, index_path, target_path in zip(idx2id_paths, tqdm(index_paths), target_paths):
with h5py.File(idx2id_path, 'r') as f:
doc_ids = f['doc'][:]
offset = f.attrs['offset']
idxs, = np.where(np.any(np.expand_dims(doc_ids, 1) == ignores, 1))
if len(idxs) > 0:
idxs = idxs + offset
print('found %d ids to remove' % len(idxs))
index = faiss.read_index(index_path)
index.remove_ids(idxs)
faiss.write_index(index, target_path)
else:
print('no ignore list found at %s' % index_path)
else:
index_path = args.subindex_dir
target_path = args.target_dir
idx2id_path = args.subindex_dir.replace('index.faiss', 'idx2id.hdf5')
with open(args.ignore_path, 'r') as fp:
ignores = np.array(list(map(int, json.load(fp))))
with h5py.File(idx2id_path, 'r') as f:
for offset, group in f.items():
doc_ids = group['doc'][:]
offset = int(offset)
idxs, = np.where(np.any(np.expand_dims(doc_ids, 1) == ignores, 1))
if len(idxs) > 0:
idxs = idxs + offset
print(idxs)
index = faiss.read_index(index_path)
index.remove_ids(idxs)
faiss.write_index(index, target_path)
else:
print('no ignore list found at %d' % offset)
示例11: merge_indexes
# 需要导入模块: import faiss [as 别名]
# 或者: from faiss import write_index [as 别名]
def merge_indexes(subindex_dir, trained_index_path, target_index_path, target_idx2id_path, target_inv_path):
# target_inv_path = merged_index.ivfdata
names = os.listdir(subindex_dir)
idx2id_paths = [os.path.join(subindex_dir, name) for name in names if name.endswith('.hdf5')]
index_paths = [os.path.join(subindex_dir, name) for name in names if name.endswith('.faiss')]
print('copying idx2id')
with h5py.File(target_idx2id_path, 'w') as out:
for idx2id_path in tqdm(idx2id_paths, desc='copying idx2id'):
with h5py.File(idx2id_path, 'r') as in_:
for key, g in in_.items():
offset = str(g.attrs['offset'])
assert key == offset
group = out.create_group(offset)
group.create_dataset('doc', data=in_['doc'])
group.create_dataset('para', data=in_['para'])
group.create_dataset('word', data=in_['word'])
print('loading invlists')
ivfs = []
for index_path in tqdm(index_paths, desc='loading invlists'):
# the IO_FLAG_MMAP is to avoid actually loading the data thus
# the total size of the inverted lists can exceed the
# available RAM
index = faiss.read_index(index_path,
faiss.IO_FLAG_MMAP)
ivfs.append(index.invlists)
# avoid that the invlists get deallocated with the index
index.own_invlists = False
# construct the output index
index = faiss.read_index(trained_index_path)
# prepare the output inverted lists. They will be written
# to merged_index.ivfdata
invlists = faiss.OnDiskInvertedLists(
index.nlist, index.code_size,
target_inv_path)
# merge all the inverted lists
print('merging')
ivf_vector = faiss.InvertedListsPtrVector()
for ivf in tqdm(ivfs):
ivf_vector.push_back(ivf)
print("merge %d inverted lists " % ivf_vector.size())
ntotal = invlists.merge_from(ivf_vector.data(), ivf_vector.size())
print(ntotal)
# now replace the inverted lists in the output index
index.ntotal = ntotal
index.replace_invlists(invlists)
print('writing index')
faiss.write_index(index, target_index_path)
示例12: run_index
# 需要导入模块: import faiss [as 别名]
# 或者: from faiss import write_index [as 别名]
def run_index(args):
phrase_path = os.path.join(args.dump_dir, 'phrase.hdf5')
if os.path.exists(phrase_path):
dump_paths = [phrase_path]
else:
dump_names = os.listdir(os.path.join(args.dump_dir, 'phrase'))
dump_paths = [os.path.join(args.dump_dir, 'phrase', name) for name in dump_names if name.endswith('.hdf5')]
data = None
if args.stage in ['all', 'coarse']:
if args.replace or not os.path.exists(args.quantizer_path):
if not os.path.exists(args.index_dir):
os.makedirs(args.index_dir)
data, max_norm = sample_data(dump_paths, max_norm=args.max_norm, para=args.para,
doc_sample_ratio=args.doc_sample_ratio, vec_sample_ratio=args.vec_sample_ratio,
max_norm_cf=args.max_norm_cf, num_dummy_zeros=args.num_dummy_zeros,
norm_th=args.norm_th)
with open(args.max_norm_path, 'w') as fp:
json.dump(max_norm, fp)
train_coarse_quantizer(data, args.quantizer_path, args.num_clusters, cuda=args.cuda)
if args.stage in ['all', 'fine']:
if args.replace or not os.path.exists(args.trained_index_path):
with open(args.max_norm_path, 'r') as fp:
max_norm = json.load(fp)
if data is None:
data, _ = sample_data(dump_paths, max_norm=max_norm, para=args.para,
doc_sample_ratio=args.doc_sample_ratio, vec_sample_ratio=args.vec_sample_ratio,
num_dummy_zeros=args.num_dummy_zeros, norm_th=args.norm_th)
train_index(data, args.quantizer_path, args.trained_index_path, fine_quant=args.fine_quant, cuda=args.cuda)
if args.stage in ['all', 'add']:
if args.replace or not os.path.exists(args.index_path):
with open(args.max_norm_path, 'r') as fp:
max_norm = json.load(fp)
if args.dump_paths is not None:
dump_paths = args.dump_paths
if not os.path.exists(args.subindex_dir):
os.makedirs(args.subindex_dir)
add_to_index(dump_paths, args.trained_index_path, args.index_path, args.idx2id_path,
max_norm=max_norm, para=args.para, num_dummy_zeros=args.num_dummy_zeros, cuda=args.cuda,
num_docs_per_add=args.num_docs_per_add, offset=args.offset, norm_th=args.norm_th,
fine_quant=args.fine_quant)
if args.stage == 'merge':
if args.replace or not os.path.exists(args.index_path):
merge_indexes(args.subindex_dir, args.trained_index_path, args.index_path, args.idx2id_path, args.inv_path)
if args.stage == 'move':
index = faiss.read_index(args.trained_index_path)
invlists = faiss.OnDiskInvertedLists(
index.nlist, index.code_size,
args.inv_path)
index.replace_invlists(invlists)
faiss.write_index(index, args.index_path)
示例13: __init__
# 需要导入模块: import faiss [as 别名]
# 或者: from faiss import write_index [as 别名]
def __init__(self,
feats,
k,
index_path='',
index_key='',
nprobe=128,
omp_num_threads=None,
rebuild_index=True,
verbose=True,
**kwargs):
import faiss
if omp_num_threads is not None:
faiss.omp_set_num_threads(omp_num_threads)
self.verbose = verbose
with Timer('[faiss] build index', verbose):
if index_path != '' and not rebuild_index and os.path.exists(
index_path):
print('[faiss] read index from {}'.format(index_path))
index = faiss.read_index(index_path)
else:
feats = feats.astype('float32')
size, dim = feats.shape
index = faiss.IndexFlatIP(dim)
if index_key != '':
assert index_key.find(
'HNSW') < 0, 'HNSW returns distances insted of sims'
metric = faiss.METRIC_INNER_PRODUCT
nlist = min(4096, 8 * round(math.sqrt(size)))
if index_key == 'IVF':
quantizer = index
index = faiss.IndexIVFFlat(quantizer, dim, nlist,
metric)
else:
index = faiss.index_factory(dim, index_key, metric)
if index_key.find('Flat') < 0:
assert not index.is_trained
index.train(feats)
index.nprobe = min(nprobe, nlist)
assert index.is_trained
print('nlist: {}, nprobe: {}'.format(nlist, nprobe))
index.add(feats)
if index_path != '':
print('[faiss] save index to {}'.format(index_path))
mkdir_if_no_exists(index_path)
faiss.write_index(index, index_path)
with Timer('[faiss] query topk {}'.format(k), verbose):
knn_ofn = index_path + '.npz'
if os.path.exists(knn_ofn):
print('[faiss] read knns from {}'.format(knn_ofn))
self.knns = np.load(knn_ofn)['data']
else:
sims, nbrs = index.search(feats, k=k)
self.knns = [(np.array(nbr, dtype=np.int32),
1 - np.array(sim, dtype=np.float32))
for nbr, sim in zip(nbrs, sims)]