本文整理汇总了Python中datasketch.MinHash方法的典型用法代码示例。如果您正苦于以下问题:Python datasketch.MinHash方法的具体用法?Python datasketch.MinHash怎么用?Python datasketch.MinHash使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类datasketch
的用法示例。
在下文中一共展示了datasketch.MinHash方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: similarity
# 需要导入模块: import datasketch [as 别名]
# 或者: from datasketch import MinHash [as 别名]
def similarity(self, other_doc, metric='jaccard', hash_method='minhash'):
"""
Computes similarity for two documents.
Only minhash Jaccard similarity is implemented.
>>> from textpipe.doc import Doc
>>> doc1 = Doc('Sentence for computing the minhash')
>>> doc2 = Doc('Sentence for computing the similarity')
>>> doc1.similarity(doc2)
0.7265625
"""
if hash_method == 'minhash' and metric == 'jaccard':
hash1 = MinHash(hashvalues=self.minhash)
hash2 = MinHash(hashvalues=other_doc.minhash)
return hash1.jaccard(hash2)
raise NotImplementedError(f'Metric/hash method combination {metric}'
f'/{hash_method} is not implemented as similarity metric')
示例2: priv_build_content_sim
# 需要导入模块: import datasketch [as 别名]
# 或者: from datasketch import MinHash [as 别名]
def priv_build_content_sim(self, threshold):
# Build a content similarity index
# Content_sim text relation (minhash-based)
start_text_sig_sim = time.time()
st = time.time()
mh_signatures = self.store_client.get_all_mh_text_signatures()
et = time.time()
print("Time to extract minhash signatures from store: {0}".format(str(et - st)))
print("!!3 " + str(et - st))
content_index = MinHashLSH(threshold=threshold, num_perm=512)
mh_sig_obj = []
# Create minhash objects and index
for nid, mh_sig in mh_signatures:
mh_obj = MinHash(num_perm=512)
mh_array = np.asarray(mh_sig, dtype=int)
mh_obj.hashvalues = mh_array
content_index.insert(nid, mh_obj)
mh_sig_obj.append((nid, mh_obj))
end_text_sig_sim = time.time()
print("Total text-sig-sim (minhash): {0}".format(str(end_text_sig_sim - start_text_sig_sim)))
print("!!4 " + str(end_text_sig_sim - start_text_sig_sim))
self.content_sim_index = content_index
示例3: test_hash
# 需要导入模块: import datasketch [as 别名]
# 或者: from datasketch import MinHash [as 别名]
def test_hash(self):
m = MinHash(hashfunc=fake_hash_func)
m.update(11)
m.update(123)
m.update(92)
m.update(98)
m.update(123218)
m.update(32)
lm1 = LeanMinHash(m)
lm2 = LeanMinHash(m)
self.assertEqual(hash(lm1), hash(lm2))
m.update(444)
lm3 = LeanMinHash(m)
self.assertNotEqual(hash(lm1), hash(lm3))
d = dict()
d[lm1] = True
self.assertTrue(d[lm2])
示例4: find_minhash
# 需要导入模块: import datasketch [as 别名]
# 或者: from datasketch import MinHash [as 别名]
def find_minhash(self, num_perm=128):
"""
Compute minhash, cached.
"""
words = self.words
doc_hash = MinHash(num_perm=num_perm)
for word, _ in words:
doc_hash.update(word.encode('utf8'))
return list(doc_hash.digest())
示例5: build_content_sim_mh_text
# 需要导入模块: import datasketch [as 别名]
# 或者: from datasketch import MinHash [as 别名]
def build_content_sim_mh_text(network, mh_signatures):
def connect(nid1, nid2, score):
network.add_relation(nid1, nid2, Relation.CONTENT_SIM, score)
# Materialize signatures for convenience
mh_sig_obj = []
content_index = MinHashLSH(threshold=0.7, num_perm=512)
# Create minhash objects and index
for nid, mh_sig in mh_signatures:
mh_obj = MinHash(num_perm=512)
mh_array = np.asarray(mh_sig, dtype=int)
mh_obj.hashvalues = mh_array
content_index.insert(nid, mh_obj)
mh_sig_obj.append((nid, mh_obj))
# Query objects
for nid, mh_obj in mh_sig_obj:
res = content_index.query(mh_obj)
for r_nid in res:
if r_nid != nid:
connect(nid, r_nid, 1)
return content_index
示例6: compare_content_signatures
# 需要导入模块: import datasketch [as 别名]
# 或者: from datasketch import MinHash [as 别名]
def compare_content_signatures(self, kr_name, signatures):
positive_matches = []
for class_name, mh_sig in signatures:
mh_obj = MinHash(num_perm=512)
mh_array = np.asarray(mh_sig, dtype=int)
mh_obj.hashvalues = mh_array
res = self.content_sim_index.query(mh_obj)
for r_nid in res:
(nid, db_name, source_name, field_name) = self.network.get_info_for([r_nid])[0]
# matching from db attr to name
matching = ((db_name, source_name, field_name), (kr_name, class_name))
positive_matches.append(matching)
return positive_matches
示例7: get_mh
# 需要导入模块: import datasketch [as 别名]
# 或者: from datasketch import MinHash [as 别名]
def get_mh(values, permutations=512):
mh = MinHash(num_perm=permutations)
for el in values:
mh.update(str(el).encode('utf8'))
return mh
示例8: get_min_hash
# 需要导入模块: import datasketch [as 别名]
# 或者: from datasketch import MinHash [as 别名]
def get_min_hash(text, too_common, num_perm=128):
min_hash = MinHash(num_perm=num_perm)
for shingle_h in shingle_hashes(text):
digest = shingle_h.digest()
if digest not in too_common:
min_hash.update(digest)
return min_hash
示例9: test_minhash_from_text
# 需要导入模块: import datasketch [as 别名]
# 或者: from datasketch import MinHash [as 别名]
def test_minhash_from_text(self):
"""Test create minhash from text."""
minhash = similarity.minhash_from_text(
self.test_text, similarity.DEFAULT_PERMUTATIONS, self.delimiters)
self.assertIsInstance(minhash, MinHash)
示例10: fit
# 需要导入模块: import datasketch [as 别名]
# 或者: from datasketch import MinHash [as 别名]
def fit(self, X):
self._index = MinHashLSHForest(num_perm=self._n_perm, l=self._n_rep)
for i, x in enumerate(X):
m = MinHash(num_perm=self._n_perm)
for e in x:
m.update(str(e).encode('utf8'))
self._index.add(str(i), m)
self._index.index()
示例11: query
# 需要导入模块: import datasketch [as 别名]
# 或者: from datasketch import MinHash [as 别名]
def query(self, v, n):
m = MinHash(num_perm=self._n_perm)
for e in v:
m.update(str(e).encode('utf8'))
return map(int, self._index.query(m, n))
示例12: build_index
# 需要导入模块: import datasketch [as 别名]
# 或者: from datasketch import MinHash [as 别名]
def build_index(self):
"""
Builds MinHash LSH blocking indexer for single database. It processes records in batches of BATCH_SIZE.
Args:
None
Returns:
None
Has a side effect of building MinHash LSH indexer and writing indexer to disk.
"""
records = {}
run_count = 0
run_iteration = 1
parse_dict = {}
for k in self.value_path:
parse_dict[k] = parse(k)
s = time.time()
for rid, json_data in self._file_iter:
extracted_data = utils.extract(json_data, self.value_path, parse_dict)
# Reset run_count when we hit BATCH_SIZE
if run_count >= self._batch_size:
self._index_records(records)
msg = "Finished indexing {val} records. Time = {time}".format(val=run_count * run_iteration,
time=(time.time() - s))
self._logger.info('{0} {1}'.format("[minhash-lsh-blocking]", msg))
run_iteration += 1
records = {}
run_count = 0
records[rid] = set(extracted_data.values()[0])
run_count += 1
# Index the final remaining records
self._index_records(records)
示例13: create_minhashes_from_sets
# 需要导入模块: import datasketch [as 别名]
# 或者: from datasketch import MinHash [as 别名]
def create_minhashes_from_sets(sets, num_perms, hashfunc, pad_for_asym=False):
# Generate paddings for asym.
max_size = max(len(s) for s in sets)
paddings = dict()
if pad_for_asym:
padding_sizes = sorted(list(set([max_size-len(s) for s in sets])))
for num_perm in num_perms:
paddings[num_perm] = dict()
for i, padding_size in enumerate(padding_sizes):
if i == 0:
prev_size = 0
pad = MinHash(num_perm, hashfunc=hashfunc)
else:
prev_size = padding_sizes[i-1]
pad = paddings[num_perm][prev_size].copy()
for w in range(prev_size, padding_size):
pad.update(str(w)+"_tmZZRe8DE23s")
paddings[num_perm][padding_size] = pad
# Generate minhash
minhashes = dict()
for num_perm in num_perms:
print("Using num_perm = {}".format(num_perm))
ms = []
for s in sets:
m = MinHash(num_perm, hashfunc=hashfunc)
for word in s:
m.update(str(word))
if pad_for_asym:
# Add padding to the minhash
m.merge(paddings[num_perm][max_size-len(s)])
ms.append(m)
sys.stdout.write("\rMinhashed {} sets".format(len(ms)))
sys.stdout.write("\n")
minhashes[num_perm] = ms
return minhashes
示例14: insertion_session
# 需要导入模块: import datasketch [as 别名]
# 或者: from datasketch import MinHash [as 别名]
def insertion_session(self, batch_size=10000):
"""
Create a asynchronous context manager for fast insertion in index.
:param int batch_size: the size of chunks to use in insert_session mode (default=10000).
:return: datasketch.experimental.aio.lsh.AsyncMinHashLSHSession
Example:
.. code-block:: python
from datasketch.experimental.aio.lsh import AsyncMinHashLSH
from datasketch import MinHash
def chunk(it, size):
it = iter(it)
return iter(lambda: tuple(islice(it, size)), ())
_chunked_str = chunk((random.choice(string.ascii_lowercase) for _ in range(10000)), 4)
seq = frozenset(chain((''.join(s) for s in _chunked_str), ('aahhb', 'aahh', 'aahhc', 'aac', 'kld', 'bhg', 'kkd', 'yow', 'ppi', 'eer')))
objs = [MinHash(16) for _ in range(len(seq))]
for e, obj in zip(seq, objs):
for i in e:
obj.update(i.encode('utf-8'))
data = [(e, m) for e, m in zip(seq, objs)]
_storage_config_redis = {'type': 'aiomongo', 'mongo': {'host': 'localhost', 'port': 27017}}
async def func():
async with AsyncMinHashLSH(storage_config=_storage_config_redis, threshold=0.5, num_perm=16) as lsh:
async with lsh.insertion_session(batch_size=1000) as session:
fs = (session.insert(key, minhash, check_duplication=True) for key, minhash in data)
await asyncio.gather(*fs)
"""
return AsyncMinHashLSHInsertionSession(self, batch_size=batch_size)
示例15: delete_session
# 需要导入模块: import datasketch [as 别名]
# 或者: from datasketch import MinHash [as 别名]
def delete_session(self, batch_size=10000):
"""
Create a asynchronous context manager for fast removal of keys
from index.
:param int batch_size: the size of chunks to use in insert_session mode (default=10000).
:return: datasketch.experimental.aio.lsh.AsyncMinHashLSHSession
Example:
.. code-block:: python
from datasketch.experimental.aio.lsh import AsyncMinHashLSH
from datasketch import MinHash
def chunk(it, size):
it = iter(it)
return iter(lambda: tuple(islice(it, size)), ())
_chunked_str = chunk((random.choice(string.ascii_lowercase) for _ in range(10000)), 4)
seq = frozenset(chain((''.join(s) for s in _chunked_str), ('aahhb', 'aahh', 'aahhc', 'aac', 'kld', 'bhg', 'kkd', 'yow', 'ppi', 'eer')))
objs = [MinHash(16) for _ in range(len(seq))]
for e, obj in zip(seq, objs):
for i in e:
obj.update(i.encode('utf-8'))
data = [(e, m) for e, m in zip(seq, objs)]
_storage_config_redis = {'type': 'aiomongo', 'mongo': {'host': 'localhost', 'port': 27017}}
async def func():
async with AsyncMinHashLSH(storage_config=_storage_config_redis, threshold=0.5, num_perm=16) as lsh:
async with lsh.insertion_session(batch_size=1000) as session:
fs = (session.insert(key, minhash, check_duplication=True) for key, minhash in data)
await asyncio.gather(*fs)
async with lsh.delete_session(batch_size=3) as session:
fs = (session.remove(key) for key in keys_to_remove)
await asyncio.gather(*fs)
"""
return AsyncMinHashLSHDeleteSession(self, batch_size=batch_size)