本文整理汇总了Python中Levenshtein.distance方法的典型用法代码示例。如果您正苦于以下问题:Python Levenshtein.distance方法的具体用法?Python Levenshtein.distance怎么用?Python Levenshtein.distance使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类Levenshtein
的用法示例。
在下文中一共展示了Levenshtein.distance方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: align
# 需要导入模块: import Levenshtein [as 别名]
# 或者: from Levenshtein import distance [as 别名]
def align(xy_tuple):
x,y = xy_tuple
xmol, ymol = Chem.MolFromSmiles(x), Chem.MolFromSmiles(y)
x = Chem.MolToSmiles(xmol, isomericSmiles=False)
xmol = Chem.MolFromSmiles(x)
xleaf = get_leaves(xmol)
yleaf = get_leaves(ymol)
best_i,best_j = 0,0
best = 1000000
for i in xleaf:
for j in yleaf:
new_x = Chem.MolToSmiles(xmol, rootedAtAtom=i, isomericSmiles=False)
new_y = Chem.MolToSmiles(ymol, rootedAtAtom=j, isomericSmiles=False)
le = min(len(new_x), len(new_y)) // 2
dist = Levenshtein.distance(new_x[:le], new_y[:le])
if dist < best:
best_i, best_j = i, j
best = dist
return Chem.MolToSmiles(xmol, rootedAtAtom=best_i, isomericSmiles=False), Chem.MolToSmiles(ymol, rootedAtAtom=best_j, isomericSmiles=False)
示例2: distanceDomain
# 需要导入模块: import Levenshtein [as 别名]
# 或者: from Levenshtein import distance [as 别名]
def distanceDomain(domain, DomainDict, ccTldDict, tldDict):
similarDomain = ""
minDistance = sys.maxint
level = domain.split(".")
if len(level) <=1:
return ("not a domain", sys.maxint)
(domain2LD, domain3LD, domain2LDs, domain3LDs) = extractLevelDomain(domain, ccTldDict, tldDict)
for popularDomain in DomainDict:
distance = Levenshtein.distance(domain2LD.decode('utf-8'), popularDomain.decode('utf-8'))
if distance < minDistance:
minDistance = distance
similarDomain = popularDomain
#debug
#sys.stdout.write("subdomain: %s, similarDomain: %s, minDistance: %d\n" % (subdomain, similarDomain, minDistance))
if len(similarDomain) > 0:
return (similarDomain, minDistance/float(len(similarDomain)))
else:
return (domain2LD, 0)
# check whether a domain contains invalid TLD
示例3: compute_edit_distance
# 需要导入模块: import Levenshtein [as 别名]
# 或者: from Levenshtein import distance [as 别名]
def compute_edit_distance(session, labels_true_st, labels_pred_st):
"""Compute edit distance per mini-batch.
Args:
session:
labels_true_st: A `SparseTensor` of ground truth
labels_pred_st: A `SparseTensor` of prediction
Returns:
edit_distances: list of edit distance of each uttearance
"""
indices, values, dense_shape = labels_true_st
labels_pred_pl = tf.SparseTensor(indices, values, dense_shape)
indices, values, dense_shape = labels_pred_st
labels_true_pl = tf.SparseTensor(indices, values, dense_shape)
edit_op = tf.edit_distance(labels_pred_pl, labels_true_pl, normalize=True)
edit_distances = session.run(edit_op)
return edit_distances
示例4: compute_per
# 需要导入模块: import Levenshtein [as 别名]
# 或者: from Levenshtein import distance [as 别名]
def compute_per(ref, hyp, normalize=True):
"""Compute Phone Error Rate.
Args:
ref (list): phones in the reference transcript
hyp (list): phones in the predicted transcript
normalize (bool, optional): if True, divide by the length of str_true
Returns:
per (float): Phone Error Rate between str_true and str_pred
"""
# Build mapping of phone to index
phone_set = set(ref + hyp)
phone2char = dict(zip(phone_set, range(len(phone_set))))
# Map phones to a single char array
# NOTE: Levenshtein packages only accepts strings
phones_ref = [chr(phone2char[p]) for p in ref]
phones_hyp = [chr(phone2char[p]) for p in hyp]
per = lev.distance(''.join(phones_ref), ''.join(phones_hyp))
if normalize:
per /= len(ref)
return per
示例5: wer
# 需要导入模块: import Levenshtein [as 别名]
# 或者: from Levenshtein import distance [as 别名]
def wer(self, s1, s2):
"""
Computes the Word Error Rate, defined as the edit distance between the
two provided sentences after tokenizing to words.
Arguments:
s1 (string): space-separated sentence
s2 (string): space-separated sentence
"""
# build mapping of words to integers
b = set(s1.split() + s2.split())
word2char = dict(zip(b, range(len(b))))
# map the words to a char array (Levenshtein packages only accepts
# strings)
w1 = [chr(word2char[w]) for w in s1.split()]
w2 = [chr(word2char[w]) for w in s2.split()]
return Lev.distance(''.join(w1), ''.join(w2))
示例6: _random_common_char_pairs
# 需要导入模块: import Levenshtein [as 别名]
# 或者: from Levenshtein import distance [as 别名]
def _random_common_char_pairs(n_pairs=50, seed=1):
"""
Return string pairs with a common char at random positions, in order to
distinguish different thresholds for matching chararacters in Jaro
distance.
"""
# Make strings with random length and common char at index 0
rng = np.random.RandomState(seed=seed)
list1 = ['a' + 'b' * rng.randint(2, 20) for k in range(n_pairs)]
list2 = ['a' + 'c' * rng.randint(2, 20) for k in range(n_pairs)]
# Shuffle strings
list1 = [''.join(rng.choice(
list(s), size=len(s), replace=False)) for s in list1]
list2 = [''.join(rng.choice(
list(s), size=len(s), replace=False)) for s in list2]
pairs = zip(list1, list2)
return pairs
# TODO: some factorization of what is common for distances;
# check results for same examples on all distances
示例7: wer
# 需要导入模块: import Levenshtein [as 别名]
# 或者: from Levenshtein import distance [as 别名]
def wer(self, s1, s2):
"""
Computes the Word Error Rate, defined as the edit distance between the
two provided sentences after tokenizing to words.
Arguments:
s1 (string): space-separated sentence
s2 (string): space-separated sentence
"""
# build mapping of words to integers
b = set(s1.split() + s2.split())
word2char = {ss: ii for ii, ss in enumerate(b)}
# map the words to a char array (Levenshtein packages only accepts
# strings)
w1 = [chr(word2char[w]) for w in s1.split()]
w2 = [chr(word2char[w]) for w in s2.split()]
return Lev.distance(''.join(w1), ''.join(w2))
示例8: calculate_wer
# 需要导入模块: import Levenshtein [as 别名]
# 或者: from Levenshtein import distance [as 别名]
def calculate_wer(s1, s2):
"""
Computes the Word Error Rate, defined as the edit distance between the
two provided sentences after tokenizing to words.
Arguments:
s1 (string): space-separated sentence
s2 (string): space-separated sentence
"""
# build mapping of words to integers
b = set(s1.split() + s2.split())
word2char = dict(zip(b, range(len(b))))
# map the words to a char array (Levenshtein packages only accepts
# strings)
w1 = [chr(word2char[w]) for w in s1.split()]
w2 = [chr(word2char[w]) for w in s2.split()]
return Lev.distance(''.join(w1), ''.join(w2))
示例9: _get_compare_data
# 需要导入模块: import Levenshtein [as 别名]
# 或者: from Levenshtein import distance [as 别名]
def _get_compare_data(tif_txt_pair):
tif = tif_txt_pair[0]
txt = tif_txt_pair[1]
if tif[:-4] == txt[:-4]: # This should always be true
# ocr = run_main(tif, conf=Config(path='/home/zr/letters/conf/443cf9ec-76c7-44bc-95ad-593138d2d5fc.conf'), text=True)
# ocr = run_main(tif, conf=Config(segmenter='stochastic', recognizer='hmm', break_width=3.6), text=True)
ocr = run_main(tif, text=True)
# ocr = run_all_confs_for_page(tif, text = True)
ocr = ocr.strip()
txt = open(txt,'r').read()
txt = _normalize_input(txt)
edit_dist = L.distance(txt, ocr)
edit_ratio = L.ratio(txt, ocr)
html = _make_html_diff(txt, ocr)
# sys.exit()
data = {'edit_distance': edit_dist,
'edit_ratio': edit_ratio,
'filename': os.path.basename(tif),
'html': html
}
return data
示例10: wer
# 需要导入模块: import Levenshtein [as 别名]
# 或者: from Levenshtein import distance [as 别名]
def wer(s1, s2):
"""
Computes the Word Error Rate, defined as the edit distance between the
two provided sentences after tokenizing to words.
Arguments:
s1 (string): space-separated sentence
s2 (string): space-separated sentence
"""
# build mapping of words to integers
b = set(s1.split() + s2.split())
word2char = dict(zip(b, range(len(b))))
# map the words to a char array (Levenshtein packages only accepts
# strings)
w1 = [chr(word2char[w]) for w in s1.split()]
w2 = [chr(word2char[w]) for w in s2.split()]
return Lev.distance(''.join(w1), ''.join(w2))
示例11: compute_cer
# 需要导入模块: import Levenshtein [as 别名]
# 或者: from Levenshtein import distance [as 别名]
def compute_cer(ref, hyp, normalize=False):
"""Compute Character Error Rate.
Args:
ref (str): a sentence without spaces
hyp (str): a sentence without spaces
normalize (bool, optional): if True, divide by the length of ref
Returns:
cer (float): Character Error Rate between ref and hyp
"""
import Levenshtein as lev
# TODO(hirofumi): install
cer = lev.distance(hyp, ref)
if normalize:
cer /= len(list(ref))
return cer * 100
示例12: validate
# 需要导入模块: import Levenshtein [as 别名]
# 或者: from Levenshtein import distance [as 别名]
def validate(self, data_loader):
"validate with label error rate by the edit distance between hyps and refs"
self.model.eval()
with torch.no_grad():
N, D = 0, 0
t = tqdm(enumerate(data_loader), total=len(data_loader), desc="validating", ncols=params.NCOLS)
for i, (data) in t:
hyps, refs = self.unit_validate(data)
# calculate ler
N += self.edit_distance(refs, hyps)
D += sum(len(r) for r in refs)
ler = N * 100. / D
t.set_description(f"validating (LER: {ler:.2f} %)")
t.refresh()
logger.info(f"validating at epoch {self.epoch:03d}: LER {ler:.2f} %")
title = f"validate"
x = self.epoch - 1 + i / len(data_loader)
if logger.visdom is not None:
opts = { 'xlabel': 'epoch', 'ylabel': 'LER', }
logger.visdom.add_point(title=title, x=x, y=ler, **opts)
if logger.tensorboard is not None:
logger.tensorboard.add_scalars(title, self.global_step, { 'LER': ler, })
示例13: levenshtein
# 需要导入模块: import Levenshtein [as 别名]
# 或者: from Levenshtein import distance [as 别名]
def levenshtein(string, candidates):
"""
Compare a string's Levenshtein distance to each candidate in a dictionary.
Returns the name of the closest match
"""
distances = defaultdict(int)
num_lines = len(string)
for k, v in candidates.items():
expanded = False
# Expands the length of each candidate to match the length of the compared string
if len(v) != len(string):
v = (v * (num_lines // len(v) + 1))[:num_lines]
expanded = True
edit_distance = distance(string, v)
# If we expanded the candidate, then it is a worse match than what we have already
if edit_distance in distances and expanded:
continue
distances[distance(string, v)] = k
return distances[min(distances)]
示例14: guess_metre
# 需要导入模块: import Levenshtein [as 别名]
# 或者: from Levenshtein import distance [as 别名]
def guess_metre(tokenized_poem):
"""
Guess a poem's metre via Levenshtein distance from candidates
"""
joined_lines = [''.join(line) for line in scanscion(tokenized_poem) if line]
line_lengths = [len(line) for line in joined_lines]
num_lines = len(joined_lines)
metres = []
for line in joined_lines:
metres.append(levenshtein(line, POSSIBLE_METRES))
guessed_metre = max(zip((metres.count(item) for item in set(metres)), set(metres)))[1]
return joined_lines, num_lines, line_lengths, guessed_metre
示例15: _get_distance
# 需要导入模块: import Levenshtein [as 别名]
# 或者: from Levenshtein import distance [as 别名]
def _get_distance(self, targets, y_hats):
"""
Provides total character distance between targets & y_hats
Args:
targets (torch.Tensor): set of ground truth
y_hats (torch.Tensor): predicted y values (y_hat) by the model
Returns: total_dist, total_length
- **total_dist**: total distance between targets & y_hats
- **total_length**: total length of targets sequence
"""
total_dist = 0
total_length = 0
for (target, y_hat) in zip(targets, y_hats):
s1 = label_to_string(target, self.id2char, self.eos_id)
s2 = label_to_string(y_hat, self.id2char, self.eos_id)
dist, length = self.metric(s1, s2)
total_dist += dist
total_length += length
return total_dist, total_length