本文整理汇总了Python中distance.levenshtein方法的典型用法代码示例。如果您正苦于以下问题:Python distance.levenshtein方法的具体用法?Python distance.levenshtein怎么用?Python distance.levenshtein使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类distance
的用法示例。
在下文中一共展示了distance.levenshtein方法的6个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: edit_distance
# 需要导入模块: import distance [as 别名]
# 或者: from distance import levenshtein [as 别名]
def edit_distance(references, hypotheses):
"""Computes Levenshtein distance between two sequences.
Args:
references: list of list of token (one hypothesis)
hypotheses: list of list of token (one hypothesis)
Returns:
1 - levenshtein distance: (higher is better, 1 is perfect)
"""
d_leven, len_tot = 0, 0
for ref, hypo in zip(references, hypotheses):
d_leven += distance.levenshtein(ref, hypo)
len_tot += float(max(len(ref), len(hypo)))
return 1. - d_leven / len_tot
示例2: __init__
# 需要导入模块: import distance [as 别名]
# 或者: from distance import levenshtein [as 别名]
def __init__(self, col_name, distance_fcn=distance.levenshtein,
threshold=None):
"""
Constructing column information
:param col_name: name of column
:param distance_fcn: distance function
:param threshold: threshold for clustering
"""
self.col_name = col_name
self.distance_fcn = distance_fcn
if threshold is not None:
self.threshold = threshold
elif distance_fcn == distance.levenshtein:
self.threshold = 3
else:
raise ValueError("Requires a threshold if not using levenshtein"
"distance")
示例3: cluster_similar_responses
# 需要导入模块: import distance [as 别名]
# 或者: from distance import levenshtein [as 别名]
def cluster_similar_responses(output_path):
max_count = get_max_socket_message_count(output_path)
listing = glob.glob(output_path + '*-%s.log' % max_count)
messages = [file(filename).read() for filename in listing]
messages = [extract_description_from_message(m) for m in messages]
messages = np.asarray(messages)
print()
print('Clustering %s responses...(this might take a while)' % len(messages))
print()
lev_similarity = -1 * np.array([[distance.levenshtein(m1, m2) for m1 in messages] for m2 in messages])
affprop = sklearn.cluster.AffinityPropagation(affinity='precomputed',
damping=0.5)
affprop.fit(lev_similarity)
print('Generated clusters:')
print()
for cluster_id in np.unique(affprop.labels_):
exemplar = messages[affprop.cluster_centers_indices_[cluster_id]]
cluster = np.unique(messages[np.nonzero(affprop.labels_ == cluster_id)])
cluster_str = ', '.join(cluster)
print('-' * 80)
print(' - *%s:* %s' % (exemplar, cluster_str))
print('-' * 80)
print()
示例4: eval
# 需要导入模块: import distance [as 别名]
# 或者: from distance import levenshtein [as 别名]
def eval():
# Load graph
g = Graph(mode="test")
print("Graph loaded")
# Load batch
_Y = load_data(mode="test")
X = np.zeros((len(_Y), hp.maxlen))
Y = np.zeros((len(_Y), hp.maxlen))
for i, y in enumerate(_Y):
y = np.fromstring(y, np.int32)
Y[i][:len(y)] = y
np.random.shuffle(y)
X[i][:len(y)] = y
word2idx, idx2word = g.word2idx, g.idx2word
# Start session
with g.graph.as_default():
sv = tf.train.Supervisor()
with sv.managed_session(config=tf.ConfigProto(allow_soft_placement=True)) as sess:
# Restore parameters
sv.saver.restore(sess, tf.train.latest_checkpoint(hp.logdir))
# Get model
mname = open(hp.logdir + '/checkpoint', 'r').read().split('"')[1] # model name
# inference
if not os.path.exists('results'): os.mkdir('results')
with codecs.open("results/" + mname, "w", "utf-8") as fout:
num_words, total_edit_distance = 0, 0
for i in range(0, len(Y), hp.batch_size):
### Get mini-batches
x = X[i:i+hp.batch_size]
y = Y[i:i+hp.batch_size]
### Autoregressive inference
preds = np.zeros((hp.batch_size, hp.maxlen), np.int32)
for j in range(hp.maxlen):
_preds = sess.run(g.preds, {g.x: x, g.y: preds})
preds[:, j] = _preds[:, j]
for xx, yy, pred in zip(x, y, preds): # sentence-wise
inputs = " ".join(idx2word[idx] for idx in xx).replace("_", "").strip()
expected = " ".join(idx2word[idx] for idx in yy).replace("_", "").strip()
got = " ".join(idx2word[idx] for idx in pred[:len(inputs.split())])
edit_distance = distance.levenshtein(expected.split(), got.split())
total_edit_distance += edit_distance
num_words += len(expected.split())
fout.write(u"Inputs : {}\n".format(inputs))
fout.write(u"Expected: {}\n".format(expected))
fout.write(u"Got : {}\n".format(got))
fout.write(u"WER : {}\n\n".format(edit_distance))
fout.write(u"Total WER: {}/{}={}\n".format(total_edit_distance,
num_words,
round(float(total_edit_distance) / num_words, 2)))
示例5: img_edit_distance
# 需要导入模块: import distance [as 别名]
# 或者: from distance import levenshtein [as 别名]
def img_edit_distance(img1, img2):
"""Computes Levenshtein distance between two images.
(From Harvard's NLP github)
Slices the images into columns and consider one column as a character.
Args:
im1, im2: np arrays of shape (H, W, 1)
Returns:
column wise levenshtein distance
max length of the two sequences
"""
# load the image (H, W)
img1, img2 = img1[:, :, 0], img2[:, :, 0]
# transpose and convert to 0 or 1
img1 = np.transpose(img1)
h1 = img1.shape[1]
w1 = img1.shape[0]
img1 = (img1<=128).astype(np.uint8)
img2 = np.transpose(img2)
h2 = img2.shape[1]
w2 = img2.shape[0]
img2 = (img2<=128).astype(np.uint8)
# create binaries for each column
if h1 == h2:
seq1 = [''.join([str(i) for i in item]) for item in img1]
seq2 = [''.join([str(i) for i in item]) for item in img2]
elif h1 > h2:
seq1 = [''.join([str(i) for i in item]) for item in img1]
seq2 = [''.join([str(i) for i in item])+''.join(['0']*(h1-h2)) for
item in img2]
else:
seq1 = [''.join([str(i) for i in item])+''.join(['0']*(h2-h1)) for
item in img1]
seq2 = [''.join([str(i) for i in item]) for item in img2]
# convert each column binary into int
seq1_int = [int(item,2) for item in seq1]
seq2_int = [int(item,2) for item in seq2]
# distance
l_dist = distance.levenshtein(seq1_int, seq2_int)
length = float(max(len(seq1_int), len(seq2_int)))
return l_dist, length
示例6: val
# 需要导入模块: import distance [as 别名]
# 或者: from distance import levenshtein [as 别名]
def val(net, test_dataset, criterion, max_iter=2):
print('Start val')
for p in crnn.parameters():
p.requires_grad = False
net.eval()
data_loader = torch.utils.data.DataLoader(
test_dataset, batch_size=opt.batchSize, num_workers=int(opt.workers),
sampler=dataset.randomSequentialSampler(test_dataset, opt.batchSize),
collate_fn=dataset.alignCollate(imgH=opt.imgH, imgW=opt.imgW, keep_ratio=opt.keep_ratio))
val_iter = iter(data_loader)
i = 0
n_correct = 0
loss_avg = utils.averager()
test_distance=0
max_iter = min(max_iter, len(data_loader))
for i in range(max_iter):
data = val_iter.next()
i += 1
cpu_images, cpu_texts = data
batch_size = cpu_images.size(0)
utils.loadData(image, cpu_images)
if ifUnicode:
cpu_texts = [ clean_txt(tx.decode('utf-8')) for tx in cpu_texts]
t, l = converter.encode(cpu_texts)
utils.loadData(text, t)
utils.loadData(length, l)
preds = crnn(image)
preds_size = Variable(torch.IntTensor([preds.size(0)] * batch_size))
cost = criterion(preds, text, preds_size, length) / batch_size
loss_avg.add(cost)
_, preds = preds.max(2)
# preds = preds.squeeze(2)
preds = preds.transpose(1, 0).contiguous().view(-1)
sim_preds = converter.decode(preds.data, preds_size.data, raw=False)
for pred, target in zip(sim_preds, cpu_texts):
if pred.strip() == target.strip():
n_correct += 1
# print(distance.levenshtein(pred.strip(), target.strip()))
test_distance +=distance.nlevenshtein(pred.strip(), target.strip(),method=2)
raw_preds = converter.decode(preds.data, preds_size.data, raw=True)[:opt.n_test_disp]
for raw_pred, pred, gt in zip(raw_preds, sim_preds, cpu_texts):
print('%-20s => %-20s, gt: %-20s' % (raw_pred, pred, gt))
accuracy = n_correct / float(max_iter * opt.batchSize)
test_distance=test_distance/float(max_iter * opt.batchSize)
testLoss = loss_avg.val()
#print('Test loss: %f, accuray: %f' % (testLoss, accuracy))
return testLoss,accuracy,test_distance