当前位置: 首页>>代码示例>>Python>>正文

Python Levenshtein.distance方法代码示例

本文整理汇总了Python中Levenshtein.distance方法的典型用法代码示例。如果您正苦于以下问题:Python Levenshtein.distance方法的具体用法?Python Levenshtein.distance怎么用?Python Levenshtein.distance使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在Levenshtein的用法示例。


示例1: align

# 需要导入模块: import Levenshtein [as 别名]
# 或者: from Levenshtein import distance [as 别名]
def align(xy_tuple):
    x,y = xy_tuple
    xmol, ymol = Chem.MolFromSmiles(x), Chem.MolFromSmiles(y)
    x = Chem.MolToSmiles(xmol, isomericSmiles=False)
    xmol = Chem.MolFromSmiles(x)

    xleaf = get_leaves(xmol)
    yleaf = get_leaves(ymol)

    best_i,best_j = 0,0
    best = 1000000
    for i in xleaf:
        for j in yleaf:
            new_x = Chem.MolToSmiles(xmol, rootedAtAtom=i, isomericSmiles=False)
            new_y = Chem.MolToSmiles(ymol, rootedAtAtom=j, isomericSmiles=False)
            le = min(len(new_x), len(new_y)) // 2
            dist = Levenshtein.distance(new_x[:le], new_y[:le])
            if dist < best:
                best_i, best_j = i, j
                best = dist

    return Chem.MolToSmiles(xmol, rootedAtAtom=best_i, isomericSmiles=False), Chem.MolToSmiles(ymol, rootedAtAtom=best_j, isomericSmiles=False) 

示例2: distanceDomain

# 需要导入模块: import Levenshtein [as 别名]
# 或者: from Levenshtein import distance [as 别名]
def distanceDomain(domain, DomainDict, ccTldDict, tldDict):
	similarDomain = ""
	minDistance = sys.maxint
	level = domain.split(".")
	if len(level) <=1:
		return ("not a domain", sys.maxint)
	(domain2LD, domain3LD, domain2LDs, domain3LDs) = extractLevelDomain(domain, ccTldDict, tldDict)
	for popularDomain in DomainDict:
		distance = Levenshtein.distance(domain2LD.decode('utf-8'), popularDomain.decode('utf-8'))
		if distance < minDistance:
			minDistance = distance
			similarDomain = popularDomain
	#sys.stdout.write("subdomain: %s, similarDomain: %s, minDistance: %d\n" % (subdomain, similarDomain, minDistance))
	if len(similarDomain) > 0:
		return (similarDomain, minDistance/float(len(similarDomain)))
		return (domain2LD, 0)

# check whether a domain contains invalid TLD 

示例3: compute_edit_distance

# 需要导入模块: import Levenshtein [as 别名]
# 或者: from Levenshtein import distance [as 别名]
def compute_edit_distance(session, labels_true_st, labels_pred_st):
    """Compute edit distance per mini-batch.
        labels_true_st: A `SparseTensor` of ground truth
        labels_pred_st: A `SparseTensor` of prediction
        edit_distances: list of edit distance of each uttearance
    indices, values, dense_shape = labels_true_st
    labels_pred_pl = tf.SparseTensor(indices, values, dense_shape)
    indices, values, dense_shape = labels_pred_st
    labels_true_pl = tf.SparseTensor(indices, values, dense_shape)

    edit_op = tf.edit_distance(labels_pred_pl, labels_true_pl, normalize=True)
    edit_distances = session.run(edit_op)

    return edit_distances 

示例4: compute_per

# 需要导入模块: import Levenshtein [as 别名]
# 或者: from Levenshtein import distance [as 别名]
def compute_per(ref, hyp, normalize=True):
    """Compute Phone Error Rate.
        ref (list): phones in the reference transcript
        hyp (list): phones in the predicted transcript
        normalize (bool, optional): if True, divide by the length of str_true
        per (float): Phone Error Rate between str_true and str_pred
    # Build mapping of phone to index
    phone_set = set(ref + hyp)
    phone2char = dict(zip(phone_set, range(len(phone_set))))

    # Map phones to a single char array
    # NOTE: Levenshtein packages only accepts strings
    phones_ref = [chr(phone2char[p]) for p in ref]
    phones_hyp = [chr(phone2char[p]) for p in hyp]

    per = lev.distance(''.join(phones_ref), ''.join(phones_hyp))
    if normalize:
        per /= len(ref)
    return per 

示例5: wer

# 需要导入模块: import Levenshtein [as 别名]
# 或者: from Levenshtein import distance [as 别名]
def wer(self, s1, s2):
        Computes the Word Error Rate, defined as the edit distance between the
        two provided sentences after tokenizing to words.
            s1 (string): space-separated sentence
            s2 (string): space-separated sentence

        # build mapping of words to integers
        b = set(s1.split() + s2.split())
        word2char = dict(zip(b, range(len(b))))

        # map the words to a char array (Levenshtein packages only accepts
        # strings)
        w1 = [chr(word2char[w]) for w in s1.split()]
        w2 = [chr(word2char[w]) for w in s2.split()]

        return Lev.distance(''.join(w1), ''.join(w2)) 

示例6: _random_common_char_pairs

# 需要导入模块: import Levenshtein [as 别名]
# 或者: from Levenshtein import distance [as 别名]
def _random_common_char_pairs(n_pairs=50, seed=1):
    Return string pairs with a common char at random positions, in order to
    distinguish different thresholds for matching chararacters in Jaro
    # Make strings with random length and common char at index 0
    rng = np.random.RandomState(seed=seed)
    list1 = ['a' + 'b' * rng.randint(2, 20) for k in range(n_pairs)]
    list2 = ['a' + 'c' * rng.randint(2, 20) for k in range(n_pairs)]
    # Shuffle strings
    list1 = [''.join(rng.choice(
        list(s), size=len(s), replace=False)) for s in list1]
    list2 = [''.join(rng.choice(
        list(s), size=len(s), replace=False)) for s in list2]
    pairs = zip(list1, list2)
    return pairs

# TODO: some factorization of what is common for distances;
# check results for same examples on all distances 

示例7: wer

# 需要导入模块: import Levenshtein [as 别名]
# 或者: from Levenshtein import distance [as 别名]
def wer(self, s1, s2):
        Computes the Word Error Rate, defined as the edit distance between the
        two provided sentences after tokenizing to words.
            s1 (string): space-separated sentence
            s2 (string): space-separated sentence

        # build mapping of words to integers
        b = set(s1.split() + s2.split())
        word2char = {ss: ii for ii, ss in enumerate(b)}

        # map the words to a char array (Levenshtein packages only accepts
        # strings)
        w1 = [chr(word2char[w]) for w in s1.split()]
        w2 = [chr(word2char[w]) for w in s2.split()]

        return Lev.distance(''.join(w1), ''.join(w2)) 

示例8: calculate_wer

# 需要导入模块: import Levenshtein [as 别名]
# 或者: from Levenshtein import distance [as 别名]
def calculate_wer(s1, s2):
    Computes the Word Error Rate, defined as the edit distance between the
    two provided sentences after tokenizing to words.
        s1 (string): space-separated sentence
        s2 (string): space-separated sentence

    # build mapping of words to integers
    b = set(s1.split() + s2.split())
    word2char = dict(zip(b, range(len(b))))

    # map the words to a char array (Levenshtein packages only accepts
    # strings)
    w1 = [chr(word2char[w]) for w in s1.split()]
    w2 = [chr(word2char[w]) for w in s2.split()]

    return Lev.distance(''.join(w1), ''.join(w2)) 

示例9: _get_compare_data

# 需要导入模块: import Levenshtein [as 别名]
# 或者: from Levenshtein import distance [as 别名]
def _get_compare_data(tif_txt_pair):
    tif = tif_txt_pair[0]
    txt = tif_txt_pair[1]
    if tif[:-4] == txt[:-4]: # This should always be true
#         ocr = run_main(tif, conf=Config(path='/home/zr/letters/conf/443cf9ec-76c7-44bc-95ad-593138d2d5fc.conf'), text=True)
#         ocr = run_main(tif, conf=Config(segmenter='stochastic', recognizer='hmm', break_width=3.6), text=True)
        ocr = run_main(tif, text=True)
#         ocr = run_all_confs_for_page(tif, text = True)
        ocr = ocr.strip()
        txt = open(txt,'r').read()
        txt = _normalize_input(txt)
        edit_dist = L.distance(txt, ocr)
        edit_ratio = L.ratio(txt, ocr)
        html = _make_html_diff(txt, ocr)
#        sys.exit()
        data = {'edit_distance': edit_dist,
                'edit_ratio': edit_ratio,
                'filename': os.path.basename(tif), 
                'html': html
    return data 

示例10: wer

# 需要导入模块: import Levenshtein [as 别名]
# 或者: from Levenshtein import distance [as 别名]
def wer(s1, s2):
        Computes the Word Error Rate, defined as the edit distance between the
        two provided sentences after tokenizing to words.
            s1 (string): space-separated sentence
            s2 (string): space-separated sentence

        # build mapping of words to integers
        b = set(s1.split() + s2.split())
        word2char = dict(zip(b, range(len(b))))

        # map the words to a char array (Levenshtein packages only accepts
        # strings)
        w1 = [chr(word2char[w]) for w in s1.split()]
        w2 = [chr(word2char[w]) for w in s2.split()]

        return Lev.distance(''.join(w1), ''.join(w2)) 

示例11: compute_cer

# 需要导入模块: import Levenshtein [as 别名]
# 或者: from Levenshtein import distance [as 别名]
def compute_cer(ref, hyp, normalize=False):
    """Compute Character Error Rate.

        ref (str): a sentence without spaces
        hyp (str): a sentence without spaces
        normalize (bool, optional): if True, divide by the length of ref
        cer (float): Character Error Rate between ref and hyp

    import Levenshtein as lev
    # TODO(hirofumi): install
    cer = lev.distance(hyp, ref)
    if normalize:
        cer /= len(list(ref))
    return cer * 100 

示例12: validate

# 需要导入模块: import Levenshtein [as 别名]
# 或者: from Levenshtein import distance [as 别名]
def validate(self, data_loader):
        "validate with label error rate by the edit distance between hyps and refs"
        with torch.no_grad():
            N, D = 0, 0
            t = tqdm(enumerate(data_loader), total=len(data_loader), desc="validating", ncols=params.NCOLS)
            for i, (data) in t:
                hyps, refs = self.unit_validate(data)
                # calculate ler
                N += self.edit_distance(refs, hyps)
                D += sum(len(r) for r in refs)
                ler = N * 100. / D
                t.set_description(f"validating (LER: {ler:.2f} %)")
            logger.info(f"validating at epoch {self.epoch:03d}: LER {ler:.2f} %")

            title = f"validate"
            x = self.epoch - 1 + i / len(data_loader)
            if logger.visdom is not None:
                opts = { 'xlabel': 'epoch', 'ylabel': 'LER', }
                logger.visdom.add_point(title=title, x=x, y=ler, **opts)
            if logger.tensorboard is not None:
                logger.tensorboard.add_scalars(title, self.global_step, { 'LER': ler, }) 

示例13: levenshtein

# 需要导入模块: import Levenshtein [as 别名]
# 或者: from Levenshtein import distance [as 别名]
def levenshtein(string, candidates):
    Compare a string's Levenshtein distance to each candidate in a dictionary. 
    Returns the name of the closest match

    distances = defaultdict(int)
    num_lines = len(string)

    for k, v in candidates.items():
        expanded = False
        # Expands the length of each candidate to match the length of the compared string
        if len(v) != len(string):
            v = (v * (num_lines // len(v) + 1))[:num_lines]
            expanded = True

        edit_distance = distance(string, v)

        # If we expanded the candidate, then it is a worse match than what we have already
        if edit_distance in distances and expanded:

        distances[distance(string, v)] = k

    return distances[min(distances)] 

示例14: guess_metre

# 需要导入模块: import Levenshtein [as 别名]
# 或者: from Levenshtein import distance [as 别名]
def guess_metre(tokenized_poem):
    Guess a poem's metre via Levenshtein distance from candidates

    joined_lines = [''.join(line) for line in scanscion(tokenized_poem) if line]
    line_lengths = [len(line) for line in joined_lines]
    num_lines = len(joined_lines)

    metres = []
    for line in joined_lines:
        metres.append(levenshtein(line, POSSIBLE_METRES))

    guessed_metre = max(zip((metres.count(item) for item in set(metres)), set(metres)))[1]

    return joined_lines, num_lines, line_lengths, guessed_metre 

示例15: _get_distance

# 需要导入模块: import Levenshtein [as 别名]
# 或者: from Levenshtein import distance [as 别名]
def _get_distance(self, targets, y_hats):
        Provides total character distance between targets & y_hats

            targets (torch.Tensor): set of ground truth
            y_hats (torch.Tensor): predicted y values (y_hat) by the model

        Returns: total_dist, total_length
            - **total_dist**: total distance between targets & y_hats
            - **total_length**: total length of targets sequence
        total_dist = 0
        total_length = 0

        for (target, y_hat) in zip(targets, y_hats):
            s1 = label_to_string(target, self.id2char, self.eos_id)
            s2 = label_to_string(y_hat, self.id2char, self.eos_id)

            dist, length = self.metric(s1, s2)

            total_dist += dist
            total_length += length

        return total_dist, total_length 
