当前位置: 首页>>代码示例>>Python>>正文


Python fuzz.token_set_ratio方法代码示例

本文整理汇总了Python中fuzzywuzzy.fuzz.token_set_ratio方法的典型用法代码示例。如果您正苦于以下问题:Python fuzz.token_set_ratio方法的具体用法?Python fuzz.token_set_ratio怎么用?Python fuzz.token_set_ratio使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在fuzzywuzzy.fuzz的用法示例。


在下文中一共展示了fuzz.token_set_ratio方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: extract_features

# 需要导入模块: from fuzzywuzzy import fuzz [as 别名]
# 或者: from fuzzywuzzy.fuzz import token_set_ratio [as 别名]
def extract_features(df):
    df["question1"] = df["question1"].fillna("").apply(preprocess)
    df["question2"] = df["question2"].fillna("").apply(preprocess)

    print("token features...")
    token_features = df.apply(lambda x: get_token_features(x["question1"], x["question2"]), axis=1)
    df["cwc_min"]       = list(map(lambda x: x[0], token_features))
    df["cwc_max"]       = list(map(lambda x: x[1], token_features))
    df["csc_min"]       = list(map(lambda x: x[2], token_features))
    df["csc_max"]       = list(map(lambda x: x[3], token_features))
    df["ctc_min"]       = list(map(lambda x: x[4], token_features))
    df["ctc_max"]       = list(map(lambda x: x[5], token_features))
    df["last_word_eq"]  = list(map(lambda x: x[6], token_features))
    df["first_word_eq"] = list(map(lambda x: x[7], token_features))
    df["abs_len_diff"]  = list(map(lambda x: x[8], token_features))
    df["mean_len"]      = list(map(lambda x: x[9], token_features))

    print("fuzzy features..")
    df["token_set_ratio"]       = df.apply(lambda x: fuzz.token_set_ratio(x["question1"], x["question2"]), axis=1)
    df["token_sort_ratio"]      = df.apply(lambda x: fuzz.token_sort_ratio(x["question1"], x["question2"]), axis=1)
    df["fuzz_ratio"]            = df.apply(lambda x: fuzz.QRatio(x["question1"], x["question2"]), axis=1)
    df["fuzz_partial_ratio"]    = df.apply(lambda x: fuzz.partial_ratio(x["question1"], x["question2"]), axis=1)
    df["longest_substr_ratio"]  = df.apply(lambda x: get_longest_substr_ratio(x["question1"], x["question2"]), axis=1)
    return df 
开发者ID:aerdem4,项目名称:kaggle-quora-dup,代码行数:26,代码来源:nlp_feature_extraction.py

示例2: similar_string_fast

# 需要导入模块: from fuzzywuzzy import fuzz [as 别名]
# 或者: from fuzzywuzzy.fuzz import token_set_ratio [as 别名]
def similar_string_fast(first_string, second_string):
    """Determine if two strings are similar (using two most effective methods).

    Params:
    - first_string: (type: string) first string.
    - second_string: (type: string) second string.

    Returns:
    - result: (type: bool) match result.
    """
    partial_score = fuzz.ratio(first_string, second_string)
    token_score = fuzz.token_set_ratio(first_string, second_string)

    if max(partial_score, token_score) >= SCORE_THRESHOLD_FAST:
        return True

    return False 
开发者ID:phage-nz,项目名称:ph0neutria,代码行数:19,代码来源:string_utils.py

示例3: fuzzy_fuzzywuzzy

# 需要导入模块: from fuzzywuzzy import fuzz [as 别名]
# 或者: from fuzzywuzzy.fuzz import token_set_ratio [as 别名]
def fuzzy_fuzzywuzzy(fuzz, user_input, collection):
    '''编辑距离,速度比较慢,比起匹配方法,能够处理字符不一样的问题'''
    collection_new = []
    len_user_input = len(user_input)
    for coll in collection:  # 获取包含一个字符的,如果不包含,就返回错误
        for i in range(len_user_input):
            if user_input[i] in coll:
                collection_new.append(coll)
    if not collection_new:
        return None
    collection_new = list(set(collection_new))

    same_char_list = []
    for collection_new_one in collection_new: # 获取相同字符串多的问题
        count_same_char_one = count_same_char(user_input, collection_new_one)
        same_char_list.append((collection_new_one, count_same_char_one))
    same_char_list.sort(key=lambda x: x[1], reverse=True)
    if len(same_char_list) >= 500:
        same_char_list = same_char_list[0: 500]

    result =  process.extract(user_input, same_char_list, scorer=fuzz.token_set_ratio, limit=20)
    return result 
开发者ID:yongzhuo,项目名称:nlp_xiaojiang,代码行数:24,代码来源:chatbot_fuzzy.py

示例4: _create_fuzzy_wuzzy_features

# 需要导入模块: from fuzzywuzzy import fuzz [as 别名]
# 或者: from fuzzywuzzy.fuzz import token_set_ratio [as 别名]
def _create_fuzzy_wuzzy_features(self, df):
        df['fuzzy_ratio'] = df[['spn_1', 'spn_2']].apply(lambda row: fuzz.ratio(row['spn_1'], row['spn_2']), axis=1)
        df['fuzzy_set_ratio'] = df[['spn_1', 'spn_2']].apply(lambda row: fuzz.token_set_ratio(row['spn_1'], row['spn_2']), axis=1)
        df['fuzzy_partial_ratio'] = df[['spn_1', 'spn_2']].apply(lambda row: fuzz.partial_ratio(row['spn_1'], row['spn_2']), axis=1)
        df['fuzzy_token_sort_ratio'] = df[['spn_1', 'spn_2']].apply(lambda row: fuzz.token_sort_ratio(row['spn_1'], row['spn_2']), axis=1)
        df['fuzzy_qratio'] = df[['spn_1', 'spn_2']].apply(lambda row: fuzz.QRatio(row['spn_1'], row['spn_2']), axis=1)
        df['fuzzy_WRatio'] = df[['spn_1', 'spn_2']].apply(lambda row: fuzz.WRatio(row['spn_1'], row['spn_2']), axis=1)
   
        def _get_longest_substr_ratio(a, b):
            strs = list(distance.lcsubstrings(a, b))
            if len(strs) == 0:
                return 0
            else:
                return len(strs[0]) / (min(len(a), len(b)) + 1)

        df['longest_substr_ratio'] = df[['spn_1', 'spn_2']].apply(lambda row: _get_longest_substr_ratio(row['spn_1'], row['spn_2']), axis=1) 
开发者ID:zake7749,项目名称:CIKM-AnalytiCup-2018,代码行数:18,代码来源:feature_engineering.py

示例5: extract_stat_features

# 需要导入模块: from fuzzywuzzy import fuzz [as 别名]
# 或者: from fuzzywuzzy.fuzz import token_set_ratio [as 别名]
def extract_stat_features(self,df):
        df["title1_zh"] = df["title1_zh"].fillna("").apply(self.__preprocess__)
        df["title2_zh"] = df["title2_zh"].fillna("").apply(self.__preprocess__)

        print("token features...")
        token_features = df.apply(lambda x: self.__get_token_features__(x["title1_zh"], x["title2_zh"]), axis=1)
        df["cwc_min"]       = list(map(lambda x: x[0], token_features))
        df["cwc_max"]       = list(map(lambda x: x[1], token_features))
        df["csc_min"]       = list(map(lambda x: x[2], token_features))
        df["csc_max"]       = list(map(lambda x: x[3], token_features))
        df["ctc_min"]       = list(map(lambda x: x[4], token_features))
        df["ctc_max"]       = list(map(lambda x: x[5], token_features))
        df["last_word_eq"]  = list(map(lambda x: x[6], token_features))
        df["first_word_eq"] = list(map(lambda x: x[7], token_features))
        df["abs_len_diff"]  = list(map(lambda x: x[8], token_features))
        df["mean_len"]      = list(map(lambda x: x[9], token_features))

        print("fuzzy features..")
        df["token_set_ratio"]       = df.apply(lambda x: fuzz.token_set_ratio(x["title1_zh"], x["title2_zh"]), axis=1)
        df["token_sort_ratio"]      = df.apply(lambda x: fuzz.token_sort_ratio(x["title1_zh"], x["title2_zh"]), axis=1)
        df["fuzz_ratio"]            = df.apply(lambda x: fuzz.QRatio(x["title1_zh"], x["title2_zh"]), axis=1)
        df["fuzz_partial_ratio"]    = df.apply(lambda x: fuzz.partial_ratio(x["title1_zh"], x["title2_zh"]), axis=1)
        df["longest_substr_ratio"]  = df.apply(lambda x: self.__get_longest_substr_ratio__(x["title1_zh"], x["title2_zh"]), axis=1)
        
        if 'label' in df.columns.tolist():
            return df.drop(["title1_zh", "title2_zh", "label"], axis=1).values
        else:
            return df.drop(["title1_zh", "title2_zh"], axis=1).values 
开发者ID:lampts,项目名称:wsdm19cup,代码行数:30,代码来源:make_handcrafted_33_features.py

示例6: worker

# 需要导入模块: from fuzzywuzzy import fuzz [as 别名]
# 或者: from fuzzywuzzy.fuzz import token_set_ratio [as 别名]
def worker(num,total,foodStrings):
  stringMatches = []
  partialList = {}
  """thread worker function"""
  for foodString in foodStrings:
    for (i,key) in enumerate(foodList.keys()):
      if i%total==num:
        leven1 = fuzz.token_set_ratio(key,foodString)
        leven2 = Levenshtein.ratio(foodString,key)
        if leven2>0.5:
          stringMatches.append((key,foodList[key],leven1,leven2))
  pickle.dump(stringMatches,open(str(num)+'.p','wb'))
  return 
开发者ID:schollz,项目名称:extract_recipe,代码行数:15,代码来源:food_string_matching.py

示例7: set_ratio_fuzzywuzzy

# 需要导入模块: from fuzzywuzzy import fuzz [as 别名]
# 或者: from fuzzywuzzy.fuzz import token_set_ratio [as 别名]
def set_ratio_fuzzywuzzy(str1, str2):
    return fuzz.token_set_ratio(str1, str2) 
开发者ID:yongzhuo,项目名称:nlp_xiaojiang,代码行数:4,代码来源:distance_text_or_vec.py

示例8: extract_string_similarity_vector

# 需要导入模块: from fuzzywuzzy import fuzz [as 别名]
# 或者: from fuzzywuzzy.fuzz import token_set_ratio [as 别名]
def extract_string_similarity_vector(instance: dict):
    """
    Returns a vector encoding a variety of lexical similarity metrics given a dictionary containing keys
    sentence_1,sentence_2
    :return: a vector containing similarity scores
    """

    s1 = instance['sentence_1']
    s2 = instance['sentence_2']

    return torch.tensor([
        normalized_levenshtein.similarity(s1,s2),
        jarowinkler.similarity(s1,s2),
        metric_lcs.distance(s1,s2),
        qgram2.distance(s1,s2),
        qgram3.distance(s1,s2),
        qgram4.distance(s1,s2),
        jaccard.similarity(s1,s2),
        cosine.similarity(s1,s2),
        fuzz.partial_token_set_ratio(s1,s2),
        fuzz.partial_token_sort_ratio(s1,s2),
        fuzz.token_set_ratio(s1,s2),
        fuzz.token_sort_ratio(s1,s2),
        fuzz.QRatio(s1,s2),
        fuzz.UQRatio(s1,s2),
        fuzz.UWRatio(s1,s2),
        fuzz.WRatio(s1,s2)
    ]) 
开发者ID:AndriyMulyar,项目名称:semantic-text-similarity,代码行数:30,代码来源:lexical_similarity_metrics.py

示例9: _scholar_score

# 需要导入模块: from fuzzywuzzy import fuzz [as 别名]
# 或者: from fuzzywuzzy.fuzz import token_set_ratio [as 别名]
def _scholar_score(txt, bib):
    # high score means high similarity
    from fuzzywuzzy.fuzz import token_set_ratio
    return sum([token_set_ratio(bib[k], txt) for k in ['title', 'author', 'abstract'] if k in bib]) 
开发者ID:perrette,项目名称:papers,代码行数:6,代码来源:extract.py

示例10: _crossref_score

# 需要导入模块: from fuzzywuzzy import fuzz [as 别名]
# 或者: from fuzzywuzzy.fuzz import token_set_ratio [as 别名]
def _crossref_score(txt, r):
    # high score means high similarity
    from fuzzywuzzy.fuzz import token_set_ratio
    score = 0
    if 'author' in r:
        author = ' '.join([p['family'] for p in r.get('author',[]) if 'family' in p])
        score += token_set_ratio(author, txt)
    if 'title' in r:
        score += token_set_ratio(r['title'][0], txt)
    if 'abstract' in r:
        score += token_set_ratio(r['abstract'], txt)
    return score 
开发者ID:perrette,项目名称:papers,代码行数:14,代码来源:extract.py

示例11: compare_entries

# 需要导入模块: from fuzzywuzzy import fuzz [as 别名]
# 或者: from fuzzywuzzy.fuzz import token_set_ratio [as 别名]
def compare_entries(e1, e2, fuzzy=False):
    """assess two entries' similarity
    """
    if e1 == e2:
        return EXACT_DUPLICATES

    id1 = entry_id(e1)
    id2 = entry_id(e2)
    
    logger.debug('{} ?= {}'.format(id1, id2))

    if id1 == id2:
        score = GOOD_DUPLICATES

    elif all([f1==f2 for f1, f2 in zip(id1, id2) if f1 and f2]): # all defined fields agree
        score = FAIR_DUPLICATES

    elif any([f1==f2 for f1, f2 in zip(id1, id2) if f1 and f2]): # some of the defined fields agree
        score = PARTIAL_DUPLICATES

    elif not fuzzy:
        score = 0

    else:
        from fuzzywuzzy.fuzz import token_set_ratio
        doi1, tag1 = id1
        doi2, tag2 = id2
        score = token_set_ratio(tag1, tag2)

    return score 
开发者ID:perrette,项目名称:papers,代码行数:32,代码来源:bib.py

示例12: detect_xss

# 需要导入模块: from fuzzywuzzy import fuzz [as 别名]
# 或者: from fuzzywuzzy.fuzz import token_set_ratio [as 别名]
def detect_xss(self, payload, browser_object, user_screenshot_name,
                   injected_link):
        """Check the HTML source to determine if XSS payload was reflected."""
        # If fuzzy detection chosen, evaluate partial reflection of XSS
        # by tokenizing the HTML source and detecting parts of the payload
        # and source common to both.
        #
        # Other methods of scoring include fuzz.ratio(), fuzz.partial_ratio()
        # and fuzz.token_sort_ratio()
        partial_score = fuzz.token_set_ratio(
            payload.lower(), browser_object.html.lower())
        # Set the level of detection asked for by the user, e.g. Only detect
        # matches with score higher than 50% fuzzy detection
        fuzzy_level = self.user_args.FUZZY_DETECTION

        if payload.lower() in browser_object.html.lower():
            print Color.GREEN + "\n[+] XSS vulnerability found:" + \
                Color.END

            # If user set the --screen flag to target, capture screenshot of
            # payload
            if user_screenshot_name is not None:
                self.take_screenshot(user_screenshot_name,
                                     browser_object, self.screen_index)

            # Add link to list of all positive XSS hits
            self.xss_links.append(injected_link)
            print Color.BLUE + injected_link + Color.END
        # If user enabled fuzzy detection and partial score was larger than
        # fuzz level, add it to partials list and print results
        elif fuzzy_level and (partial_score >= fuzzy_level):
            print Color.YELLOW + \
                "\n[-] Partial XSS vulnerability found:" + Color.END
            print Color.BLUE + injected_link + Color.END
            self.xss_partials.append(injected_link)
            print "Detection score: %s" % partial_score
        else:
            print Color.RED + "\n[+] No XSS detected at: \n" + \
                Color.BLUE + injected_link + Color.END
            if (fuzzy_level):
                print "Detection score: %s" % partial_score 
开发者ID:shogunlab,项目名称:shuriken,代码行数:43,代码来源:shuriken_xss.py

示例13: getStringMatches

# 需要导入模块: from fuzzywuzzy import fuzz [as 别名]
# 或者: from fuzzywuzzy.fuzz import token_set_ratio [as 别名]
def getStringMatches(foodString):
  print(foodString)
  foodString = foodString.replace(',',' ').lower()
  foodStrings = []
  foodStrings.append(foodString)
  foodWords = foodString.split()
  if len(foodWords)>2:
    otherFoodWords = combinations(foodWords,2)
    for words in otherFoodWords:
      foodStrings.append(' '.join(words))
  if len(foodWords)>3:
    otherFoodWords = combinations(foodWords,3)
    for words in otherFoodWords:
      foodStrings.append(' '.join(words))
  stringMatches = []
  partialList = {}
  
 
  processes = []
  totalProcesses = NUM_PROCESSORS
  for i in range(totalProcesses):
    t = Process(target=worker, args=(i,totalProcesses,foodStrings,))
    processes.append(t)
  for t in processes:
    t.start()
  for t in processes:
    t.join()
    
  for i in range(totalProcesses):
    foo = pickle.load(open(str(i)+'.p','rb'))
    stringMatches = stringMatches + foo
    os.system('rm ' + str(i)+'.p')
    
  
  '''
  for foodString in foodStrings:
    for (i,key) in enumerate(foodList.keys()):
      partialList[key] = fuzz.token_set_ratio(key,foodString)

    foo = sorted(partialList.items(), key=operator.itemgetter(1),reverse=True)[:100]
    for result in foo:
      leven=Levenshtein.ratio(foodString,result[0])
      if leven>0.5:
        stringMatches.append((result[0],foodList[result[0]],result[1],leven))
  '''
  matches = (sorted(stringMatches, key=operator.itemgetter(2, 3), reverse=True))
  return matches 
开发者ID:schollz,项目名称:extract_recipe,代码行数:49,代码来源:food_string_matching.py

示例14: fuzzy_score_string

# 需要导入模块: from fuzzywuzzy import fuzz [as 别名]
# 或者: from fuzzywuzzy.fuzz import token_set_ratio [as 别名]
def fuzzy_score_string(first_string, second_string):
    """Produce a similarity score for two strings (using Levenshtein distance).

    Params:
    - first_string: (type: string) first string.
    - second_string: (type: string) second string.

    Returns:
    - result: (type: int) score.
    """
    score = 0

    if len(first_string) < len(second_string):
        shorter, longer = (first_string, second_string)
        window_length = len(shorter)

        num_iterations = len(longer) - len(shorter) + 1

        for position in range(0, num_iterations):
            window = longer[position:position + window_length]
            l_ratio = Levenshtein.ratio(window, shorter) * 100

            if l_ratio > 60:
                result = statistics.mean(
                    [100 - Levenshtein.distance(window, shorter) * 15, l_ratio, l_ratio])

            else:
                result = l_ratio

            if result > score:
                score = result

    else:
        l_ratio = Levenshtein.ratio(first_string, second_string) * 100
        score = statistics.mean(
            [100 - Levenshtein.distance(first_string, second_string) * 15, l_ratio, l_ratio])

    simple = fuzz.ratio(first_string, second_string)
    partial = fuzz.partial_ratio(first_string, second_string)
    sort = fuzz.token_sort_ratio(first_string, second_string)
    set_ratio = fuzz.token_set_ratio(first_string, second_string)

    score = max([score, simple, partial, sort, set_ratio])

    if score < 75:
        score = 0

    return score * 0.85 
开发者ID:phage-nz,项目名称:ph0neutria,代码行数:50,代码来源:string_utils.py

示例15: fuzzy_fuzzywuzzy_list

# 需要导入模块: from fuzzywuzzy import fuzz [as 别名]
# 或者: from fuzzywuzzy.fuzz import token_set_ratio [as 别名]
def fuzzy_fuzzywuzzy_list(fuzz, user_input, qa_list, collection, topn=50):
    '''编辑距离,速度比较慢,比起匹配方法,能够处理字符不一样的问题'''

    start_time = time.time()
    # user_input_set = set([user_input_one for user_input_one in user_input])
    user_input_set = [user_input_one for user_input_one in user_input]


    same_char_list = []
    max_data = 0
    max_data_list = []
    count_collection_new_one = 0
    for collection_new_one in collection: # 获取相同字符串多的问题
        count_same_char_one = len([x for x in user_input_set if x in collection_new_one])

        if count_same_char_one > 0:
            same_char_list.append((count_collection_new_one, count_same_char_one))
        if count_same_char_one > max_data:
            max_data_list.append(count_same_char_one)
            max_data = count_same_char_one
        count_collection_new_one += 1

    end_time1 = time.time()
    list_max_count = []
    len_max_data_list = len(max_data_list)
    for x in range(len_max_data_list):  # 获取前20排名
        for k,l in same_char_list:
            if l == max_data_list[len_max_data_list -1 - x]:
                list_max_count.append(qa_list[k]) #问答重这里取出来
        if len(list_max_count) >= 5000:
            list_max_count = list_max_count[0:5000]
            break

    end_time2 = time.time()

    # end_time1: 0.34090662002563477
    # end_time2: 0.4080846309661865

    # end_time1: 0.06417036056518555
    # end_time2: 0.08422374725341797

    # same_char_list.sort(key=lambda x: x[1], reverse=True)
    # if len(same_char_list) >= 20:
    #     same_char_list = same_char_list[0: 20]

    result =  process.extract(user_input, list_max_count, scorer=fuzz.token_set_ratio, limit=topn)
    end_time3 = time.time()

    # print('end_time1: ' + str(end_time1 - start_time))
    # print('end_time2: ' + str(end_time2 - start_time))
    # print('end_time3: ' + str(end_time3 - start_time))

    return result
    # [fuzz.WRatio, fuzz.QRatio,
    #  fuzz.token_set_ratio, fuzz.token_sort_ratio,
    #  fuzz.partial_token_set_ratio, fuzz.partial_token_sort_ratio,
    #  fuzz.UWRatio, fuzz.UQRatio] 
开发者ID:yongzhuo,项目名称:nlp_xiaojiang,代码行数:59,代码来源:chatbot_fuzzy.py


注:本文中的fuzzywuzzy.fuzz.token_set_ratio方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。