本文整理汇总了Python中py_stringmatching.simfunctions.tfidf函数的典型用法代码示例。如果您正苦于以下问题:Python tfidf函数的具体用法?Python tfidf怎么用?Python tfidf使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。
在下文中一共展示了tfidf函数的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: test_valid_input
def test_valid_input(self):
self.assertEqual(tfidf(['a', 'b', 'a'], ['a', 'c'], [['a', 'b', 'a'], ['a', 'c'], ['a'], ['b']], True),
0.11166746710505392)
self.assertEqual(tfidf(['a', 'b', 'a'], ['a', 'c'], [['a', 'b', 'a'], ['a', 'c'], ['a']]), 0.17541160386140586)
self.assertEqual(tfidf(['a', 'b', 'a'], ['a'], [['a', 'b', 'a'], ['a', 'c'], ['a']]), 0.5547001962252291)
self.assertEqual(tfidf(['a', 'b', 'a'], ['a']), 0.7071067811865475)
self.assertEqual(tfidf(['a', 'b', 'a'], ['a'], [['x', 'y'], ['w'], ['q']]), 0.0)
self.assertEqual(tfidf(['a', 'b', 'a'], ['a']), 0.7071067811865475)
self.assertEqual(tfidf(['a', 'b', 'a'], ['a', 'b', 'a']), 1.0)
self.assertEqual(tfidf([], ['a', 'b', 'a']), 0.0)
示例2: test_invalid_input4
def test_invalid_input4(self):
tfidf(['a'], None)
示例3: test_invalid_input1
def test_invalid_input1(self):
tfidf(1, 1)
示例4: Score
attribute_id1 = product_dict[id1]
attribute_id2 = product_dict[id2]
id.append([id1,id2])
# class label
if (match_dict[pair] == 'MATCH'):
classlabels.append(1)
else:
classlabels.append(0)
####feature: Product Name ---- Jaccard Score (word boudary, 3-gram), edit distance, tf/idf
if ("Product Name" in attribute_id1 and "Product Name" in attribute_id2):
jaccard_productName = simfunctions.jaccard(tokenizers.delimiter(attribute_id1["Product Name"][0]), tokenizers.delimiter(attribute_id2["Product Name"][0]))
jaccard3gram_productName = simfunctions.jaccard(tokenizers.qgram(attribute_id1["Product Name"][0], 3), tokenizers.qgram(attribute_id2["Product Name"][0], 3))
tfidf_productName = simfunctions.tfidf(tokenizers.delimiter(attribute_id1["Product Name"][0]), tokenizers.delimiter(attribute_id2["Product Name"][0]), productName_courpus)
edit_productName = simfunctions.levenshtein(attribute_id1["Product Name"][0], attribute_id2["Product Name"][0])
edit_productName = 1 - edit_productName/max(len(attribute_id1["Product Name"][0]), len(attribute_id2["Product Name"][0]))
else:
jaccard_productName = 0
jaccard3gram_productName = 0
tfidf_productName = 0
edit_productName = 0
####feature: Manufacturer
if ("Manufacturer" in attribute_id1 and "Manufacturer" in attribute_id2):
jaccard_manufacturer = simfunctions.jaccard(tokenizers.delimiter(attribute_id1["Manufacturer"][0]), tokenizers.delimiter(attribute_id2["Manufacturer"][0]))
jaccard3gram_manufacturer = simfunctions.jaccard(tokenizers.qgram(attribute_id1["Manufacturer"][0], 3), tokenizers.qgram(attribute_id2["Manufacturer"][0], 3))
tfidf_manufacturer = simfunctions.tfidf(tokenizers.delimiter(attribute_id1["Manufacturer"][0]), tokenizers.delimiter(attribute_id2["Manufacturer"][0]))
else:
jaccard_manufacturer = 0
示例5: generate_feature
def generate_feature(filename):
productName_courpus = []
brand_courpus = []
with open(filename, 'r') as f:
for line in f:
list_line = line.split('?')
attribute_id1 = json.loads(list_line[2], encoding = 'latin-1')
attribute_id2 = json.loads(list_line[4], encoding = 'latin-1')
if "Product Name" in attribute_id1:
productName_courpus.append(tokenizers.delimiter(attribute_id1["Product Name"][0]))
if "Product Name" in attribute_id2:
productName_courpus.append(tokenizers.delimiter(attribute_id2["Product Name"][0]))
if "Brand" in attribute_id1:
brand_courpus.append(tokenizers.delimiter(attribute_id1["Brand"][0]))
if "Brand" in attribute_id2:
brand_courpus.append(tokenizers.delimiter(attribute_id2["Brand"][0]))
feature_matrix = []
with open(filename, 'r') as f:
i = 1
for line in f:
list_line = line.split('?')
attribute_id1 = json.loads(list_line[2], encoding = 'latin-1')
attribute_id2 = json.loads(list_line[4], encoding = 'latin-1')
print 'Generate features for pair', i
i = i+1
instance = []
#Product Name 4
if ("Product Name" in attribute_id1 and "Product Name" in attribute_id2):
jaccard_productName = simfunctions.jaccard(tokenizers.delimiter(attribute_id1["Product Name"][0]), tokenizers.delimiter(attribute_id2["Product Name"][0]))
jaccard3gram_productName = simfunctions.jaccard(tokenizers.qgram(attribute_id1["Product Name"][0], 3), tokenizers.qgram(attribute_id2["Product Name"][0], 3))
tfidf_productName = simfunctions.tfidf(tokenizers.delimiter(attribute_id1["Product Name"][0]), tokenizers.delimiter(attribute_id2["Product Name"][0]), productName_courpus)
edit_productName = simfunctions.levenshtein(attribute_id1["Product Name"][0], attribute_id2["Product Name"][0])
edit_productName = 1 - edit_productName/max(len(attribute_id1["Product Name"][0]), len(attribute_id2["Product Name"][0]))
else:
jaccard_productName = 0
jaccard3gram_productName = 0
tfidf_productName = 0
edit_productName = 0
instance += [jaccard_productName, jaccard3gram_productName, tfidf_productName, edit_productName]
#Manufacturer 3
if ("Manufacturer" in attribute_id1 and "Manufacturer" in attribute_id2):
jaccard_manufacturer = simfunctions.jaccard(tokenizers.delimiter(attribute_id1["Manufacturer"][0]), tokenizers.delimiter(attribute_id2["Manufacturer"][0]))
jaccard3gram_manufacturer = simfunctions.jaccard(tokenizers.qgram(attribute_id1["Manufacturer"][0], 3), tokenizers.qgram(attribute_id2["Manufacturer"][0], 3))
tfidf_manufacturer = simfunctions.tfidf(tokenizers.delimiter(attribute_id1["Manufacturer"][0]), tokenizers.delimiter(attribute_id2["Manufacturer"][0]))
else:
jaccard_manufacturer = 0
jaccard3gram_manufacturer = 0
tfidf_manufacturer = 0
instance += [jaccard_manufacturer, jaccard3gram_manufacturer, tfidf_manufacturer]
#Color 3
if ("Color" in attribute_id1 and "Color" in attribute_id2):
jaccard_color = simfunctions.jaccard(tokenizers.delimiter(attribute_id1["Color"][0]), tokenizers.delimiter(attribute_id2["Color"][0]))
jaccard3gram_color = simfunctions.jaccard(tokenizers.qgram(attribute_id1["Color"][0], 3), tokenizers.qgram(attribute_id2["Color"][0], 3))
tfidf_color = simfunctions.tfidf(tokenizers.delimiter(attribute_id1["Color"][0]), tokenizers.delimiter(attribute_id2["Color"][0]))
else:
jaccard_color = 0
jaccard3gram_color = 0
tfidf_color = 0
instance += [jaccard_color, jaccard3gram_color, tfidf_color]
#Product Type 3
if ("Product Type" in attribute_id1 and "Product Type" in attribute_id2):
jaccard_productType = simfunctions.jaccard(tokenizers.delimiter(attribute_id1["Product Type"][0]),tokenizers.delimiter(attribute_id2["Product Type"][0]))
jaccard3gram_productType = simfunctions.jaccard(tokenizers.qgram(attribute_id1["Product Type"][0], 3),tokenizers.qgram(attribute_id2["Product Type"][0], 3))
tfidf_productType = simfunctions.tfidf(tokenizers.delimiter(attribute_id1["Product Type"][0]),tokenizers.delimiter(attribute_id2["Product Type"][0]))
else:
jaccard_productType = 0
jaccard3gram_productType = 0
tfidf_productType = 0
instance += [jaccard_productType, jaccard3gram_productType, tfidf_productType]
#Product Segment 3
if "Product Segment" in attribute_id1 and "Product Segment" in attribute_id2:
jaccard_productSegment = simfunctions.jaccard(tokenizers.delimiter(attribute_id1["Product Segment"][0]),tokenizers.delimiter(attribute_id2["Product Segment"][0]))
jaccard3gram_productSegment= simfunctions.jaccard(tokenizers.qgram(attribute_id1["Product Segment"][0], 3),tokenizers.qgram(attribute_id2["Product Segment"][0], 3))
if (attribute_id1["Product Segment"][0] == attribute_id2["Product Segment"][0]):
exactMatch_productSegment = 1
else:
exactMatch_productSegment = 0
else:
exactMatch_productSegment = 0
jaccard_productSegment = 0
jaccard3gram_productSegment = 0
instance += [exactMatch_productSegment, jaccard_productSegment, jaccard3gram_productSegment]
#Brand 4
if ("Brand" in attribute_id1 and "Brand" in attribute_id2):
#.........这里部分代码省略.........
示例6: time_medium_large_wi_rep
def time_medium_large_wi_rep(self):
simfunctions.tfidf(_med_num_tokens_wo_rep, _large_num_tokens_wo_rep, corpus_list=self.corpus_list, dampen=True)
示例7: time_medium_large_wo_rep_no_dampen
def time_medium_large_wo_rep_no_dampen(self):
simfunctions.tfidf(_med_num_tokens_wo_rep, _large_num_tokens_wo_rep, corpus_list=self.corpus_list)
示例8: time_small_large_wo_rep_no_corpus_no_dampen
def time_small_large_wo_rep_no_corpus_no_dampen(self):
simfunctions.tfidf(_small_num_tokens_wo_rep, _large_num_tokens_wo_rep)
示例9: time_small_medium_wo_rep_no_dampen
def time_small_medium_wo_rep_no_dampen(self):
simfunctions.tfidf(_small_num_tokens_wo_rep, _med_num_tokens_wo_rep, corpus_list=self.corpus_list)
示例10: time_small_large_wi_rep_no_dampen
def time_small_large_wi_rep_no_dampen(self):
simfunctions.tfidf(_small_num_tokens_wi_rep, _large_num_tokens_wi_rep, corpus_list=self.corpus_list)
示例11: time_medium_large_wi_rep_no_corpus
def time_medium_large_wi_rep_no_corpus(self):
simfunctions.tfidf(_med_num_tokens_wo_rep, _large_num_tokens_wo_rep, dampen=True)
示例12: time_small_large_wi_rep_no_corpus
def time_small_large_wi_rep_no_corpus(self):
simfunctions.tfidf(_small_num_tokens_wi_rep, _large_num_tokens_wi_rep, dampen=True)
示例13: time_small_medium_wo_rep_no_corpus
def time_small_medium_wo_rep_no_corpus(self):
simfunctions.tfidf(_small_num_tokens_wo_rep, _med_num_tokens_wo_rep, dampen=True)
示例14: test_invalid_input2
def test_invalid_input2(self):
tfidf(None, ['b'])
示例15: time_small_medium_wi_rep
def time_small_medium_wi_rep(self):
simfunctions.tfidf(_small_num_tokens_wi_rep, _med_num_tokens_wi_rep, corpus_list=self.corpus_list, dampen=True)