本文整理汇总了Python中py_entitymatching.io.parsers.read_csv_metadata函数的典型用法代码示例。如果您正苦于以下问题:Python read_csv_metadata函数的具体用法?Python read_csv_metadata怎么用?Python read_csv_metadata使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。
在下文中一共展示了read_csv_metadata函数的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: test_eval_matches_valid_3
def test_eval_matches_valid_3(self):
A = read_csv_metadata(path_a)
B = read_csv_metadata(path_b, key='ID')
C = read_csv_metadata(path_c, ltable=A, rtable=B)
C1 = C[['_id', 'ltable_ID', 'rtable_ID']]
num_ones = len(C1)
num_zeros = len(C1) - num_ones
gold = [0]*num_ones
# gold.extend([1]*num_zeros)
predicted = [1]* (num_zeros + num_ones)
ln = len(C1.columns)
C1.insert(ln, 'gold', gold)
C1.insert(ln+1, 'predicted', predicted)
D = pd.DataFrame(columns=C1.columns)
cm.copy_properties(C, D)
result = eval_matches(D, 'gold', 'predicted')
self.assertEqual(isinstance(result, dict), True)
self.assertEqual(result['prec_numerator'], 0)
self.assertEqual(result['prec_denominator'], 0)
self.assertAlmostEqual(result['precision'], 0)
self.assertEqual(result['recall_numerator'], 0)
self.assertEqual(result['recall_denominator'], 0)
self.assertEqual(result['recall'], 0)
self.assertEqual(result['f1'], 0)
self.assertEqual(result['pred_pos_num'], 0)
self.assertEqual(result['false_pos_num'], 0.0)
self.assertEqual(len(result['false_pos_ls']), 0)
self.assertEqual(result['pred_neg_num'], 0)
self.assertEqual(result['false_neg_num'], 0.0)
self.assertEqual(len(result['false_neg_ls']), 0)
示例2: test_select_matcher_valid_2
def test_select_matcher_valid_2(self):
A = read_csv_metadata(path_a, key='id')
B = read_csv_metadata(path_b, key='id')
# C = read_csv_metadata(path_c, ltable=A, rtable=B, fk_ltable='ltable.id',
# fk_rtable='rtable.id', key='_id')
# labels = [0] * 7
# labels.extend([1] * 8)
# C['labels'] = labels
# feature_table = get_features_for_matching(A, B)
# feature_vectors = extract_feature_vecs(C, feature_table=feature_table, attrs_after='gold')
# feature_vectors.fillna(0, inplace=True)
feature_vectors = read_csv_metadata(path_f, ltable=A, rtable=B)
dtmatcher = DTMatcher()
nbmatcher = NBMatcher()
rfmatcher = RFMatcher()
svmmatcher = SVMMatcher()
linregmatcher = LinRegMatcher()
logregmatcher = LogRegMatcher()
matchers = [dtmatcher, nbmatcher, rfmatcher, svmmatcher, linregmatcher, logregmatcher]
col_list = list(feature_vectors.columns)
l = list_diff(col_list, [cm.get_key(feature_vectors), cm.get_fk_ltable(feature_vectors),
cm.get_fk_rtable(feature_vectors),
'gold'])
X = feature_vectors[l]
Y = feature_vectors['gold']
result = select_matcher(matchers, x=X, y=Y)
header = ['Name', 'Matcher', 'Num folds']
result_df = result['drill_down_cv_stats']['precision']
self.assertEqual(set(header) == set(list(result_df.columns[[0, 1, 2]])), True)
self.assertEqual('Mean score', result_df.columns[len(result_df.columns) - 1])
d = result_df.set_index('Name')
p_max = d.ix[result['selected_matcher'].name, 'Mean score']
a_max = pd.np.max(d['Mean score'])
self.assertEqual(p_max, a_max)
示例3: test_feature_fn_valid_nosim_tok
def test_feature_fn_valid_nosim_tok(self):
A = read_csv_metadata(path_a)
B = read_csv_metadata(path_b, key='ID')
feature_table = get_features_for_matching(A, B, validate_inferred_attr_types=False)
len1 = len(feature_table)
feature_string = "exact_match(ltuple['zipcode'], rtuple['zipcode'])"
f_dict = get_feature_fn(feature_string, dict(), dict())
示例4: test_valid_path_df_metadata_split_betn_file_kw
def test_valid_path_df_metadata_split_betn_file_kw(self):
cm.del_catalog()
del_files_in_dir(sndbx_path)
A = read_csv_metadata(path_a)
B = read_csv_metadata(path_b, key='ID')
path_c = os.sep.join([io_datasets_path, 'C_partialmeta.csv'])
C = read_csv_metadata(path_c, ltable=A, rtable=B, fk_ltable='ltable_ID')
示例5: test_select_matcher_valid_1
def test_select_matcher_valid_1(self):
A = read_csv_metadata(path_a, key='id')
B = read_csv_metadata(path_b, key='id')
# C = read_csv_metadata(path_c, ltable=A, rtable=B, fk_ltable='ltable.id',
# fk_rtable='rtable.id', key='_id')
# C['labels'] = labels
feature_vectors = read_csv_metadata(path_f, ltable=A, rtable=B)
dtmatcher = DTMatcher()
nbmatcher = NBMatcher()
rfmatcher = RFMatcher()
svmmatcher = SVMMatcher()
linregmatcher = LinRegMatcher()
logregmatcher = LogRegMatcher()
# xgmatcher = XGBoostMatcher()
matchers = [dtmatcher, nbmatcher, rfmatcher, svmmatcher, linregmatcher,
logregmatcher]
result = select_matcher(matchers, x=None, y=None, table=feature_vectors,
exclude_attrs=['ltable.id', 'rtable.id', '_id', 'gold'],
target_attr='gold', k=7)
header = ['Name', 'Matcher', 'Num folds']
result_df = result['drill_down_cv_stats']['precision']
self.assertEqual(set(header) == set(list(result_df.columns[[0, 1, 2]])), True)
self.assertEqual('Mean score', result_df.columns[len(result_df.columns) - 1])
d = result_df.set_index('Name')
p_max = d.ix[result['selected_matcher'].name, 'Mean score']
a_max = pd.np.max(d['Mean score'])
self.assertEqual(p_max, a_max)
示例6: test_assemble_topk_table_2
def test_assemble_topk_table_2(self):
A = read_csv_metadata(path_a, key='ID')
B = read_csv_metadata(path_b, key='ID')
A_key = em.get_key(A)
B_key = em.get_key(B)
topk_heap = [(0.2727272727272727, 1, 0), (0.23076923076923078, 0, 4),
(0.16666666666666666, 0, 3)]
ret_dataframe = db._assemble_topk_table(topk_heap, A, B, A_key, B_key)
expected_columns = ['_id', 'ltable_ID', 'rtable_ID',
'ltable_name', 'ltable_birth_year',
'ltable_hourly_wage',
'ltable_address', 'ltable_zipcode', 'rtable_name',
'rtable_birth_year', 'rtable_hourly_wage',
'rtable_address', 'rtable_zipcode']
self.assertEqual(len(ret_dataframe), 3)
self.assertEqual(list(ret_dataframe.columns), expected_columns)
expected_recs = [[0, 'a2', 'b1', 'Michael Franklin',
1988, 27.5, '1652 Stockton St, San Francisco',
94122, 'Mark Levene', 1987, 29.5,
'108 Clement St, San Francisco', 94107],
[1, 'a1', 'b5', 'Kevin Smith',
1989, 30.0, '607 From St, San Francisco', 94107,
'Alfons Kemper', 1984, 35.0,
'170 Post St, Apt 4, San Francisco', 94122],
[2, 'a1', 'b4', 'Kevin Smith',
1989, 30.0, '607 From St, San Francisco', 94107,
'Joseph Kuan', 1982, 26.0,
'108 South Park, San Francisco', 94122]]
self.assertEqual(list(ret_dataframe.ix[0]), expected_recs[0])
self.assertEqual(list(ret_dataframe.ix[1]), expected_recs[1])
self.assertEqual(list(ret_dataframe.ix[2]), expected_recs[2])
示例7: test_select_matcher_valid_multiple_metrics
def test_select_matcher_valid_multiple_metrics(self):
A = read_csv_metadata(path_a, key='id')
B = read_csv_metadata(path_b, key='id')
feature_vectors = read_csv_metadata(path_f, ltable=A, rtable=B)
dtmatcher = DTMatcher()
nbmatcher = NBMatcher()
rfmatcher = RFMatcher()
svmmatcher = SVMMatcher()
linregmatcher = LinRegMatcher()
logregmatcher = LogRegMatcher()
matchers = [dtmatcher, nbmatcher, rfmatcher, svmmatcher, linregmatcher, logregmatcher]
result = select_matcher(matchers, x=None, y=None, table=feature_vectors,
exclude_attrs=['ltable.id', 'rtable.id', '_id', 'gold'],
target_attr='gold', k=7)
header = ['Name', 'Matcher', 'Num folds']
result_df_p = result['drill_down_cv_stats']['precision']
result_df_f = result['drill_down_cv_stats']['f1']
result_df_r = result['drill_down_cv_stats']['recall']
# Check header of precision dataframe
self.assertEqual(set(header) == set(list(result_df_p.columns[[0, 1, 2]])), True)
self.assertEqual('Mean score', result_df_p.columns[len(result_df_p.columns) - 1])
# Check header of f1 dataframe
self.assertEqual(set(header) == set(list(result_df_f.columns[[0, 1, 2]])), True)
self.assertEqual('Mean score', result_df_f.columns[len(result_df_f.columns) - 1])
# Check header of recall dataframe
self.assertEqual(set(header) == set(list(result_df_r.columns[[0, 1, 2]])), True)
self.assertEqual('Mean score', result_df_p.columns[len(result_df_r.columns) - 1])
d = result_df_p.set_index('Name')
p_max = d.ix[result['selected_matcher'].name, 'Mean score']
a_max = pd.np.max(d['Mean score'])
self.assertEqual(p_max, a_max)
示例8: test_check_table_order_invalid_df2
def test_check_table_order_invalid_df2(self):
A = read_csv_metadata(path_a)
B = read_csv_metadata(path_b, key='ID')
l_attr_types = au.get_attr_types(A)
r_attr_types = au.get_attr_types(B)
attr_corres = au.get_attr_corres(A, B)
status = afg._check_table_order(A, None, l_attr_types, r_attr_types, attr_corres)
示例9: test_select_matcher_target_attr_not_present
def test_select_matcher_target_attr_not_present(self):
A = read_csv_metadata(path_a, key='id')
B = read_csv_metadata(path_b, key='id')
# C = read_csv_metadata(path_c, ltable=A, rtable=B, fk_ltable='ltable.id',
# fk_rtable='rtable.id', key='_id')
# labels = [0] * 7
# labels.extend([1] * 8)
# C['labels'] = labels
# feature_table = get_features_for_matching(A, B)
# feature_vectors = extract_feature_vecs(C, feature_table=feature_table, attrs_after='gold')
# feature_vectors.fillna(0, inplace=True)
feature_vectors = read_csv_metadata(path_f, ltable=A, rtable=B)
dtmatcher = DTMatcher()
nbmatcher = NBMatcher()
rfmatcher = RFMatcher()
svmmatcher = SVMMatcher()
linregmatcher = LinRegMatcher()
logregmatcher = LogRegMatcher()
matchers = [dtmatcher, nbmatcher, rfmatcher, svmmatcher, linregmatcher, logregmatcher]
col_list = list(feature_vectors.columns)
l = list_diff(col_list, [cm.get_fk_ltable(feature_vectors),
cm.get_fk_rtable(feature_vectors)
])
feature_vectors = feature_vectors[l]
result = select_matcher(matchers, x=None, y=None, table=feature_vectors,
exclude_attrs='_id',
target_attr='labels1', k=2)
示例10: test_eval_matches_valid_2
def test_eval_matches_valid_2(self):
A = read_csv_metadata(path_a)
B = read_csv_metadata(path_b, key='ID')
C = read_csv_metadata(path_c, ltable=A, rtable=B)
C1 = C[['_id', 'ltable_ID', 'rtable_ID']]
num_ones = 1
num_zeros = len(C1) - num_ones
gold = [0] * num_ones
gold.extend([1] * num_zeros)
predicted = [1] * (num_zeros + num_ones)
ln = len(C1.columns)
C1.insert(ln, 'gold', gold)
C1.insert(ln + 1, 'predicted', predicted)
cm.copy_properties(C, C1)
result = eval_matches(C1, 'predicted', 'gold')
self.assertEqual(isinstance(result, dict), True)
self.assertEqual(result['prec_numerator'], 14)
self.assertEqual(result['prec_denominator'], 14)
self.assertAlmostEqual(result['precision'], 1)
self.assertEqual(result['recall_numerator'], 14)
self.assertEqual(result['recall_denominator'], 15)
self.assertEqual(result['recall'], 0.9333333333333333)
self.assertEqual(result['f1'], 0.9655172413793104)
self.assertEqual(result['pred_pos_num'], 14)
self.assertEqual(result['false_pos_num'], 0.0)
self.assertEqual(len(result['false_pos_ls']), 0)
self.assertEqual(result['pred_neg_num'], 1)
self.assertEqual(result['false_neg_num'], 1.0)
self.assertEqual(len(result['false_neg_ls']), 1)
t = result['false_neg_ls'][0]
self.assertEqual(t[0], 'a1')
self.assertEqual(t[1], 'b1')
示例11: test_ml_matcher_return_probs_true_predict_diff_colname
def test_ml_matcher_return_probs_true_predict_diff_colname(self):
A = read_csv_metadata(fpath_a, key='id')
B = read_csv_metadata(fpath_b, key='id')
feature_vectors = read_csv_metadata(fpath_f, ltable=A, rtable=B)
train_test = mu.split_train_test(feature_vectors)
train, test = train_test['train'], train_test['test']
dt = DTMatcher(name='DecisionTree')
train.drop('ltable.id', axis=1, inplace=True)
train.drop('rtable.id', axis=1, inplace=True)
test.drop('ltable.id', axis=1, inplace=True)
test.drop('rtable.id', axis=1, inplace=True)
test.drop('gold', axis=1, inplace=True)
dt.fit(table=train, exclude_attrs='_id', target_attr='gold')
predictions = dt.predict(table=test, exclude_attrs='_id',
target_attr='predicted', probs_attr='probas',
inplace=False, append=True, return_probs=True)
self.assertNotEqual(id(predictions), id(test))
self.assertEqual(len(predictions), len(test))
self.assertEqual(set(list(test.columns)).issubset(list(predictions.columns)), True)
p_col = predictions.columns[len(predictions.columns)-2]
self.assertEqual(p_col, 'predicted')
r_col = predictions.columns[len(predictions.columns) - 1]
self.assertEqual(r_col, 'probas')
self.assertEqual(sum((predictions[r_col] >= 0.0) & (predictions[r_col] <= 1.0)),
len(predictions))
示例12: test_debugblocker_14
def test_debugblocker_14(self):
path_ltable = os.sep.join([debugblocker_datasets_path,
'test_debugblocker_ltable.csv'])
path_rtable = os.sep.join([debugblocker_datasets_path,
'test_debugblocker_rtable.csv'])
path_cand = os.sep.join([debugblocker_datasets_path,
'test_debugblocker_cand.csv'])
ltable = read_csv_metadata(path_ltable, key='ID')
rtable = read_csv_metadata(path_rtable, key='book_id')
cand_set = read_csv_metadata(path_cand, ltable=ltable, rtable=rtable,
fk_ltable='ltable_ID',
fk_rtable='rtable_book_id',
key='_id')
attr_corres = [('title', 'book_title'), ('price', 'price'),
('desc', 'description'), ('genre', 'book_genre'),
('year', 'pub_year'), ('lang', 'language'),
('author', 'author'), ('publisher', 'publisher')]
output_size = 1
ret_dataframe = db.debug_blocker(cand_set, ltable, rtable,
output_size, attr_corres)
expected_columns = ['_id', 'ltable_ID', 'rtable_book_id',
'ltable_title', 'ltable_desc', 'ltable_year',
'ltable_lang', 'ltable_author', 'ltable_publisher',
'rtable_book_title', 'rtable_description',
'rtable_pub_year', 'rtable_language',
'rtable_author', 'rtable_publisher']
self.assertEqual(list(ret_dataframe.columns), expected_columns)
ret_record = list(ret_dataframe.ix[0])
expected_record = [0, 1, 'B001', 'data analysis', 'introduction to data analysis',
2015, 'ENG', 'Jane Doe', 'BCD publisher', 'introduction to data analysis',
float('nan'), 'English', 'introduction to data analysis', 'John Doe', 'ABC publisher10.00']
print(ret_record)
print(expected_record)
self.assertEqual(expected_record[2], ret_record[2])
self.assertEqual(expected_record[3], ret_record[3])
示例13: test_select_matcher_valid_cv_stats_3
def test_select_matcher_valid_cv_stats_3(self):
A = read_csv_metadata(path_a, key='id')
B = read_csv_metadata(path_b, key='id')
feature_vectors = read_csv_metadata(path_f, ltable=A, rtable=B)
dtmatcher = DTMatcher()
nbmatcher = NBMatcher()
rfmatcher = RFMatcher()
svmmatcher = SVMMatcher()
linregmatcher = LinRegMatcher()
logregmatcher = LogRegMatcher()
matchers = [dtmatcher, nbmatcher, rfmatcher, svmmatcher, linregmatcher, logregmatcher]
result = select_matcher(matchers, x=None, y=None, table=feature_vectors,
exclude_attrs=['ltable.id', 'rtable.id', '_id', 'gold'],
metric_to_select_matcher='recall',
metrics_to_display='recall',
target_attr='gold', k=7)
header = ['Matcher', 'Average recall']
result_df = result['cv_stats']
result_df_r = result['drill_down_cv_stats']['recall']
self.assertEqual(set(header) == set(list(result_df.columns[[0, 1]])), True)
d = result_df.set_index('Matcher')
p_max = d.ix[result['selected_matcher'].name, 'Average recall']
a_max = pd.np.max(result_df_r['Mean score'])
self.assertEqual(p_max, a_max)
示例14: test_validate_attr_types_invalid_corres
def test_validate_attr_types_invalid_corres(self):
A = read_csv_metadata(path_a)
B = read_csv_metadata(path_b, key='ID')
l_attr_types = au.get_attr_types(A)
r_attr_types = au.get_attr_types(B)
# attr_corres = au.get_attr_corres(A, B)
response = afg.validate_attr_types(l_attr_types, r_attr_types, None)
示例15: test_blocker_combiner_valid_8
def test_blocker_combiner_valid_8(self):
A = read_csv_metadata(path_a)
B = read_csv_metadata(path_b, key='ID')
C1 = read_csv_metadata(os.sep.join([bc_datasets_path, 'C4_ex_1.csv']), ltable=A, rtable=B)
C1.rename(columns={'l_ID':'ltable_ID'}, inplace=True)
C1.rename(columns={'r_ID':'rtable_ID'}, inplace=True)
cm.set_fk_ltable(C1, 'ltable_ID')
cm.set_fk_rtable(C1, 'rtable_ID')
C2 = read_csv_metadata(os.sep.join([bc_datasets_path, 'C4_ex_2.csv']), ltable=A, rtable=B)
C2.rename(columns={'l_ID':'ltable_ID'}, inplace=True)
C2.rename(columns={'r_ID':'rtable_ID'}, inplace=True)
cm.set_fk_ltable(C2, 'ltable_ID')
cm.set_fk_rtable(C2, 'rtable_ID')
C = combine_blocker_outputs_via_union([C1, C2], 'l_', 'r_')
C_exp = read_csv_metadata(os.sep.join([bc_datasets_path, 'C_ex_4.csv']), ltable=A, rtable=B)
C_exp.rename(columns={'l_ID':'ltable_ID'}, inplace=True)
C_exp.rename(columns={'r_ID':'rtable_ID'}, inplace=True)
cm.set_fk_ltable(C_exp, 'ltable_ID')
cm.set_fk_rtable(C_exp, 'rtable_ID')
# C_exp.sort_values(['l_ID', 'r_ID'], inplace=True)
# C_exp.reset_index(inplace=True, drop=True)
# C_exp['_id'] = six.moves.range(0, len(C_exp))
# C_exp.drop('r_address', axis=1, inplace=True)
if os.name != 'nt':
self.assertEqual(C.equals(C_exp), True)
p1 = cm.get_all_properties(C)
p2 = cm.get_all_properties(C_exp)
self.assertEqual(p1, p2)