本文整理汇总了Python中etl.ETLUtils类的典型用法代码示例。如果您正苦于以下问题:Python ETLUtils类的具体用法?Python ETLUtils怎么用?Python ETLUtils使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。
在下文中一共展示了ETLUtils类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: main
def main():
# dataset = 'hotel'
dataset = 'restaurant'
my_folder = '/Users/fpena/UCC/Thesis/datasets/context/'
my_training_records_file =\
my_folder + 'classified_' + dataset + '_reviews.json'
my_training_reviews_file =\
my_folder + 'classified_' + dataset + '_reviews.pkl'
my_training_records = ETLUtils.load_json_file(my_training_records_file)
with open(my_training_reviews_file, 'rb') as read_file:
my_training_reviews = pickle.load(read_file)
classifier = ReviewsClassifier()
classifier.train(my_training_records, my_training_reviews)
my_input_records_file =\
my_folder + 'yelp_training_set_review_' + dataset + 's_shuffled.json'
my_input_reviews_file =\
my_folder + 'reviews_' + dataset + '_shuffled.pkl'
my_output_records_file =\
my_folder + 'yelp_training_set_review_' + dataset +\
's_shuffled_tagged.json'
with open(my_input_reviews_file, 'rb') as read_file:
my_input_reviews = pickle.load(read_file)
my_input_records = ETLUtils.load_json_file(my_input_records_file)
my_output_records =\
classifier.label_json_reviews(my_input_records, my_input_reviews)
ETLUtils.save_json_file(my_output_records_file, my_output_records)
示例2: export_without_context
def export_without_context(self):
print('%s: exporting to CARSKit binary ratings format without context' %
time.strftime("%Y/%m/%d-%H:%M:%S"))
if os.path.exists(CSV_FILE):
print('Binary ratings file already exists')
copy_to_workspace(CSV_FILE)
return
new_records = []
numpy.random.seed(0)
for record in self.records:
context_na_value = 1
new_records.append({
Constants.USER_ID_FIELD: record[Constants.USER_INTEGER_ID_FIELD],
Constants.ITEM_ID_FIELD: record[Constants.ITEM_INTEGER_ID_FIELD],
Constants.RATING_FIELD: record[Constants.RATING_FIELD],
'context:na': context_na_value,
})
headers = [
Constants.USER_ID_FIELD,
Constants.ITEM_ID_FIELD,
Constants.RATING_FIELD,
'context:na'
]
ETLUtils.save_csv_file(CSV_FILE, new_records, headers)
copy_to_workspace(CSV_FILE)
示例3: update_labeled_reviews_records
def update_labeled_reviews_records():
reviews_label_map = compare_records()
agreed_review_ids = set(reviews_label_map.keys())
classifier_records = \
ETLUtils.load_json_file(Constants.CLASSIFIED_RECORDS_FILE)
classifier_review_ids = \
{record[Constants.REVIEW_ID_FIELD] for record in classifier_records}
non_agreed_review_ids = classifier_review_ids.difference(agreed_review_ids)
# for record in classifier_records:
# print(record)
print('number of records before: %d' % len(classifier_records))
print(reviews_label_map)
print(non_agreed_review_ids)
review_type_map = {'s': 'yes', 'g': 'no'}
# We remove from the classifier records the ones who don't have agreed on a
# label
classifier_records = ETLUtils.filter_out_records(
classifier_records, Constants.REVIEW_ID_FIELD, non_agreed_review_ids)
# Finally we make the update of the labels
for record in classifier_records:
review_id = record[Constants.REVIEW_ID_FIELD]
record[Constants.SPECIFIC] = review_type_map[reviews_label_map[review_id]]
# print(record)
print('number of records after: %d' % len(classifier_records))
示例4: run
def run(self, dataset, output_folder, train_records, test_records, train_reviews=None, test_reviews=None):
contextual_train_set, contextual_test_set = self.full_cycle(
train_records, test_records, train_reviews, test_reviews
)
print("Prepared data: %s" % time.strftime("%Y/%d/%m-%H:%M:%S"))
# json_train_file = output_folder + 'yelp_' + dataset + '_context_shuffled_train5.json'
csv_train_file = output_folder + "yelp_" + dataset + "_context_shuffled_train5.csv"
# json_test_file = output_folder + 'yelp_' + dataset + '_context_shuffled_test5.json'
csv_test_file = output_folder + "yelp_" + dataset + "_context_shuffled_test5.csv"
# ETLUtils.save_json_file(json_train_file, contextual_train_set)
ETLUtils.save_csv_file(csv_train_file, contextual_train_set, self.headers)
# ETLUtils.save_json_file(json_test_file, contextual_test_set)
ETLUtils.save_csv_file(csv_test_file, contextual_test_set, self.headers)
print("Exported CSV and JSON files: %s" % time.strftime("%Y/%d/%m-%H:%M:%S"))
csv_files = [csv_train_file, csv_test_file]
num_cols = len(self.headers)
context_cols = num_cols
print("num_cols", num_cols)
# print('context_cols', context_cols)
libfm_converter.csv_to_libfm(
csv_files, 0, [1, 2], range(3, context_cols), ",", has_header=True, suffix=".no_context.libfm"
)
libfm_converter.csv_to_libfm(csv_files, 0, [1, 2], [], ",", has_header=True, suffix=".context.libfm")
print("Exported LibFM files: %s" % time.strftime("%Y/%d/%m-%H:%M:%S"))
示例5: drop_unwanted_fields
def drop_unwanted_fields(dictionary_list):
"""
Drops fields that are not useful for data analysis in the business
data set
:rtype : void
:param dictionary_list: the list of dictionaries containing the data
"""
unwanted_fields = [
'attributes',
'business_id',
'categories',
'city',
'full_address',
'latitude',
'longitude',
'hours',
'name',
'neighborhoods',
'open',
'review_count',
'stars',
'state',
'type'
]
ETLUtils.drop_fields(unwanted_fields, dictionary_list)
示例6: multiple_lineal_regression
def multiple_lineal_regression(file_path):
records = ReviewETL.load_file(file_path)
ratings = np.array([record['stars'] for record in records])
ETLUtils.drop_fields(['stars'], records)
data = np.array([record.values() for record in records])
# Create linear regression object
regr = linear_model.LinearRegression()
# Train the model using the training sets
regr.fit(data, ratings)
model = linear_model.LinearRegression(fit_intercept=True)
model.fit(data, ratings)
p = np.array([model.predict(xi) for xi in data])
e = p - ratings
total_error = np.dot(e, e)
rmse_train = np.sqrt(total_error / len(p))
kf = KFold(len(data), n_folds=10)
err = 0
for train, test in kf:
model.fit(data[train], ratings[train])
p = np.array([model.predict(xi) for xi in data[test]])
e = p - ratings[test]
err += np.dot(e, e)
rmse_10cv = np.sqrt(err / len(data))
print('RMSE on training: {}'.format(rmse_train))
print('RMSE on 10-fold CV: {}'.format(rmse_10cv))
示例7: export_records
def export_records(self):
print('%s: get_records_to_predict_topn records' % time.strftime("%Y/%m/%d-%H:%M:%S"))
self.dictionary.save(Constants.DICTIONARY_FILE)
ETLUtils.save_json_file(
Constants.FULL_PROCESSED_RECORDS_FILE, self.records)
self.drop_unnecessary_fields()
ETLUtils.save_json_file(Constants.PROCESSED_RECORDS_FILE, self.records)
示例8: main_evaluate
def main_evaluate():
I = my_i
records = ETLUtils.load_json_file(RECORDS_FILE)
# print('num_records', len(records))
test_file = RECORDS_FILE + '_test'
test_records = ETLUtils.load_json_file(test_file)
top_n_evaluator = TopNEvaluator(records, test_records, DATASET, 10, I)
top_n_evaluator.find_important_records()
# top_n_evaluator.initialize()
# records_to_predict_file = DATASET_FOLDER + 'generated/records_to_predict_' + DATASET + '.json'
top_n_evaluator.load_records_to_predict(RECORDS_TO_PREDICT_FILE)
predictions_file = GENERATED_FOLDER + 'predictions_' + DATASET + '.txt'
predictions = rmse_calculator.read_targets_from_txt(predictions_file)
# print('total predictions', len(predictions))
top_n_evaluator.evaluate(predictions)
# print('precision', top_n_evaluator.precision)
print('recall', top_n_evaluator.recall)
return top_n_evaluator.recall
示例9: export_records_to_predict
def export_records_to_predict(self, records_file):
if self.records_to_predict is None:
self.records_to_predict = self.get_records_to_predict()
ETLUtils.save_json_file(records_file, self.records_to_predict)
with open(records_file + '.pkl', 'wb') as write_file:
pickle.dump(
self.items_to_predict, write_file, pickle.HIGHEST_PROTOCOL)
示例10: analyze_context_records
def analyze_context_records():
records = ETLUtils.load_json_file(Constants.CLASSIFIED_RECORDS_FILE)
records = ETLUtils.filter_records(records, 'context_type', ['context'])
print('num records: %d' % len(records))
for record in records:
print(record[Constants.TEXT_FIELD])
示例11: get_categories
def get_categories(file_path):
records = ETLUtils.load_json_file(file_path)
# Now we obtain the categories for all the businesses
records = ETLUtils.add_transpose_list_column('categories', records)
BusinessETL.drop_unwanted_fields(records)
return records[0].keys()
示例12: parallel_run_topn_test
def parallel_run_topn_test(
records_file, recommenders, binary_reviews_file, reviews_type=None):
records = context_recommender_tests.load_records(records_file)
records = extractor.remove_users_with_low_reviews(records, 20)
with open(binary_reviews_file, 'rb') as read_file:
binary_reviews = pickle.load(read_file)
if len(records) != len(binary_reviews):
raise ValueError("The records and reviews should have the same length")
num_folds = 5
split = 0.986
top_n = 10
min_like_score = 5.0
args = itertools.product(
[records],
recommenders,
[top_n],
[num_folds],
[split],
[min_like_score],
[binary_reviews],
[reviews_type]
)
print('Total recommenders: %d' % (len(recommenders)))
pool = Pool()
print('Total CPUs: %d' % pool._processes)
results_list = pool.map(run_topn_test_wrapper, args)
pool.close()
pool.join()
# After we have finished executing, we process the results
dataset_info_map = {}
dataset_info_map['dataset'] = records_file.split('/')[-1]
dataset_info_map['cache_reviews'] = binary_reviews_file.split('/')[-1]
dataset_info_map['num_records'] = len(records)
dataset_info_map['reviews_type'] = reviews_type
dataset_info_map['cross_validation_folds'] = num_folds
dataset_info_map['min_like_score'] = min_like_score
dataset_info_map['top_n'] = top_n
results_log_list = []
for recommender, results in zip(recommenders, results_list):
results_log_list.append(context_recommender_tests.process_topn_results(
recommender, results, dataset_info_map))
timestamp = time.strftime("%Y%m%d-%H%M%S")
file_name = 'recommender-topn-results-parallel' + timestamp
ETLUtils.save_csv_file(file_name + '.csv', results_log_list, TOPN_HEADERS, '\t')
return results_list
示例13: main
def main():
topic_model_creator.plant_seeds()
my_resamplers = [
None,
'random_over_sampler',
'smote_regular',
'smote_bl1',
'smote_bl2',
'smote_tomek',
'smoteenn'
]
my_classifiers = [
DummyClassifier(strategy='most_frequent', random_state=0),
DummyClassifier(strategy='stratified', random_state=0),
DummyClassifier(strategy='uniform', random_state=0),
DummyClassifier(strategy='constant', random_state=0, constant=True),
LogisticRegression(C=100),
SVC(C=1.0, kernel='rbf', probability=True),
SVC(C=1.0, kernel='linear', probability=True),
KNeighborsClassifier(n_neighbors=10),
tree.DecisionTreeClassifier(),
NuSVC(probability=True),
RandomForestClassifier(n_estimators=100)
]
document_levels = ['review', 'sentence', 1]
num_cyles = len(my_resamplers) * len(my_classifiers) * len(document_levels)
index = 1
results_list = []
for document_level in document_levels:
Constants.DOCUMENT_LEVEL = document_level
my_records = load_records()
preprocess_records(my_records)
x_matrix, y_vector = transform(my_records)
count_specific_generic(my_records)
for resampler, classifier in itertools.product(my_resamplers, my_classifiers):
print('Cycle %d/%d' % (index, num_cyles))
classification_results =\
test_classifier(x_matrix, y_vector, resampler, classifier)
results_list.append(classification_results)
index += 1
for results in results_list:
print(results)
csv_file = Constants.DATASET_FOLDER + Constants.ITEM_TYPE +\
'_sentence_classifier_results.csv'
ETLUtils.save_csv_file(csv_file, results_list, results_list[0].keys())
示例14: test_select_fields
def test_select_fields(self):
select_fields = ['user_id', 'offering_id', 'overall_rating']
result = ETLUtils.select_fields(select_fields, reviews_matrix_5)
self.assertEqual(result, reviews_matrix_5_short)
select_fields = ['user_id']
result = ETLUtils.select_fields(select_fields, reviews_matrix_5_short)
self.assertEqual(result, reviews_matrix_5_users)
示例15: drop_unnecessary_fields
def drop_unnecessary_fields(self):
print('%s: drop unnecessary fields' % time.strftime("%Y/%m/%d-%H:%M:%S"))
unnecessary_fields = [
Constants.TEXT_FIELD,
Constants.POS_TAGS_FIELD,
Constants.VOTES_FIELD,
Constants.BOW_FIELD
]
ETLUtils.drop_fields(unnecessary_fields, self.records)