本文整理汇总了Python中etl.ETLUtils.filter_records方法的典型用法代码示例。如果您正苦于以下问题:Python ETLUtils.filter_records方法的具体用法?Python ETLUtils.filter_records怎么用?Python ETLUtils.filter_records使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类etl.ETLUtils
的用法示例。
在下文中一共展示了ETLUtils.filter_records方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: initialize_cluster_users
# 需要导入模块: from etl import ETLUtils [as 别名]
# 或者: from etl.ETLUtils import filter_records [as 别名]
def initialize_cluster_users(reviews, significant_criteria_ranges=None):
"""
Builds a dictionary containing all the users in the reviews. Each user
contains information about its average overall rating, the list of reviews
that user has made, and the cluster the user belongs to
:param reviews: the list of reviews
:return: a dictionary with the users initialized, the keys of the
dictionaries are the users' ID
"""
user_ids = get_groupby_list(reviews, 'user_id')
user_dictionary = {}
for user_id in user_ids:
user = User(user_id)
user_reviews = ETLUtils.filter_records(reviews, 'user_id', [user_id])
user.average_overall_rating = get_user_average_overall_rating(
user_reviews, user_id, apply_filter=False)
user.criteria_weights = get_criteria_weights(
user_reviews, user_id, apply_filter=False)
_, user.cluster = get_significant_criteria(
user.criteria_weights, significant_criteria_ranges)
user.item_ratings = get_user_item_ratings(user_reviews, user_id)
user.item_multi_ratings = get_user_item_multi_ratings(user_reviews, user_id)
user_dictionary[user_id] = user
# print('Total users: %i' % len(user_ids))
return user_dictionary
示例2: get_user_item_ratings
# 需要导入模块: from etl import ETLUtils [as 别名]
# 或者: from etl.ETLUtils import filter_records [as 别名]
def get_user_item_ratings(reviews, user_id, apply_filter=False):
"""
Returns a dictionary that contains the items that the given user has rated,
where the key of the dictionary is the ID of the item and the value is the
rating that user_id has given to that item
:param reviews: a list of reviews
:param user_id: the ID of the user
:param apply_filter: a boolean that indicates if the reviews have to be
filtered by user_id or not. In other word this boolean indicates if the list
contains reviews from several users or not. If it does contains reviews from
other users, those have to be removed
:return: a dictionary with the items that the given user has rated
"""
if apply_filter:
user_reviews = ETLUtils.filter_records(reviews, 'user_id', [user_id])
else:
user_reviews = reviews
if not user_reviews:
return {}
data_frame = DataFrame(user_reviews)
column = 'offering_id'
counts = data_frame.groupby(column).mean()
items = counts.index.get_level_values(0).tolist()
items_ratings = {}
for item, mean in zip(items, counts['overall_rating']):
items_ratings[item] = mean
return items_ratings
示例3: calculate_sparsity
# 需要导入模块: from etl import ETLUtils [as 别名]
# 或者: from etl.ETLUtils import filter_records [as 别名]
def calculate_sparsity(self):
"""
Returns the percentage of missing ratings in the list of reviews of this
ReviewsDatasetAnalyzer
:return: the rate of missing ratings
(i.e. number of missing ratings / (number of items * number of users))
:raise ValueError: in case an empty list is given
"""
if not self.reviews:
raise ValueError("Can not determine the sparsity for an empty list")
user_ids = extractor.get_groupby_list(self.reviews, "user_id")
item_ids = extractor.get_groupby_list(self.reviews, "offering_id")
non_missing_reviews = 0.0
total_expected_reviews = len(user_ids) * len(item_ids)
for user in user_ids:
user_reviews = ETLUtils.filter_records(self.reviews, "user_id", [user])
user_items = extractor.get_groupby_list(user_reviews, "offering_id")
non_missing_reviews += len(set(item_ids).intersection(set(user_items)))
return 1 - non_missing_reviews / total_expected_reviews
示例4: initialize_users
# 需要导入模块: from etl import ETLUtils [as 别名]
# 或者: from etl.ETLUtils import filter_records [as 别名]
def initialize_users(reviews, is_multi_criteria):
"""
Builds a dictionary containing all the users in the reviews. Each user
contains information about its average overall rating, the list of reviews
that user has made, and the cluster the user belongs to
:param reviews: the list of reviews
:return: a dictionary with the users initialized, the keys of the
dictionaries are the users' ID
"""
user_ids = get_groupby_list(reviews, 'user_id')
user_dictionary = {}
for user_id in user_ids:
user = User(user_id)
user_reviews = ETLUtils.filter_records(reviews, 'user_id', [user_id])
user.average_overall_rating = get_user_average_overall_rating(
user_reviews, user_id, apply_filter=False)
user.item_ratings = get_user_item_ratings(user_reviews, user_id)
user_dictionary[user_id] = user
if is_multi_criteria:
user.item_multi_ratings =\
get_user_item_multi_ratings(user_reviews, user_id)
return user_dictionary
示例5: get_item_average_overall_rating
# 需要导入模块: from etl import ETLUtils [as 别名]
# 或者: from etl.ETLUtils import filter_records [as 别名]
def get_item_average_overall_rating(reviews, item_id, apply_filter=True):
"""
Returns the average of the overall ratings that this user has given to
every item he/she has reviewed
:param reviews: a list of reviews
:param item_id: the ID of the user
:return: the average (or mean) of all the overall ratings that this has
given to all the items he/she has reviewed
"""
if apply_filter:
user_reviews =\
ETLUtils.filter_records(reviews, 'offering_id', [item_id])
else:
user_reviews = reviews
ratings_sum = 0.
ratings_count = len(user_reviews)
for review in user_reviews:
ratings_sum += float(review['overall_rating'])
average_rating = float(ratings_sum) / float(ratings_count)
return average_rating
示例6: analyze_context_records
# 需要导入模块: from etl import ETLUtils [as 别名]
# 或者: from etl.ETLUtils import filter_records [as 别名]
def analyze_context_records():
records = ETLUtils.load_json_file(Constants.CLASSIFIED_RECORDS_FILE)
records = ETLUtils.filter_records(records, 'context_type', ['context'])
print('num records: %d' % len(records))
for record in records:
print(record[Constants.TEXT_FIELD])
示例7: remove_items_with_low_reviews
# 需要导入模块: from etl import ETLUtils [as 别名]
# 或者: from etl.ETLUtils import filter_records [as 别名]
def remove_items_with_low_reviews(reviews, min_reviews):
"""
Returns a copy of the original reviews list without the reviews of hotels
that just have been reviewed once
:param reviews: a list of reviews
:return: a copy of the original reviews list without the reviews of hotels
that just have been reviewed once
"""
items = get_item_list(reviews, min_reviews)
return ETLUtils.filter_records(reviews, 'offering_id', items)
示例8: export
# 需要导入模块: from etl import ETLUtils [as 别名]
# 或者: from etl.ETLUtils import filter_records [as 别名]
def export(self):
print('export: %s' % time.strftime("%Y/%d/%m-%H:%M:%S"))
I = my_i
if REVIEW_TYPE:
self.records = ETLUtils.filter_records(
self.records, constants.PREDICTED_CLASS_FIELD, [REVIEW_TYPE])
self.test_records = ETLUtils.filter_records(
self.test_records, constants.PREDICTED_CLASS_FIELD,
[REVIEW_TYPE])
with open(USER_ITEM_MAP_FILE, 'rb') as read_file:
user_item_map = pickle.load(read_file)
self.top_n_evaluator = TopNEvaluator(
self.records, self.test_records, DATASET, 10, I)
self.top_n_evaluator.initialize(user_item_map)
self.records_to_predict = self.top_n_evaluator.get_records_to_predict()
# self.top_n_evaluator.export_records_to_predict(RECORDS_TO_PREDICT_FILE)
self.important_records = self.top_n_evaluator.important_records
示例9: get_unknown_items
# 需要导入模块: from etl import ETLUtils [as 别名]
# 或者: from etl.ETLUtils import filter_records [as 别名]
def get_unknown_items(reviews, user_id, num_unknown=1000):
item_ids = extractor.get_groupby_list(reviews, 'offering_id')
user_reviews = ETLUtils.filter_records(reviews, 'user_id', [user_id])
user_items = extractor.get_groupby_list(user_reviews, 'offering_id')
# We calculate which are the items that the user hasn't rated, which is the
# items that are in the list item_ids but not in the list user_items
s = set(user_items)
unknown_items = [x for x in item_ids if x not in s]
# TODO: Uncomment this line, the items have to be shuffled
# shuffle(unknown_items)
return unknown_items[:num_unknown]
示例10: remove_users_with_low_reviews
# 需要导入模块: from etl import ETLUtils [as 别名]
# 或者: from etl.ETLUtils import filter_records [as 别名]
def remove_users_with_low_reviews(reviews, min_reviews):
"""
Returns a copy of the original reviews list without the reviews made by
users who have made less than min_reviews reviews
:param reviews: a list of reviews
:param min_reviews: the minimum number of reviews a user must have in order
not to be removed from the reviews list
:return: a copy of the original reviews list without the reviews made by
users who have made less than min_reviews reviews
"""
users = get_user_list(reviews, min_reviews)
return ETLUtils.filter_records(reviews, 'user_id', users)
示例11: create_topic_model_with_context_records
# 需要导入模块: from etl import ETLUtils [as 别名]
# 或者: from etl.ETLUtils import filter_records [as 别名]
def create_topic_model_with_context_records():
processed_records_file = Constants.generate_file_name(
'classified_processed_reviews', 'json', Constants.CACHE_FOLDER, None,
None, False, True)
records = ETLUtils.load_json_file(processed_records_file)
print('records length: %d' % len(records))
context_records = ETLUtils.filter_records(records, 'context_type', ['context'])
print('context records length: %d' % len(context_records))
context_specific_records = ETLUtils.filter_records(context_records, 'predicted_class', ['specific'])
print('context specific records length: %d' % len(context_specific_records))
for i in range(len(context_specific_records)):
# print('%d:\t%s' % (i, context_records[i]['text']))
print('%d:\t%s' % (i, context_specific_records[i]['bow']))
for i in range(1, len(context_records)+1):
Constants.update_properties({Constants.TOPIC_MODEL_NUM_TOPICS_FIELD: i})
context_extractor = \
topic_model_creator.create_topic_model(records, None, None)
topic_data = []
for topic in range(Constants.TOPIC_MODEL_NUM_TOPICS):
result = {}
result['topic_id'] = topic
result.update(split_topic(context_extractor.print_topic_model(
num_terms=Constants.TOPIC_MODEL_STABILITY_NUM_TERMS)[topic]))
result['ratio'] = context_extractor.topic_ratio_map[topic]
result['weighted_frequency'] = \
context_extractor.topic_weighted_frequency_map[topic]
topic_data.append(result)
file_name = Constants.generate_file_name(
'manual_topic_model', 'xlsx', Constants.DATASET_FOLDER, None, None, True)
generate_excel_file(topic_data, file_name)
示例12: test_filter_records
# 需要导入模块: from etl import ETLUtils [as 别名]
# 或者: from etl.ETLUtils import filter_records [as 别名]
def test_filter_records(self):
field = 'offering_id'
values = [1, 3, 5]
expected_result = [
{'user_id': 'U1', 'offering_id': 1, 'overall_rating': 5.0},
{'user_id': 'U1', 'offering_id': 3, 'overall_rating': 5.0},
{'user_id': 'U2', 'offering_id': 1, 'overall_rating': 5.0},
{'user_id': 'U2', 'offering_id': 3, 'overall_rating': 5.0},
{'user_id': 'U2', 'offering_id': 5, 'overall_rating': 9.0}
]
actual_result = ETLUtils.filter_records(reviews_matrix_5_short, field, values)
self.assertEqual(expected_result, actual_result)
示例13: get_user_item_reviews
# 需要导入模块: from etl import ETLUtils [as 别名]
# 或者: from etl.ETLUtils import filter_records [as 别名]
def get_user_item_reviews(records, user_id, apply_filter=False):
if apply_filter:
user_records = ETLUtils.filter_records(records, 'user_id', [user_id])
else:
user_records = records
if not user_records:
return {}
items_reviews = {}
for record in user_records:
items_reviews[record['offering_id']] = record['text']
return items_reviews
示例14: get_user_item_contexts
# 需要导入模块: from etl import ETLUtils [as 别名]
# 或者: from etl.ETLUtils import filter_records [as 别名]
def get_user_item_contexts(records, lda_model, user_id, apply_filter=False):
if apply_filter:
user_records = ETLUtils.filter_records(records, 'user_id', [user_id])
else:
user_records = records
if not user_records:
return {}
items_reviews = {}
for record in user_records:
review_text = record['text']
context = get_topic_distribution(review_text, lda_model)
items_reviews[record['offering_id']] = context
return items_reviews
示例15: create_user_item_map
# 需要导入模块: from etl import ETLUtils [as 别名]
# 或者: from etl.ETLUtils import filter_records [as 别名]
def create_user_item_map(records):
user_ids = extractor.get_groupby_list(records, constants.USER_ID_FIELD)
user_item_map = {}
user_count = 0
for user_id in user_ids:
user_records =\
ETLUtils.filter_records(records, constants.USER_ID_FIELD, [user_id])
user_items =\
extractor.get_groupby_list(user_records, constants.ITEM_ID_FIELD)
user_item_map[user_id] = user_items
user_count += 1
# print("user count %d" % user_count),
print 'user count: {0}\r'.format(user_count),
print
return user_item_map