当前位置: 首页>>代码示例>>Python>>正文


Python ETLUtils.load_json_file方法代码示例

本文整理汇总了Python中etl.ETLUtils.load_json_file方法的典型用法代码示例。如果您正苦于以下问题:Python ETLUtils.load_json_file方法的具体用法?Python ETLUtils.load_json_file怎么用?Python ETLUtils.load_json_file使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在etl.ETLUtils的用法示例。


在下文中一共展示了ETLUtils.load_json_file方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: main_evaluate

# 需要导入模块: from etl import ETLUtils [as 别名]
# 或者: from etl.ETLUtils import load_json_file [as 别名]
def main_evaluate():
    I = my_i

    records = ETLUtils.load_json_file(RECORDS_FILE)
    # print('num_records', len(records))

    test_file = RECORDS_FILE + '_test'
    test_records = ETLUtils.load_json_file(test_file)

    top_n_evaluator = TopNEvaluator(records, test_records, DATASET, 10, I)
    top_n_evaluator.find_important_records()
    # top_n_evaluator.initialize()

    # records_to_predict_file = DATASET_FOLDER + 'generated/records_to_predict_' + DATASET + '.json'
    top_n_evaluator.load_records_to_predict(RECORDS_TO_PREDICT_FILE)

    predictions_file = GENERATED_FOLDER + 'predictions_' + DATASET + '.txt'
    predictions = rmse_calculator.read_targets_from_txt(predictions_file)

    # print('total predictions', len(predictions))
    top_n_evaluator.evaluate(predictions)
    # print('precision', top_n_evaluator.precision)
    print('recall', top_n_evaluator.recall)

    return top_n_evaluator.recall
开发者ID:bachlog,项目名称:yelp,代码行数:27,代码来源:top_n_runner.py

示例2: main

# 需要导入模块: from etl import ETLUtils [as 别名]
# 或者: from etl.ETLUtils import load_json_file [as 别名]
def main():
    # dataset = 'hotel'
    dataset = 'restaurant'
    my_folder = '/Users/fpena/UCC/Thesis/datasets/context/'
    my_training_records_file =\
        my_folder + 'classified_' + dataset + '_reviews.json'
    my_training_reviews_file =\
        my_folder + 'classified_' + dataset + '_reviews.pkl'
    my_training_records = ETLUtils.load_json_file(my_training_records_file)

    with open(my_training_reviews_file, 'rb') as read_file:
        my_training_reviews = pickle.load(read_file)

    classifier = ReviewsClassifier()
    classifier.train(my_training_records, my_training_reviews)

    my_input_records_file =\
        my_folder + 'yelp_training_set_review_' + dataset + 's_shuffled.json'
    my_input_reviews_file =\
        my_folder + 'reviews_' + dataset + '_shuffled.pkl'
    my_output_records_file =\
        my_folder + 'yelp_training_set_review_' + dataset +\
        's_shuffled_tagged.json'

    with open(my_input_reviews_file, 'rb') as read_file:
        my_input_reviews = pickle.load(read_file)

    my_input_records = ETLUtils.load_json_file(my_input_records_file)

    my_output_records =\
        classifier.label_json_reviews(my_input_records, my_input_reviews)

    ETLUtils.save_json_file(my_output_records_file, my_output_records)
开发者ID:bachlog,项目名称:yelp,代码行数:35,代码来源:reviews_classifier.py

示例3: load

# 需要导入模块: from etl import ETLUtils [as 别名]
# 或者: from etl.ETLUtils import load_json_file [as 别名]
    def load(self):
        print('load: %s' % time.strftime("%Y/%m/%d-%H:%M:%S"))
        self.original_records =\
            ETLUtils.load_json_file(Constants.PROCESSED_RECORDS_FILE)
        # ETLUtils.drop_fields(['tagged_words'], self.original_records)
        print('num_records: %d' % len(self.original_records))

        if not os.path.exists(Constants.USER_ITEM_MAP_FILE):
            records = ETLUtils.load_json_file(Constants.RECORDS_FILE)
            user_item_map = create_user_item_map(records)
            with open(Constants.USER_ITEM_MAP_FILE, 'wb') as write_file:
                pickle.dump(user_item_map, write_file, pickle.HIGHEST_PROTOCOL)
开发者ID:antoine-tran,项目名称:yelp,代码行数:14,代码来源:context_top_n_runner.py

示例4: classify_reviews

# 需要导入模块: from etl import ETLUtils [as 别名]
# 或者: from etl.ETLUtils import load_json_file [as 别名]
    def classify_reviews(self):
        print('%s: classify reviews' % time.strftime("%Y/%m/%d-%H:%M:%S"))
        dataset = Constants.ITEM_TYPE
        folder = Constants.DATASET_FOLDER
        file_name_suffix =\
            '' if Constants.MAX_SENTENCES is None else '_sentences'
        training_records_file = folder +\
            'classified_' + dataset + '_reviews' + file_name_suffix + '.json'
        training_records = ETLUtils.load_json_file(training_records_file)

        if Constants.MAX_SENTENCES is not None:
            training_records = [
                record for record in training_records
                if record['sentence_index'] < Constants.MAX_SENTENCES
            ]
            for record in training_records:
                record['specific'] = \
                    'yes' if record['sentence_type'] == 'specific' else 'no'
            print('num training records', len(training_records))

        self.lemmatize_reviews(training_records)

        classifier = ReviewsClassifier(self.classifier, self.resampler)
        classifier.train(training_records)
        classifier.label_json_reviews(self.records)
开发者ID:neostoic,项目名称:yelp-1,代码行数:27,代码来源:yelp_reviews_preprocessor.py

示例5: main

# 需要导入模块: from etl import ETLUtils [as 别名]
# 或者: from etl.ETLUtils import load_json_file [as 别名]
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument(
        '-c', '--cycle', metavar='int', type=int,
        nargs=1, help='The index of the running cycle')
    parser.add_argument(
        '-f', '--fold', metavar='int', type=int,
        nargs=1, help='The index of the cross validation fold')
    parser.add_argument(
        '-t', '--numtopics', metavar='int', type=int,
        nargs=1, help='The number of topics of the topic model')

    args = parser.parse_args()
    fold = args.fold[0] if args.fold is not None else None
    cycle = args.cycle[0] if args.cycle is not None else None
    num_topics = args.numtopics[0] if args.numtopics is not None else None

    if num_topics is not None:
        Constants.update_properties(
            {Constants.TOPIC_MODEL_NUM_TOPICS_FIELD: num_topics})

    if fold is None and cycle is None:
        records = ETLUtils.load_json_file(Constants.PROCESSED_RECORDS_FILE)

        if Constants.SEPARATE_TOPIC_MODEL_RECSYS_REVIEWS:
            num_records = len(records)
            records = records[:num_records / 2]
        print('num_reviews', len(records))

        create_topic_model(records, None, None)
    else:
        create_single_topic_model(cycle, fold)
开发者ID:melqkiades,项目名称:yelp,代码行数:34,代码来源:topic_model_creator.py

示例6: load

# 需要导入模块: from etl import ETLUtils [as 别名]
# 或者: from etl.ETLUtils import load_json_file [as 别名]
    def load(self):
        print('load: %s' % time.strftime("%Y/%m/%d-%H:%M:%S"))
        self.original_records = ETLUtils.load_json_file(Constants.RECORDS_FILE)
        with open(Constants.REVIEWS_FILE, 'rb') as read_file:
            self.original_reviews = pickle.load(read_file)
        print('num_records: %d' % len(self.original_records))

        for record, review in zip(self.original_records, self.original_reviews):
            review.id = record[Constants.REVIEW_ID_FIELD]
            review.rating = record[Constants.RATING_FIELD]

        if not os.path.exists(Constants.USER_ITEM_MAP_FILE):
            records = ETLUtils.load_json_file(Constants.RECORDS_FILE)
            user_item_map = create_user_item_map(records)
            with open(Constants.USER_ITEM_MAP_FILE, 'wb') as write_file:
                pickle.dump(user_item_map, write_file, pickle.HIGHEST_PROTOCOL)
开发者ID:melqkiades,项目名称:yelp,代码行数:18,代码来源:word_context_top_n_runner.py

示例7: main

# 需要导入模块: from etl import ETLUtils [as 别名]
# 或者: from etl.ETLUtils import load_json_file [as 别名]
def main():
    # my_file = '/Users/fpena/UCC/Thesis/datasets/context/classified_hotel_reviews.json'
    my_file = '/Users/fpena/UCC/Thesis/datasets/context/classified_restaurant_reviews.json'
    my_records = ETLUtils.load_json_file(my_file)
    # my_reviews = []
    # my_index = 0
    #
    # print("records:", len(my_records))
    #
    # for record in my_records:
    #     my_index += 1
    #     my_reviews.append(Review(record['text']))
    #     print('index', my_index)

    # binary_reviews_file = '/Users/fpena/UCC/Thesis/datasets/context/classified_hotel_reviews.pkl'
    binary_reviews_file = '/Users/fpena/UCC/Thesis/datasets/context/classified_restaurant_reviews.pkl'
    # with open(binary_reviews_file, 'wb') as write_file:
    #     pickle.dump(my_reviews, write_file, pickle.HIGHEST_PROTOCOL)

    with open(binary_reviews_file, 'rb') as read_file:
        my_reviews = pickle.load(read_file)

    cluster_labels = cluster_reviews(my_reviews)
    specific_records = split_list_by_labels(my_records, cluster_labels)[0]
    generic_records = split_list_by_labels(my_records, cluster_labels)[1]
开发者ID:antoine-tran,项目名称:yelp,代码行数:27,代码来源:reviews_clusterer.py

示例8: classify_reviews

# 需要导入模块: from etl import ETLUtils [as 别名]
# 或者: from etl.ETLUtils import load_json_file [as 别名]
    def classify_reviews(self):
        print('%s: classify reviews' % time.strftime("%Y/%m/%d-%H:%M:%S"))
        print(Constants.CLASSIFIED_RECORDS_FILE)
        training_records =\
            ETLUtils.load_json_file(Constants.CLASSIFIED_RECORDS_FILE)

        # If document level set to sentence (can be either 'sentence' or int)
        document_level = Constants.DOCUMENT_LEVEL
        if document_level != 'review':

            if document_level == 'sentence':
                document_level = float("inf")

            training_records = [
                record for record in training_records
                if record['sentence_index'] < document_level
            ]
            for record in training_records:
                record['specific'] = \
                    'yes' if record['sentence_type'] == 'specific' else 'no'
            print('num training records', len(training_records))

        training_records = self.lemmatize_reviews(training_records)

        classifier = ReviewsClassifier(self.classifier, self.resampler)
        classifier.train(training_records)
        classifier.label_json_reviews(self.records)
开发者ID:antoine-tran,项目名称:yelp,代码行数:29,代码来源:yelp_reviews_preprocessor.py

示例9: dataset_bucket_analysis_by_field

# 需要导入模块: from etl import ETLUtils [as 别名]
# 或者: from etl.ETLUtils import load_json_file [as 别名]
def dataset_bucket_analysis_by_field(field):
    # Set the dataset
    hotel_dataset_properties = {Constants.BUSINESS_TYPE_FIELD: 'fourcity_hotel'}
    Constants.update_properties(hotel_dataset_properties)

    records = ETLUtils.load_json_file(Constants.PROCESSED_RECORDS_FILE)

    print('Loaded %d records' % len(records))

    user_frequency_map = {}

    for record in records:

        user_id = record[field]
        if user_id not in user_frequency_map:
            user_frequency_map[user_id] = 0
        user_frequency_map[user_id] += 1

    print('There is a total of %d %ss' % (len(user_frequency_map), field))
    sorted_x = sorted(user_frequency_map.items(), key=operator.itemgetter(1), reverse=True)
    print(sorted_x[0])
    print(sorted_x[1])
    print(sorted_x[2])
    # print(user_frequency_map)

    # Number of reviews per user
    rda = ReviewsDatasetAnalyzer(records)
    users_summary = rda.summarize_reviews_by_field(field)
    print('Average number of reviews per %s: %f' % (field,
          float(rda.num_reviews) / rda.num_users))
    users_summary.plot(kind='line', rot=0)

    pandas.set_option('display.max_rows', len(users_summary))
    print(users_summary)
    pandas.reset_option('display.max_rows')
开发者ID:melqkiades,项目名称:yelp,代码行数:37,代码来源:main.py

示例10: update_labeled_reviews_records

# 需要导入模块: from etl import ETLUtils [as 别名]
# 或者: from etl.ETLUtils import load_json_file [as 别名]
def update_labeled_reviews_records():

    reviews_label_map = compare_records()
    agreed_review_ids = set(reviews_label_map.keys())
    classifier_records = \
        ETLUtils.load_json_file(Constants.CLASSIFIED_RECORDS_FILE)
    classifier_review_ids = \
        {record[Constants.REVIEW_ID_FIELD] for record in classifier_records}
    non_agreed_review_ids = classifier_review_ids.difference(agreed_review_ids)

    # for record in classifier_records:
        # print(record)

    print('number of records before: %d' % len(classifier_records))

    print(reviews_label_map)
    print(non_agreed_review_ids)
    review_type_map = {'s': 'yes', 'g': 'no'}

    # We remove from the classifier records the ones who don't have agreed on a
    # label
    classifier_records = ETLUtils.filter_out_records(
        classifier_records, Constants.REVIEW_ID_FIELD, non_agreed_review_ids)

    # Finally we make the update of the labels
    for record in classifier_records:
        review_id = record[Constants.REVIEW_ID_FIELD]
        record[Constants.SPECIFIC] = review_type_map[reviews_label_map[review_id]]
        # print(record)

    print('number of records after: %d' % len(classifier_records))
开发者ID:melqkiades,项目名称:yelp,代码行数:33,代码来源:labeled_reviews_comparator.py

示例11: analyze_context_records

# 需要导入模块: from etl import ETLUtils [as 别名]
# 或者: from etl.ETLUtils import load_json_file [as 别名]
def analyze_context_records():
    records = ETLUtils.load_json_file(Constants.CLASSIFIED_RECORDS_FILE)
    records = ETLUtils.filter_records(records, 'context_type', ['context'])

    print('num records: %d' % len(records))

    for record in records:
        print(record[Constants.TEXT_FIELD])
开发者ID:melqkiades,项目名称:yelp,代码行数:10,代码来源:main.py

示例12: main

# 需要导入模块: from etl import ETLUtils [as 别名]
# 或者: from etl.ETLUtils import load_json_file [as 别名]
def main():

    records = ETLUtils.load_json_file(
        Constants.RECSYS_CONTEXTUAL_PROCESSED_RECORDS_FILE)
    context_transformer = ContextTransformer(records)
    context_transformer.load_data()
    context_transformer.transform_records()
    context_transformer.export_records()
开发者ID:melqkiades,项目名称:yelp,代码行数:10,代码来源:context_transformer.py

示例13: generate_report_fourcity_filtered

# 需要导入模块: from etl import ETLUtils [as 别名]
# 或者: from etl.ETLUtils import load_json_file [as 别名]
 def generate_report_fourcity_filtered():
     file_path = '/Users/fpena/tmp/filtered_reviews_multi_non_sparse_shuffled.json'
     file_name = '/Users/fpena/UCC/Thesis/projects/yelp/notebooks/dataset_analysis_report_fourcity.ipynb'
     reviews = ETLUtils.load_json_file(file_path)
     load_reviews_code =\
         'file_path = \'/Users/fpena/tmp/filtered_reviews_multi_non_sparse_shuffled.json\'\n' +\
         'reviews = ETLUtils.load_json_file(file_path)\n'
     ReviewsDatasetAnalyzerReport.generate_report(reviews, 'Fourcity TripAdvisor', file_name, load_reviews_code)
开发者ID:antoine-tran,项目名称:yelp,代码行数:10,代码来源:reviews_dataset_analyzer_report.py

示例14: generate_report_ruihai

# 需要导入模块: from etl import ETLUtils [as 别名]
# 或者: from etl.ETLUtils import load_json_file [as 别名]
 def generate_report_ruihai():
     file_path = '/Users/fpena/UCC/Thesis/datasets/TripHotelReviewXml/cleaned_reviews.json'
     file_name = '/Users/fpena/UCC/Thesis/projects/yelp/notebooks/dataset_analysis_report_ruihai.ipynb'
     reviews = ETLUtils.load_json_file(file_path)
     load_reviews_code =\
         'file_path = \'/Users/fpena/UCC/Thesis/datasets/TripHotelReviewXml/cleaned_reviews.json\'\n' +\
         'reviews = ETLUtils.load_json_file(file_path)\n'
     ReviewsDatasetAnalyzerReport.generate_report(reviews, 'Ruihai TripAdvisor', file_name, load_reviews_code)
开发者ID:antoine-tran,项目名称:yelp,代码行数:10,代码来源:reviews_dataset_analyzer_report.py

示例15: get_categories

# 需要导入模块: from etl import ETLUtils [as 别名]
# 或者: from etl.ETLUtils import load_json_file [as 别名]
def get_categories(file_path):
    records = ETLUtils.load_json_file(file_path)

    # Now we obtain the categories for all the businesses
    records = ETLUtils.add_transpose_list_column('categories', records)
    BusinessETL.drop_unwanted_fields(records)

    return records[0].keys()
开发者ID:anuragreddygv323,项目名称:yelp,代码行数:10,代码来源:business_clusterer.py


注:本文中的etl.ETLUtils.load_json_file方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。