当前位置: 首页>>代码示例>>Python>>正文


Python etl.ETLUtils类代码示例

本文整理汇总了Python中etl.ETLUtils的典型用法代码示例。如果您正苦于以下问题:Python ETLUtils类的具体用法?Python ETLUtils怎么用?Python ETLUtils使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。


在下文中一共展示了ETLUtils类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: main

def main():
    # dataset = 'hotel'
    dataset = 'restaurant'
    my_folder = '/Users/fpena/UCC/Thesis/datasets/context/'
    my_training_records_file =\
        my_folder + 'classified_' + dataset + '_reviews.json'
    my_training_reviews_file =\
        my_folder + 'classified_' + dataset + '_reviews.pkl'
    my_training_records = ETLUtils.load_json_file(my_training_records_file)

    with open(my_training_reviews_file, 'rb') as read_file:
        my_training_reviews = pickle.load(read_file)

    classifier = ReviewsClassifier()
    classifier.train(my_training_records, my_training_reviews)

    my_input_records_file =\
        my_folder + 'yelp_training_set_review_' + dataset + 's_shuffled.json'
    my_input_reviews_file =\
        my_folder + 'reviews_' + dataset + '_shuffled.pkl'
    my_output_records_file =\
        my_folder + 'yelp_training_set_review_' + dataset +\
        's_shuffled_tagged.json'

    with open(my_input_reviews_file, 'rb') as read_file:
        my_input_reviews = pickle.load(read_file)

    my_input_records = ETLUtils.load_json_file(my_input_records_file)

    my_output_records =\
        classifier.label_json_reviews(my_input_records, my_input_reviews)

    ETLUtils.save_json_file(my_output_records_file, my_output_records)
开发者ID:bachlog,项目名称:yelp,代码行数:33,代码来源:reviews_classifier.py

示例2: export_without_context

    def export_without_context(self):
        print('%s: exporting to CARSKit binary ratings format without context' %
              time.strftime("%Y/%m/%d-%H:%M:%S"))

        if os.path.exists(CSV_FILE):
            print('Binary ratings file already exists')
            copy_to_workspace(CSV_FILE)
            return

        new_records = []
        numpy.random.seed(0)

        for record in self.records:

            context_na_value = 1

            new_records.append({
                Constants.USER_ID_FIELD: record[Constants.USER_INTEGER_ID_FIELD],
                Constants.ITEM_ID_FIELD: record[Constants.ITEM_INTEGER_ID_FIELD],
                Constants.RATING_FIELD: record[Constants.RATING_FIELD],
                'context:na': context_na_value,
            })

        headers = [
            Constants.USER_ID_FIELD,
            Constants.ITEM_ID_FIELD,
            Constants.RATING_FIELD,
            'context:na'
        ]

        ETLUtils.save_csv_file(CSV_FILE, new_records, headers)
        copy_to_workspace(CSV_FILE)
开发者ID:melqkiades,项目名称:yelp,代码行数:32,代码来源:carskit_exporter.py

示例3: update_labeled_reviews_records

def update_labeled_reviews_records():

    reviews_label_map = compare_records()
    agreed_review_ids = set(reviews_label_map.keys())
    classifier_records = \
        ETLUtils.load_json_file(Constants.CLASSIFIED_RECORDS_FILE)
    classifier_review_ids = \
        {record[Constants.REVIEW_ID_FIELD] for record in classifier_records}
    non_agreed_review_ids = classifier_review_ids.difference(agreed_review_ids)

    # for record in classifier_records:
        # print(record)

    print('number of records before: %d' % len(classifier_records))

    print(reviews_label_map)
    print(non_agreed_review_ids)
    review_type_map = {'s': 'yes', 'g': 'no'}

    # We remove from the classifier records the ones who don't have agreed on a
    # label
    classifier_records = ETLUtils.filter_out_records(
        classifier_records, Constants.REVIEW_ID_FIELD, non_agreed_review_ids)

    # Finally we make the update of the labels
    for record in classifier_records:
        review_id = record[Constants.REVIEW_ID_FIELD]
        record[Constants.SPECIFIC] = review_type_map[reviews_label_map[review_id]]
        # print(record)

    print('number of records after: %d' % len(classifier_records))
开发者ID:melqkiades,项目名称:yelp,代码行数:31,代码来源:labeled_reviews_comparator.py

示例4: run

    def run(self, dataset, output_folder, train_records, test_records, train_reviews=None, test_reviews=None):

        contextual_train_set, contextual_test_set = self.full_cycle(
            train_records, test_records, train_reviews, test_reviews
        )

        print("Prepared data: %s" % time.strftime("%Y/%d/%m-%H:%M:%S"))

        # json_train_file = output_folder + 'yelp_' + dataset + '_context_shuffled_train5.json'
        csv_train_file = output_folder + "yelp_" + dataset + "_context_shuffled_train5.csv"
        # json_test_file = output_folder + 'yelp_' + dataset + '_context_shuffled_test5.json'
        csv_test_file = output_folder + "yelp_" + dataset + "_context_shuffled_test5.csv"

        # ETLUtils.save_json_file(json_train_file, contextual_train_set)
        ETLUtils.save_csv_file(csv_train_file, contextual_train_set, self.headers)

        # ETLUtils.save_json_file(json_test_file, contextual_test_set)
        ETLUtils.save_csv_file(csv_test_file, contextual_test_set, self.headers)

        print("Exported CSV and JSON files: %s" % time.strftime("%Y/%d/%m-%H:%M:%S"))

        csv_files = [csv_train_file, csv_test_file]

        num_cols = len(self.headers)
        context_cols = num_cols
        print("num_cols", num_cols)
        # print('context_cols', context_cols)

        libfm_converter.csv_to_libfm(
            csv_files, 0, [1, 2], range(3, context_cols), ",", has_header=True, suffix=".no_context.libfm"
        )
        libfm_converter.csv_to_libfm(csv_files, 0, [1, 2], [], ",", has_header=True, suffix=".context.libfm")

        print("Exported LibFM files: %s" % time.strftime("%Y/%d/%m-%H:%M:%S"))
开发者ID:bachlog,项目名称:yelp,代码行数:34,代码来源:context_data_converter.py

示例5: drop_unwanted_fields

    def drop_unwanted_fields(dictionary_list):
        """
        Drops fields that are not useful for data analysis in the business
        data set

        :rtype : void
        :param dictionary_list: the list of dictionaries containing the data
        """
        unwanted_fields = [
            'attributes',
            'business_id',
            'categories',
            'city',
            'full_address',
            'latitude',
            'longitude',
            'hours',
            'name',
            'neighborhoods',
            'open',
            'review_count',
            'stars',
            'state',
            'type'
        ]

        ETLUtils.drop_fields(unwanted_fields, dictionary_list)
开发者ID:antoine-tran,项目名称:yelp,代码行数:27,代码来源:business_etl.py

示例6: multiple_lineal_regression

    def multiple_lineal_regression(file_path):
        records = ReviewETL.load_file(file_path)
        ratings = np.array([record['stars'] for record in records])
        ETLUtils.drop_fields(['stars'], records)
        data = np.array([record.values() for record in records])

        # Create linear regression object
        regr = linear_model.LinearRegression()

        # Train the model using the training sets
        regr.fit(data, ratings)

        model = linear_model.LinearRegression(fit_intercept=True)
        model.fit(data, ratings)
        p = np.array([model.predict(xi) for xi in data])
        e = p - ratings

        total_error = np.dot(e, e)
        rmse_train = np.sqrt(total_error / len(p))

        kf = KFold(len(data), n_folds=10)
        err = 0
        for train, test in kf:
            model.fit(data[train], ratings[train])
            p = np.array([model.predict(xi) for xi in data[test]])
            e = p - ratings[test]
            err += np.dot(e, e)


        rmse_10cv = np.sqrt(err / len(data))
        print('RMSE on training: {}'.format(rmse_train))
        print('RMSE on 10-fold CV: {}'.format(rmse_10cv))
开发者ID:antoine-tran,项目名称:yelp,代码行数:32,代码来源:review_analysis.py

示例7: export_records

 def export_records(self):
     print('%s: get_records_to_predict_topn records' % time.strftime("%Y/%m/%d-%H:%M:%S"))
     self.dictionary.save(Constants.DICTIONARY_FILE)
     ETLUtils.save_json_file(
         Constants.FULL_PROCESSED_RECORDS_FILE, self.records)
     self.drop_unnecessary_fields()
     ETLUtils.save_json_file(Constants.PROCESSED_RECORDS_FILE, self.records)
开发者ID:neostoic,项目名称:yelp-1,代码行数:7,代码来源:yelp_reviews_preprocessor.py

示例8: main_evaluate

def main_evaluate():
    I = my_i

    records = ETLUtils.load_json_file(RECORDS_FILE)
    # print('num_records', len(records))

    test_file = RECORDS_FILE + '_test'
    test_records = ETLUtils.load_json_file(test_file)

    top_n_evaluator = TopNEvaluator(records, test_records, DATASET, 10, I)
    top_n_evaluator.find_important_records()
    # top_n_evaluator.initialize()

    # records_to_predict_file = DATASET_FOLDER + 'generated/records_to_predict_' + DATASET + '.json'
    top_n_evaluator.load_records_to_predict(RECORDS_TO_PREDICT_FILE)

    predictions_file = GENERATED_FOLDER + 'predictions_' + DATASET + '.txt'
    predictions = rmse_calculator.read_targets_from_txt(predictions_file)

    # print('total predictions', len(predictions))
    top_n_evaluator.evaluate(predictions)
    # print('precision', top_n_evaluator.precision)
    print('recall', top_n_evaluator.recall)

    return top_n_evaluator.recall
开发者ID:bachlog,项目名称:yelp,代码行数:25,代码来源:top_n_runner.py

示例9: export_records_to_predict

 def export_records_to_predict(self, records_file):
     if self.records_to_predict is None:
         self.records_to_predict = self.get_records_to_predict()
     ETLUtils.save_json_file(records_file, self.records_to_predict)
     with open(records_file + '.pkl', 'wb') as write_file:
         pickle.dump(
             self.items_to_predict, write_file, pickle.HIGHEST_PROTOCOL)
开发者ID:melqkiades,项目名称:yelp,代码行数:7,代码来源:top_n_evaluator.py

示例10: analyze_context_records

def analyze_context_records():
    records = ETLUtils.load_json_file(Constants.CLASSIFIED_RECORDS_FILE)
    records = ETLUtils.filter_records(records, 'context_type', ['context'])

    print('num records: %d' % len(records))

    for record in records:
        print(record[Constants.TEXT_FIELD])
开发者ID:melqkiades,项目名称:yelp,代码行数:8,代码来源:main.py

示例11: get_categories

def get_categories(file_path):
    records = ETLUtils.load_json_file(file_path)

    # Now we obtain the categories for all the businesses
    records = ETLUtils.add_transpose_list_column('categories', records)
    BusinessETL.drop_unwanted_fields(records)

    return records[0].keys()
开发者ID:anuragreddygv323,项目名称:yelp,代码行数:8,代码来源:business_clusterer.py

示例12: parallel_run_topn_test

def parallel_run_topn_test(
        records_file, recommenders, binary_reviews_file, reviews_type=None):

    records = context_recommender_tests.load_records(records_file)
    records = extractor.remove_users_with_low_reviews(records, 20)
    with open(binary_reviews_file, 'rb') as read_file:
        binary_reviews = pickle.load(read_file)

    if len(records) != len(binary_reviews):
        raise ValueError("The records and reviews should have the same length")

    num_folds = 5
    split = 0.986
    top_n = 10
    min_like_score = 5.0

    args = itertools.product(
        [records],
        recommenders,
        [top_n],
        [num_folds],
        [split],
        [min_like_score],
        [binary_reviews],
        [reviews_type]
    )

    print('Total recommenders: %d' % (len(recommenders)))

    pool = Pool()

    print('Total CPUs: %d' % pool._processes)

    results_list = pool.map(run_topn_test_wrapper, args)
    pool.close()
    pool.join()

    # After we have finished executing, we process the results
    dataset_info_map = {}
    dataset_info_map['dataset'] = records_file.split('/')[-1]
    dataset_info_map['cache_reviews'] = binary_reviews_file.split('/')[-1]
    dataset_info_map['num_records'] = len(records)
    dataset_info_map['reviews_type'] = reviews_type
    dataset_info_map['cross_validation_folds'] = num_folds
    dataset_info_map['min_like_score'] = min_like_score
    dataset_info_map['top_n'] = top_n

    results_log_list = []
    for recommender, results in zip(recommenders, results_list):
        results_log_list.append(context_recommender_tests.process_topn_results(
            recommender, results, dataset_info_map))

    timestamp = time.strftime("%Y%m%d-%H%M%S")
    file_name = 'recommender-topn-results-parallel' + timestamp

    ETLUtils.save_csv_file(file_name + '.csv', results_log_list, TOPN_HEADERS, '\t')

    return results_list
开发者ID:antoine-tran,项目名称:yelp,代码行数:58,代码来源:parallel_context_recommender_tests.py

示例13: main

def main():
    topic_model_creator.plant_seeds()

    my_resamplers = [
        None,
        'random_over_sampler',
        'smote_regular',
        'smote_bl1',
        'smote_bl2',
        'smote_tomek',
        'smoteenn'
    ]

    my_classifiers = [
        DummyClassifier(strategy='most_frequent', random_state=0),
        DummyClassifier(strategy='stratified', random_state=0),
        DummyClassifier(strategy='uniform', random_state=0),
        DummyClassifier(strategy='constant', random_state=0, constant=True),
        LogisticRegression(C=100),
        SVC(C=1.0, kernel='rbf', probability=True),
        SVC(C=1.0, kernel='linear', probability=True),
        KNeighborsClassifier(n_neighbors=10),
        tree.DecisionTreeClassifier(),
        NuSVC(probability=True),
        RandomForestClassifier(n_estimators=100)
    ]

    document_levels = ['review', 'sentence', 1]

    num_cyles = len(my_resamplers) * len(my_classifiers) * len(document_levels)
    index = 1

    results_list = []

    for document_level in document_levels:

        Constants.DOCUMENT_LEVEL = document_level
        my_records = load_records()
        preprocess_records(my_records)
        x_matrix, y_vector = transform(my_records)

        count_specific_generic(my_records)

        for resampler, classifier in itertools.product(my_resamplers, my_classifiers):

            print('Cycle %d/%d' % (index, num_cyles))

            classification_results =\
                test_classifier(x_matrix, y_vector, resampler, classifier)
            results_list.append(classification_results)
            index += 1

    for results in results_list:
        print(results)

    csv_file = Constants.DATASET_FOLDER + Constants.ITEM_TYPE +\
               '_sentence_classifier_results.csv'
    ETLUtils.save_csv_file(csv_file, results_list, results_list[0].keys())
开发者ID:antoine-tran,项目名称:yelp,代码行数:58,代码来源:classifier_evaluator.py

示例14: test_select_fields

    def test_select_fields(self):

        select_fields = ['user_id', 'offering_id', 'overall_rating']
        result = ETLUtils.select_fields(select_fields, reviews_matrix_5)
        self.assertEqual(result, reviews_matrix_5_short)

        select_fields = ['user_id']
        result = ETLUtils.select_fields(select_fields, reviews_matrix_5_short)
        self.assertEqual(result, reviews_matrix_5_users)
开发者ID:antoine-tran,项目名称:yelp,代码行数:9,代码来源:test_etl_utils.py

示例15: drop_unnecessary_fields

    def drop_unnecessary_fields(self):
        print('%s: drop unnecessary fields' % time.strftime("%Y/%m/%d-%H:%M:%S"))

        unnecessary_fields = [
            Constants.TEXT_FIELD,
            Constants.POS_TAGS_FIELD,
            Constants.VOTES_FIELD,
            Constants.BOW_FIELD
        ]

        ETLUtils.drop_fields(unnecessary_fields, self.records)
开发者ID:neostoic,项目名称:yelp-1,代码行数:11,代码来源:yelp_reviews_preprocessor.py


注:本文中的etl.ETLUtils类示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。