当前位置: 首页>>代码示例>>Python>>正文


Python datasets.load_files函数代码示例

本文整理汇总了Python中sklearn.datasets.load_files函数的典型用法代码示例。如果您正苦于以下问题:Python load_files函数的具体用法?Python load_files怎么用?Python load_files使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。


在下文中一共展示了load_files函数的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: export_classifier

def export_classifier():
    #note that this data is not in the git repo
    train_small = load_files('./training_data/')
    test_small = load_files('./test_data/')

    # Turn the text documents into vectors of word frequencies
    vectorizer = CountVectorizer(min_df=5, ngram_range=(1, 2),
                                 stop_words='english',
                                 strip_accents='ascii')
    X_train = vectorizer.fit_transform(train_small.data)
    y_train = train_small.target

    # Fit a classifier on the training set
    classifier = LogisticRegression(penalty='l2', tol=0.0001, C=1.0,
                                    fit_intercept=True, intercept_scaling=1,
                                    ).fit(X_train, y_train)
    print("Training score: {0:.1f}%".format(
        classifier.score(X_train, y_train) * 100))

    # Evaluate the classifier on the testing set
    X_test = vectorizer.transform(test_small.data)
    y_test = test_small.target
    print("Testing score: {0:.1f}%".format(
        classifier.score(X_test, y_test) * 100))
    export_pickle('LRclassifier.txt', classifier)
    export_pickle('LRvectorizer.txt', vectorizer)
开发者ID:sazlin,项目名称:reTOracle,代码行数:26,代码来源:LR.py

示例2: getData

def getData():
	train_data= load_files('training')    
	test_data=load_files("test")
	count_Vec=TfidfVectorizer(min_df=1,decode_error="replace")
	doc_train=count_Vec.fit_transform(train_data.data)
	doc_test=count_Vec.transform(test_data.data)# ! there is transform not fit_transform
	return doc_train.toarray(),train_data.target,doc_test.toarray()
开发者ID:chen33,项目名称:nlp,代码行数:7,代码来源:index.py

示例3: getData

def getData():
	train_data= load_files('dataset/train')    
	test_data=load_files("dataset/test")
	count_Vec=TfidfVectorizer(min_df=1,decode_error="replace")
	doc_train=count_Vec.fit_transform(train_data.data)
	doc_test=count_Vec.transform(test_data.data)
	return doc_train.toarray(),train_data.target,doc_test.toarray(),test_data.target
开发者ID:chen33,项目名称:nlp,代码行数:7,代码来源:lr.py

示例4: createDataSet

def createDataSet(train_path,test_path,category,k):
	"""
	create vectorized text feature
    '0' refer to 'atheism'
    '1' refer to 'sports'

	"""
	train_set = datasets.load_files(train_path,categories=category, 
	load_content=True, shuffle=True, encoding='utf-8', decode_error='ignore', random_state=0)

	count_vect = CountVectorizer(encoding = 'utf-8',lowercase = True,
	 decode_error = 'ignore',  analyzer = 'word', ngram_range = (2,4),min_df = 1)
	
	tfidf_vecter = TfidfVectorizer( max_df = 0.8, stop_words = 'english')

	test_set = datasets.load_files(test_path,categories=category, 
	load_content=True, shuffle=True, encoding='utf-8',  decode_error='ignore', random_state=0)

	

	X_train_tfidf = tfidf_vecter.fit_transform(train_set.data)
	X_train_counts = count_vect.fit_transform(train_set.data)

	X_test_tfidf = tfidf_vecter.transform(test_set.data)
	X_test_counts = count_vect.transform(test_set.data)


	 
	for i in range(X_train_counts.shape[0]):
		if train_set.target[i] == k:
			train_set.target[i] = 1
		else:
			train_set.target[i] = -1

	for i in range(X_test_counts.shape[0]):
		if test_set.target[i] == k:
			test_set.target[i] = 1
		else:
			test_set.target[i] = -1

	
	
	#X_train_normalize = preprocessing.normalize(X_train_counts, norm = 'l2')
	



	#print train_set.target_names
	#print train_set.target
	#print size 
	#print len(train_set.target)


	#print X_train_tfidf.shape
	#print X_train_counts
	#print X_train_normalize


	return X_train_counts, train_set.target, X_train_counts.shape,X_test_counts, test_set.target, X_test_counts.shape
开发者ID:zoezou2015,项目名称:ML_hm1,代码行数:59,代码来源:document_vectorize.py

示例5: load

def load(dataset, categories):
    if dataset == 'full':
        train = load_files('aclImdb/aggregate/', categories=categories)
        return train

    elif dataset == 'split':    
        train = load_files('aclImdb/train/', categories=categories)
        test = load_files('aclImdb/test/', categories=categories)
        return (train, test)
开发者ID:aakashjain,项目名称:ReviewClassification,代码行数:9,代码来源:data_loader.py

示例6: vector_for_input_binary

def vector_for_input_binary(train_file_path="/mnt/hgfs/temp/machine learning/train",
                            test_file_path="/mnt/hgfs/temp/machine learning/test", categories=None):
    train_data = load.load_files(train_file_path, categories=categories, encoding='utf-8', decode_error='ignore')
    test_data = load.load_files(test_file_path, categories=categories, encoding='utf-8', decode_error='ignore')

    vectorized = feature_extraction.CountVectorizer(min_df=1, binary=True)
    train_input = vectorized.fit_transform(train_data['data'])
    test_input = vectorized.transform(test_data['data'])

    return train_input, train_data['target'], test_input, test_data['target']
开发者ID:zoezou2015,项目名称:ML_hm1,代码行数:10,代码来源:Homework_1.py

示例7: test_grid_search_cv_on_newsgroup

def test_grid_search_cv_on_newsgroup():
    ## load news group data
    categories = [
        'alt.atheism',
        'talk.religion.misc',
        'comp.graphics',
        'sci.space',
    ]
    twenty_train_small = load_files('./data/20news-bydate-train/',
        categories=categories, charset='latin-1')
    twenty_test_small = load_files('./data/20news-bydate-test/',
        categories=categories, charset='latin-1')
    ## model pipeline using tfidf and passive aggresive
    pipeline = Pipeline((
        ('vec', TfidfVectorizer(min_df=1, max_df=0.8, use_idf=True)),
        ('clf', PassiveAggressiveClassifier(C=1)),
    ))
    param_grid = {
        'vec__min_df': [1, 2],
        'vec__max_df': [0.8, 1.0],
        'vec__ngram_range': [(1, 1), (1, 2)],
        'vec__use_idf': [True, False]
    }
    X, y = twenty_train_small.data, twenty_train_small.target
    ## cross validation on n_iter = 5
    grid_searcher = meta_search.GridSearch()
    # persist only once
    grid_searcher.persist_cv_splits('text_classification', X, y, './tmp/')
    grid_searcher.search(pipeline, param_grid)
    import time
    while not grid_searcher.isready():
        print time.sleep(2)
        print 'progress:', grid_searcher.progress()
        print 'best result:', grid_searcher.best_params_so_far()
        if grid_searcher.best_params_so_far():
            pass#grid_searcher.abort()
    print len(grid_searcher.partial_result())
    ## run again with naive bayesian
    ## no need to persist_cv_splits
    pipeline = Pipeline((
        ('vec', TfidfVectorizer(min_df=1, max_df=0.8, use_idf=True)),
        ('clf', MultinomialNB()),
    ))
    grid_searcher10 = meta_search.GridSearch(datafiles = grid_searcher.datafiles)
    grid_searcher10.search(pipeline, param_grid)
    while not grid_searcher10.isready():
        print time.sleep(2)
        print 'progress:', grid_searcher10.progress()
        print 'best result:', grid_searcher10.best_params_so_far()
        if grid_searcher10.best_params_so_far():
            pass#grid_searcher10.abort()
    print len(grid_searcher10.partial_result())    
开发者ID:dolaameng,项目名称:machine-learning-toolkit,代码行数:52,代码来源:test_meta_search.py

示例8: main

def main():
    #buildTrainSet()
    #buildTestSet()
    train = load_files('model/train', encoding='utf-8')
    test = load_files('model/test', encoding='utf-8')
    print train.cc
#    for l in train.target_names:
#        print l
#    for l in train.target:
#        print l
    vectorizer = TfidfVectorizer(sublinear_tf=True, stop_words='english')
    X_train = vectorizer.fit(train)
    X_test = vectorizer.fit_transform(test)
    print X_train.get_feature_names()
开发者ID:titopsur,项目名称:python_test,代码行数:14,代码来源:test.py

示例9: vector_for_input

def vector_for_input(train_file_path=path1,
                     test_file_path=path2, categories=None):
    train_data = load.load_files(train_file_path, categories=categories, encoding='utf-8', decode_error='ignore')
    test_data = load.load_files(test_file_path, categories=categories, encoding='utf-8', decode_error='ignore')

    # vectorized_normalized = feature_extraction.TfidfVectorizer(min_df=1)
    # train_input_normalized = vectorized_normalized.fit_transform(train_data['data'])
    # test_input_normalized = vectorized_normalized.transform(test_data['data'])

    vectorized = feature_extraction.CountVectorizer(min_df=1)
    train_input = vectorized.fit_transform(train_data['data'])
    test_input = vectorized.transform(test_data['data'])

    return train_input, train_data['target'], test_input, test_data['target']
开发者ID:zoezou2015,项目名称:ML_hm1,代码行数:14,代码来源:Homework_1.py

示例10: load_data

def load_data():
    # Descargamos los datos, los descomprimimos en la carpeta ./data/txt_sentoken
    # "http://www.cs.cornell.edu/people/pabo/movie-review-data/review_polarity.tar.gz")
    dataset = load_files('./data/txt_sentoken', shuffle=False)
    print("n_samples: %d" % len(dataset.data))

    return dataset
开发者ID:yazquez,项目名称:poc-machine-learning,代码行数:7,代码来源:MyGensim.py

示例11: testdata_stats

def testdata_stats():
    test_dataset = datasets.load_files(project_root+"/testdata",
                                     encoding='utf-8',
                                  decode_error='ignore')

    # save_thing_to_file(test_dataset, "test_dataset.txt")

    bayes = get_thing_from_file("bayes.txt")
    bayes.fit(test_dataset.data, test_dataset.target)
    predicted_nb = bayes.predict(test_dataset.data)

    print "*****BAYESIAN STATS****"
    print "average accuracy = " + \
            str(numpy.mean(predicted_nb == test_dataset.target))

    print(metrics.classification_report(test_dataset.target, predicted_nb,
    target_names=test_dataset.target_names))
    print "*****BAYESIAN CONFUSION MATRIX*****"
    print metrics.confusion_matrix(test_dataset.target, predicted_nb)

    svm = get_thing_from_file("svm.txt")
    svm.fit(test_dataset.data, test_dataset.target)
    predicted_svm = svm.predict(test_dataset.data)

    print "*****SVM STATS*****"
    print "average accuracy = " + \
            str(numpy.mean(predicted_svm == test_dataset.target))
    print(metrics.classification_report(test_dataset.target, predicted_svm,
    target_names=test_dataset.target_names))
    print "*****SVM CONFUSION MATRIX*****"
    print metrics.confusion_matrix(test_dataset.target, predicted_svm)
开发者ID:colinricardo28,项目名称:Peepl,代码行数:31,代码来源:analysis.py

示例12: load_SRAA

def load_SRAA(AVI_HOME='./SRAA/partition1/data', percent=1./3, rnd=2342, \
              vect=CountVectorizer(min_df=5, max_df=1.0, binary=True, ngram_range=(1, 1))):
    data = load_files(AVI_HOME, encoding="latin1", load_content=True, random_state=rnd)
    data.data = [remove_header_subject(text) for text in data.data]

    indices = ShuffleSplit(len(data.data), n_iter=1, test_size=percent, indices=True, random_state=rnd)
    for train_ind, test_ind in indices:
        data = bunch.Bunch(train=bunch.Bunch(data=[data.data[i] for i in train_ind], target=data.target[train_ind]),
                              test=bunch.Bunch(data=[data.data[i] for i in test_ind], target=data.target[test_ind]))

    X_tr = vect.fit_transform(data.train.data)
    y_tr = data.train.target

    X_te = vect.transform(data.test.data)
    y_te = data.test.target
    
    # cache the files
    pickle.dump(X_tr, open('SRAA_X_train.pickle', 'wb'))
    pickle.dump(y_tr, open('SRAA_y_train.pickle', 'wb'))
    pickle.dump(X_te, open('SRAA_X_test.pickle', 'wb'))
    pickle.dump(y_te, open('SRAA_y_test.pickle', 'wb'))
    pickle.dump(data.train.data, open('SRAA_X_train_corpus.pickle', 'wb'))
    pickle.dump(data.test.data, open('SRAA_X_test_corpus.pickle', 'wb'))
    pickle.dump(vect.get_feature_names(), open('SRAA_feature_names.pickle', 'wb'))
    
    return (X_tr, y_tr, X_te, y_te, data.train.data, data.test.data)
开发者ID:dzhuang2,项目名称:active_learn,代码行数:26,代码来源:load_SRAA.py

示例13: text_sentiment

def text_sentiment(docs_new):
   docs_new=[docs_new]
   twenty_train= load_files('./Sentiment')  #the complete data is in this directory; like comp.graphics etc
   count_vect = CountVectorizer()
   X_train_counts = count_vect.fit_transform(twenty_train.data)
   tf_transformer = TfidfTransformer(use_idf=False).fit(X_train_counts)
   X_train_tf = tf_transformer.transform(X_train_counts)
   tfidf_transformer = TfidfTransformer()
   X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)

   # Fit a classifier on the training set
   #clf = MultinomialNB().fit(X_train_tfidf, twenty_train.target)
   #f = open('my_classifier.pickle', 'wb')
   #pickle.dump(clf, f)
   #f = open('my_classifier.pickle',)
   #clf = pickle.load(f)
   #f.close()
   # save the classifier
   #with open('my_sentiment.pkl', 'wb') as fid:
      #cPickle.dump(clf, fid)    

   # load it again
   with open('my_sentiment.pkl', 'rb') as fid:
      clf = cPickle.load(fid)
   X_new_counts = count_vect.transform(docs_new)
   X_new_tfidf = tfidf_transformer.transform(X_new_counts)

   predicted = clf.predict(X_new_tfidf)
   return twenty_train.target_names[predicted]
开发者ID:amangarg078,项目名称:TextGenius,代码行数:29,代码来源:sentiment.py

示例14: text_classifly_twang

def text_classifly_twang(dataset_dir_name, fs_method, fs_num):
    print 'Loading dataset, 80% for training, 20% for testing...'
    movie_reviews = load_files(dataset_dir_name)  
    doc_str_list_train, doc_str_list_test, doc_class_list_train, doc_class_list_test = train_test_split(movie_reviews.data, movie_reviews.target, test_size = 0.2, random_state = 0)
    
    print 'Feature selection...'
    print 'fs method:' + fs_method, 'fs num:' + str(fs_num)
    vectorizer = CountVectorizer(binary = True)   
    word_tokenizer = vectorizer.build_tokenizer()
    doc_terms_list_train = [word_tokenizer(doc_str) for doc_str in doc_str_list_train]
    term_set_fs = feature_selection.feature_selection(doc_terms_list_train, doc_class_list_train, fs_method)[:fs_num]
    
    print 'Building VSM model...'
    term_dict = dict(zip(term_set_fs, range(len(term_set_fs))))
    vectorizer.fixed_vocabulary = True
    vectorizer.vocabulary_ = term_dict
    doc_train_vec = vectorizer.fit_transform(doc_str_list_train)
    doc_test_vec= vectorizer.transform(doc_str_list_test)
    
    clf = MultinomialNB().fit(doc_train_vec, doc_class_list_train)  #调用MultinomialNB分类
    doc_test_predicted = clf.predict(doc_test_vec)
    
    acc = np.mean(doc_test_predicted == doc_class_list_test)  
    print 'Accuracy: ', acc
    
    return acc
开发者ID:ZHAOTING,项目名称:WebDataMining_Kaggle,代码行数:26,代码来源:feature_selection_test.py

示例15: __init__

    def __init__(self, file_path):
        self.training_documents = load_files(container_path='./20news-bydate/20news-bydate-train',
                                       categories=CATEGORIES,
                                       decode_error='ignore',
                                       shuffle=True,
                                       encoding='utf-8',
                                       random_state=42)

        self.test_documents = load_files(container_path='./20news-bydate/20news-bydate-test',
                                       categories=CATEGORIES,
                                       decode_error='ignore',
                                       shuffle=True,
                                       encoding='utf-8',
                                       random_state=42)

        self.file_path = file_path
开发者ID:sherkin735,项目名称:dmsapp,代码行数:16,代码来源:Classifier.py


注:本文中的sklearn.datasets.load_files函数示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。