本文整理汇总了Python中sklearn.datasets.load_files函数的典型用法代码示例。如果您正苦于以下问题:Python load_files函数的具体用法?Python load_files怎么用?Python load_files使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。
在下文中一共展示了load_files函数的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: export_classifier
def export_classifier():
#note that this data is not in the git repo
train_small = load_files('./training_data/')
test_small = load_files('./test_data/')
# Turn the text documents into vectors of word frequencies
vectorizer = CountVectorizer(min_df=5, ngram_range=(1, 2),
stop_words='english',
strip_accents='ascii')
X_train = vectorizer.fit_transform(train_small.data)
y_train = train_small.target
# Fit a classifier on the training set
classifier = LogisticRegression(penalty='l2', tol=0.0001, C=1.0,
fit_intercept=True, intercept_scaling=1,
).fit(X_train, y_train)
print("Training score: {0:.1f}%".format(
classifier.score(X_train, y_train) * 100))
# Evaluate the classifier on the testing set
X_test = vectorizer.transform(test_small.data)
y_test = test_small.target
print("Testing score: {0:.1f}%".format(
classifier.score(X_test, y_test) * 100))
export_pickle('LRclassifier.txt', classifier)
export_pickle('LRvectorizer.txt', vectorizer)
示例2: getData
def getData():
train_data= load_files('training')
test_data=load_files("test")
count_Vec=TfidfVectorizer(min_df=1,decode_error="replace")
doc_train=count_Vec.fit_transform(train_data.data)
doc_test=count_Vec.transform(test_data.data)# ! there is transform not fit_transform
return doc_train.toarray(),train_data.target,doc_test.toarray()
示例3: getData
def getData():
train_data= load_files('dataset/train')
test_data=load_files("dataset/test")
count_Vec=TfidfVectorizer(min_df=1,decode_error="replace")
doc_train=count_Vec.fit_transform(train_data.data)
doc_test=count_Vec.transform(test_data.data)
return doc_train.toarray(),train_data.target,doc_test.toarray(),test_data.target
示例4: createDataSet
def createDataSet(train_path,test_path,category,k):
"""
create vectorized text feature
'0' refer to 'atheism'
'1' refer to 'sports'
"""
train_set = datasets.load_files(train_path,categories=category,
load_content=True, shuffle=True, encoding='utf-8', decode_error='ignore', random_state=0)
count_vect = CountVectorizer(encoding = 'utf-8',lowercase = True,
decode_error = 'ignore', analyzer = 'word', ngram_range = (2,4),min_df = 1)
tfidf_vecter = TfidfVectorizer( max_df = 0.8, stop_words = 'english')
test_set = datasets.load_files(test_path,categories=category,
load_content=True, shuffle=True, encoding='utf-8', decode_error='ignore', random_state=0)
X_train_tfidf = tfidf_vecter.fit_transform(train_set.data)
X_train_counts = count_vect.fit_transform(train_set.data)
X_test_tfidf = tfidf_vecter.transform(test_set.data)
X_test_counts = count_vect.transform(test_set.data)
for i in range(X_train_counts.shape[0]):
if train_set.target[i] == k:
train_set.target[i] = 1
else:
train_set.target[i] = -1
for i in range(X_test_counts.shape[0]):
if test_set.target[i] == k:
test_set.target[i] = 1
else:
test_set.target[i] = -1
#X_train_normalize = preprocessing.normalize(X_train_counts, norm = 'l2')
#print train_set.target_names
#print train_set.target
#print size
#print len(train_set.target)
#print X_train_tfidf.shape
#print X_train_counts
#print X_train_normalize
return X_train_counts, train_set.target, X_train_counts.shape,X_test_counts, test_set.target, X_test_counts.shape
示例5: load
def load(dataset, categories):
if dataset == 'full':
train = load_files('aclImdb/aggregate/', categories=categories)
return train
elif dataset == 'split':
train = load_files('aclImdb/train/', categories=categories)
test = load_files('aclImdb/test/', categories=categories)
return (train, test)
示例6: vector_for_input_binary
def vector_for_input_binary(train_file_path="/mnt/hgfs/temp/machine learning/train",
test_file_path="/mnt/hgfs/temp/machine learning/test", categories=None):
train_data = load.load_files(train_file_path, categories=categories, encoding='utf-8', decode_error='ignore')
test_data = load.load_files(test_file_path, categories=categories, encoding='utf-8', decode_error='ignore')
vectorized = feature_extraction.CountVectorizer(min_df=1, binary=True)
train_input = vectorized.fit_transform(train_data['data'])
test_input = vectorized.transform(test_data['data'])
return train_input, train_data['target'], test_input, test_data['target']
示例7: test_grid_search_cv_on_newsgroup
def test_grid_search_cv_on_newsgroup():
## load news group data
categories = [
'alt.atheism',
'talk.religion.misc',
'comp.graphics',
'sci.space',
]
twenty_train_small = load_files('./data/20news-bydate-train/',
categories=categories, charset='latin-1')
twenty_test_small = load_files('./data/20news-bydate-test/',
categories=categories, charset='latin-1')
## model pipeline using tfidf and passive aggresive
pipeline = Pipeline((
('vec', TfidfVectorizer(min_df=1, max_df=0.8, use_idf=True)),
('clf', PassiveAggressiveClassifier(C=1)),
))
param_grid = {
'vec__min_df': [1, 2],
'vec__max_df': [0.8, 1.0],
'vec__ngram_range': [(1, 1), (1, 2)],
'vec__use_idf': [True, False]
}
X, y = twenty_train_small.data, twenty_train_small.target
## cross validation on n_iter = 5
grid_searcher = meta_search.GridSearch()
# persist only once
grid_searcher.persist_cv_splits('text_classification', X, y, './tmp/')
grid_searcher.search(pipeline, param_grid)
import time
while not grid_searcher.isready():
print time.sleep(2)
print 'progress:', grid_searcher.progress()
print 'best result:', grid_searcher.best_params_so_far()
if grid_searcher.best_params_so_far():
pass#grid_searcher.abort()
print len(grid_searcher.partial_result())
## run again with naive bayesian
## no need to persist_cv_splits
pipeline = Pipeline((
('vec', TfidfVectorizer(min_df=1, max_df=0.8, use_idf=True)),
('clf', MultinomialNB()),
))
grid_searcher10 = meta_search.GridSearch(datafiles = grid_searcher.datafiles)
grid_searcher10.search(pipeline, param_grid)
while not grid_searcher10.isready():
print time.sleep(2)
print 'progress:', grid_searcher10.progress()
print 'best result:', grid_searcher10.best_params_so_far()
if grid_searcher10.best_params_so_far():
pass#grid_searcher10.abort()
print len(grid_searcher10.partial_result())
示例8: main
def main():
#buildTrainSet()
#buildTestSet()
train = load_files('model/train', encoding='utf-8')
test = load_files('model/test', encoding='utf-8')
print train.cc
# for l in train.target_names:
# print l
# for l in train.target:
# print l
vectorizer = TfidfVectorizer(sublinear_tf=True, stop_words='english')
X_train = vectorizer.fit(train)
X_test = vectorizer.fit_transform(test)
print X_train.get_feature_names()
示例9: vector_for_input
def vector_for_input(train_file_path=path1,
test_file_path=path2, categories=None):
train_data = load.load_files(train_file_path, categories=categories, encoding='utf-8', decode_error='ignore')
test_data = load.load_files(test_file_path, categories=categories, encoding='utf-8', decode_error='ignore')
# vectorized_normalized = feature_extraction.TfidfVectorizer(min_df=1)
# train_input_normalized = vectorized_normalized.fit_transform(train_data['data'])
# test_input_normalized = vectorized_normalized.transform(test_data['data'])
vectorized = feature_extraction.CountVectorizer(min_df=1)
train_input = vectorized.fit_transform(train_data['data'])
test_input = vectorized.transform(test_data['data'])
return train_input, train_data['target'], test_input, test_data['target']
示例10: load_data
def load_data():
# Descargamos los datos, los descomprimimos en la carpeta ./data/txt_sentoken
# "http://www.cs.cornell.edu/people/pabo/movie-review-data/review_polarity.tar.gz")
dataset = load_files('./data/txt_sentoken', shuffle=False)
print("n_samples: %d" % len(dataset.data))
return dataset
示例11: testdata_stats
def testdata_stats():
test_dataset = datasets.load_files(project_root+"/testdata",
encoding='utf-8',
decode_error='ignore')
# save_thing_to_file(test_dataset, "test_dataset.txt")
bayes = get_thing_from_file("bayes.txt")
bayes.fit(test_dataset.data, test_dataset.target)
predicted_nb = bayes.predict(test_dataset.data)
print "*****BAYESIAN STATS****"
print "average accuracy = " + \
str(numpy.mean(predicted_nb == test_dataset.target))
print(metrics.classification_report(test_dataset.target, predicted_nb,
target_names=test_dataset.target_names))
print "*****BAYESIAN CONFUSION MATRIX*****"
print metrics.confusion_matrix(test_dataset.target, predicted_nb)
svm = get_thing_from_file("svm.txt")
svm.fit(test_dataset.data, test_dataset.target)
predicted_svm = svm.predict(test_dataset.data)
print "*****SVM STATS*****"
print "average accuracy = " + \
str(numpy.mean(predicted_svm == test_dataset.target))
print(metrics.classification_report(test_dataset.target, predicted_svm,
target_names=test_dataset.target_names))
print "*****SVM CONFUSION MATRIX*****"
print metrics.confusion_matrix(test_dataset.target, predicted_svm)
示例12: load_SRAA
def load_SRAA(AVI_HOME='./SRAA/partition1/data', percent=1./3, rnd=2342, \
vect=CountVectorizer(min_df=5, max_df=1.0, binary=True, ngram_range=(1, 1))):
data = load_files(AVI_HOME, encoding="latin1", load_content=True, random_state=rnd)
data.data = [remove_header_subject(text) for text in data.data]
indices = ShuffleSplit(len(data.data), n_iter=1, test_size=percent, indices=True, random_state=rnd)
for train_ind, test_ind in indices:
data = bunch.Bunch(train=bunch.Bunch(data=[data.data[i] for i in train_ind], target=data.target[train_ind]),
test=bunch.Bunch(data=[data.data[i] for i in test_ind], target=data.target[test_ind]))
X_tr = vect.fit_transform(data.train.data)
y_tr = data.train.target
X_te = vect.transform(data.test.data)
y_te = data.test.target
# cache the files
pickle.dump(X_tr, open('SRAA_X_train.pickle', 'wb'))
pickle.dump(y_tr, open('SRAA_y_train.pickle', 'wb'))
pickle.dump(X_te, open('SRAA_X_test.pickle', 'wb'))
pickle.dump(y_te, open('SRAA_y_test.pickle', 'wb'))
pickle.dump(data.train.data, open('SRAA_X_train_corpus.pickle', 'wb'))
pickle.dump(data.test.data, open('SRAA_X_test_corpus.pickle', 'wb'))
pickle.dump(vect.get_feature_names(), open('SRAA_feature_names.pickle', 'wb'))
return (X_tr, y_tr, X_te, y_te, data.train.data, data.test.data)
示例13: text_sentiment
def text_sentiment(docs_new):
docs_new=[docs_new]
twenty_train= load_files('./Sentiment') #the complete data is in this directory; like comp.graphics etc
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(twenty_train.data)
tf_transformer = TfidfTransformer(use_idf=False).fit(X_train_counts)
X_train_tf = tf_transformer.transform(X_train_counts)
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
# Fit a classifier on the training set
#clf = MultinomialNB().fit(X_train_tfidf, twenty_train.target)
#f = open('my_classifier.pickle', 'wb')
#pickle.dump(clf, f)
#f = open('my_classifier.pickle',)
#clf = pickle.load(f)
#f.close()
# save the classifier
#with open('my_sentiment.pkl', 'wb') as fid:
#cPickle.dump(clf, fid)
# load it again
with open('my_sentiment.pkl', 'rb') as fid:
clf = cPickle.load(fid)
X_new_counts = count_vect.transform(docs_new)
X_new_tfidf = tfidf_transformer.transform(X_new_counts)
predicted = clf.predict(X_new_tfidf)
return twenty_train.target_names[predicted]
示例14: text_classifly_twang
def text_classifly_twang(dataset_dir_name, fs_method, fs_num):
print 'Loading dataset, 80% for training, 20% for testing...'
movie_reviews = load_files(dataset_dir_name)
doc_str_list_train, doc_str_list_test, doc_class_list_train, doc_class_list_test = train_test_split(movie_reviews.data, movie_reviews.target, test_size = 0.2, random_state = 0)
print 'Feature selection...'
print 'fs method:' + fs_method, 'fs num:' + str(fs_num)
vectorizer = CountVectorizer(binary = True)
word_tokenizer = vectorizer.build_tokenizer()
doc_terms_list_train = [word_tokenizer(doc_str) for doc_str in doc_str_list_train]
term_set_fs = feature_selection.feature_selection(doc_terms_list_train, doc_class_list_train, fs_method)[:fs_num]
print 'Building VSM model...'
term_dict = dict(zip(term_set_fs, range(len(term_set_fs))))
vectorizer.fixed_vocabulary = True
vectorizer.vocabulary_ = term_dict
doc_train_vec = vectorizer.fit_transform(doc_str_list_train)
doc_test_vec= vectorizer.transform(doc_str_list_test)
clf = MultinomialNB().fit(doc_train_vec, doc_class_list_train) #调用MultinomialNB分类
doc_test_predicted = clf.predict(doc_test_vec)
acc = np.mean(doc_test_predicted == doc_class_list_test)
print 'Accuracy: ', acc
return acc
示例15: __init__
def __init__(self, file_path):
self.training_documents = load_files(container_path='./20news-bydate/20news-bydate-train',
categories=CATEGORIES,
decode_error='ignore',
shuffle=True,
encoding='utf-8',
random_state=42)
self.test_documents = load_files(container_path='./20news-bydate/20news-bydate-test',
categories=CATEGORIES,
decode_error='ignore',
shuffle=True,
encoding='utf-8',
random_state=42)
self.file_path = file_path