本文整理匯總了Python中sklearn.feature_extraction.text.TfidfVectorizer.build_tokenizer方法的典型用法代碼示例。如果您正苦於以下問題:Python TfidfVectorizer.build_tokenizer方法的具體用法?Python TfidfVectorizer.build_tokenizer怎麽用?Python TfidfVectorizer.build_tokenizer使用的例子?那麽, 這裏精選的方法代碼示例或許可以為您提供幫助。您也可以進一步了解該方法所在類sklearn.feature_extraction.text.TfidfVectorizer
的用法示例。
在下文中一共展示了TfidfVectorizer.build_tokenizer方法的15個代碼示例,這些例子默認根據受歡迎程度排序。您可以為喜歡或者感覺有用的代碼點讚,您的評價將有助於係統推薦出更棒的Python代碼示例。
示例1: file
# 需要導入模塊: from sklearn.feature_extraction.text import TfidfVectorizer [as 別名]
# 或者: from sklearn.feature_extraction.text.TfidfVectorizer import build_tokenizer [as 別名]
def file():
cats = ["alt.atheism", "sci.electronics"]
newsgroups_train = fetch_20newsgroups(subset="train", categories=cats)
newsgroups_test = fetch_20newsgroups(subset="test", categories=cats)
vectorizer = TfidfVectorizer() # 把所有文檔都切詞,統計了
vectors_train = vectorizer.fit_transform(newsgroups_train.data)
vectors = vectorizer.transform(newsgroups_test.data)
print vectors.shape[1]
# f=open('test_all.txt','wb')
for j in range(0, vectors.shape[0]):
item_id = list()
tokens = vectorizer.build_tokenizer()(newsgroups_test.data[j]) # 提取分詞結果
# print tokens
word_sort = np.argsort(-vectors[j].data)
print "頂點" + str(j)
for i in range(0, len(word_sort)):
word = vectorizer.get_feature_names()[vectors[j].indices[word_sort[i]]] # 這個是tf-idf詞
for line in range(0, len(tokens)):
if tokens[line].lower() == word:
item_id.append((line, word_sort[i]))
pos_item = sorted(item_id, key=lambda jj: jj[0], reverse=True) # 抽取tf-idf詞
word_word = np.zeros([len(word_sort), len(word_sort)])
for p in range(0, len(pos_item)):
if p < (len(pos_item) - 1):
ki = word_sort[pos_item[p][1]]
kj = word_sort[pos_item[p + 1][1]]
word_word[ki, kj] = word_word[ki, kj] + 1
示例2: create_vocab
# 需要導入模塊: from sklearn.feature_extraction.text import TfidfVectorizer [as 別名]
# 或者: from sklearn.feature_extraction.text.TfidfVectorizer import build_tokenizer [as 別名]
def create_vocab(train):
init_time = time.time()
vocab = set()
t = TfidfVectorizer()
tokenizer = t.build_tokenizer()
for ex in train[0]:
vocab.update(tokenizer(ex))
end_time = time.time()
print("it took " + str(end_time - init_time) + "to create the vocabulary")
return vocab
示例3: Analyzer
# 需要導入模塊: from sklearn.feature_extraction.text import TfidfVectorizer [as 別名]
# 或者: from sklearn.feature_extraction.text.TfidfVectorizer import build_tokenizer [as 別名]
class Analyzer(object):
def __init__(self):
self.tfidf = TfidfVectorizer(min_df=1, binary=False, ngram_range=(1, 3), tokenizer=Tokenizer())
self.tokens = self.tfidf.build_tokenizer()
self.ngram = self.tfidf.build_analyzer()
def __call__(self, sentence):
ret = self.ngram(sentence)
terms = self.tokens(sentence)
for term in terms:
cate = term_category(term)
if term != cate:
ret.append(cate)
return ret
示例4: Vectorizer
# 需要導入模塊: from sklearn.feature_extraction.text import TfidfVectorizer [as 別名]
# 或者: from sklearn.feature_extraction.text.TfidfVectorizer import build_tokenizer [as 別名]
class Vectorizer(object):
def __init__(self):
self.count_vec = TfidfVectorizer(binary = True,
ngram_range = (1, 3),
tokenizer = Tokenizer())
self.last_vec = CountVectorizer(binary = True, ngram_range = (1, 1), tokenizer = Tokenizer())
def collect_last_term(self, X):
X_last = list()
tokens = self.last_vec.build_tokenizer()
_logger.debug("Extracting last term for each sentence")
for sent in X:
X_last.append(tokens(sent)[-1])
_logger.debug("Fitting last-term vectorizer")
return X_last
def fit(self, X, y = None):
_logger.debug("Fitting count vectorizer")
self.count_vec.fit(X)
X_last = self.collect_last_term(X)
self.last_vec.fit(X_last)
return self
def transform(self, X, y = None):
#return self.count_vec.transform(X)
_logger.debug("Doing tfidf transform")
Xc = self.count_vec.transform(X)
X_last = self.collect_last_term(X)
_logger.debug("Doing last term transform")
Xl = self.last_vec.transform(X_last)
_logger.debug("stacking features")
ret = sparse.hstack([Xc, Xl])
tokens = self.count_vec.build_tokenizer()
l = list()
for sent in X:
terms = tokens(sent)
l.append(1 if ("__LOCATION__" in terms and "__ORGNIZATION__" in terms) else 0)
l = np.array(l)
l.shape = len(l), 1
ret = sparse.hstack([ret, l])
_logger.debug("vectorization transform done")
return ret
示例5: transform_cnn_data
# 需要導入模塊: from sklearn.feature_extraction.text import TfidfVectorizer [as 別名]
# 或者: from sklearn.feature_extraction.text.TfidfVectorizer import build_tokenizer [as 別名]
def transform_cnn_data(self, X_raw, feat_and_param):
feat_and_param['feats']['ngram_range'] = (1,1)
feat_and_param['feats']['use_idf'] = False
feat_and_param['feats']['binary'] = False
vectorizer = TfidfVectorizer(**feat_and_param['feats'])
vectorizer.fit(X_raw)
tokenizer = TfidfVectorizer.build_tokenizer(vectorizer)
X_raw_tokenized = [tokenizer(ex) for ex in X_raw]
train_X = []
for example in X_raw_tokenized:
for i in range(len(example)):
example[i] = re.sub(r"[^A-Za-z0-9(),!?\'\`]", "", example[i])
train_X.append([vectorizer.transform(example)])
index_to_word = {v:k for k,v in vectorizer.vocabulary_.items()}
for key in index_to_word:
index_to_word[key] = re.sub(r"[^A-Za-z0-9(),!?\'\`]", "", index_to_word[key])
return train_X, index_to_word
示例6: process_joke
# 需要導入模塊: from sklearn.feature_extraction.text import TfidfVectorizer [as 別名]
# 或者: from sklearn.feature_extraction.text.TfidfVectorizer import build_tokenizer [as 別名]
def process_joke(joke):
data = {}
# Lowercase text.
joke.text = joke.text.lower()
# Replace text with dict.
stop_words = set(stopwords.words('english'))
vectorizer = TfidfVectorizer()
tokenizer = vectorizer.build_tokenizer()
def tokenize_text(text, prefix=''):
d = {}
for term in tokenizer(text):
if term in stop_words:
continue
d[prefix + term] = d.get(prefix + term, 0) + 1
return d
data.update(tokenize_text(joke.text, 't_'))
data.update({('cat_' + cat): 1 for cat in joke.categories})
data.update({('subcat_' + cat): 1 for cat in joke.subcategories})
return data
示例7: TfidfVectorizer
# 需要導入模塊: from sklearn.feature_extraction.text import TfidfVectorizer [as 別名]
# 或者: from sklearn.feature_extraction.text.TfidfVectorizer import build_tokenizer [as 別名]
X_test = np.array([''.join(el) for el in nyt_data[trainset_size + 1:len(nyt_data)]])
y_test = np.array([el for el in nyt_labels[trainset_size + 1:len(nyt_labels)]])
#print(X_train)
vectorizer = TfidfVectorizer(min_df=2,
ngram_range=(1, 2),
stop_words='english',
strip_accents='unicode',
norm='l2')
test_string = unicode(nyt_data[0])
print "Example string: " + test_string
print "Preprocessed string: " + vectorizer.build_preprocessor()(test_string)
print "Tokenized string:" + str(vectorizer.build_tokenizer()(test_string))
print "N-gram data string:" + str(vectorizer.build_analyzer()(test_string))
X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)
svm_classifier = LinearSVC().fit(X_train, y_train)
示例8: main
# 需要導入模塊: from sklearn.feature_extraction.text import TfidfVectorizer [as 別名]
# 或者: from sklearn.feature_extraction.text.TfidfVectorizer import build_tokenizer [as 別名]
def main():
vct = TfidfVectorizer(encoding='ISO-8859-1', min_df=1, max_df=1.0, binary=False, ngram_range=(1, 1),
token_pattern='\\b\\w+\\b') #, tokenizer=StemTokenizer())
vct_analizer = vct.build_tokenizer()
print("Start loading ...")
# data fields: data, bow, file_names, target_names, target
########## NEWS GROUPS ###############
# easy to hard. see "Less is More" paper: http://axon.cs.byu.edu/~martinez/classes/678/Presentations/Clawson.pdf
categories = [['alt.atheism', 'talk.religion.misc'],
['comp.graphics', 'comp.windows.x'],
['comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware'],
['rec.sport.baseball', 'sci.crypt']]
min_size = 10 # max(10, args.fixk)
args.fixk = None
data, vct = load_from_file(args.train, [categories[3]], args.fixk, min_size, vct, raw=True)
print("Data %s" % args.train)
print("Data size %s" % len(data.train.data))
### SENTENCE TRANSFORMATION
sent_detector = nltk.data.load('tokenizers/punkt/english.pickle')
## delete <br> to "." to recognize as end of sentence
data.train.data = experiment_utils.clean_html(data.train.data)
data.test.data = experiment_utils.clean_html(data.test.data)
print("Train:{}, Test:{}, {}".format(len(data.train.data), len(data.test.data), data.test.target.shape[0]))
## Get the features of the sentence dataset
## create splits of data: pool, test, oracle, sentences
expert_data = Bunch()
train_test_data = Bunch()
expert_data.sentence, train_test_data.pool = split_data(data.train)
expert_data.oracle, train_test_data.test = split_data(data.test)
data.train.data = train_test_data.pool.train.data
data.train.target = train_test_data.pool.train.target
data.test.data = train_test_data.test.train.data
data.test.target = train_test_data.test.train.target
## convert document to matrix
data.train.bow = vct.fit_transform(data.train.data)
data.test.bow = vct.transform(data.test.data)
#### EXPERT CLASSIFIER: ORACLE
print("Training Oracle expert")
labels, sent_train = split_data_sentences(expert_data.oracle.train, sent_detector)
expert_data.oracle.train.data = sent_train
expert_data.oracle.train.target = np.array(labels)
expert_data.oracle.train.bow = vct.transform(expert_data.oracle.train.data)
exp_clf = linear_model.LogisticRegression(penalty='l1', C=args.expert_penalty)
exp_clf.fit(expert_data.oracle.train.bow, expert_data.oracle.train.target)
#### EXPERT CLASSIFIER: SENTENCES
print("Training sentence expert")
labels, sent_train = split_data_sentences(expert_data.sentence.train, sent_detector)
expert_data.sentence.train.data = sent_train
expert_data.sentence.train.target = np.array(labels)
expert_data.sentence.train.bow = vct.transform(expert_data.sentence.train.data)
sent_clf = linear_model.LogisticRegression(penalty='l1', C=args.expert_penalty)
sent_clf.fit(expert_data.sentence.train.bow, expert_data.sentence.train.target)
#### TESTING THE CLASSIFERS
test_target, test_data = split_data_sentences(data.test,sent_detector)
test_data_bow = vct.transform(test_data)
#pred_sent = sent_clf.predict(test_data_bow)
pred_ora = exp_clf.predict(test_data_bow)
y_probas = sent_clf.predict_proba(test_data_bow)
pred_sent = sent_clf.classes_[np.argmax(y_probas, axis=1)]
## just based on one class probability
# order = np.argsort(y_probas[:,0])
order = np.argsort(y_probas.max(axis=1))
print "ORACLE\tSENTENCE\tMAX-SENT"
# for i in order[:500]:
# print pred_ora[i],pred_sent[i], y_probas[i,0], test_data[i]
for i in order[-500:]:
print pred_ora[i],pred_sent[i], y_probas[i,0], test_data[i]
print "Accuracy of Sentences Classifier", metrics.accuracy_score(test_target, pred_sent)
print "Class distribution: %s" % pred_sent.sum()
print "Size of data: %s" % pred_sent.shape[0]
sizes = [50, 100, 500, 1000, 2000, 3000, 4000, 20000]
#.........這裏部分代碼省略.........
示例9: CountVectorizer
# 需要導入模塊: from sklearn.feature_extraction.text import TfidfVectorizer [as 別名]
# 或者: from sklearn.feature_extraction.text.TfidfVectorizer import build_tokenizer [as 別名]
count_pos_test = count_neg_test + 1
label_test = test_data[:,1]
#vctr = CountVectorizer(stop_words='english',min_df = 1)
#vctr2 = HashingVectorizer(stop_words='english')
vctr = TfidfVectorizer(stop_words='english') #intailising vectorizers TF-IDF gives better accuracy by 1 percent compared to the other vectors
count_pos = 0
count_neg = 0
######################################################################################################
train = []
test = []
for i in range(len(train_data)): #processing of the train data
string = train_data[i,0]
string = vctr.build_preprocessor()(string.lower())
string = vctr.build_tokenizer()(string.lower())
train.append(' '.join(string))
for i in range(len(test_data)): #processing of the test data
string = test_data[i,0]
string = vctr.build_preprocessor()(string.lower())
string = vctr.build_tokenizer()(string.lower())
test.append(' '.join(string))
######################################################################################################
train_data1 = vctr.fit_transform(train).toarray() #fitting the dictionary for bag of words model using TF-IDF vectorizers
#X_test = vctr.transform(test).toarray()
y_train = np.asarray(label_train, dtype="|S6")
y_train = y_train.astype(int)
clf1 = GradientBoostingClassifier(n_estimators = 500) #initialising classifiers
clf2 = AdaBoostClassifier(n_estimators = 500)
示例10: train_extractor_from_lines
# 需要導入模塊: from sklearn.feature_extraction.text import TfidfVectorizer [as 別名]
# 或者: from sklearn.feature_extraction.text.TfidfVectorizer import build_tokenizer [as 別名]
class FeatureExtractor:
vectorizer = None
feature_names = None
feature_matrix = None
def train_extractor_from_lines(self, train_lines, labels, test_lines):
self.vectorizer = TfidfVectorizer(tokenizer=tokenizer, max_features=DISTINCT_WORDS_CNT)
self.vectorizer.fit(train_lines + test_lines)
pass
def load_vectorizer(self):
input_file = open('../models/tfidf_vectorizer.pkl', 'rb')
self.vectorizer = pickle.load(input_file)
input_file.close()
pass
def save_vectorizer(self):
output_file = open('../models/tfidf_vectorizer.pkl', 'wb')
pickle.dump(self.vectorizer, output_file)
output_file.close()
pass
def train_extractor(self, full = False):
if not full:
train_lines = file2lines('../data/train_lite.csv')
labels = file2labels('../data/train_lite.csv')
test_lines = file2lines('../data/test_lite.csv')
else:
train_lines = file2lines('../data/train.csv')
labels = file2labels('../data/train.csv')
test_lines = file2lines('../data/test.csv')
self.train_extractor_from_lines(train_lines, labels, test_lines)
pass
def lines2words(self, lines):
self.tokenizer = self.vectorizer.build_tokenizer()
return [self.tokenizer(line) for line in lines]
def lines2features(self, lines, use_tense = False):
"""
returns DataFrame(feature_matrix, feature_name)
['word_rainny', 'word_'sunny'],
array([
[1, 0.4, 0.2],
[0.2, 1, 0.2],
])
"""
self.feature_names = []
self.feature_matrix = None
# tf-idf features
data = self.vectorizer.transform(lines).toarray()
self.feature_names = self.vectorizer.get_feature_names()
self.feature_matrix = data
# additional features
add_features = []
important_words = ['sunny', 'wind', 'humid', 'hot', 'cold', 'dry', 'ice', 'rain', 'snow', 'tornado', 'storm', 'hurricane']
important_words = ['cloud', 'cold', 'dry', 'hot', 'humid', 'hurricane', 'ice', 'rain', 'snow', 'storm', 'sunny', 'tornado', 'wind']
self.feature_names = self.feature_names + ['impt_words:' + word for word in important_words]
if use_tense:
self.feature_names = self.feature_names + ['past_tense_num', 'present_tense_num']
all_words = self.lines2words(lines)
for words in all_words:
# important words
important_words_ftr = [int(word in words) for word in important_words]
add_features.append(important_words_ftr)
# tense
if use_tense:
tagz = zip(*nltk.pos_tag(nltk.word_tokenize(words)))[1]
past_num = len([v for v in tagz if v == 'VBD'])
present_num = len([v for v in tagz if v in ['VBP', 'VB']])
add_features.append([past_num, present_num])
self.feature_matrix = np.hstack((self.feature_matrix, add_features))
return DataFrame(self.feature_matrix, columns = self.feature_names)
示例11: TfidfVectorizer
# 需要導入模塊: from sklearn.feature_extraction.text import TfidfVectorizer [as 別名]
# 或者: from sklearn.feature_extraction.text.TfidfVectorizer import build_tokenizer [as 別名]
# -*- coding: utf-8 -*-
from gensim import corpora, models, matutils
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.stem import WordNetLemmatizer
dataset=fetch_20newsgroups(categories=['alt.atheism','talk.religion.misc','sci.space']) # berem toko 3 categorii
vect = TfidfVectorizer()
tok=vect.build_tokenizer() # хорошо токенизирует все
texts=[]
lem=WordNetLemmatizer()
lemms=[]
#for text in dataset.data:
# for token in tok(text):
# lemms.append(lem.lemmatize(token))
# texts.append(lemms)
#models = models.Word2Vec(texts,size=100, window=5,min_count=5,workers=4)
#models.save('texts.dat')
model = models.Word2Vec.load('texts.dat')
#print(model['theory'])
#print(model.similarity('man','car'))
#print(model.most_similar(positive=['man'],negative=['computer']))
print model.doesnt_match("car wheel glass engine".split())
示例12: WordNetLemmatizer
# 需要導入模塊: from sklearn.feature_extraction.text import TfidfVectorizer [as 別名]
# 或者: from sklearn.feature_extraction.text.TfidfVectorizer import build_tokenizer [as 別名]
self.wnl = WordNetLemmatizer()
def __call__(self, doc):
doc = str(doc)
s = "".join(doc.split("__EOS__"))
doc = s.translate(None, string.punctuation)
tokens = doc.word_tokenize(doc)
bi = list(p1+" "+p2 for p1,p2 in nltk.bigrams(tokens))
tokens.extend(bi)
return [self.wnl.lemmatize(t) for t in tokens]
if _use_TFIDF_ :
#vectorizer = TfidfVectorizer(lowercase=True, ngram_range=(1,2), min_df=1, stop_words=stoplist, max_features=no_of_features, tokenizer=LemmaTokenizer())
vectorizer = TfidfVectorizer(lowercase=True, ngram_range=(1,2), min_df=1, stop_words=stoplist, max_features=no_of_features)
func_tokenizer =vectorizer.build_tokenizer()
'''
I was using two functions earlier for tokenization and data preprocessing.
Later implemented the LemmaTokenizer class for this.
'''
def ispunct(some_string):
return not any(char.isalnum() for char in some_string)
def get_tokens(s):
# Tokenize into words in sentences. Returns list of strs
retval = []
sents = sent_tokenize(s)
for sent in sents:
示例13: main
# 需要導入模塊: from sklearn.feature_extraction.text import TfidfVectorizer [as 別名]
# 或者: from sklearn.feature_extraction.text.TfidfVectorizer import build_tokenizer [as 別名]
def main():
accuracies = defaultdict(lambda: [])
aucs = defaultdict(lambda: [])
x_axis = defaultdict(lambda: [])
vct = TfidfVectorizer(encoding='ISO-8859-1', min_df=1, max_df=1.0, binary=False, ngram_range=(1, 1),
token_pattern='\\b\\w+\\b')#, tokenizer=StemTokenizer())
#
# vct = CountVectorizer(encoding='ISO-8859-1', min_df=1, max_df=1.0, binary=True, ngram_range=(1, 1),
# token_pattern='\\b\\w+\\b')#, tokenizer=StemTokenizer())
vct_analizer = vct.build_tokenizer()
print("Start loading ...")
# data fields: data, bow, file_names, target_names, target
########## NEWS GROUPS ###############
# easy to hard. see "Less is More" paper: http://axon.cs.byu.edu/~martinez/classes/678/Presentations/Clawson.pdf
categories = [['alt.atheism', 'talk.religion.misc'],
['comp.graphics', 'comp.windows.x'],
['comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware'],
['rec.sport.baseball', 'sci.crypt']]
min_size = 10 # max(10, args.fixk)
# if args.fixk < 0:
args.fixk = None
data, vct = load_from_file(args.train, [categories[3]], args.fixk, min_size, vct, raw=True)
print("Data %s" % args.train)
print("Data size %s" % len(data.train.data))
print ("Vectorizer: %s" % vct)
parameters = parse_parameters_mat(args.cost_model)
print "Cost Parameters %s" % parameters
cost_model = set_cost_model(args.cost_function, parameters=parameters)
print "\nCost Model: %s" % cost_model.__class__.__name__
### SENTENCE TRANSFORMATION
sent_detector = nltk.data.load('tokenizers/punkt/english.pickle')
## delete <br> to "." to recognize as end of sentence
data.train.data = clean_html(data.train.data)
data.test.data = clean_html(data.test.data)
# labels, sent_train = split_data_sentences(data.train, sent_detector)
#
# data.train.data = sent_train
# data.train.target = np.array(labels)
# labels, sent_train = split_data_sentences(data.test, sent_detector)
# data.test.data = sent_train
# data.test.target = np.array(labels)
print("Train:{}, Test:{}, {}".format(len(data.train.data), len(data.test.data), data.test.target.shape[0]))
## Get the features of the sentence dataset
# data.train.bow = vct.fit_transform(data.train.data)
data.test.bow = vct.transform(data.test.data)
#### EXPERT CLASSIFIER
exp_clf = linear_model.LogisticRegression(penalty='l1', C=args.expert_penalty)
exp_clf.fit(data.test.bow, data.test.target)
expert = baseexpert.NeutralityExpert(exp_clf, threshold=args.neutral_threshold,
cost_function=cost_model.cost_function)
print "\nExpert: %s " % expert
#### STUDENT CLASSIFIER
clf = linear_model.LogisticRegression(penalty="l1", C=1)
# clf = set_classifier(args.classifier)
student = structured.AALStructured(model=clf, accuracy_model=None, budget=args.budget, seed=args.seed, vcn=vct,
subpool=250, cost_model=cost_model)
student.set_score_model(exp_clf)
print "\nStudent Classifier: %s" % clf
#### ACTIVE LEARNING SETTINGS
step_size = args.step_size
bootstrap_size = args.bootstrap
evaluation_points = 200
print ("Sentence Classification")
t0 = time.time()
tac = []
tau = []
# predition = exp_clf.predict(data.train.bow)
#.........這裏部分代碼省略.........
示例14: TfidfVectorizer
# 需要導入模塊: from sklearn.feature_extraction.text import TfidfVectorizer [as 別名]
# 或者: from sklearn.feature_extraction.text.TfidfVectorizer import build_tokenizer [as 別名]
__author__ = 'Alena'
from sklearn.datasets import fetch_20newsgroups
dataset=fetch_20newsgroups()
from sklearn.feature_extraction.text import TfidfVectorizer
vect = TfidfVectorizer()
tok=vect.build_tokenizer()
texts=[]
Y=vect.fit_transform(dataset.data)
first=Y.getcol(0)
second=Y.getcol(1)
word1=[]
for i, el in enumerate(first):
word1.append(first._get_single_element(i,0))
word2=[]
for i, el in enumerate(second):
word2.append(second._get_single_element(i,0))
distance=0
for i in range(len(word2)):
distance+=absmod=word1[i]-word2[i](mod)
print(distance)
示例15: TfidfVectorizer
# 需要導入模塊: from sklearn.feature_extraction.text import TfidfVectorizer [as 別名]
# 或者: from sklearn.feature_extraction.text.TfidfVectorizer import build_tokenizer [as 別名]
__author__ = '315-4'
# -*- coding: utf-8 -*-
from gensim import corpora, models, matutils
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
dataset=fetch_20newsgroups() # датасет - 20 групп новостей
vect = TfidfVectorizer() # конвертор в матрицу TF-IDF
tok = vect.build_tokenizer() # токенизатор
texts = []
# токенизация текстов
for text in dataset.data:
texts.append(tok(text))
# на сцену выходит gensim
# Convert document (a list of words) into the bag-of-words
dictionary = corpora.Dictionary(texts) # создаем словарь (сет токенов)
corpus = [dictionary.doc2bow(text) for text in texts] # корпус
new_vec = dictionary.doc2bow((tok('Hello world'))) # это нигде не используется
# Обучение LDA модели
lda = models.ldamodel.LdaModel(corpus=corpus, id2word=dictionary,\
num_topics=100, update_every=1, chunksize=10000, passes=1)
# выводим матрицу V из UEV разложения
for item in lda.print_topics(100):
print (item)