本文整理匯總了Python中sklearn.feature_extraction.text.TfidfVectorizer.build_analyzer方法的典型用法代碼示例。如果您正苦於以下問題:Python TfidfVectorizer.build_analyzer方法的具體用法?Python TfidfVectorizer.build_analyzer怎麽用?Python TfidfVectorizer.build_analyzer使用的例子?那麽, 這裏精選的方法代碼示例或許可以為您提供幫助。您也可以進一步了解該方法所在類sklearn.feature_extraction.text.TfidfVectorizer
的用法示例。
在下文中一共展示了TfidfVectorizer.build_analyzer方法的15個代碼示例,這些例子默認根據受歡迎程度排序。您可以為喜歡或者感覺有用的代碼點讚,您的評價將有助於係統推薦出更棒的Python代碼示例。
示例1: learn_vocabulary
# 需要導入模塊: from sklearn.feature_extraction.text import TfidfVectorizer [as 別名]
# 或者: from sklearn.feature_extraction.text.TfidfVectorizer import build_analyzer [as 別名]
def learn_vocabulary(docs, only_noun_phrases=True):
first_occurrence_all = []
entropy_all = []
#docs = [doc.decode('utf8', 'ignore') for doc in docs]
'''
noun_phrases = set()
if only_noun_phrases:
for i, doc in enumerate(docs):
print "--extracting NP from doc", i
#doc = doc.decode('utf8', 'ignore')
noun_phrases.update([lemmatize(phrase) for phrase in extract_candidate_chunks(doc)])
with open('./semeval_train_docs_noun_phrases.set', 'w') as f:
pickle.dump(noun_phrases, f)
'''
print "loading pre-extracted set of noun_phrases"
noun_phrases = set()
with open('./semeval_train_docs_noun_phrases.set', 'r') as f:
noun_phrases = pickle.load(f)
vectorizer = TfidfVectorizer(decode_error='ignore', preprocessor=preprocess, ngram_range=(1, 3), tokenizer=tokenize)
analyzer = vectorizer.build_analyzer()
vocab = set()
print "--learning vocabulary"
for i, doc in enumerate(docs):
print "--learning doc", i
first_occurrence = {}
entropy = {}
phrases = analyzer(doc) # all phrases from doc
doc = preprocess(doc)
doc_length = len(doc)
chunks = get_chunks(doc)
for i, phrase in enumerate(phrases):
if valid_ngram(phrase, noun_phrases) and phrase not in first_occurrence:
try:
pos = doc.find(phrase)
except ValueError:
print "--phrase: '{}' not found".format(phrase)
continue
first_occurrence[phrase] = pos / doc_length
# calculate entropy
entropy[phrase] = get_entropy(phrase, chunks)
vocab.add(phrase)
first_occurrence_all.append(first_occurrence)
entropy_all.append(entropy)
print "--size of vocabulary: ", len(vocab)
return vocab, first_occurrence_all, entropy_all
示例2: Analyzer
# 需要導入模塊: from sklearn.feature_extraction.text import TfidfVectorizer [as 別名]
# 或者: from sklearn.feature_extraction.text.TfidfVectorizer import build_analyzer [as 別名]
class Analyzer(object):
def __init__(self):
self.tfidf = TfidfVectorizer(min_df=1, binary=False, ngram_range=(1, 3), tokenizer=Tokenizer())
self.tokens = self.tfidf.build_tokenizer()
self.ngram = self.tfidf.build_analyzer()
def __call__(self, sentence):
ret = self.ngram(sentence)
terms = self.tokens(sentence)
for term in terms:
cate = term_category(term)
if term != cate:
ret.append(cate)
return ret
示例3: feed
# 需要導入模塊: from sklearn.feature_extraction.text import TfidfVectorizer [as 別名]
# 或者: from sklearn.feature_extraction.text.TfidfVectorizer import build_analyzer [as 別名]
def feed(param):
values=[]
result={}
tweetdata = rawtweets.find()
json_str =json_util.dumps(tweetdata)
tweetdata =json_util.loads(json_str)
path = os.path.dirname(os.path.realpath(__file__))
texts = []
for tweetlist in tweetdata:
tweet = tweetlist["text"]
print(tweet)
#d = datetime.strptime(tweetlist["_id"], '%Y/%m/%d/%H')
text = unicodedata.normalize('NFKD', tweet).encode('ascii','ignore').decode('utf-8')
texts.append(text)
vectorizer = TfidfVectorizer(
analyzer='char',
#token_pattern=r'[a-z]{4,}',
#use_idf=True,
#strip_accents='unicode',
#sublinear_tf=False
)
print(len(texts))
vectorizer.build_analyzer()
idf = vectorizer.fit_transform(texts)
feature_names = np.asarray(vectorizer.get_feature_names())
#print(idf.todense().T)
#print((idf * idf.T).A)
#print(idf.data)
print("len ",(feature_names))
z = (zip(feature_names,idf.data))
d = {}
for t in z:
#print(t[0],t[1])
d[t[0]] = t[1]
#print(d)
return d
示例4: train
# 需要導入模塊: from sklearn.feature_extraction.text import TfidfVectorizer [as 別名]
# 或者: from sklearn.feature_extraction.text.TfidfVectorizer import build_analyzer [as 別名]
def train(self, segments, ignore_before=4, ignore_after=4):
'''
This uses the 20newsgroups dataset for idf
Parameters:
:segments: list of strings where each string is a segment
'''
data = fetch_20newsgroups(subset='train').data
stripped_data = []
for d in data:
lines = d.split('\n')
if len(lines)>ignore_before+ignore_after:
stripped_data.append('\n'.join(lines[ignore_before:-ignore_after]))
txt = ''.join(segments)
stripped_data.append(txt)
# Train corpus tf-idf
tfidf_corpus = TfidfVectorizer(stop_words='english')
tfidf_corpus.fit(stripped_data)
book_scores = tfidf_corpus.transform([txt])
print 'Learned {} features CORPUS'.format(len(tfidf_corpus.get_feature_names()))
# Train document segment-wise tf-idf
tfidf_book = TfidfVectorizer(vocabulary=tfidf_corpus.vocabulary_)
segment_scores = tfidf_book.fit_transform(segments)
print 'Learned {} features BOOK'.format(len(tfidf_book.get_feature_names()))
# Now get word scores in each segment
final_scores = book_scores.multiply(segment_scores)
idx_to_word = tfidf_corpus.get_feature_names()
word_scores = []
for i, segment_scores in enumerate(final_scores):
scores = {}
for j in segment_scores.indices:
scores[idx_to_word[j]] = segment_scores[0, j]
word_scores.append(scores)
self.word_scores = word_scores
self.analyze = tfidf_corpus.build_analyzer()
示例5: keyword_extractor_tfidf
# 需要導入模塊: from sklearn.feature_extraction.text import TfidfVectorizer [as 別名]
# 或者: from sklearn.feature_extraction.text.TfidfVectorizer import build_analyzer [as 別名]
def keyword_extractor_tfidf(corpus_list,is_stop_words_allowed,n_gram_min,n_gram_max):
if n_gram_min > n_gram_max:
raise Exception('Invalid input n_gram_min should be <= n_gram_max')
corpus = []
for doc in corpus_list:
text = ''
for word in doc:
text = text +' '+ word
corpus.append(text)
if is_stop_words_allowed == False:
vectorizer = TfidfVectorizer(ngram_range=(n_gram_min, n_gram_max),stop_words='english')
else:
vectorizer = TfidfVectorizer(ngram_range=(n_gram_min, n_gram_max))
analyzer = vectorizer.build_analyzer()
analyzer(corpus[0])
features_array = vectorizer.fit_transform(corpus).toarray()
features_transform_list = features_array.tolist()[0]
features_dictionary = dict(zip(vectorizer.get_feature_names(),features_transform_list))
sorted_features_dictionary = OrderedDict(sorted(features_dictionary.items(),key=itemgetter(1)))
return sorted_features_dictionary
示例6: sentence_tokenizer
# 需要導入模塊: from sklearn.feature_extraction.text import TfidfVectorizer [as 別名]
# 或者: from sklearn.feature_extraction.text.TfidfVectorizer import build_analyzer [as 別名]
def sentence_tokenizer(dataset_name="pascal"):
"""
Parameters
----------
dataset_name : string
'memorability' or 'pascal' or 'clipart'
Returns
-------
analyze : object
breaks sentences into words using scikit-learn tokenizer
vectorizer : object of class TfidfVectorizer
see scikit-learn documentation
"""
if dataset_name == "memorability":
mat = scipy.io.loadmat("../../data/sentences/memorability_888_img_5_sent.mat")
sentences = mat["memorability_sentences"]
elif dataset_name == "pascal":
mat = scipy.io.loadmat("../../data/sentences/pascal_1000_img_50_sent.mat")
sentences = mat["pascal_sentences"]
elif dataset_name == "clipart":
mat = scipy.io.loadmat("../../data/sentences/clipart_500_img_48_sent.mat")
sentences = mat["clipart_sentences"]
# Build corpus
corpus = list()
for sent_group in sentences:
corpus.append(" ".join([sent[0] for sent in sent_group]))
### Build tf-idf vectorizer ###
# at-least three letters in word
vectorizer = TfidfVectorizer(token_pattern="(?u)\\b\\w\\w\\w+\\b")
vectorizer.fit(corpus)
analyze = vectorizer.build_analyzer()
return analyze, vectorizer
示例7: extract_candidates_doc
# 需要導入模塊: from sklearn.feature_extraction.text import TfidfVectorizer [as 別名]
# 或者: from sklearn.feature_extraction.text.TfidfVectorizer import build_analyzer [as 別名]
def extract_candidates_doc(doc, phrase_list, idf_vec, training_size = 450):
#vocab = set(phrase_list)
idf_dic = {}
#print "phrase list len", len(phrase_list)
#print "len idf_vec", len(idf_vec)
for i, phrase in enumerate(phrase_list):
idf_dic[phrase] = idf_vec[i]
noun_phrases = set()
print "--extracting NP"
noun_phrases = set([lemmatize(phrase) for phrase in extract_candidate_chunks(doc)])
vectorizer = TfidfVectorizer(decode_error='ignore', preprocessor=preprocess, ngram_range=(1, 3), tokenizer=tokenize)
analyzer = vectorizer.build_analyzer()
phrases = list(set([phrase for phrase in analyzer(doc) if valid_ngram(phrase, noun_phrases)]))
doc = preprocess(doc)
#print "candidate phrases", phrases
#tfidf = []
#first_occurrence = []
#entropy = []
#length = []
doc_len = len(doc)
entropy = get_entropy_doc(doc, phrases)
# get feature vectors
features = []
for i, phrase in enumerate(phrases):
first_occurrence = doc.find(phrase) / doc_len
tf = doc.count(phrase)
if phrase in idf_dic:
tfidf = tf * idf_dic[phrase]
else:
tfidf = tf * log10(training_size)
feature_vec = get_feature_vector(phrase, tfidf, first_occurrence, entropy[i])
features.append(feature_vec)
return phrases, features
示例8: main
# 需要導入模塊: from sklearn.feature_extraction.text import TfidfVectorizer [as 別名]
# 或者: from sklearn.feature_extraction.text.TfidfVectorizer import build_analyzer [as 別名]
def main():
reload(sys)
sys.setdefaultencoding('utf-8')
pprint(LemmaTokenizer()("this is testing the stemming functionality"))
param_grid = [
{'C': [.125, .25, .5, 1, 10, 100, 1000]},
{ 'penalty': ('l1','l2')}
]
svm_param_grid = [
{'C': [1, 10, 100, 1000], 'kernel': ['linear']},
{'C': [1, 10, 100, 1000], 'gamma': [0.001, 0.0001], 'kernel': ['rbf']},
]
lines = [line for line in fileinput.input()]
sentences = map(lambda x: x.split('\t')[1], lines)
Y = map(lambda x: int(x.split('\t')[0]), lines)
vectorizer = TfidfVectorizer(min_df=1,
tokenizer=POSTokenizer(),
preprocessor=preprocess_sentence,
ngram_range=(2,2),
stop_words='english')
pipeline = Pipeline([
('vect', vectorizer),
('clf', SGDClassifier()),
])
# pprint(parameters)
# t0 = time()
# grid_search.fit(sentences, Y)
# print("done in %0.3fs" % (time() - t0))
# print()
# print("Best score: %0.3f" % grid_search.best_score_)
X = vectorizer.fit_transform(sentences)
num_samples = len(Y)
num_train = int(num_samples * .8)
print "Num training: %d" % num_train
X_train = X[0:num_train]
Y_train = Y[0:num_train]
X_test = X[num_train:]
Y_test = Y[num_train:]
analyze = vectorizer.build_analyzer()
for sentence in sentences[0:10]:
print preprocess_sentence(sentence)
print analyze(sentence)
print "LemmaTokenizer" + str(LemmaTokenizer()(sentence))
print StemmingTokenizer()(sentence)
# tokenized_sentences = [nltk.word_tokenize(sentence) for sentence in sentences]
# tagged_sentences = [nltk.pos_tag(sentence) for sentence in tokenized_sentences]
# chunked_sentences = nltk.ne_chunk_sents(tagged_sentences, binary=True)
logistic = linear_model.LogisticRegression(C=.5, class_weight=None, dual=False,
fit_intercept=True, intercept_scaling=1, max_iter=100,
multi_class='ovr', penalty='l2', random_state=None,
solver='liblinear', tol=0.0001, verbose=0)
# grid_search = GridSearchCV(SVC(), svm_param_grid, n_jobs=-1, verbose=1)
# grid_search.fit(X_train, Y_train)
# print grid_search.score(X_test, Y_test)
# best_parameters = grid_search.best_estimator_.get_params()
# print best_parameters
# grid_search = GridSearchCV(logistic, param_grid, n_jobs=-1, verbose=1)
# grid_search.fit(X_train, Y_train)
# print grid_search.score(X_test, Y_test)
# best_parameters = grid_search.best_estimator_.get_params()
# print best_parameters
print logistic.fit(X_train,Y_train).score(X_test,Y_test)
show_most_informative_features(vectorizer, logistic, 25)
num_errors = 0
feature_names = vectorizer.vocabulary_
feature_index = inv_map = {v: k for k, v in feature_names.items()}
y_pred = []
for (i,x) in enumerate(X_test):
y_hat = logistic.predict(x)
y_pred.append(y_hat)
if y_hat != Y_test[i]:
num_errors += 1
print "\n\nError predicting sentence: " + sentences[i + num_train]
print print_features(x, feature_index)
print "Label: " + str(Y_test[i])
error_rate = float(num_errors) / len(Y_test)
print "Accuracy : " + str(1 - error_rate)
示例9: build_analyzer
# 需要導入模塊: from sklearn.feature_extraction.text import TfidfVectorizer [as 別名]
# 或者: from sklearn.feature_extraction.text.TfidfVectorizer import build_analyzer [as 別名]
def build_analyzer(self):
analyzer = TfidfVectorizer.build_analyzer(self)
return lambda doc: (StemmedTfidfVectorizer.english_stemmer.stem(w) for w in analyzer(doc))
示例10: open
# 需要導入模塊: from sklearn.feature_extraction.text import TfidfVectorizer [as 別名]
# 或者: from sklearn.feature_extraction.text.TfidfVectorizer import build_analyzer [as 別名]
sentences = scipy.io.loadmat('../../data/sentences/memorability_888_img_5_sent.mat')
sentences = sentences['memorability_sentences']
f = open('../../automated_specificity.txt', 'w')
sent_pairs, scores_w = list(), list()
vectorizer = TfidfVectorizer(token_pattern='(?u)\\b\\w\\w\\w+\\b')
corpus = list()
# Build corpus
for sent_group in sentences:
corpus.append(' '.join([sent[0] for sent in sent_group]))
vectorizer.fit(corpus)
analyze = vectorizer.build_analyzer()
specificity_max, specificity_w = list(), list()
for im_idx, sentence_group in enumerate(sentences):
similarity_max, similarity_w = list(), list()
for (sent1, sent2) in combinations(sentence_group, 2):
words1, words2 = analyze(sent1[0]), analyze(sent2[0])
sent1_weights = [vectorizer.transform(sent1).toarray()[0][vectorizer.vocabulary_.get(w)] for w in words1]
sent2_weights = [vectorizer.transform(sent2).toarray()[0][vectorizer.vocabulary_.get(w)] for w in words2]
print >> f, [w.encode('utf-8') for w in words1]
print >> f, [PrettyFloat(w) for w in sent1_weights]
print >> f, [w.encode('utf-8') for w in words2]
示例11: TfidfVectorizer
# 需要導入模塊: from sklearn.feature_extraction.text import TfidfVectorizer [as 別名]
# 或者: from sklearn.feature_extraction.text.TfidfVectorizer import build_analyzer [as 別名]
tfidf_train = tfidftransformer.fit(counts_train).transform(counts_train);
tfidf_test = tfidftransformer.fit(counts_test).transform(counts_test);
#或者讓兩個tf-idf共享vocabulary
#method 2:TfidfVectorizer
print '*************************\nTfidfVectorizer\n*************************'
from sklearn.feature_extraction.text import TfidfVectorizer
tv = TfidfVectorizer(sublinear_tf = True,
max_df = 0.5,
stop_words = 'english');
tfidf_train_2 = tv.fit_transform(newsgroup_train.data);
tv2 = TfidfVectorizer(vocabulary = tv.vocabulary_);
tfidf_test_2 = tv2.fit_transform(newsgroups_test.data);
print "the shape of train is "+repr(tfidf_train_2.shape)
print "the shape of test is "+repr(tfidf_test_2.shape)
analyze = tv.build_analyzer()
tv.get_feature_names()#statistical features/terms
#(準確率*召回率)/(準確率+召回率)
def calculate_result(actual,pred):
m_precision = metrics.precision_score(actual,pred);
m_recall = metrics.recall_score(actual,pred);
print 'predict info:'
print 'precision:{0:.3f}'.format(m_precision)
print 'recall:{0:0.3f}'.format(m_recall);
print 'f1-score:{0:.3f}'.format(metrics.f1_score(actual,pred));
#或者sklearn裏封裝好的抓feature函數,fetch_20newsgroups_vectorized
print '*************************\nfetch_20newsgroups_vectorized\n*************************'
from sklearn.datasets import fetch_20newsgroups_vectorized
tfidf_train_3 = fetch_20newsgroups_vectorized(subset = 'train');
示例12: main
# 需要導入模塊: from sklearn.feature_extraction.text import TfidfVectorizer [as 別名]
# 或者: from sklearn.feature_extraction.text.TfidfVectorizer import build_analyzer [as 別名]
def main():
global X
logging.info('Started')
# pulling primary bill sponsor to match with party information
sponsors_query = db.bills_details.find({},
{'_id': 1,'sponsors.leg_id':1,'sponsors.type':1,'sponsors.name':1,
'action_dates.signed': 1}) #able to limit number of records for testing
sponsors = list(sponsors_query)
bill_party = []
# sponsors[0]['sponsors'][0]
# Creates list of dict: bill database ID, passed status, legislator ID and party
for i in range(len(sponsors)):
bill_dbid = sponsors[i]['_id']
leg_id = sponsors[i]['sponsors'][0]['leg_id']
if leg_id == None:
leg_id = 'CA0000'
party = sponsors[i]['sponsors'][0]['name']
else:
party = GetParty(leg_id)
if party == None:
party = sponsors[i]['sponsors'][0]['name']
if sponsors[i]['action_dates']['signed'] == None:
bill_signed = False
else:
bill_signed = True
k = ['id', 'leg_id', 'party','passed']
v = [bill_dbid, leg_id, party, bill_signed]
bill_party.append(dict(zip(k,v)))
logging.info('populated list of sponsor and party')
# note to self/presentation: show number of bills sponsored by non-legislators
# graph bills by party that passed .....
# Do I need to create/ update a dictionary? This pulls MongoDB_Id and texts
# all_legtext = list(db.legtext.find({}, {'text': 1}).limit(25))
#adds vectorized features of bigrams using function
# for i in range(len(bill_party)):
# vec = GetBigramsVector(bill_party[i]['id'])
# bill_party[i]['vec'] = vec
# logging.info('loaded vectorized bigrams')
bigram_vectorizer = TfidfVectorizer(stop_words='english', ngram_range=(1,2), token_pattern=r'\b\w+\b', min_df =1)
analyze = bigram_vectorizer.build_analyzer()
for i in range(len(bill_party)):
#oid = bill_party[i]['id']
#print "Getting text for item", i, bill_party[i]['id']
leg_text = list(db.legtext.find({'_id': bill_party[i]['id']}, {'text': 1}))[0]['text']
raw = nltk.clean_html(leg_text)
# bigram_vectorizer = CountVectorizer(ngram_range=(1, 2), token_pattern=r'\b\w+\b', min_df=1)
bigram_features = analyze(raw)
bill_party[i]['features'] = bigram_features
bill_party[i]['raw'] = raw
# bill_party[i]['vec'] = bigram_vectorizer.fit_transform(bigram_features).toarray()
party_options = {'democratic': 0, 'republican': 1}
X = bigram_vectorizer.fit_transform([x['raw'] for x in bill_party if x['party'].lower() in party_options])
print bigram_vectorizer
logging.info('loaded tfidf vectorized bigrams')
# Creates numpy arrays, results = party and features = vectorized words
# party only = democrat or republican and vectorized text
bp_target = []
bp_data = []
for i in range(len(bill_party)):
if bill_party[i]['party'].lower() in ('democratic', 'republican'):
bp_target.append( party_options[bill_party[i]['party'].lower()] )
else:
continue
targets = np.array(bp_target)
data = X.toarray()
#====================================================================================
# Random Forests Modeling and Plotting
#===================================================================================
# Parameters
n_classes = 2
n_estimators = 30
plot_colors = "ryb"
cmap = pl.cm.RdYlBu
plot_step = 0.02 # fine step width for decision surface contours
plot_step_coarser = 0.5 # step widths for coarse classifier guesses
RANDOM_SEED = 9 # fix the seed on each iteration ???
plot_idx = 1
models = [DecisionTreeClassifier(max_depth=None),
RandomForestClassifier(n_estimators=n_estimators),
ExtraTreesClassifier(n_estimators=n_estimators),
AdaBoostClassifier(DecisionTreeClassifier(max_depth=3),
n_estimators=n_estimators)]
#.........這裏部分代碼省略.........
示例13: build_analyzer
# 需要導入模塊: from sklearn.feature_extraction.text import TfidfVectorizer [as 別名]
# 或者: from sklearn.feature_extraction.text.TfidfVectorizer import build_analyzer [as 別名]
def build_analyzer(self):
analyzer = TfidfVectorizer.build_analyzer(self)
english_stemmer = SnowballStemmer('english')
return lambda doc:(english_stemmer.stem(w) for w in analyzer(doc))
示例14: __init__
# 需要導入模塊: from sklearn.feature_extraction.text import TfidfVectorizer [as 別名]
# 或者: from sklearn.feature_extraction.text.TfidfVectorizer import build_analyzer [as 別名]
def __init__(self, n_features, voc_file):
self.n_features = n_features
self.voc_file = voc_file
self.word_clusters, self.grouped_words = self.read_word_cluster(voc_file)
tfidf = TfidfVectorizer(encoding = 'iso-8859-1', stop_words='english')
self.vectorize = tfidf.build_analyzer()
示例15: fetch_20newsgroups
# 需要導入模塊: from sklearn.feature_extraction.text import TfidfVectorizer [as 別名]
# 或者: from sklearn.feature_extraction.text.TfidfVectorizer import build_analyzer [as 別名]
data_full.append(SiteData('fb/srsplit/fullfbsearch_results_combined{i:02d}'.format(i=file_counter),categories, full_candidate_dict))
"""
data_train = fetch_20newsgroups(subset='train', categories=categories,
shuffle=True, random_state=42)
data_test = fetch_20newsgroups(subset='test', categories=categories,
shuffle=True, random_state=42)
"""
print 'data loaded'
import conversions as conv
from ersatzpg.utffile import utffile
special_terms = []
vocabulary = []
basic_vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5, use_idf=False,
stop_words='english')
basic_analyze = basic_vectorizer.build_analyzer()
with utffile('searchterms.csv') as f:
for s in f:
if s.startswith('<'):
special_terms.append(s.strip('<>'))
else:
vocabulary.append(s.decode('utf-8').strip())
fb_page_data = {}
with open('fb/facebookpolsurls_bkp.csv') as f:
csvr = csv.DictReader(f)
for l in csvr:
fb_page_data.update({l['url']:{'fans':l['Fan Count'].replace(',',''),'authentic':l['Authentic Category']}})
def analyze(s):
d=eval(s)
special_keys = []