本文整理匯總了Python中sklearn.feature_extraction.text.TfidfVectorizer.inverse_transform方法的典型用法代碼示例。如果您正苦於以下問題:Python TfidfVectorizer.inverse_transform方法的具體用法?Python TfidfVectorizer.inverse_transform怎麽用?Python TfidfVectorizer.inverse_transform使用的例子?那麽, 這裏精選的方法代碼示例或許可以為您提供幫助。您也可以進一步了解該方法所在類sklearn.feature_extraction.text.TfidfVectorizer
的用法示例。
在下文中一共展示了TfidfVectorizer.inverse_transform方法的7個代碼示例,這些例子默認根據受歡迎程度排序。您可以為喜歡或者感覺有用的代碼點讚,您的評價將有助於係統推薦出更棒的Python代碼示例。
示例1: buildVectorizer
# 需要導入模塊: from sklearn.feature_extraction.text import TfidfVectorizer [as 別名]
# 或者: from sklearn.feature_extraction.text.TfidfVectorizer import inverse_transform [as 別名]
def buildVectorizer(bio):
nounlist = []
for doc in bio:
st = ""
for (word, pos) in tag(doc):
if pos in ["JJ", "NNS", "NN", "NNP"]:
st = st+word+" "
else:
if st!= "":
st = st[0:-1]+" "
#print "got one"
nounlist.extend([st])
sciencestopwords = set([u'model','according', 'data', u'models', 'function', 'properties', 'approach', 'parameters',
'systems', 'number', 'order', u'data', 'analysis', u'information', u'journal',
'results','using','research', 'consumers', 'scientists', 'model', 'models', 'journal',
'researchers','paper','new','study','time','case', 'simulation', u'simulation', 'equation',
'based','years','better', 'theory', 'particular','many','due','much','set', 'studies', 'systems',
'simple', 'example','work','non','experiments', 'large', 'small', 'experiment', u'experiments',
'provide', 'analysis', 'problem', 'method', 'used', 'methods'])
#now doing the new vectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
english = nltk.corpus.stopwords.words('english')
newstop = english+list(sciencestopwords)
vectorizer = TfidfVectorizer(min_df=1, max_df=.5, stop_words=newstop, decode_error='ignore')
X = vectorizer.fit_transform(nounlist)
Xinv = vectorizer.inverse_transform(X)
#X is a sparse matrix of docs x vocab size (7638).
#so X[doc_num] is the sparse vector of its words.
#the ||X[doc_num]|| = 1 there are 7638 unique words and 755 docs. with a total number of 38888 non-zeros.
#Xinv[doc_num] is the list of words in the doc.
return nounlist, vectorizer, X, Xinv
示例2: test_add_hashtag_bow_to_graph
# 需要導入模塊: from sklearn.feature_extraction.text import TfidfVectorizer [as 別名]
# 或者: from sklearn.feature_extraction.text.TfidfVectorizer import inverse_transform [as 別名]
def test_add_hashtag_bow_to_graph(self):
g = IU.add_hastag_bow_to_graph(self.g_undecom)
tfidf = TfidfVectorizer(preprocessor=None,
tokenizer=lambda s: s.split(),
stop_words=None)
tfidf.fit([' '.join(g.node[n]['hashtags'])
for n in g.nodes_iter()])
for n in g.nodes_iter():
assert_true(issparse(g.node[n]['hashtag_bow']))
assert_equal(
sorted(g.node[n]['hashtags']),
sorted(
tfidf.inverse_transform(
g.node[n]['hashtag_bow']
)[0].tolist()
)
)
示例3: main
# 需要導入模塊: from sklearn.feature_extraction.text import TfidfVectorizer [as 別名]
# 或者: from sklearn.feature_extraction.text.TfidfVectorizer import inverse_transform [as 別名]
def main(args):
logger.debug("Arguments: %r", args)
tfidf_vect = TfidfVectorizer(
preprocessor=get_preprocessor(args.fields),
analyzer='word', # maybe callable
token_pattern=r'\b[a-z]\w+\b',
ngram_range=(args.min_ngrams, args.max_ngrams),
max_df=args.max_df,
max_features=args.max_features,
sublinear_tf=args.sublinear_tf,
stop_words=STOP_WORDS,
norm=args.norm,
)
with LogRuntime("Loaded input data in {elapsed} seconds", logger):
data = get_data(args)
if data:
logger.debug("Corpus size: {0}".format(len(data)))
else:
logger.error("Empty data")
return
with LogRuntime("Fitted in {0.elapsed} seconds", logger):
X = tfidf_vect.fit_transform(data)
logger.debug("Vocabulary size: {}".format(len(tfidf_vect.vocabulary_)))
logger.debug("Max DF stop words size: {}".format(
len(tfidf_vect.stop_words_)))
logger.debug("Stop words size: {}".format(len(tfidf_vect.stop_words)))
if args.clusters:
true_k = args.clusters
else:
# ref: http://en.wikipedia.org/wiki/Determining_the_number_of_clusters_in_a_data_set#Finding_Number_of_Clusters_in_Text_Databases
m_docs, n_terms = X.shape
t_nonzeros = len(X.nonzero()[0])
true_k = (m_docs * n_terms) / t_nonzeros
logger.debug("Calculated number of clusters: {}".format(true_k))
if args.minibatch:
km = MiniBatchKMeans(
n_clusters=true_k,
init='k-means++',
n_init=10,
init_size=1000,
batch_size=1000,
verbose=-1)
else:
km = KMeans(
n_clusters=args.clusters,
init='random',
max_iter=100,
n_init=10,
verbose=1,
n_jobs=-1)
with LogRuntime("KMeans Fitted in {0.elapsed} seconds", logger):
km.fit(X)
if args.sample_random and args.sample_size:
sample = [
data[i]
for i in np.random.random_integers(0, len(data), args.sample_size)
]
elif args.sample_size:
sample = data[args.sample_skip:args.sample_size]
else:
sample = data
Y = tfidf_vect.transform(sample)
sample_terms = tfidf_vect.inverse_transform(Y)
labels = km.predict(Y)
distances = km.transform(Y)
center_terms = tfidf_vect.inverse_transform(km.cluster_centers_)
clusters = defaultdict(list)
vocabulary = tfidf_vect.vocabulary_
for i, doc in enumerate(sample):
clusters[labels[i]].append((i, doc))
truncate = lambda t: t[:100] + '...' if len(t) > 100 else t
for label, result in sorted(clusters.iteritems()):
# skip single results
if len(result) < args.cluster_minsize:
continue
terms_joined = ', '.join(
sorted(
center_terms[label],
reverse=True,
key=lambda t: km.cluster_centers_[label, vocabulary[t]]))
print '=' * 79
print '=' * 79
print '=' * 79
print '-> ' + truncate(terms_joined) + '\n\n'
result = sorted(
result,
key=lambda (i, _): distances[i, label],
#.........這裏部分代碼省略.........
示例4: DSOM
# 需要導入模塊: from sklearn.feature_extraction.text import TfidfVectorizer [as 別名]
# 或者: from sklearn.feature_extraction.text.TfidfVectorizer import inverse_transform [as 別名]
class DSOM(object):
def __init__(self, inputFile=None, fileType=None, widthOfMap=2, useGPU=True):
self.inputFile = inputFile
self.fileType = fileType
self.widthOfMap = widthOfMap
self.useGPU = useGPU
self.arrayTrain = []
self.Y = None
self.vectorizer = None
self.nodeHolder = dict()
self.text = ""
self.dataset = ""
def readDocument(self):
if(self.fileType == 'pdf'):
self.text = readPDF.pdfparser(self.inputFile)
else:
self.text = open(self.inputFile, "r").read()
self.dataset = self.text.split("\n\n")
def train(self, inputFile=None):
###############################################################################
#clean_file = open("data/paragraph_vector_output (copy).txt")
#dataset = clean_file.read().split("\n\n")
# print(dataset)
# print("%d Paragraphs " % len(dataset))
# print()
# print("Extracting features from the dataset using a sparse vectorizer")
#t0 = time()
self.vectorizer = TfidfVectorizer(max_df=0.5, max_features=1000,
min_df=2, stop_words='english',
use_idf=True, sublinear_tf=True)
self.Y = self.vectorizer.fit_transform(self.dataset)
#arrayTrain = X.toarray()
svd = TruncatedSVD(n_components=100, random_state=42)
X = svd.fit_transform(self.Y)
self.arrayTrain = X
#print("done in %fs" % (time() - t0))
#print("n_samples: %d, n_features: %d" % X.shape)
#print()
###############################################################################
## SOM
#For plotting the images
#Train a 20x30 SOM with 400 iterations
#print("<-- Starting SOM -- >")
mapSide = self.widthOfMap
som = SOM.SOM(DATA=self.arrayTrain, num_units=mapSide*mapSide, width=mapSide, height=mapSide)
#print("<-- Training SOM -- >")
#t0 = time()
if(self.useGPU == True):
try:
import theano.sandbox.cuda
theano.sandbox.cuda.use('gpu')
except:
print("Switching to GPU didn't work, will fallback to CPU.")
som.train_batch_theano(verbose=False)
else:
som.train_batch(verbose=False)
#print("<-- Done Training SOM %fs -- >" %(time()-t0))
#Get output grid
#print("<-- Testing SOM -- >")
#print("<-- Begin Output -- >")
#np.set_printoptions(threshold='nan')
clusters = som.ins_unit_assign
#print(clusters)
for i in range(mapSide*mapSide):
self.nodeHolder[i] = []
for i, m in enumerate(clusters):
if (m) in self.nodeHolder:
self.nodeHolder[m].append(i)
else:
self.nodeHolder[m] = [i]
def getClusters(self):
return self.nodeHolder
def getDataset(self):
return self.dataset
def tfIDFArray(self):
inverse = self.vectorizer.inverse_transform(self.Y)
outList = []
for x in inverse:
outList.append([y.encode('UTF8') for y in x])
return outList
示例5: summarize_cisco_support_forum_texts
# 需要導入模塊: from sklearn.feature_extraction.text import TfidfVectorizer [as 別名]
# 或者: from sklearn.feature_extraction.text.TfidfVectorizer import inverse_transform [as 別名]
def summarize_cisco_support_forum_texts():
# cisco_plain_text = LazyCorpusLoader(
# 'content', PlaintextCorpusReader, r'(?!\.).*\.txt', encoding='latin_1')
cisco_plain_text = LazyCorpusLoader(
"cisco_forum_subset", PlaintextCorpusReader, r"(?!\.).*\.txt", encoding="latin_1"
)
token_dict = {}
for article in cisco_plain_text.fileids():
token_dict[article] = cisco_plain_text.raw(article)
tfidf = TfidfVectorizer(tokenizer=tokenize_and_stem, stop_words="english", decode_error="ignore")
sys.stdout.flush()
# creates Compressed Sparse Row format numpy matrix
tdm = tfidf.fit_transform(token_dict.values())
feature_names = tfidf.get_feature_names()
# problem_statement_#1 - summarize support_forum articles automatically
for article_id in range(0, tdm.shape[0] - 2):
article_text = cisco_plain_text.raw(cisco_plain_text.fileids()[article_id])
sent_scores = []
for sentence in nltk.sent_tokenize(article_text):
score = 0
sent_tokens = tokenize_and_stem(sentence)
for token in (t for t in sent_tokens if t in feature_names):
score += tdm[article_id, feature_names.index(token)]
sent_scores.append((score / len(sent_tokens), sentence))
summary_length = int(math.ceil(len(sent_scores) / 5))
sent_scores.sort(key=lambda sent: sent[0])
print "\n*** SUMMARY ***"
for summary_sentence in sent_scores[:summary_length]:
print summary_sentence[1]
print "\n*** ORIGINAL ***"
print article_text
# problem_statement_#2 - automatically categorize forum posts by tags into various groups
reduce_dimensionality_and_cluster_docs(tfidf, tdm, num_features=200)
# problem_statement_#3 - find similar documents to a current document (that user is reading) automatically
# eg - quora: find similar questions, find similar answers
cosine_similarity(tdm[0:1], tdm)
"""
output looks like this
array([[ 1. , 0.22185251, 0.0215558 , 0.03805012, 0.04796646,
0.05069365, 0.05507056, 0.03374501, 0.03643342, 0.05308392,
0.06002623, 0.0298806 , 0.04177088, 0.0844478 , 0.07951179,
0.02822186, 0.03036787, 0.11022385, 0.0535391 , 0.10009412,
0.07432719, 0.03753424, 0.06596462, 0.01256566, 0.02135591,
0.13931643, 0.03062681, 0.02595649, 0.04897851, 0.06276997,
0.03173952, 0.01822134, 0.04043555, 0.06629454, 0.05436211,
0.0549144 , 0.04400169, 0.05157118, 0.05409632, 0.09541703,
0.02473209, 0.05646599, 0.05728387, 0.04672681, 0.04519217,
0.04126276, 0.06289187, 0.03116767, 0.04828476, 0.04745193,
0.01404426, 0.04201325, 0.023492 , 0.07138136, 0.03778315,
0.03677206, 0.02553581]])
The first document is compared to the rest, with the most similar to it being itself with score of 1, next most similar to it is document with score 0.22185251
"""
cosine_similarities = linear_kernel(tdm[0:1], tdm).flatten()
# mapping back to document_name space
related_docs_indices = cosine_similarities.argsort()
"""
document_ids
array([23, 50, 31, 24, 2, 52, 40, 56, 27, 15, 11, 16, 26, 47, 30, 7, 8,
55, 21, 54, 3, 32, 45, 12, 51, 36, 44, 43, 49, 4, 48, 28, 5, 37,
9, 18, 38, 34, 35, 6, 41, 42, 10, 29, 46, 22, 33, 53, 20, 14, 13,
39, 19, 17, 25, 1, 0])
docs 0 and 1 are very similar which are the following posts (last 2 array elements above when sorted)
https://supportforums.cisco.com/discussion/11469881/aniserver-failed-run-lms-40
and
supportforums.cisco.com/discussion/11469606/eos-lms-31-support-quest
"""
cosine_similarities[related_docs_indices]
for key, value in token_dict.iteritems():
print key, value
# find the actual posts which are the most similar
tfidf.inverse_transform(tdm)[0]
tfidf.inverse_transform(tdm)[1]
示例6: CommentsAnalyzer
# 需要導入模塊: from sklearn.feature_extraction.text import TfidfVectorizer [as 別名]
# 或者: from sklearn.feature_extraction.text.TfidfVectorizer import inverse_transform [as 別名]
class CommentsAnalyzer(pmlutil.Configurable):
def configTypes(self):
return dict(amount=int, min_ngram=int, max_ngram=int, min_df=int, max_df=float, use_idf=int, alpha=readArray, l1_ratio=readArray, n_folds=int)
def _loadData(self):
logging.info("loading data")
self.data = []
count = 0
for fn in os.listdir(self._datafolder):
if not self._amount < 1 and count >= self._amount:
break
if fn.endswith(self._metaextension):
mfn = self._datafolder + "/" + fn
ddm = pml.Datum(mfn,None)
if len(ddm.meta()['comments'])>0:
self.data.append(ddm)
count +=1
logging.info("loaded %d data" % count)
def __init__(self):
self.data=[]
def _aggregateComments(self, subset):
allcomments = []
for datum in subset:
comments = []
for comment in datum.meta()['comments']:
comments.append(comment['text'])
allcomments.append(" ".join(comments))
return np.array(allcomments)
def _buildDictionary(self, allcomments):
print allcomments
self.vectorizer = TfidfVectorizer(analyzer=self._analyzer, ngram_range=(self._min_ngram,self._max_ngram),
min_df=self._min_df, max_df=self._max_df, norm='l2', smooth_idf=True, use_idf=bool(self._use_idf))
self.vectorizer.fit(allcomments)
def run(self):
allcomments = self._aggregateComments(self.data)
self._buildDictionary(allcomments)
# create representation of documents
tfidfArray = self.vectorizer.transform(allcomments)
# create labelling
labels = []
for datum in self.data:
labels.append(len(datum.meta()['favorites']))
labels = np.array(labels)
print self.vectorizer.get_params()
print self.vectorizer.get_feature_names()
# training
self.elasticNet = ElasticNetCV(alphas=self._alpha, l1_ratio=self._l1_ratio, fit_intercept=True, normalize=False, precompute='auto', max_iter=1000, copy_X=True, tol=0.0001, rho=None, cv=self._n_folds)
self.elasticNet.fit(tfidfArray,labels)
for i,l1_ratio in enumerate(self._l1_ratio):
for j,alpha in enumerate(self._alpha):
print "alpha: %f, l1_ratio: %f --> %f" % (alpha,l1_ratio,np.mean(self.elasticNet.mse_path_[i,j,:]))
print self.vectorizer.inverse_transform(self.elasticNet.coef_)
示例7: main
# 需要導入模塊: from sklearn.feature_extraction.text import TfidfVectorizer [as 別名]
# 或者: from sklearn.feature_extraction.text.TfidfVectorizer import inverse_transform [as 別名]
#.........這裏部分代碼省略.........
article_info.popitem()
else:
article_info[article['newid']]['topic'].append(topic_list)
'''
Extracting the dictionary of features into a .csv file
Format :
Article ID,Topic,Place, Label
20057,[u'south-korea'],[],TEST
'''
# with open('dictionary.csv', 'wb') as f:
# f.write('Article ID,Topic,Place,Label')
# f.write('\n')
# for key, value in article_info.iteritems():
# f.write(key)
# f.write(',')
# for inner_key,inner_value in value.items():
# f.write(str(inner_value))
# f.write(',')
# f.write('\n')
print 'No of valid articles = {}'.format(len(article_list))
#To create a global list of topics used while tokenization(used in tokenize function) to make sure feature words do not belong to topic list
global topics_list
topics_list = list()
topics = getTopics(article_info,[])
for topic_article in topics:
if topic_article:
for topic in topic_article:
if topic:
for top in topic:
topics_list.append(top)
with open('topic_labels', 'wb') as outfile:
pickle.dump(topics, outfile, pickle.HIGHEST_PROTOCOL)
# with open('initial_word_count.txt', 'wb') as ini:
# sum =0
# for word in article_list:
# sum += len(word.split())
# ini.write('Total words in body tag of all the 21578 documents initially :'+str(sum))
vectorizer = TfidfVectorizer(min_df= 0.001,max_df=0.9, tokenizer=tokenize, strip_accents='unicode', smooth_idf=True)
feature_vector = vectorizer.fit_transform(article_list)
feature_list = vectorizer.get_feature_names()
with open('feature_vector', 'wb') as outfile:
pickle.dump(feature_vector, outfile, pickle.HIGHEST_PROTOCOL)
with open('features_list', 'wb') as f:
pickle.dump(feature_list, f, pickle.HIGHEST_PROTOCOL)
# with open('feature_list.csv','wb') as feature:
# for value in feature_list:
# feature.write(str(value)+'\n')
counter_vectorizer = CountVectorizer(vocabulary=vectorizer.vocabulary_, strip_accents='unicode')
# for the word frequency counts
data_matrix = counter_vectorizer.fit_transform(article_list) # data matrix
transaction_matrix = vectorizer.inverse_transform(feature_vector) # transaction matrix
# terms = counter_vectorizer.get_feature_names()
# freqs = data_matrix
# result = dict(zip(terms, freqs))
# print result
# print(len(result))
## Un-comment from here to generate data_matrix and transaction_matrix
# with open('data_matrix.dat', 'wb') as outfile:
# pickle.dump(data_matrix, outfile, pickle.HIGHEST_PROTOCOL)
#
# with open('transaction_matrix.dat', 'wb') as outfile:
# pickle.dump(transaction_matrix, outfile, pickle.HIGHEST_PROTOCOL)
# with open('unigram_word_count.txt','wb') as ini:
# sum = len(vectorizer.get_feature_names())
# ini.write('Total words in body tag remaining after stemming , removing stop words and computing tf-idf counts :'+str(sum))
bigram_vectorizer = TfidfVectorizer(min_df=0.001, tokenizer=tokenize, ngram_range=(2,2), strip_accents='unicode', max_df=0.9, smooth_idf=True)
bigram_feature_vector = bigram_vectorizer.fit_transform(article_list)
indices = np.argsort(bigram_vectorizer.idf_)[::-1]
features = bigram_vectorizer.get_feature_names()
top_n = 20
top_features = [features[i] for i in indices[:top_n]]
print top_features
# with open('top_20_bigrams.txt','wb') as ini:
# ini.write(str(top_features))
print("Done in %0.3fs" % (time() - t0))