本文整理汇总了Python中gensim.models.TfidfModel方法的典型用法代码示例。如果您正苦于以下问题:Python models.TfidfModel方法的具体用法?Python models.TfidfModel怎么用?Python models.TfidfModel使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类gensim.models
的用法示例。
在下文中一共展示了models.TfidfModel方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: test_miislita_high_level
# 需要导入模块: from gensim import models [as 别名]
# 或者: from gensim.models import TfidfModel [as 别名]
def test_miislita_high_level(self):
# construct corpus from file
miislita = CorpusMiislita(datapath('miIslita.cor'))
# initialize tfidf transformation and similarity index
tfidf = models.TfidfModel(miislita, miislita.dictionary, normalize=False)
index = similarities.SparseMatrixSimilarity(tfidf[miislita], num_features=len(miislita.dictionary))
# compare to query
query = 'latent semantic indexing'
vec_bow = miislita.dictionary.doc2bow(query.lower().split())
vec_tfidf = tfidf[vec_bow]
# perform a similarity query against the corpus
sims_tfidf = index[vec_tfidf]
# for the expected results see the article
expected = [0.0, 0.2560, 0.7022, 0.1524, 0.3334]
for i, value in enumerate(expected):
self.assertAlmostEqual(sims_tfidf[i], value, 2)
示例2: train_TFIDF
# 需要导入模块: from gensim import models [as 别名]
# 或者: from gensim.models import TfidfModel [as 别名]
def train_TFIDF():
list_cut_short_text = get_data.get_cut_PARTI_short_text()
print "list_cut_short_text is %d"%(len(list_cut_short_text))
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',level = logging.INFO)
dictionary = corpora.Dictionary(list_cut_short_text)
dictionary.save("dictionary.tfidf.dic")
corpus = [dictionary.doc2bow(text) for text in list_cut_short_text]
tfidf = models.TfidfModel(corpus)
tfidf.save('./model/PARTI_tfidf_model')
示例3: __init__
# 需要导入模块: from gensim import models [as 别名]
# 或者: from gensim.models import TfidfModel [as 别名]
def __init__(self, sentences_file, stopwords):
self.dictionary = None
self.corpus = None
f_sentences = codecs.open(sentences_file, encoding='utf-8')
documents = list()
count = 0
print("Gathering sentences and removing stopwords")
for line in f_sentences:
line = re.sub('<[A-Z]+>[^<]+</[A-Z]+>', '', line)
# remove stop words and tokenize
document = [word for word in nltk.word_tokenize(line.lower()) if word not in stopwords]
documents.append(document)
count += 1
if count % 10000 == 0:
sys.stdout.write(".")
f_sentences.close()
self.dictionary = corpora.Dictionary(documents)
self.corpus = [self.dictionary.doc2bow(text) for text in documents]
self.tf_idf_model = TfidfModel(self.corpus)
print(len(documents), "documents red")
print(len(self.dictionary), " unique tokens")
示例4: generate_dtm
# 需要导入模块: from gensim import models [as 别名]
# 或者: from gensim.models import TfidfModel [as 别名]
def generate_dtm(self, corpus, tfidf=False):
""" Generate the inside document-term matrix and other peripherical information
objects. This is run when the class is instantiated.
:param corpus: corpus.
:param tfidf: whether to weigh using tf-idf. (Default: False)
:return: None
:type corpus: list
:type tfidf: bool
"""
self.dictionary = Dictionary(corpus)
self.dtm = dok_matrix((len(corpus), len(self.dictionary)), dtype=np.float)
bow_corpus = [self.dictionary.doc2bow(doctokens) for doctokens in corpus]
if tfidf:
weighted_model = TfidfModel(bow_corpus)
bow_corpus = weighted_model[bow_corpus]
for docid in self.docids:
for tokenid, count in bow_corpus[self.docid_dict[docid]]:
self.dtm[self.docid_dict[docid], tokenid] = count
示例5: train
# 需要导入模块: from gensim import models [as 别名]
# 或者: from gensim.models import TfidfModel [as 别名]
def train(self, prefix: str, corporas: list):
""" 训练模型
保存字典,语料,模型到磁盘
Arguments:
prefix {str} -- 模型名称前缀
corpora_documents {list} -- 分词后的文本
"""
# 生成字典和向量语料
dictionary = corpora.Dictionary(corporas)
dictionary.save('./models/{}_dict.dic'.format(prefix)) # 保存生成的词典
corpus = [dictionary.doc2bow(text) for text in corporas]
corpora.MmCorpus.serialize('./models/{}_corpuse.mm'.format(prefix), corpus) # 保存生成的语料
tfidf_model = models.TfidfModel(corpus)
tfidf_model.save("./models/{}_tfidf_model.model".format(prefix)) # 保存Tfidf模型
示例6: init_tfidf_chinese_or_pinyin
# 需要导入模块: from gensim import models [as 别名]
# 或者: from gensim.models import TfidfModel [as 别名]
def init_tfidf_chinese_or_pinyin(sources_path):
"""
构建td_idf
:param path:
:return:
"""
questions = txtRead(sources_path)
corpora_documents = []
for item_text in questions:
item_seg = list(jieba.cut(str(item_text).strip()))
corpora_documents.append(item_seg)
dictionary = corpora.Dictionary(corpora_documents)
corpus = [dictionary.doc2bow(text) for text in corpora_documents]
tfidf_model = models.TfidfModel(corpus)
print("init_tfidf_chinese_or_pinyin ok! " + sources_path)
file = open(sources_path.replace(".csv", "_dictionary_model.pkl"), 'wb')
pickle.dump([dictionary, tfidf_model], file)
示例7: get_tfidf_weighted_keyphrases
# 需要导入模块: from gensim import models [as 别名]
# 或者: from gensim.models import TfidfModel [as 别名]
def get_tfidf_weighted_keyphrases(sentences,
grammar=r'NP: {<DT>? <JJ>* <NN.*>+}',
top_n=10):
valid_chunks = get_chunks(sentences, grammar=grammar)
dictionary = corpora.Dictionary(valid_chunks)
corpus = [dictionary.doc2bow(chunk) for chunk in valid_chunks]
tfidf = models.TfidfModel(corpus)
corpus_tfidf = tfidf[corpus]
weighted_phrases = {dictionary.get(id): round(value,3)
for doc in corpus_tfidf
for id, value in doc}
weighted_phrases = sorted(weighted_phrases.items(),
key=itemgetter(1), reverse=True)
return weighted_phrases[:top_n]
示例8: buildCorpus
# 需要导入模块: from gensim import models [as 别名]
# 或者: from gensim.models import TfidfModel [as 别名]
def buildCorpus(self):
"""
Build the corpus from the documents:
1. Remove words that only appeared once.
2. Create the Dictionary object.
3. Convert the documents to simple bag-of-words representation.
4. Convert the bag-of-words vectors to tf-idf.
"""
# Remove words that only appear once.
self.documents = [[token for token in doc if self.frequency[token] > 1]
for doc in self.documents]
# Build a dictionary from the text.
self.dictionary = corpora.Dictionary(self.documents)
# Map the documents to vectors.
corpus = [self.dictionary.doc2bow(text) for text in self.documents]
# Delete the tokenized representation of the documents--no need to
# carry this around!
del self.documents[:]
# Convert the simple bag-of-words vectors to a tf-idf representation.
self.tfidf_model = TfidfModel(corpus)
self.corpus_tfidf = self.tfidf_model[corpus]
示例9: summarize
# 需要导入模块: from gensim import models [as 别名]
# 或者: from gensim.models import TfidfModel [as 别名]
def summarize(self, text):
self.sentences = self.factory.text2sentences(text)
self.num_sentences = len(self.sentences)
self.corpus = SentenceCorpus(self.sentences, self.no_below_word_count, self.no_above_word_portion, self.max_dictionary_size)
self.model = TfidfModel(self.corpus.bows, id2word=self.corpus.dictionary, normalize=True)
self.tfidfs = self.model[self.corpus.bows]
self._inject_tfidfs()
self._build_matrix()
self._clustering()
if self.compactify:
self._compactify()
self.graphs = []
for i in range(self.num_clusters):
graph = self.sentences2graph(self.clusters[i])
pagerank = networkx.pagerank(graph, weight='weight')
self.clusters[i] = sorted(pagerank, key=pagerank.get, reverse=True)
self.graphs.append(graph)
示例10: _train_model
# 需要导入模块: from gensim import models [as 别名]
# 或者: from gensim.models import TfidfModel [as 别名]
def _train_model(self, min_freq=1):
# Create tfidf model.
self.dct = corpora.Dictionary(self.contexts)
# Filter low frequency words from dictionary.
low_freq_ids = [id_ for id_, freq in
self.dct.dfs.items() if freq <= min_freq]
self.dct.filter_tokens(low_freq_ids)
self.dct.compactify()
# Build tfidf model.
self.corpus = [self.dct.doc2bow(s) for s in self.contexts]
self.tfidf_model = models.TfidfModel(self.corpus)
示例11: prepare_gensim_corpus
# 需要导入模块: from gensim import models [as 别名]
# 或者: from gensim.models import TfidfModel [as 别名]
def prepare_gensim_corpus(corpus_name, corpus, output_path, min_freq=5):
if not output_path.endswith('/'): output_path = output_path + '/'
check_dir(output_path) # if directory does not exist, then create
logging.info( 'building gensim corpus and dictionary for %s corpus', corpus_name )
logging.info( 'loading corpus' )
texts = [[word for word in process_text(document, removePunct=True, removeSW=True, removeNum=True)] for document in corpus]
logging.info( 'tokenizing' )
all_tokens = [item for sublist in texts for item in sublist]
logging.info( 'mark tokens which have frequency less than %d', min_freq )
tokens_once = set([k for k, v in collections.Counter(all_tokens).iteritems() if v < min_freq ])
logging.info( '|D|=%d' , len(texts) )
logging.info( 'filter low frequency tokens' )
texts = [[word for word in text if word not in tokens_once] for text in texts]
logging.info( '|D|=%d' , len(texts) )
logging.info( 'building dictionary' )
dictionary = corpora.Dictionary(texts)
logging.info( 'saving dictionary' )
dictFile = output_path + corpus_name + '.dict'
dictionary.save(dictFile)
logging.info( 'building corpus in mm format' )
corpus = [dictionary.doc2bow(text) for text in texts]
logging.info( 'saving corpus' )
gensim_corpus_file = output_path + corpus_name + '.mm'
corpora.MmCorpus.serialize(gensim_corpus_file, corpus)
logging.info( 'computing tfidf' )
tfidf = models.TfidfModel(corpus) # tfidf model
corpus_tfidf = tfidf[corpus] # tfidf corpus
logging.info( 'saving tfidf corpus' )
corpus_tfidf_file = output_path + corpus_name + '.tfidf.mm'
corpora.MmCorpus.serialize(corpus_tfidf_file, corpus_tfidf)
logging.info( 'gensim corpus is ready' )
##################################################################################
示例12: _build_idf
# 需要导入模块: from gensim import models [as 别名]
# 或者: from gensim.models import TfidfModel [as 别名]
def _build_idf(dictionary: gensim.corpora.Dictionary) -> np.ndarray:
model = TfidfModel(dictionary=dictionary)
idf = np.zeros(len(dictionary.token2id))
for idx, value in model.idfs.items():
idf[idx] = value
return idf
示例13: train
# 需要导入模块: from gensim import models [as 别名]
# 或者: from gensim.models import TfidfModel [as 别名]
def train(self):
corpus = self.preprocess()
dictionary = Dictionary(corpus)
doc2bow = [dictionary.doc2bow(text) for text in corpus]
tf_idf = TfidfModel(doc2bow)
corpus_tf_idf = tf_idf[doc2bow]
model = LdaModel(corpus_tf_idf, num_topics=2)
return dictionary, tf_idf, model
示例14: xform_tfidf
# 需要导入模块: from gensim import models [as 别名]
# 或者: from gensim.models import TfidfModel [as 别名]
def xform_tfidf(self):
self.transformation = models.TfidfModel(self.get_vectors())
示例15: __init__
# 需要导入模块: from gensim import models [as 别名]
# 或者: from gensim.models import TfidfModel [as 别名]
def __init__(self, data_list):
data_list = self._check(data_list)
self.dictionary = corpora.Dictionary(data_list)
corpus = [self.dictionary.doc2bow(doc) for doc in data_list]
self.tfidf = models.TfidfModel(corpus) #文档建tfidf模型