当前位置: 首页>>代码示例>>Python>>正文


Python models.TfidfModel方法代码示例

本文整理汇总了Python中gensim.models.TfidfModel方法的典型用法代码示例。如果您正苦于以下问题:Python models.TfidfModel方法的具体用法?Python models.TfidfModel怎么用?Python models.TfidfModel使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在gensim.models的用法示例。


在下文中一共展示了models.TfidfModel方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: test_miislita_high_level

# 需要导入模块: from gensim import models [as 别名]
# 或者: from gensim.models import TfidfModel [as 别名]
def test_miislita_high_level(self):
        # construct corpus from file
        miislita = CorpusMiislita(datapath('miIslita.cor'))

        # initialize tfidf transformation and similarity index
        tfidf = models.TfidfModel(miislita, miislita.dictionary, normalize=False)
        index = similarities.SparseMatrixSimilarity(tfidf[miislita], num_features=len(miislita.dictionary))

        # compare to query
        query = 'latent semantic indexing'
        vec_bow = miislita.dictionary.doc2bow(query.lower().split())
        vec_tfidf = tfidf[vec_bow]

        # perform a similarity query against the corpus
        sims_tfidf = index[vec_tfidf]

        # for the expected results see the article
        expected = [0.0, 0.2560, 0.7022, 0.1524, 0.3334]
        for i, value in enumerate(expected):
            self.assertAlmostEqual(sims_tfidf[i], value, 2) 
开发者ID:largelymfs,项目名称:topical_word_embeddings,代码行数:22,代码来源:test_miislita.py

示例2: train_TFIDF

# 需要导入模块: from gensim import models [as 别名]
# 或者: from gensim.models import TfidfModel [as 别名]
def train_TFIDF():
	
	list_cut_short_text = get_data.get_cut_PARTI_short_text()
	
	print "list_cut_short_text is %d"%(len(list_cut_short_text))

	logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',level = logging.INFO)
	
	dictionary = corpora.Dictionary(list_cut_short_text)

	dictionary.save("dictionary.tfidf.dic")
	
	corpus = [dictionary.doc2bow(text) for text in list_cut_short_text]

	tfidf = models.TfidfModel(corpus)
	
	tfidf.save('./model/PARTI_tfidf_model') 
开发者ID:yangzhiye,项目名称:Short-Text-Summarization,代码行数:19,代码来源:train_TFIDF_model.py

示例3: __init__

# 需要导入模块: from gensim import models [as 别名]
# 或者: from gensim.models import TfidfModel [as 别名]
def __init__(self, sentences_file, stopwords):
        self.dictionary = None
        self.corpus = None
        f_sentences = codecs.open(sentences_file, encoding='utf-8')
        documents = list()
        count = 0
        print("Gathering sentences and removing stopwords")
        for line in f_sentences:
            line = re.sub('<[A-Z]+>[^<]+</[A-Z]+>', '', line)

            # remove stop words and tokenize
            document = [word for word in nltk.word_tokenize(line.lower()) if word not in stopwords]
            documents.append(document)
            count += 1
            if count % 10000 == 0:
                sys.stdout.write(".")

        f_sentences.close()

        self.dictionary = corpora.Dictionary(documents)
        self.corpus = [self.dictionary.doc2bow(text) for text in documents]
        self.tf_idf_model = TfidfModel(self.corpus)

        print(len(documents), "documents red")
        print(len(self.dictionary), " unique tokens") 
开发者ID:davidsbatista,项目名称:Snowball,代码行数:27,代码来源:VectorSpaceModel.py

示例4: generate_dtm

# 需要导入模块: from gensim import models [as 别名]
# 或者: from gensim.models import TfidfModel [as 别名]
def generate_dtm(self, corpus, tfidf=False):
        """ Generate the inside document-term matrix and other peripherical information
        objects. This is run when the class is instantiated.

        :param corpus: corpus.
        :param tfidf: whether to weigh using tf-idf. (Default: False)
        :return: None
        :type corpus: list
        :type tfidf: bool
        """
        self.dictionary = Dictionary(corpus)
        self.dtm = dok_matrix((len(corpus), len(self.dictionary)), dtype=np.float)
        bow_corpus = [self.dictionary.doc2bow(doctokens) for doctokens in corpus]
        if tfidf:
            weighted_model = TfidfModel(bow_corpus)
            bow_corpus = weighted_model[bow_corpus]
        for docid in self.docids:
            for tokenid, count in bow_corpus[self.docid_dict[docid]]:
                self.dtm[self.docid_dict[docid], tokenid] = count 
开发者ID:stephenhky,项目名称:PyShortTextCategorization,代码行数:21,代码来源:dtm.py

示例5: train

# 需要导入模块: from gensim import models [as 别名]
# 或者: from gensim.models import TfidfModel [as 别名]
def train(self, prefix: str, corporas: list):
        """ 训练模型
        保存字典,语料,模型到磁盘

        Arguments:
            prefix {str} -- 模型名称前缀
            corpora_documents {list} -- 分词后的文本
        """
        # 生成字典和向量语料
        dictionary = corpora.Dictionary(corporas)
        dictionary.save('./models/{}_dict.dic'.format(prefix))  # 保存生成的词典

        corpus = [dictionary.doc2bow(text) for text in corporas]
        corpora.MmCorpus.serialize('./models/{}_corpuse.mm'.format(prefix), corpus)  # 保存生成的语料
        tfidf_model = models.TfidfModel(corpus)
        tfidf_model.save("./models/{}_tfidf_model.model".format(prefix))  # 保存Tfidf模型 
开发者ID:jarvisqi,项目名称:nlp_learning,代码行数:18,代码来源:docsim.py

示例6: init_tfidf_chinese_or_pinyin

# 需要导入模块: from gensim import models [as 别名]
# 或者: from gensim.models import TfidfModel [as 别名]
def init_tfidf_chinese_or_pinyin(sources_path):
    """
      构建td_idf
    :param path: 
    :return: 
    """
    questions = txtRead(sources_path)
    corpora_documents = []
    for item_text in questions:
        item_seg = list(jieba.cut(str(item_text).strip()))
        corpora_documents.append(item_seg)

    dictionary = corpora.Dictionary(corpora_documents)
    corpus = [dictionary.doc2bow(text) for text in corpora_documents]
    tfidf_model = models.TfidfModel(corpus)
    print("init_tfidf_chinese_or_pinyin ok! " + sources_path)
    file = open(sources_path.replace(".csv", "_dictionary_model.pkl"), 'wb')
    pickle.dump([dictionary, tfidf_model], file) 
开发者ID:yongzhuo,项目名称:nlp_xiaojiang,代码行数:20,代码来源:cut_td_idf.py

示例7: get_tfidf_weighted_keyphrases

# 需要导入模块: from gensim import models [as 别名]
# 或者: from gensim.models import TfidfModel [as 别名]
def get_tfidf_weighted_keyphrases(sentences, 
                                  grammar=r'NP: {<DT>? <JJ>* <NN.*>+}',
                                  top_n=10):
    
    valid_chunks = get_chunks(sentences, grammar=grammar)
                                     
    dictionary = corpora.Dictionary(valid_chunks)
    corpus = [dictionary.doc2bow(chunk) for chunk in valid_chunks]
    
    tfidf = models.TfidfModel(corpus)
    corpus_tfidf = tfidf[corpus]
    
    weighted_phrases = {dictionary.get(id): round(value,3) 
                        for doc in corpus_tfidf 
                        for id, value in doc}
                            
    weighted_phrases = sorted(weighted_phrases.items(), 
                              key=itemgetter(1), reverse=True)
    
    return weighted_phrases[:top_n] 
开发者ID:dipanjanS,项目名称:text-analytics-with-python,代码行数:22,代码来源:keyphrase_extraction.py

示例8: buildCorpus

# 需要导入模块: from gensim import models [as 别名]
# 或者: from gensim.models import TfidfModel [as 别名]
def buildCorpus(self):
        """
        Build the corpus from the documents:
            1. Remove words that only appeared once.
            2. Create the Dictionary object.
            3. Convert the documents to simple bag-of-words representation.
            4. Convert the bag-of-words vectors to tf-idf.
        """
        # Remove words that only appear once.
        self.documents = [[token for token in doc if self.frequency[token] > 1]
                          for doc in self.documents]
        
        # Build a dictionary from the text.
        self.dictionary = corpora.Dictionary(self.documents)
        
        # Map the documents to vectors.
        corpus = [self.dictionary.doc2bow(text) for text in self.documents]

        # Delete the tokenized representation of the documents--no need to
        # carry this around!
        del self.documents[:]

        # Convert the simple bag-of-words vectors to a tf-idf representation.        
        self.tfidf_model = TfidfModel(corpus)
        self.corpus_tfidf = self.tfidf_model[corpus] 
开发者ID:chrisjmccormick,项目名称:simsearch,代码行数:27,代码来源:corpusbuilder.py

示例9: summarize

# 需要导入模块: from gensim import models [as 别名]
# 或者: from gensim.models import TfidfModel [as 别名]
def summarize(self, text):
        self.sentences = self.factory.text2sentences(text)
        self.num_sentences = len(self.sentences)
        self.corpus = SentenceCorpus(self.sentences, self.no_below_word_count, self.no_above_word_portion, self.max_dictionary_size)
        self.model = TfidfModel(self.corpus.bows, id2word=self.corpus.dictionary, normalize=True)
        self.tfidfs = self.model[self.corpus.bows]
        self._inject_tfidfs()
        self._build_matrix()
        self._clustering()
        if self.compactify:
            self._compactify()
        self.graphs = []
        for i in range(self.num_clusters):
            graph = self.sentences2graph(self.clusters[i])
            pagerank = networkx.pagerank(graph, weight='weight')
            self.clusters[i] = sorted(pagerank, key=pagerank.get, reverse=True)
            self.graphs.append(graph) 
开发者ID:theeluwin,项目名称:lexrankr,代码行数:19,代码来源:lexrankr.py

示例10: _train_model

# 需要导入模块: from gensim import models [as 别名]
# 或者: from gensim.models import TfidfModel [as 别名]
def _train_model(self, min_freq=1):
        # Create tfidf model.
        self.dct = corpora.Dictionary(self.contexts)
        # Filter low frequency words from dictionary.
        low_freq_ids = [id_ for id_, freq in
                        self.dct.dfs.items() if freq <= min_freq]
        self.dct.filter_tokens(low_freq_ids)
        self.dct.compactify()
        # Build tfidf model.
        self.corpus = [self.dct.doc2bow(s) for s in self.contexts]
        self.tfidf_model = models.TfidfModel(self.corpus) 
开发者ID:shibing624,项目名称:dialogbot,代码行数:13,代码来源:tfidfmodel.py

示例11: prepare_gensim_corpus

# 需要导入模块: from gensim import models [as 别名]
# 或者: from gensim.models import TfidfModel [as 别名]
def prepare_gensim_corpus(corpus_name, corpus, output_path, min_freq=5):
	if not output_path.endswith('/'): output_path = output_path + '/'
	check_dir(output_path) # if directory does not exist, then create
	
	logging.info( 'building gensim corpus and dictionary for %s corpus', corpus_name )
	logging.info( 'loading corpus' )
	texts = [[word for word in process_text(document, removePunct=True, removeSW=True, removeNum=True)] for document in corpus]
	logging.info( 'tokenizing' )
	all_tokens = [item for sublist in texts for item in sublist]
	logging.info( 'mark tokens which have frequency less than %d', min_freq )
	tokens_once = set([k for k, v in collections.Counter(all_tokens).iteritems() if v < min_freq ])
	logging.info( '|D|=%d' , len(texts) )
	logging.info( 'filter low frequency tokens' )
	texts = [[word for word in text if word not in tokens_once] for text in texts]
	logging.info( '|D|=%d' , len(texts) )
	logging.info( 'building dictionary' )
	dictionary = corpora.Dictionary(texts)
	logging.info( 'saving dictionary' )
	dictFile = output_path + corpus_name + '.dict'
	dictionary.save(dictFile) 
	logging.info( 'building corpus in  mm format' )
	corpus = [dictionary.doc2bow(text) for text in texts]
	logging.info( 'saving corpus' )
	gensim_corpus_file = output_path + corpus_name + '.mm'
	corpora.MmCorpus.serialize(gensim_corpus_file, corpus)
	logging.info( 'computing tfidf' )
	tfidf = models.TfidfModel(corpus) # tfidf model 
	corpus_tfidf = tfidf[corpus] # tfidf corpus 
	logging.info( 'saving tfidf corpus' )
	corpus_tfidf_file = output_path + corpus_name + '.tfidf.mm'
	corpora.MmCorpus.serialize(corpus_tfidf_file, corpus_tfidf)
	logging.info( 'gensim corpus is ready' )
################################################################################## 
开发者ID:motazsaad,项目名称:comparable-text-miner,代码行数:35,代码来源:textpro.py

示例12: _build_idf

# 需要导入模块: from gensim import models [as 别名]
# 或者: from gensim.models import TfidfModel [as 别名]
def _build_idf(dictionary: gensim.corpora.Dictionary) -> np.ndarray:
        model = TfidfModel(dictionary=dictionary)
        idf = np.zeros(len(dictionary.token2id))
        for idx, value in model.idfs.items():
            idf[idx] = value
        return idf 
开发者ID:m3dev,项目名称:redshells,代码行数:8,代码来源:scdv.py

示例13: train

# 需要导入模块: from gensim import models [as 别名]
# 或者: from gensim.models import TfidfModel [as 别名]
def train(self):
        corpus = self.preprocess()
        dictionary = Dictionary(corpus)
        doc2bow = [dictionary.doc2bow(text) for text in corpus]

        tf_idf = TfidfModel(doc2bow)
        corpus_tf_idf = tf_idf[doc2bow]

        model = LdaModel(corpus_tf_idf, num_topics=2)
        return dictionary, tf_idf, model 
开发者ID:msgi,项目名称:nlp-journey,代码行数:12,代码来源:lda_topic.py

示例14: xform_tfidf

# 需要导入模块: from gensim import models [as 别名]
# 或者: from gensim.models import TfidfModel [as 别名]
def xform_tfidf(self):
        self.transformation = models.TfidfModel(self.get_vectors()) 
开发者ID:christabor,项目名称:MoAL,代码行数:4,代码来源:tm_gensim.py

示例15: __init__

# 需要导入模块: from gensim import models [as 别名]
# 或者: from gensim.models import TfidfModel [as 别名]
def __init__(self, data_list):
        data_list = self._check(data_list)
        self.dictionary = corpora.Dictionary(data_list)
        corpus = [self.dictionary.doc2bow(doc) for doc in data_list]
        self.tfidf = models.TfidfModel(corpus) #文档建tfidf模型 
开发者ID:zhufz,项目名称:nlp_research,代码行数:7,代码来源:recall.py


注:本文中的gensim.models.TfidfModel方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。