當前位置: 首頁>>代碼示例>>Python>>正文


Python models.TfidfModel方法代碼示例

本文整理匯總了Python中gensim.models.TfidfModel方法的典型用法代碼示例。如果您正苦於以下問題:Python models.TfidfModel方法的具體用法?Python models.TfidfModel怎麽用?Python models.TfidfModel使用的例子?那麽, 這裏精選的方法代碼示例或許可以為您提供幫助。您也可以進一步了解該方法所在gensim.models的用法示例。


在下文中一共展示了models.TfidfModel方法的15個代碼示例,這些例子默認根據受歡迎程度排序。您可以為喜歡或者感覺有用的代碼點讚,您的評價將有助於係統推薦出更棒的Python代碼示例。

示例1: test_miislita_high_level

# 需要導入模塊: from gensim import models [as 別名]
# 或者: from gensim.models import TfidfModel [as 別名]
def test_miislita_high_level(self):
        # construct corpus from file
        miislita = CorpusMiislita(datapath('miIslita.cor'))

        # initialize tfidf transformation and similarity index
        tfidf = models.TfidfModel(miislita, miislita.dictionary, normalize=False)
        index = similarities.SparseMatrixSimilarity(tfidf[miislita], num_features=len(miislita.dictionary))

        # compare to query
        query = 'latent semantic indexing'
        vec_bow = miislita.dictionary.doc2bow(query.lower().split())
        vec_tfidf = tfidf[vec_bow]

        # perform a similarity query against the corpus
        sims_tfidf = index[vec_tfidf]

        # for the expected results see the article
        expected = [0.0, 0.2560, 0.7022, 0.1524, 0.3334]
        for i, value in enumerate(expected):
            self.assertAlmostEqual(sims_tfidf[i], value, 2) 
開發者ID:largelymfs,項目名稱:topical_word_embeddings,代碼行數:22,代碼來源:test_miislita.py

示例2: train_TFIDF

# 需要導入模塊: from gensim import models [as 別名]
# 或者: from gensim.models import TfidfModel [as 別名]
def train_TFIDF():
	
	list_cut_short_text = get_data.get_cut_PARTI_short_text()
	
	print "list_cut_short_text is %d"%(len(list_cut_short_text))

	logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',level = logging.INFO)
	
	dictionary = corpora.Dictionary(list_cut_short_text)

	dictionary.save("dictionary.tfidf.dic")
	
	corpus = [dictionary.doc2bow(text) for text in list_cut_short_text]

	tfidf = models.TfidfModel(corpus)
	
	tfidf.save('./model/PARTI_tfidf_model') 
開發者ID:yangzhiye,項目名稱:Short-Text-Summarization,代碼行數:19,代碼來源:train_TFIDF_model.py

示例3: __init__

# 需要導入模塊: from gensim import models [as 別名]
# 或者: from gensim.models import TfidfModel [as 別名]
def __init__(self, sentences_file, stopwords):
        self.dictionary = None
        self.corpus = None
        f_sentences = codecs.open(sentences_file, encoding='utf-8')
        documents = list()
        count = 0
        print("Gathering sentences and removing stopwords")
        for line in f_sentences:
            line = re.sub('<[A-Z]+>[^<]+</[A-Z]+>', '', line)

            # remove stop words and tokenize
            document = [word for word in nltk.word_tokenize(line.lower()) if word not in stopwords]
            documents.append(document)
            count += 1
            if count % 10000 == 0:
                sys.stdout.write(".")

        f_sentences.close()

        self.dictionary = corpora.Dictionary(documents)
        self.corpus = [self.dictionary.doc2bow(text) for text in documents]
        self.tf_idf_model = TfidfModel(self.corpus)

        print(len(documents), "documents red")
        print(len(self.dictionary), " unique tokens") 
開發者ID:davidsbatista,項目名稱:Snowball,代碼行數:27,代碼來源:VectorSpaceModel.py

示例4: generate_dtm

# 需要導入模塊: from gensim import models [as 別名]
# 或者: from gensim.models import TfidfModel [as 別名]
def generate_dtm(self, corpus, tfidf=False):
        """ Generate the inside document-term matrix and other peripherical information
        objects. This is run when the class is instantiated.

        :param corpus: corpus.
        :param tfidf: whether to weigh using tf-idf. (Default: False)
        :return: None
        :type corpus: list
        :type tfidf: bool
        """
        self.dictionary = Dictionary(corpus)
        self.dtm = dok_matrix((len(corpus), len(self.dictionary)), dtype=np.float)
        bow_corpus = [self.dictionary.doc2bow(doctokens) for doctokens in corpus]
        if tfidf:
            weighted_model = TfidfModel(bow_corpus)
            bow_corpus = weighted_model[bow_corpus]
        for docid in self.docids:
            for tokenid, count in bow_corpus[self.docid_dict[docid]]:
                self.dtm[self.docid_dict[docid], tokenid] = count 
開發者ID:stephenhky,項目名稱:PyShortTextCategorization,代碼行數:21,代碼來源:dtm.py

示例5: train

# 需要導入模塊: from gensim import models [as 別名]
# 或者: from gensim.models import TfidfModel [as 別名]
def train(self, prefix: str, corporas: list):
        """ 訓練模型
        保存字典,語料,模型到磁盤

        Arguments:
            prefix {str} -- 模型名稱前綴
            corpora_documents {list} -- 分詞後的文本
        """
        # 生成字典和向量語料
        dictionary = corpora.Dictionary(corporas)
        dictionary.save('./models/{}_dict.dic'.format(prefix))  # 保存生成的詞典

        corpus = [dictionary.doc2bow(text) for text in corporas]
        corpora.MmCorpus.serialize('./models/{}_corpuse.mm'.format(prefix), corpus)  # 保存生成的語料
        tfidf_model = models.TfidfModel(corpus)
        tfidf_model.save("./models/{}_tfidf_model.model".format(prefix))  # 保存Tfidf模型 
開發者ID:jarvisqi,項目名稱:nlp_learning,代碼行數:18,代碼來源:docsim.py

示例6: init_tfidf_chinese_or_pinyin

# 需要導入模塊: from gensim import models [as 別名]
# 或者: from gensim.models import TfidfModel [as 別名]
def init_tfidf_chinese_or_pinyin(sources_path):
    """
      構建td_idf
    :param path: 
    :return: 
    """
    questions = txtRead(sources_path)
    corpora_documents = []
    for item_text in questions:
        item_seg = list(jieba.cut(str(item_text).strip()))
        corpora_documents.append(item_seg)

    dictionary = corpora.Dictionary(corpora_documents)
    corpus = [dictionary.doc2bow(text) for text in corpora_documents]
    tfidf_model = models.TfidfModel(corpus)
    print("init_tfidf_chinese_or_pinyin ok! " + sources_path)
    file = open(sources_path.replace(".csv", "_dictionary_model.pkl"), 'wb')
    pickle.dump([dictionary, tfidf_model], file) 
開發者ID:yongzhuo,項目名稱:nlp_xiaojiang,代碼行數:20,代碼來源:cut_td_idf.py

示例7: get_tfidf_weighted_keyphrases

# 需要導入模塊: from gensim import models [as 別名]
# 或者: from gensim.models import TfidfModel [as 別名]
def get_tfidf_weighted_keyphrases(sentences, 
                                  grammar=r'NP: {<DT>? <JJ>* <NN.*>+}',
                                  top_n=10):
    
    valid_chunks = get_chunks(sentences, grammar=grammar)
                                     
    dictionary = corpora.Dictionary(valid_chunks)
    corpus = [dictionary.doc2bow(chunk) for chunk in valid_chunks]
    
    tfidf = models.TfidfModel(corpus)
    corpus_tfidf = tfidf[corpus]
    
    weighted_phrases = {dictionary.get(id): round(value,3) 
                        for doc in corpus_tfidf 
                        for id, value in doc}
                            
    weighted_phrases = sorted(weighted_phrases.items(), 
                              key=itemgetter(1), reverse=True)
    
    return weighted_phrases[:top_n] 
開發者ID:dipanjanS,項目名稱:text-analytics-with-python,代碼行數:22,代碼來源:keyphrase_extraction.py

示例8: buildCorpus

# 需要導入模塊: from gensim import models [as 別名]
# 或者: from gensim.models import TfidfModel [as 別名]
def buildCorpus(self):
        """
        Build the corpus from the documents:
            1. Remove words that only appeared once.
            2. Create the Dictionary object.
            3. Convert the documents to simple bag-of-words representation.
            4. Convert the bag-of-words vectors to tf-idf.
        """
        # Remove words that only appear once.
        self.documents = [[token for token in doc if self.frequency[token] > 1]
                          for doc in self.documents]
        
        # Build a dictionary from the text.
        self.dictionary = corpora.Dictionary(self.documents)
        
        # Map the documents to vectors.
        corpus = [self.dictionary.doc2bow(text) for text in self.documents]

        # Delete the tokenized representation of the documents--no need to
        # carry this around!
        del self.documents[:]

        # Convert the simple bag-of-words vectors to a tf-idf representation.        
        self.tfidf_model = TfidfModel(corpus)
        self.corpus_tfidf = self.tfidf_model[corpus] 
開發者ID:chrisjmccormick,項目名稱:simsearch,代碼行數:27,代碼來源:corpusbuilder.py

示例9: summarize

# 需要導入模塊: from gensim import models [as 別名]
# 或者: from gensim.models import TfidfModel [as 別名]
def summarize(self, text):
        self.sentences = self.factory.text2sentences(text)
        self.num_sentences = len(self.sentences)
        self.corpus = SentenceCorpus(self.sentences, self.no_below_word_count, self.no_above_word_portion, self.max_dictionary_size)
        self.model = TfidfModel(self.corpus.bows, id2word=self.corpus.dictionary, normalize=True)
        self.tfidfs = self.model[self.corpus.bows]
        self._inject_tfidfs()
        self._build_matrix()
        self._clustering()
        if self.compactify:
            self._compactify()
        self.graphs = []
        for i in range(self.num_clusters):
            graph = self.sentences2graph(self.clusters[i])
            pagerank = networkx.pagerank(graph, weight='weight')
            self.clusters[i] = sorted(pagerank, key=pagerank.get, reverse=True)
            self.graphs.append(graph) 
開發者ID:theeluwin,項目名稱:lexrankr,代碼行數:19,代碼來源:lexrankr.py

示例10: _train_model

# 需要導入模塊: from gensim import models [as 別名]
# 或者: from gensim.models import TfidfModel [as 別名]
def _train_model(self, min_freq=1):
        # Create tfidf model.
        self.dct = corpora.Dictionary(self.contexts)
        # Filter low frequency words from dictionary.
        low_freq_ids = [id_ for id_, freq in
                        self.dct.dfs.items() if freq <= min_freq]
        self.dct.filter_tokens(low_freq_ids)
        self.dct.compactify()
        # Build tfidf model.
        self.corpus = [self.dct.doc2bow(s) for s in self.contexts]
        self.tfidf_model = models.TfidfModel(self.corpus) 
開發者ID:shibing624,項目名稱:dialogbot,代碼行數:13,代碼來源:tfidfmodel.py

示例11: prepare_gensim_corpus

# 需要導入模塊: from gensim import models [as 別名]
# 或者: from gensim.models import TfidfModel [as 別名]
def prepare_gensim_corpus(corpus_name, corpus, output_path, min_freq=5):
	if not output_path.endswith('/'): output_path = output_path + '/'
	check_dir(output_path) # if directory does not exist, then create
	
	logging.info( 'building gensim corpus and dictionary for %s corpus', corpus_name )
	logging.info( 'loading corpus' )
	texts = [[word for word in process_text(document, removePunct=True, removeSW=True, removeNum=True)] for document in corpus]
	logging.info( 'tokenizing' )
	all_tokens = [item for sublist in texts for item in sublist]
	logging.info( 'mark tokens which have frequency less than %d', min_freq )
	tokens_once = set([k for k, v in collections.Counter(all_tokens).iteritems() if v < min_freq ])
	logging.info( '|D|=%d' , len(texts) )
	logging.info( 'filter low frequency tokens' )
	texts = [[word for word in text if word not in tokens_once] for text in texts]
	logging.info( '|D|=%d' , len(texts) )
	logging.info( 'building dictionary' )
	dictionary = corpora.Dictionary(texts)
	logging.info( 'saving dictionary' )
	dictFile = output_path + corpus_name + '.dict'
	dictionary.save(dictFile) 
	logging.info( 'building corpus in  mm format' )
	corpus = [dictionary.doc2bow(text) for text in texts]
	logging.info( 'saving corpus' )
	gensim_corpus_file = output_path + corpus_name + '.mm'
	corpora.MmCorpus.serialize(gensim_corpus_file, corpus)
	logging.info( 'computing tfidf' )
	tfidf = models.TfidfModel(corpus) # tfidf model 
	corpus_tfidf = tfidf[corpus] # tfidf corpus 
	logging.info( 'saving tfidf corpus' )
	corpus_tfidf_file = output_path + corpus_name + '.tfidf.mm'
	corpora.MmCorpus.serialize(corpus_tfidf_file, corpus_tfidf)
	logging.info( 'gensim corpus is ready' )
################################################################################## 
開發者ID:motazsaad,項目名稱:comparable-text-miner,代碼行數:35,代碼來源:textpro.py

示例12: _build_idf

# 需要導入模塊: from gensim import models [as 別名]
# 或者: from gensim.models import TfidfModel [as 別名]
def _build_idf(dictionary: gensim.corpora.Dictionary) -> np.ndarray:
        model = TfidfModel(dictionary=dictionary)
        idf = np.zeros(len(dictionary.token2id))
        for idx, value in model.idfs.items():
            idf[idx] = value
        return idf 
開發者ID:m3dev,項目名稱:redshells,代碼行數:8,代碼來源:scdv.py

示例13: train

# 需要導入模塊: from gensim import models [as 別名]
# 或者: from gensim.models import TfidfModel [as 別名]
def train(self):
        corpus = self.preprocess()
        dictionary = Dictionary(corpus)
        doc2bow = [dictionary.doc2bow(text) for text in corpus]

        tf_idf = TfidfModel(doc2bow)
        corpus_tf_idf = tf_idf[doc2bow]

        model = LdaModel(corpus_tf_idf, num_topics=2)
        return dictionary, tf_idf, model 
開發者ID:msgi,項目名稱:nlp-journey,代碼行數:12,代碼來源:lda_topic.py

示例14: xform_tfidf

# 需要導入模塊: from gensim import models [as 別名]
# 或者: from gensim.models import TfidfModel [as 別名]
def xform_tfidf(self):
        self.transformation = models.TfidfModel(self.get_vectors()) 
開發者ID:christabor,項目名稱:MoAL,代碼行數:4,代碼來源:tm_gensim.py

示例15: __init__

# 需要導入模塊: from gensim import models [as 別名]
# 或者: from gensim.models import TfidfModel [as 別名]
def __init__(self, data_list):
        data_list = self._check(data_list)
        self.dictionary = corpora.Dictionary(data_list)
        corpus = [self.dictionary.doc2bow(doc) for doc in data_list]
        self.tfidf = models.TfidfModel(corpus) #文檔建tfidf模型 
開發者ID:zhufz,項目名稱:nlp_research,代碼行數:7,代碼來源:recall.py


注:本文中的gensim.models.TfidfModel方法示例由純淨天空整理自Github/MSDocs等開源代碼及文檔管理平台,相關代碼片段篩選自各路編程大神貢獻的開源項目,源碼版權歸原作者所有,傳播和使用請參考對應項目的License;未經允許,請勿轉載。