當前位置: 首頁>>代碼示例>>Python>>正文


Python models.Phrases方法代碼示例

本文整理匯總了Python中gensim.models.Phrases方法的典型用法代碼示例。如果您正苦於以下問題:Python models.Phrases方法的具體用法?Python models.Phrases怎麽用?Python models.Phrases使用的例子?那麽, 這裏精選的方法代碼示例或許可以為您提供幫助。您也可以進一步了解該方法所在gensim.models的用法示例。


在下文中一共展示了models.Phrases方法的8個代碼示例,這些例子默認根據受歡迎程度排序。您可以為喜歡或者感覺有用的代碼點讚,您的評價將有助於係統推薦出更棒的Python代碼示例。

示例1: train_phrases

# 需要導入模塊: from gensim import models [as 別名]
# 或者: from gensim.models import Phrases [as 別名]
def train_phrases(paths, out='data/bigram_model.phrases', tokenizer=word_tokenize, **kwargs):
    """
    Train a bigram phrase model on a list of files.
    """
    n = 0
    for path in paths:
        print('Counting lines for {0}...'.format(path))
        n += sum(1 for line in open(path, 'r'))
    print('Processing {0} lines...'.format(n))

    # Change to use less memory. Default is 40m.
    kwargs = {
        'max_vocab_size': 40000000,
        'threshold': 8.
    }.update(kwargs)

    print('Training bigrams...')
    bigram = Phrases(_phrase_doc_stream(paths, n, tokenizer=word_tokenize), **kwargs)

    print('Saving...')
    bigram.save(out) 
開發者ID:frnsys,項目名稱:broca,代碼行數:23,代碼來源:phrases.py

示例2: add_phrases

# 需要導入模塊: from gensim import models [as 別名]
# 或者: from gensim.models import Phrases [as 別名]
def add_phrases(self, corpus):
		'''
		Parameters
		----------
		corpus: Corpus for phrase augmentation

		Returns
		-------
		New ParsedCorpus containing unigrams in corpus and new phrases
		'''
		from gensim.models import Phrases

		assert isinstance(corpus, ParsedCorpus)
		self.phrases = [Phrases(CorpusAdapterForGensim.get_sentences(corpus), delimiter=' ')]

		for i in range(1, self.max_tokens_per_phrase):
			self.phrases.append(Phrases(self.phrases[-1][CorpusAdapterForGensim.get_sentences(corpus)]))

		return self 
開發者ID:JasonKessler,項目名稱:scattertext,代碼行數:21,代碼來源:Word2VecFromParsedCorpus.py

示例3: __init__

# 需要導入模塊: from gensim import models [as 別名]
# 或者: from gensim.models import Phrases [as 別名]
def __init__(self, phrases, gram_size):
		'''
		Parameters
		----------
		phrases : list[gensim.models.Phrases]
		gram_size : int, maximum number of words per phrase
		kwargs : parameters for FeatsFromSpacyDoc.init
		'''
		from gensim.models import Phrases

		phrases = phrases
		gram_size = gram_size
		assert type(phrases) == Phrases
		self.gram_size = gram_size
		self.phrases = phrases 
開發者ID:JasonKessler,項目名稱:scattertext,代碼行數:17,代碼來源:Word2VecFromParsedCorpus.py

示例4: _scan_and_build_vocab

# 需要導入模塊: from gensim import models [as 別名]
# 或者: from gensim.models import Phrases [as 別名]
def _scan_and_build_vocab(self):
		from gensim.models import Phrases
		bigram_transformer = Phrases(CorpusAdapterForGensim.get_sentences(self.corpus))
		try:
			self.model.scan_vocab(CorpusAdapterForGensim.get_sentences(self.corpus))
		except:
			pass
		self.model.build_vocab(bigram_transformer[CorpusAdapterForGensim.get_sentences(self.corpus)]) 
開發者ID:JasonKessler,項目名稱:scattertext,代碼行數:10,代碼來源:Word2VecFromParsedCorpus.py

示例5: compute_vectors

# 需要導入模塊: from gensim import models [as 別名]
# 或者: from gensim.models import Phrases [as 別名]
def compute_vectors(input_path: Path, output_path: Path):
    """
    Builds word embeddings using gensim Word2Vec. This function takes
    a file contained single sentences per line and writes the computed
    vectors in text format to the specified output path. 
    """
    print(f"Processing {input_path}")
    sentences = LineSentence(input_path)
    bigram_transformer = Phrases(sentences)
    model = Word2Vec(
        bigram_transformer[sentences], size=150, window=5, min_count=5, workers=4
    )
    print(f"Saving vectors to {output_path}")
    model.wv.save_word2vec_format(output_path, binary=False) 
開發者ID:ICLRandD,項目名稱:Blackstone,代碼行數:16,代碼來源:train_vectors.py

示例6: tokenize

# 需要導入模塊: from gensim import models [as 別名]
# 或者: from gensim.models import Phrases [as 別名]
def tokenize(self, docs):
        if self.lemmatize:
            lem = WordNetLemmatizer()

        #print('RAKE tokenizing...')
        pre_tdocs = RAKETokenizer(n_jobs=self.n_jobs).tokenize(docs)

        for i, tdoc in enumerate(pre_tdocs):
            for t in tdoc:
                if t.startswith('one'):
                    print(t)
                    print(i)

        #print('Additional Tokenizing docs...')
        if self.n_jobs == 1:
            tdocs = [pre_tokenize(doc, tdoc, lem=lem) for doc, tdoc in zip(docs, pre_tdocs)]
        else:
            tdocs = parallel(partial(pre_tokenize, lem=lem), zip(docs, pre_tdocs), self.n_jobs, expand_args=True)

        #print('Training bigram...')
        if self.bigram is None:
            self.bigram = Phrases(tdocs,
                                  min_count=self.min_count,
                                  threshold=self.threshold,
                                  delimiter=b' ')
        else:
            self.bigram.add_vocab(tdocs)

        #print('Training trigram...')
        if self.trigram is None:
            self.trigram = Phrases(self.bigram[tdocs],
                                   min_count=self.min_count,
                                   threshold=self.threshold,
                                   delimiter=b' ')
        else:
            self.trigram.add_vocab(self.bigram[tdocs])

        return [tdoc for tdoc in self.trigram[self.bigram[tdocs]]] 
開發者ID:frnsys,項目名稱:broca,代碼行數:40,代碼來源:overkill.py

示例7: quad_gram_words

# 需要導入模塊: from gensim import models [as 別名]
# 或者: from gensim.models import Phrases [as 別名]
def quad_gram_words(tokenized_sentences_tokenized_words, minimum_count_for_vectorization):
    print "performing bi gram"
    bigram = Phrases((tokenized_sentences_tokenized_words), min_count=minimum_count_for_vectorization, delimiter='_', threshold = 10)
    print "performing tri gram"
    trigram = Phrases((list(bigram[tokenized_sentences_tokenized_words])), min_count=minimum_count_for_vectorization, delimiter='_', threshold = 10)
    print "performing quad gram"
    quadgram = Phrases((list(trigram[list(bigram[tokenized_sentences_tokenized_words])])), min_count=minimum_count_for_vectorization, delimiter='_', threshold = 10)
    quadgramprocessed = (quadgram[list(trigram[list(bigram[tokenized_sentences_tokenized_words])])])
    return quadgramprocessed 
開發者ID:nicolas-ivanov,項目名稱:Seq2Seq_Upgrade_TensorFlow,代碼行數:11,代碼來源:cf.py

示例8: bi_gram_words

# 需要導入模塊: from gensim import models [as 別名]
# 或者: from gensim.models import Phrases [as 別名]
def bi_gram_words(tokenized_sentences_tokenized_words, minimum_count_for_vectorization):
    print "performing bi gram"
    bigram = Phrases((tokenized_sentences_tokenized_words), min_count=minimum_count_for_vectorization, delimiter='_', threshold = 10)
    bigramprocessed = (bigram[tokenized_sentences_tokenized_words])
    return bigramprocessed 
開發者ID:nicolas-ivanov,項目名稱:Seq2Seq_Upgrade_TensorFlow,代碼行數:7,代碼來源:cf.py


注:本文中的gensim.models.Phrases方法示例由純淨天空整理自Github/MSDocs等開源代碼及文檔管理平台,相關代碼片段篩選自各路編程大神貢獻的開源項目,源碼版權歸原作者所有,傳播和使用請參考對應項目的License;未經允許,請勿轉載。