本文整理汇总了Python中gensim.models.Phrases方法的典型用法代码示例。如果您正苦于以下问题:Python models.Phrases方法的具体用法?Python models.Phrases怎么用?Python models.Phrases使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类gensim.models
的用法示例。
在下文中一共展示了models.Phrases方法的8个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: train_phrases
# 需要导入模块: from gensim import models [as 别名]
# 或者: from gensim.models import Phrases [as 别名]
def train_phrases(paths, out='data/bigram_model.phrases', tokenizer=word_tokenize, **kwargs):
"""
Train a bigram phrase model on a list of files.
"""
n = 0
for path in paths:
print('Counting lines for {0}...'.format(path))
n += sum(1 for line in open(path, 'r'))
print('Processing {0} lines...'.format(n))
# Change to use less memory. Default is 40m.
kwargs = {
'max_vocab_size': 40000000,
'threshold': 8.
}.update(kwargs)
print('Training bigrams...')
bigram = Phrases(_phrase_doc_stream(paths, n, tokenizer=word_tokenize), **kwargs)
print('Saving...')
bigram.save(out)
示例2: add_phrases
# 需要导入模块: from gensim import models [as 别名]
# 或者: from gensim.models import Phrases [as 别名]
def add_phrases(self, corpus):
'''
Parameters
----------
corpus: Corpus for phrase augmentation
Returns
-------
New ParsedCorpus containing unigrams in corpus and new phrases
'''
from gensim.models import Phrases
assert isinstance(corpus, ParsedCorpus)
self.phrases = [Phrases(CorpusAdapterForGensim.get_sentences(corpus), delimiter=' ')]
for i in range(1, self.max_tokens_per_phrase):
self.phrases.append(Phrases(self.phrases[-1][CorpusAdapterForGensim.get_sentences(corpus)]))
return self
示例3: __init__
# 需要导入模块: from gensim import models [as 别名]
# 或者: from gensim.models import Phrases [as 别名]
def __init__(self, phrases, gram_size):
'''
Parameters
----------
phrases : list[gensim.models.Phrases]
gram_size : int, maximum number of words per phrase
kwargs : parameters for FeatsFromSpacyDoc.init
'''
from gensim.models import Phrases
phrases = phrases
gram_size = gram_size
assert type(phrases) == Phrases
self.gram_size = gram_size
self.phrases = phrases
示例4: _scan_and_build_vocab
# 需要导入模块: from gensim import models [as 别名]
# 或者: from gensim.models import Phrases [as 别名]
def _scan_and_build_vocab(self):
from gensim.models import Phrases
bigram_transformer = Phrases(CorpusAdapterForGensim.get_sentences(self.corpus))
try:
self.model.scan_vocab(CorpusAdapterForGensim.get_sentences(self.corpus))
except:
pass
self.model.build_vocab(bigram_transformer[CorpusAdapterForGensim.get_sentences(self.corpus)])
示例5: compute_vectors
# 需要导入模块: from gensim import models [as 别名]
# 或者: from gensim.models import Phrases [as 别名]
def compute_vectors(input_path: Path, output_path: Path):
"""
Builds word embeddings using gensim Word2Vec. This function takes
a file contained single sentences per line and writes the computed
vectors in text format to the specified output path.
"""
print(f"Processing {input_path}")
sentences = LineSentence(input_path)
bigram_transformer = Phrases(sentences)
model = Word2Vec(
bigram_transformer[sentences], size=150, window=5, min_count=5, workers=4
)
print(f"Saving vectors to {output_path}")
model.wv.save_word2vec_format(output_path, binary=False)
示例6: tokenize
# 需要导入模块: from gensim import models [as 别名]
# 或者: from gensim.models import Phrases [as 别名]
def tokenize(self, docs):
if self.lemmatize:
lem = WordNetLemmatizer()
#print('RAKE tokenizing...')
pre_tdocs = RAKETokenizer(n_jobs=self.n_jobs).tokenize(docs)
for i, tdoc in enumerate(pre_tdocs):
for t in tdoc:
if t.startswith('one'):
print(t)
print(i)
#print('Additional Tokenizing docs...')
if self.n_jobs == 1:
tdocs = [pre_tokenize(doc, tdoc, lem=lem) for doc, tdoc in zip(docs, pre_tdocs)]
else:
tdocs = parallel(partial(pre_tokenize, lem=lem), zip(docs, pre_tdocs), self.n_jobs, expand_args=True)
#print('Training bigram...')
if self.bigram is None:
self.bigram = Phrases(tdocs,
min_count=self.min_count,
threshold=self.threshold,
delimiter=b' ')
else:
self.bigram.add_vocab(tdocs)
#print('Training trigram...')
if self.trigram is None:
self.trigram = Phrases(self.bigram[tdocs],
min_count=self.min_count,
threshold=self.threshold,
delimiter=b' ')
else:
self.trigram.add_vocab(self.bigram[tdocs])
return [tdoc for tdoc in self.trigram[self.bigram[tdocs]]]
示例7: quad_gram_words
# 需要导入模块: from gensim import models [as 别名]
# 或者: from gensim.models import Phrases [as 别名]
def quad_gram_words(tokenized_sentences_tokenized_words, minimum_count_for_vectorization):
print "performing bi gram"
bigram = Phrases((tokenized_sentences_tokenized_words), min_count=minimum_count_for_vectorization, delimiter='_', threshold = 10)
print "performing tri gram"
trigram = Phrases((list(bigram[tokenized_sentences_tokenized_words])), min_count=minimum_count_for_vectorization, delimiter='_', threshold = 10)
print "performing quad gram"
quadgram = Phrases((list(trigram[list(bigram[tokenized_sentences_tokenized_words])])), min_count=minimum_count_for_vectorization, delimiter='_', threshold = 10)
quadgramprocessed = (quadgram[list(trigram[list(bigram[tokenized_sentences_tokenized_words])])])
return quadgramprocessed
示例8: bi_gram_words
# 需要导入模块: from gensim import models [as 别名]
# 或者: from gensim.models import Phrases [as 别名]
def bi_gram_words(tokenized_sentences_tokenized_words, minimum_count_for_vectorization):
print "performing bi gram"
bigram = Phrases((tokenized_sentences_tokenized_words), min_count=minimum_count_for_vectorization, delimiter='_', threshold = 10)
bigramprocessed = (bigram[tokenized_sentences_tokenized_words])
return bigramprocessed