当前位置: 首页>>代码示例>>Python>>正文


Python models.Phrases方法代码示例

本文整理汇总了Python中gensim.models.Phrases方法的典型用法代码示例。如果您正苦于以下问题:Python models.Phrases方法的具体用法?Python models.Phrases怎么用?Python models.Phrases使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在gensim.models的用法示例。


在下文中一共展示了models.Phrases方法的8个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: train_phrases

# 需要导入模块: from gensim import models [as 别名]
# 或者: from gensim.models import Phrases [as 别名]
def train_phrases(paths, out='data/bigram_model.phrases', tokenizer=word_tokenize, **kwargs):
    """
    Train a bigram phrase model on a list of files.
    """
    n = 0
    for path in paths:
        print('Counting lines for {0}...'.format(path))
        n += sum(1 for line in open(path, 'r'))
    print('Processing {0} lines...'.format(n))

    # Change to use less memory. Default is 40m.
    kwargs = {
        'max_vocab_size': 40000000,
        'threshold': 8.
    }.update(kwargs)

    print('Training bigrams...')
    bigram = Phrases(_phrase_doc_stream(paths, n, tokenizer=word_tokenize), **kwargs)

    print('Saving...')
    bigram.save(out) 
开发者ID:frnsys,项目名称:broca,代码行数:23,代码来源:phrases.py

示例2: add_phrases

# 需要导入模块: from gensim import models [as 别名]
# 或者: from gensim.models import Phrases [as 别名]
def add_phrases(self, corpus):
		'''
		Parameters
		----------
		corpus: Corpus for phrase augmentation

		Returns
		-------
		New ParsedCorpus containing unigrams in corpus and new phrases
		'''
		from gensim.models import Phrases

		assert isinstance(corpus, ParsedCorpus)
		self.phrases = [Phrases(CorpusAdapterForGensim.get_sentences(corpus), delimiter=' ')]

		for i in range(1, self.max_tokens_per_phrase):
			self.phrases.append(Phrases(self.phrases[-1][CorpusAdapterForGensim.get_sentences(corpus)]))

		return self 
开发者ID:JasonKessler,项目名称:scattertext,代码行数:21,代码来源:Word2VecFromParsedCorpus.py

示例3: __init__

# 需要导入模块: from gensim import models [as 别名]
# 或者: from gensim.models import Phrases [as 别名]
def __init__(self, phrases, gram_size):
		'''
		Parameters
		----------
		phrases : list[gensim.models.Phrases]
		gram_size : int, maximum number of words per phrase
		kwargs : parameters for FeatsFromSpacyDoc.init
		'''
		from gensim.models import Phrases

		phrases = phrases
		gram_size = gram_size
		assert type(phrases) == Phrases
		self.gram_size = gram_size
		self.phrases = phrases 
开发者ID:JasonKessler,项目名称:scattertext,代码行数:17,代码来源:Word2VecFromParsedCorpus.py

示例4: _scan_and_build_vocab

# 需要导入模块: from gensim import models [as 别名]
# 或者: from gensim.models import Phrases [as 别名]
def _scan_and_build_vocab(self):
		from gensim.models import Phrases
		bigram_transformer = Phrases(CorpusAdapterForGensim.get_sentences(self.corpus))
		try:
			self.model.scan_vocab(CorpusAdapterForGensim.get_sentences(self.corpus))
		except:
			pass
		self.model.build_vocab(bigram_transformer[CorpusAdapterForGensim.get_sentences(self.corpus)]) 
开发者ID:JasonKessler,项目名称:scattertext,代码行数:10,代码来源:Word2VecFromParsedCorpus.py

示例5: compute_vectors

# 需要导入模块: from gensim import models [as 别名]
# 或者: from gensim.models import Phrases [as 别名]
def compute_vectors(input_path: Path, output_path: Path):
    """
    Builds word embeddings using gensim Word2Vec. This function takes
    a file contained single sentences per line and writes the computed
    vectors in text format to the specified output path. 
    """
    print(f"Processing {input_path}")
    sentences = LineSentence(input_path)
    bigram_transformer = Phrases(sentences)
    model = Word2Vec(
        bigram_transformer[sentences], size=150, window=5, min_count=5, workers=4
    )
    print(f"Saving vectors to {output_path}")
    model.wv.save_word2vec_format(output_path, binary=False) 
开发者ID:ICLRandD,项目名称:Blackstone,代码行数:16,代码来源:train_vectors.py

示例6: tokenize

# 需要导入模块: from gensim import models [as 别名]
# 或者: from gensim.models import Phrases [as 别名]
def tokenize(self, docs):
        if self.lemmatize:
            lem = WordNetLemmatizer()

        #print('RAKE tokenizing...')
        pre_tdocs = RAKETokenizer(n_jobs=self.n_jobs).tokenize(docs)

        for i, tdoc in enumerate(pre_tdocs):
            for t in tdoc:
                if t.startswith('one'):
                    print(t)
                    print(i)

        #print('Additional Tokenizing docs...')
        if self.n_jobs == 1:
            tdocs = [pre_tokenize(doc, tdoc, lem=lem) for doc, tdoc in zip(docs, pre_tdocs)]
        else:
            tdocs = parallel(partial(pre_tokenize, lem=lem), zip(docs, pre_tdocs), self.n_jobs, expand_args=True)

        #print('Training bigram...')
        if self.bigram is None:
            self.bigram = Phrases(tdocs,
                                  min_count=self.min_count,
                                  threshold=self.threshold,
                                  delimiter=b' ')
        else:
            self.bigram.add_vocab(tdocs)

        #print('Training trigram...')
        if self.trigram is None:
            self.trigram = Phrases(self.bigram[tdocs],
                                   min_count=self.min_count,
                                   threshold=self.threshold,
                                   delimiter=b' ')
        else:
            self.trigram.add_vocab(self.bigram[tdocs])

        return [tdoc for tdoc in self.trigram[self.bigram[tdocs]]] 
开发者ID:frnsys,项目名称:broca,代码行数:40,代码来源:overkill.py

示例7: quad_gram_words

# 需要导入模块: from gensim import models [as 别名]
# 或者: from gensim.models import Phrases [as 别名]
def quad_gram_words(tokenized_sentences_tokenized_words, minimum_count_for_vectorization):
    print "performing bi gram"
    bigram = Phrases((tokenized_sentences_tokenized_words), min_count=minimum_count_for_vectorization, delimiter='_', threshold = 10)
    print "performing tri gram"
    trigram = Phrases((list(bigram[tokenized_sentences_tokenized_words])), min_count=minimum_count_for_vectorization, delimiter='_', threshold = 10)
    print "performing quad gram"
    quadgram = Phrases((list(trigram[list(bigram[tokenized_sentences_tokenized_words])])), min_count=minimum_count_for_vectorization, delimiter='_', threshold = 10)
    quadgramprocessed = (quadgram[list(trigram[list(bigram[tokenized_sentences_tokenized_words])])])
    return quadgramprocessed 
开发者ID:nicolas-ivanov,项目名称:Seq2Seq_Upgrade_TensorFlow,代码行数:11,代码来源:cf.py

示例8: bi_gram_words

# 需要导入模块: from gensim import models [as 别名]
# 或者: from gensim.models import Phrases [as 别名]
def bi_gram_words(tokenized_sentences_tokenized_words, minimum_count_for_vectorization):
    print "performing bi gram"
    bigram = Phrases((tokenized_sentences_tokenized_words), min_count=minimum_count_for_vectorization, delimiter='_', threshold = 10)
    bigramprocessed = (bigram[tokenized_sentences_tokenized_words])
    return bigramprocessed 
开发者ID:nicolas-ivanov,项目名称:Seq2Seq_Upgrade_TensorFlow,代码行数:7,代码来源:cf.py


注:本文中的gensim.models.Phrases方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。