Python word2vec.LineSentence方法代码示例

本文整理汇总了Python中gensim.models.word2vec.LineSentence方法的典型用法代码示例。如果您正苦于以下问题：Python word2vec.LineSentence方法的具体用法？Python word2vec.LineSentence怎么用？Python word2vec.LineSentence使用的例子？那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类gensim.models.word2vec的用法示例。

在下文中一共展示了word2vec.LineSentence方法的15个代码示例，这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞，您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: train_word2vec_by_word

# 需要导入模块: from gensim.models import word2vec [as 别名]
# 或者: from gensim.models.word2vec import LineSentence [as 别名]
def train_word2vec_by_word():
    logging.basicConfig(format='%(asctime)s: %(levelname)s: %(message)s')
    logging.root.setLevel(level=logging.INFO)
    logging.info("running")

    inp = "cut_zhwiki_wiki_parse.txt"
    outp1 = "w2v_model_wiki.model"
    outp2 = "w2v_model_wiki_word.vec"

    print(multiprocessing.cpu_count())
    model = Word2Vec(LineSentence(inp), size=300, window=10,
                     # 这里用skip-heriber
                     min_count=1, sg=1, hs=1, iter=10, workers=multiprocessing.cpu_count())

    model.save(outp1)
    model.wv.save_word2vec_format(outp2, binary=False)

开发者ID:yongzhuo，项目名称:nlg-yongzhuo，代码行数:18，代码来源:keyword_word2vec.py

示例2: train_fasttext

# 需要导入模块: from gensim.models import word2vec [as 别名]
# 或者: from gensim.models.word2vec import LineSentence [as 别名]
def train_fasttext(input_file, output_file, skipgram, loss, size, epochs):
    """
    train_fasttext(args**) -> Takes the input file, the
    output file and the model
    hyperparameters as arguments
    and trains the model accordingly.
    The model is saved at the output location.

    Arguments
    ---------
    input_file : Input pre-processed wiki dump
    output_file : Output directory to save the model.
    skipgram : Layers of the model (0 - CBOW, 1 - Skipgram)
    loss : Loss Function (0 - Negative Sampling, 1 - Heirarichal Loss)
    size : Embedding size (100 ~ 300)
    epochs : Number of epochs
    """
    sentence = LineSentence(input_file)

    model = FastText(sentence, sg=skipgram, hs=loss, size=size,
                     alpha=0.05, window=5, min_count=5, min_n=2,
                     max_n=5, workers=3, iter=epochs)

    model.save(output_file)

开发者ID:dbpedia，项目名称:embeddings，代码行数:26，代码来源:pre_train.py

示例3: train_word2vec

# 需要导入模块: from gensim.models import word2vec [as 别名]
# 或者: from gensim.models.word2vec import LineSentence [as 别名]
def train_word2vec(input_file, output_file, skipgram, loss, size, epochs):
    """
    train_word2vec(args**) -> Takes the input file,
    the output file and the model hyperparameters as
    arguments and trains the model accordingly.
    The model is saved at the output location.

    Arguments
    ---------
    input_file : Input pre-processed wiki dump
    output_file : Output directory to save the model.
    skipgram : Layers of the model (0 - CBOW, 1 - Skipgram)
    loss : Loss Function (0 - Negative Sampling, 1 - Heirarichal Loss)
    size : Embedding size (100 ~ 300)
    epochs : Number of epochs
    """
    sentence = LineSentence(input_file)

    model = Word2Vec(sentence, sg=skipgram, hs=loss,
                     size=size, alpha=0.05, window=5,
                     min_count=5, workers=3, iter=epochs)

    model.save(output_file)

开发者ID:dbpedia，项目名称:embeddings，代码行数:25，代码来源:pre_train.py

示例4: build

# 需要导入模块: from gensim.models import word2vec [as 别名]
# 或者: from gensim.models.word2vec import LineSentence [as 别名]
def build(train_seg_path, test_seg_path, out_path=None, sentence_path='',
          w2v_bin_path="w2v.bin", min_count=1, col_sep='\t'):
    sentences = extract_sentence(train_seg_path, test_seg_path, col_sep=col_sep)
    save_sentence(sentences, sentence_path)
    print('train w2v model...')
    # train model
    w2v = Word2Vec(sg=1, sentences=LineSentence(sentence_path),
                   size=256, window=5, min_count=min_count, iter=40)
    w2v.wv.save_word2vec_format(w2v_bin_path, binary=True)
    print("save %s ok." % w2v_bin_path)
    # test
    # sim = w2v.wv.similarity('大', '小')
    # print('大 vs 小 similarity score:', sim)
    # load model
    model = KeyedVectors.load_word2vec_format(w2v_bin_path, binary=True)
    word_dict = {}
    for word in model.vocab:
        word_dict[word] = model[word]
    save_pkl(word_dict, out_path, overwrite=True)

开发者ID:shibing624，项目名称:text-classifier，代码行数:21，代码来源:build_w2v.py

示例5: bigrammer

# 需要导入模块: from gensim.models import word2vec [as 别名]
# 或者: from gensim.models.word2vec import LineSentence [as 别名]
def bigrammer(source_file, outfile, mincount=100, threshold=0.99, scoring='npmi',
              commonfile='common_tagged.txt'):
    """
    :param source_file:
    :param outfile:
    :param mincount:
    :param threshold:
    :param scoring:
    :param commonfile:
    :return:
    """
    common = set([word.strip() for word in open(commonfile, 'r').readlines()])
    data = LineSentence(source_file)
    bigram_transformer = Phrases(sentences=data, min_count=mincount, threshold=threshold,
                                 scoring=scoring, max_vocab_size=400000000, delimiter=b':::',
                                 progress_per=100000, common_terms=common)
    bigrams = Phraser(bigram_transformer)
    tempfile = open(outfile, 'a')
    print('Writing bigrammed text to %s' % outfile, file=sys.stderr)
    for i in bigrams[data]:
        tempfile.write(' '.join(i) + '\n')
    tempfile.close()
    return len(bigrams.phrasegrams)

开发者ID:akutuzov，项目名称:webvectors，代码行数:25，代码来源:helpers.py

示例6: train_model

# 需要导入模块: from gensim.models import word2vec [as 别名]
# 或者: from gensim.models.word2vec import LineSentence [as 别名]
def train_model(corpus, size=200, window=5, workers=3, model_path=None,
                word_freq=None, corpus_count=None):
    """Train using Skipgram model.

    Args:
        corpus (str):       file path of corpus
        size (int):         embedding size (default=200)
        window (int):       window size (default=5)
        workers (int):      number of workers (default=3)
        model_path (str):   file path of model we want to update
        word_freq (dict):   dictionary of word frequencies
        corpus_count (int): corpus size

    Returns:
        Word2Vec: word2vec model
    """
    sentences = LineSentence(corpus)
    if model_path is not None:
        logger.info("Updating pre-existing model: %s", model_path)
        assert os.path.isfile(model_path), "File does not exist"
        model = Word2Vec.load(model_path)
        model.build_vocab(sentences, update=True)
        model.train(sentences, total_examples=model.corpus_count,
                    epochs=model.iter)
    else:
        model = Skipgram(sentences=sentences, size=size, window=window,
                         min_count=1, workers=workers, raw_vocab=word_freq,
                         corpus_count=corpus_count)
    return model

开发者ID:jwplayer，项目名称:jwalk，代码行数:31，代码来源:skipgram.py

示例7: train_word2vec_model

# 需要导入模块: from gensim.models import word2vec [as 别名]
# 或者: from gensim.models.word2vec import LineSentence [as 别名]
def train_word2vec_model(text_file_path, model_file_path):
    # define training data
    # train model
    logger.info("Loading input file and training mode ...")
    model = Word2Vec(sentences=LineSentence(text_file_path), min_count=1, size=global_config.embedding_size)
    # summarize the loaded model
    logger.info("Model Details: {}".format(model))
    # save model
    model.wv.save_word2vec_format(model_file_path, binary=True)
    logger.info("Model saved")

开发者ID:vineetjohn，项目名称:linguistic-style-transfer，代码行数:12，代码来源:train_word2vec_model.py

示例8: main

# 需要导入模块: from gensim.models import word2vec [as 别名]
# 或者: from gensim.models.word2vec import LineSentence [as 别名]
def main():

    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
    sentences = word2vec.LineSentence("wiki_seg.txt")
    model = word2vec.Word2Vec(sentences, size=250)

    #保存模型，供日後使用
    model.save(u"word2vec.model")

    #模型讀取方式
    # model = word2vec.Word2Vec.load("your_model_name")

开发者ID:zake7749，项目名称:word2vec-tutorial，代码行数:13，代码来源:train.py

示例9: main

# 需要导入模块: from gensim.models import word2vec [as 别名]
# 或者: from gensim.models.word2vec import LineSentence [as 别名]
def main():

    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
    sentences = word2vec.LineSentence("wiki_seg.txt")
    model = word2vec.Word2Vec(sentences, size=250)

    #保存模型，供日後使用
    model.save("word2vec.model")

    #模型讀取方式
    # model = word2vec.Word2Vec.load("your_model_name")

开发者ID:zake7749，项目名称:word2vec-tutorial，代码行数:13，代码来源:train.py

示例10: train_w2v

# 需要导入模块: from gensim.models import word2vec [as 别名]
# 或者: from gensim.models.word2vec import LineSentence [as 别名]
def train_w2v(self, filename):
        """
        训练wv模型
        :param filename:path
        :return:none
        """
        sentences = word2vec.LineSentence(filename)  # 加载语料，要求语料为“一行一文本”的格式
        print '正在训练w2v 针对语料：',str(filename)
        print 'size is: ',self.size
        model = word2vec.Word2Vec(sentences, size=self.size, window=100,workers=48)  # 训练模型; 注意参数window 对结果有影响 一般5-100
        savepath = '20w_size_win100_' + str(self.size)+'.model' # 保存model的路径
        print '训练完毕，已保存: ', savepath,
        model.save(savepath)

开发者ID:prozhuchen，项目名称:2016CCF-sougou，代码行数:15，代码来源:class_w2v.py

示例11: testLineSentenceWorksWithFilename

# 需要导入模块: from gensim.models import word2vec [as 别名]
# 或者: from gensim.models.word2vec import LineSentence [as 别名]
def testLineSentenceWorksWithFilename(self):
        """Does LineSentence work with a filename argument?"""
        with utils.smart_open(datapath('lee_background.cor')) as orig:
            sentences = word2vec.LineSentence(datapath('lee_background.cor'))
            for words in sentences:
                self.assertEqual(words, utils.to_unicode(orig.readline()).split())

开发者ID:largelymfs，项目名称:topical_word_embeddings，代码行数:8，代码来源:test_word2vec.py

示例12: testLineSentenceWorksWithCompressedFile

# 需要导入模块: from gensim.models import word2vec [as 别名]
# 或者: from gensim.models.word2vec import LineSentence [as 别名]
def testLineSentenceWorksWithCompressedFile(self):
        """Does LineSentence work with a compressed file object argument?"""
        with utils.smart_open(datapath('head500.noblanks.cor')) as orig:
            sentences = word2vec.LineSentence(bz2.BZ2File(datapath('head500.noblanks.cor.bz2')))
            for words in sentences:
                self.assertEqual(words, utils.to_unicode(orig.readline()).split())

开发者ID:largelymfs，项目名称:topical_word_embeddings，代码行数:8，代码来源:test_word2vec.py

示例13: testLineSentenceWorksWithNormalFile

# 需要导入模块: from gensim.models import word2vec [as 别名]
# 或者: from gensim.models.word2vec import LineSentence [as 别名]
def testLineSentenceWorksWithNormalFile(self):
        """Does LineSentence work with a file object argument, rather than filename?"""
        with utils.smart_open(datapath('head500.noblanks.cor')) as orig:
            with utils.smart_open(datapath('head500.noblanks.cor')) as fin:
                sentences = word2vec.LineSentence(fin)
                for words in sentences:
                    self.assertEqual(words, utils.to_unicode(orig.readline()).split())
#endclass TestWord2VecSentenceIterators

开发者ID:largelymfs，项目名称:topical_word_embeddings，代码行数:10，代码来源:test_word2vec.py

示例14: word2vec_model

# 需要导入模块: from gensim.models import word2vec [as 别名]
# 或者: from gensim.models.word2vec import LineSentence [as 别名]
def word2vec_model(blog_seg_path):
    sentences = word2vec.LineSentence(blog_seg_path)
    model = word2vec.Word2Vec(sentences, workers=4)
    return model

开发者ID:03pie，项目名称:SMPCUP2017，代码行数:6，代码来源:word2vec_model.py

示例15: learn_embeddings

# 需要导入模块: from gensim.models import word2vec [as 别名]
# 或者: from gensim.models.word2vec import LineSentence [as 别名]
def learn_embeddings():
	'''
	Learn embeddings by optimizing the Skipgram objective using SGD.
	'''
	logging.info("Initializing creation of the representations...")
	walks = LineSentence('random_walks.txt')
	model = Word2Vec(walks, size=args.dimensions, window=args.window_size, min_count=0, hs=1, sg=1, workers=args.workers, iter=args.iter)
	model.wv.save_word2vec_format(args.output)
	logging.info("Representations created.")
	
	return

开发者ID:leoribeiro，项目名称:struc2vec，代码行数:13，代码来源:main.py

注：本文中的gensim.models.word2vec.LineSentence方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台，相关代码片段筛选自各路编程大神贡献的开源项目，源码版权归原作者所有，传播和使用请参考对应项目的License；未经允许，请勿转载。