Python word2vec.LineSentence方法代碼示例

本文整理匯總了Python中gensim.models.word2vec.LineSentence方法的典型用法代碼示例。如果您正苦於以下問題：Python word2vec.LineSentence方法的具體用法？Python word2vec.LineSentence怎麽用？Python word2vec.LineSentence使用的例子？那麽, 這裏精選的方法代碼示例或許可以為您提供幫助。您也可以進一步了解該方法所在類gensim.models.word2vec的用法示例。

在下文中一共展示了word2vec.LineSentence方法的15個代碼示例，這些例子默認根據受歡迎程度排序。您可以為喜歡或者感覺有用的代碼點讚，您的評價將有助於係統推薦出更棒的Python代碼示例。

示例1: train_word2vec_by_word

# 需要導入模塊: from gensim.models import word2vec [as 別名]
# 或者: from gensim.models.word2vec import LineSentence [as 別名]
def train_word2vec_by_word():
    logging.basicConfig(format='%(asctime)s: %(levelname)s: %(message)s')
    logging.root.setLevel(level=logging.INFO)
    logging.info("running")

    inp = "cut_zhwiki_wiki_parse.txt"
    outp1 = "w2v_model_wiki.model"
    outp2 = "w2v_model_wiki_word.vec"

    print(multiprocessing.cpu_count())
    model = Word2Vec(LineSentence(inp), size=300, window=10,
                     # 這裏用skip-heriber
                     min_count=1, sg=1, hs=1, iter=10, workers=multiprocessing.cpu_count())

    model.save(outp1)
    model.wv.save_word2vec_format(outp2, binary=False)

開發者ID:yongzhuo，項目名稱:nlg-yongzhuo，代碼行數:18，代碼來源:keyword_word2vec.py

示例2: train_fasttext

# 需要導入模塊: from gensim.models import word2vec [as 別名]
# 或者: from gensim.models.word2vec import LineSentence [as 別名]
def train_fasttext(input_file, output_file, skipgram, loss, size, epochs):
    """
    train_fasttext(args**) -> Takes the input file, the
    output file and the model
    hyperparameters as arguments
    and trains the model accordingly.
    The model is saved at the output location.

    Arguments
    ---------
    input_file : Input pre-processed wiki dump
    output_file : Output directory to save the model.
    skipgram : Layers of the model (0 - CBOW, 1 - Skipgram)
    loss : Loss Function (0 - Negative Sampling, 1 - Heirarichal Loss)
    size : Embedding size (100 ~ 300)
    epochs : Number of epochs
    """
    sentence = LineSentence(input_file)

    model = FastText(sentence, sg=skipgram, hs=loss, size=size,
                     alpha=0.05, window=5, min_count=5, min_n=2,
                     max_n=5, workers=3, iter=epochs)

    model.save(output_file)

開發者ID:dbpedia，項目名稱:embeddings，代碼行數:26，代碼來源:pre_train.py

示例3: train_word2vec

# 需要導入模塊: from gensim.models import word2vec [as 別名]
# 或者: from gensim.models.word2vec import LineSentence [as 別名]
def train_word2vec(input_file, output_file, skipgram, loss, size, epochs):
    """
    train_word2vec(args**) -> Takes the input file,
    the output file and the model hyperparameters as
    arguments and trains the model accordingly.
    The model is saved at the output location.

    Arguments
    ---------
    input_file : Input pre-processed wiki dump
    output_file : Output directory to save the model.
    skipgram : Layers of the model (0 - CBOW, 1 - Skipgram)
    loss : Loss Function (0 - Negative Sampling, 1 - Heirarichal Loss)
    size : Embedding size (100 ~ 300)
    epochs : Number of epochs
    """
    sentence = LineSentence(input_file)

    model = Word2Vec(sentence, sg=skipgram, hs=loss,
                     size=size, alpha=0.05, window=5,
                     min_count=5, workers=3, iter=epochs)

    model.save(output_file)

開發者ID:dbpedia，項目名稱:embeddings，代碼行數:25，代碼來源:pre_train.py

示例4: build

# 需要導入模塊: from gensim.models import word2vec [as 別名]
# 或者: from gensim.models.word2vec import LineSentence [as 別名]
def build(train_seg_path, test_seg_path, out_path=None, sentence_path='',
          w2v_bin_path="w2v.bin", min_count=1, col_sep='\t'):
    sentences = extract_sentence(train_seg_path, test_seg_path, col_sep=col_sep)
    save_sentence(sentences, sentence_path)
    print('train w2v model...')
    # train model
    w2v = Word2Vec(sg=1, sentences=LineSentence(sentence_path),
                   size=256, window=5, min_count=min_count, iter=40)
    w2v.wv.save_word2vec_format(w2v_bin_path, binary=True)
    print("save %s ok." % w2v_bin_path)
    # test
    # sim = w2v.wv.similarity('大', '小')
    # print('大 vs 小 similarity score:', sim)
    # load model
    model = KeyedVectors.load_word2vec_format(w2v_bin_path, binary=True)
    word_dict = {}
    for word in model.vocab:
        word_dict[word] = model[word]
    save_pkl(word_dict, out_path, overwrite=True)

開發者ID:shibing624，項目名稱:text-classifier，代碼行數:21，代碼來源:build_w2v.py

示例5: bigrammer

# 需要導入模塊: from gensim.models import word2vec [as 別名]
# 或者: from gensim.models.word2vec import LineSentence [as 別名]
def bigrammer(source_file, outfile, mincount=100, threshold=0.99, scoring='npmi',
              commonfile='common_tagged.txt'):
    """
    :param source_file:
    :param outfile:
    :param mincount:
    :param threshold:
    :param scoring:
    :param commonfile:
    :return:
    """
    common = set([word.strip() for word in open(commonfile, 'r').readlines()])
    data = LineSentence(source_file)
    bigram_transformer = Phrases(sentences=data, min_count=mincount, threshold=threshold,
                                 scoring=scoring, max_vocab_size=400000000, delimiter=b':::',
                                 progress_per=100000, common_terms=common)
    bigrams = Phraser(bigram_transformer)
    tempfile = open(outfile, 'a')
    print('Writing bigrammed text to %s' % outfile, file=sys.stderr)
    for i in bigrams[data]:
        tempfile.write(' '.join(i) + '\n')
    tempfile.close()
    return len(bigrams.phrasegrams)

開發者ID:akutuzov，項目名稱:webvectors，代碼行數:25，代碼來源:helpers.py

示例6: train_model

# 需要導入模塊: from gensim.models import word2vec [as 別名]
# 或者: from gensim.models.word2vec import LineSentence [as 別名]
def train_model(corpus, size=200, window=5, workers=3, model_path=None,
                word_freq=None, corpus_count=None):
    """Train using Skipgram model.

    Args:
        corpus (str):       file path of corpus
        size (int):         embedding size (default=200)
        window (int):       window size (default=5)
        workers (int):      number of workers (default=3)
        model_path (str):   file path of model we want to update
        word_freq (dict):   dictionary of word frequencies
        corpus_count (int): corpus size

    Returns:
        Word2Vec: word2vec model
    """
    sentences = LineSentence(corpus)
    if model_path is not None:
        logger.info("Updating pre-existing model: %s", model_path)
        assert os.path.isfile(model_path), "File does not exist"
        model = Word2Vec.load(model_path)
        model.build_vocab(sentences, update=True)
        model.train(sentences, total_examples=model.corpus_count,
                    epochs=model.iter)
    else:
        model = Skipgram(sentences=sentences, size=size, window=window,
                         min_count=1, workers=workers, raw_vocab=word_freq,
                         corpus_count=corpus_count)
    return model

開發者ID:jwplayer，項目名稱:jwalk，代碼行數:31，代碼來源:skipgram.py

示例7: train_word2vec_model

# 需要導入模塊: from gensim.models import word2vec [as 別名]
# 或者: from gensim.models.word2vec import LineSentence [as 別名]
def train_word2vec_model(text_file_path, model_file_path):
    # define training data
    # train model
    logger.info("Loading input file and training mode ...")
    model = Word2Vec(sentences=LineSentence(text_file_path), min_count=1, size=global_config.embedding_size)
    # summarize the loaded model
    logger.info("Model Details: {}".format(model))
    # save model
    model.wv.save_word2vec_format(model_file_path, binary=True)
    logger.info("Model saved")

開發者ID:vineetjohn，項目名稱:linguistic-style-transfer，代碼行數:12，代碼來源:train_word2vec_model.py

示例8: main

# 需要導入模塊: from gensim.models import word2vec [as 別名]
# 或者: from gensim.models.word2vec import LineSentence [as 別名]
def main():

    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
    sentences = word2vec.LineSentence("wiki_seg.txt")
    model = word2vec.Word2Vec(sentences, size=250)

    #保存模型，供日後使用
    model.save(u"word2vec.model")

    #模型讀取方式
    # model = word2vec.Word2Vec.load("your_model_name")

開發者ID:zake7749，項目名稱:word2vec-tutorial，代碼行數:13，代碼來源:train.py

示例9: main

# 需要導入模塊: from gensim.models import word2vec [as 別名]
# 或者: from gensim.models.word2vec import LineSentence [as 別名]
def main():

    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
    sentences = word2vec.LineSentence("wiki_seg.txt")
    model = word2vec.Word2Vec(sentences, size=250)

    #保存模型，供日後使用
    model.save("word2vec.model")

    #模型讀取方式
    # model = word2vec.Word2Vec.load("your_model_name")

開發者ID:zake7749，項目名稱:word2vec-tutorial，代碼行數:13，代碼來源:train.py

示例10: train_w2v

# 需要導入模塊: from gensim.models import word2vec [as 別名]
# 或者: from gensim.models.word2vec import LineSentence [as 別名]
def train_w2v(self, filename):
        """
        訓練wv模型
        :param filename:path
        :return:none
        """
        sentences = word2vec.LineSentence(filename)  # 加載語料，要求語料為“一行一文本”的格式
        print '正在訓練w2v 針對語料：',str(filename)
        print 'size is: ',self.size
        model = word2vec.Word2Vec(sentences, size=self.size, window=100,workers=48)  # 訓練模型; 注意參數window 對結果有影響 一般5-100
        savepath = '20w_size_win100_' + str(self.size)+'.model' # 保存model的路徑
        print '訓練完畢，已保存: ', savepath,
        model.save(savepath)

開發者ID:prozhuchen，項目名稱:2016CCF-sougou，代碼行數:15，代碼來源:class_w2v.py

示例11: testLineSentenceWorksWithFilename

# 需要導入模塊: from gensim.models import word2vec [as 別名]
# 或者: from gensim.models.word2vec import LineSentence [as 別名]
def testLineSentenceWorksWithFilename(self):
        """Does LineSentence work with a filename argument?"""
        with utils.smart_open(datapath('lee_background.cor')) as orig:
            sentences = word2vec.LineSentence(datapath('lee_background.cor'))
            for words in sentences:
                self.assertEqual(words, utils.to_unicode(orig.readline()).split())

開發者ID:largelymfs，項目名稱:topical_word_embeddings，代碼行數:8，代碼來源:test_word2vec.py

示例12: testLineSentenceWorksWithCompressedFile

# 需要導入模塊: from gensim.models import word2vec [as 別名]
# 或者: from gensim.models.word2vec import LineSentence [as 別名]
def testLineSentenceWorksWithCompressedFile(self):
        """Does LineSentence work with a compressed file object argument?"""
        with utils.smart_open(datapath('head500.noblanks.cor')) as orig:
            sentences = word2vec.LineSentence(bz2.BZ2File(datapath('head500.noblanks.cor.bz2')))
            for words in sentences:
                self.assertEqual(words, utils.to_unicode(orig.readline()).split())

開發者ID:largelymfs，項目名稱:topical_word_embeddings，代碼行數:8，代碼來源:test_word2vec.py

示例13: testLineSentenceWorksWithNormalFile

# 需要導入模塊: from gensim.models import word2vec [as 別名]
# 或者: from gensim.models.word2vec import LineSentence [as 別名]
def testLineSentenceWorksWithNormalFile(self):
        """Does LineSentence work with a file object argument, rather than filename?"""
        with utils.smart_open(datapath('head500.noblanks.cor')) as orig:
            with utils.smart_open(datapath('head500.noblanks.cor')) as fin:
                sentences = word2vec.LineSentence(fin)
                for words in sentences:
                    self.assertEqual(words, utils.to_unicode(orig.readline()).split())
#endclass TestWord2VecSentenceIterators

開發者ID:largelymfs，項目名稱:topical_word_embeddings，代碼行數:10，代碼來源:test_word2vec.py

示例14: word2vec_model

# 需要導入模塊: from gensim.models import word2vec [as 別名]
# 或者: from gensim.models.word2vec import LineSentence [as 別名]
def word2vec_model(blog_seg_path):
    sentences = word2vec.LineSentence(blog_seg_path)
    model = word2vec.Word2Vec(sentences, workers=4)
    return model

開發者ID:03pie，項目名稱:SMPCUP2017，代碼行數:6，代碼來源:word2vec_model.py

示例15: learn_embeddings

# 需要導入模塊: from gensim.models import word2vec [as 別名]
# 或者: from gensim.models.word2vec import LineSentence [as 別名]
def learn_embeddings():
	'''
	Learn embeddings by optimizing the Skipgram objective using SGD.
	'''
	logging.info("Initializing creation of the representations...")
	walks = LineSentence('random_walks.txt')
	model = Word2Vec(walks, size=args.dimensions, window=args.window_size, min_count=0, hs=1, sg=1, workers=args.workers, iter=args.iter)
	model.wv.save_word2vec_format(args.output)
	logging.info("Representations created.")
	
	return

開發者ID:leoribeiro，項目名稱:struc2vec，代碼行數:13，代碼來源:main.py

注：本文中的gensim.models.word2vec.LineSentence方法示例由純淨天空整理自Github/MSDocs等開源代碼及文檔管理平台，相關代碼片段篩選自各路編程大神貢獻的開源項目，源碼版權歸原作者所有，傳播和使用請參考對應項目的License；未經允許，請勿轉載。