Python word2vec.LineSentence方法代码示例

本文整理汇总了Python中gensim.models.word2vec.LineSentence方法的典型用法代码示例。如果您正苦于以下问题:Python word2vec.LineSentence方法的具体用法?Python word2vec.LineSentence怎么用?Python word2vec.LineSentence使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在gensim.models.word2vec的用法示例。


示例1: train_word2vec_by_word

# 需要导入模块: from gensim.models import word2vec [as 别名]
# 或者: from gensim.models.word2vec import LineSentence [as 别名]
def train_word2vec_by_word():
    logging.basicConfig(format='%(asctime)s: %(levelname)s: %(message)s')

    inp = "cut_zhwiki_wiki_parse.txt"
    outp1 = "w2v_model_wiki.model"
    outp2 = "w2v_model_wiki_word.vec"

    model = Word2Vec(LineSentence(inp), size=300, window=10,
                     # 这里用skip-heriber
                     min_count=1, sg=1, hs=1, iter=10, workers=multiprocessing.cpu_count())

    model.wv.save_word2vec_format(outp2, binary=False) 

示例2: train_fasttext

# 需要导入模块: from gensim.models import word2vec [as 别名]
# 或者: from gensim.models.word2vec import LineSentence [as 别名]
def train_fasttext(input_file, output_file, skipgram, loss, size, epochs):
    train_fasttext(args**) -> Takes the input file, the
    output file and the model
    hyperparameters as arguments
    and trains the model accordingly.
    The model is saved at the output location.

    input_file : Input pre-processed wiki dump
    output_file : Output directory to save the model.
    skipgram : Layers of the model (0 - CBOW, 1 - Skipgram)
    loss : Loss Function (0 - Negative Sampling, 1 - Heirarichal Loss)
    size : Embedding size (100 ~ 300)
    epochs : Number of epochs
    sentence = LineSentence(input_file)

    model = FastText(sentence, sg=skipgram, hs=loss, size=size,
                     alpha=0.05, window=5, min_count=5, min_n=2,
                     max_n=5, workers=3, iter=epochs)


示例3: train_word2vec

# 需要导入模块: from gensim.models import word2vec [as 别名]
# 或者: from gensim.models.word2vec import LineSentence [as 别名]
def train_word2vec(input_file, output_file, skipgram, loss, size, epochs):
    train_word2vec(args**) -> Takes the input file,
    the output file and the model hyperparameters as
    arguments and trains the model accordingly.
    The model is saved at the output location.

    input_file : Input pre-processed wiki dump
    output_file : Output directory to save the model.
    skipgram : Layers of the model (0 - CBOW, 1 - Skipgram)
    loss : Loss Function (0 - Negative Sampling, 1 - Heirarichal Loss)
    size : Embedding size (100 ~ 300)
    epochs : Number of epochs
    sentence = LineSentence(input_file)

    model = Word2Vec(sentence, sg=skipgram, hs=loss,
                     size=size, alpha=0.05, window=5,
                     min_count=5, workers=3, iter=epochs)


示例4: build

# 需要导入模块: from gensim.models import word2vec [as 别名]
# 或者: from gensim.models.word2vec import LineSentence [as 别名]
def build(train_seg_path, test_seg_path, out_path=None, sentence_path='',
          w2v_bin_path="w2v.bin", min_count=1, col_sep='\t'):
    sentences = extract_sentence(train_seg_path, test_seg_path, col_sep=col_sep)
    save_sentence(sentences, sentence_path)
    print('train w2v model...')
    # train model
    w2v = Word2Vec(sg=1, sentences=LineSentence(sentence_path),
                   size=256, window=5, min_count=min_count, iter=40)
    w2v.wv.save_word2vec_format(w2v_bin_path, binary=True)
    print("save %s ok." % w2v_bin_path)
    # test
    # sim = w2v.wv.similarity('大', '小')
    # print('大 vs 小 similarity score:', sim)
    # load model
    model = KeyedVectors.load_word2vec_format(w2v_bin_path, binary=True)
    word_dict = {}
    for word in model.vocab:
        word_dict[word] = model[word]
    save_pkl(word_dict, out_path, overwrite=True) 

示例5: bigrammer

# 需要导入模块: from gensim.models import word2vec [as 别名]
# 或者: from gensim.models.word2vec import LineSentence [as 别名]
def bigrammer(source_file, outfile, mincount=100, threshold=0.99, scoring='npmi',
    :param source_file:
    :param outfile:
    :param mincount:
    :param threshold:
    :param scoring:
    :param commonfile:
    common = set([word.strip() for word in open(commonfile, 'r').readlines()])
    data = LineSentence(source_file)
    bigram_transformer = Phrases(sentences=data, min_count=mincount, threshold=threshold,
                                 scoring=scoring, max_vocab_size=400000000, delimiter=b':::',
                                 progress_per=100000, common_terms=common)
    bigrams = Phraser(bigram_transformer)
    tempfile = open(outfile, 'a')
    print('Writing bigrammed text to %s' % outfile, file=sys.stderr)
    for i in bigrams[data]:
        tempfile.write(' '.join(i) + '\n')
    return len(bigrams.phrasegrams) 

示例6: train_model

# 需要导入模块: from gensim.models import word2vec [as 别名]
# 或者: from gensim.models.word2vec import LineSentence [as 别名]
def train_model(corpus, size=200, window=5, workers=3, model_path=None,
                word_freq=None, corpus_count=None):
    """Train using Skipgram model.

        corpus (str):       file path of corpus
        size (int):         embedding size (default=200)
        window (int):       window size (default=5)
        workers (int):      number of workers (default=3)
        model_path (str):   file path of model we want to update
        word_freq (dict):   dictionary of word frequencies
        corpus_count (int): corpus size

        Word2Vec: word2vec model
    sentences = LineSentence(corpus)
    if model_path is not None:
        logger.info("Updating pre-existing model: %s", model_path)
        assert os.path.isfile(model_path), "File does not exist"
        model = Word2Vec.load(model_path)
        model.build_vocab(sentences, update=True)
        model.train(sentences, total_examples=model.corpus_count,
        model = Skipgram(sentences=sentences, size=size, window=window,
                         min_count=1, workers=workers, raw_vocab=word_freq,
    return model 

示例7: train_word2vec_model

# 需要导入模块: from gensim.models import word2vec [as 别名]
# 或者: from gensim.models.word2vec import LineSentence [as 别名]
def train_word2vec_model(text_file_path, model_file_path):
    # define training data
    # train model
    logger.info("Loading input file and training mode ...")
    model = Word2Vec(sentences=LineSentence(text_file_path), min_count=1, size=global_config.embedding_size)
    # summarize the loaded model
    logger.info("Model Details: {}".format(model))
    # save model
    model.wv.save_word2vec_format(model_file_path, binary=True)
    logger.info("Model saved") 

示例8: main

# 需要导入模块: from gensim.models import word2vec [as 别名]
# 或者: from gensim.models.word2vec import LineSentence [as 别名]
def main():

    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
    sentences = word2vec.LineSentence("wiki_seg.txt")
    model = word2vec.Word2Vec(sentences, size=250)


    # model = word2vec.Word2Vec.load("your_model_name") 

示例9: main

# 需要导入模块: from gensim.models import word2vec [as 别名]
# 或者: from gensim.models.word2vec import LineSentence [as 别名]
def main():

    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
    sentences = word2vec.LineSentence("wiki_seg.txt")
    model = word2vec.Word2Vec(sentences, size=250)


    # model = word2vec.Word2Vec.load("your_model_name") 

示例10: train_w2v

# 需要导入模块: from gensim.models import word2vec [as 别名]
# 或者: from gensim.models.word2vec import LineSentence [as 别名]
def train_w2v(self, filename):
        :param filename:path
        sentences = word2vec.LineSentence(filename)  # 加载语料,要求语料为“一行一文本”的格式
        print '正在训练w2v 针对语料:',str(filename)
        print 'size is: ',self.size
        model = word2vec.Word2Vec(sentences, size=self.size, window=100,workers=48)  # 训练模型; 注意参数window 对结果有影响 一般5-100
        savepath = '20w_size_win100_' + str(self.size)+'.model' # 保存model的路径
        print '训练完毕,已保存: ', savepath,

示例11: testLineSentenceWorksWithFilename

# 需要导入模块: from gensim.models import word2vec [as 别名]
# 或者: from gensim.models.word2vec import LineSentence [as 别名]
def testLineSentenceWorksWithFilename(self):
        """Does LineSentence work with a filename argument?"""
        with utils.smart_open(datapath('lee_background.cor')) as orig:
            sentences = word2vec.LineSentence(datapath('lee_background.cor'))
            for words in sentences:
                self.assertEqual(words, utils.to_unicode(orig.readline()).split()) 

示例12: testLineSentenceWorksWithCompressedFile

# 需要导入模块: from gensim.models import word2vec [as 别名]
# 或者: from gensim.models.word2vec import LineSentence [as 别名]
def testLineSentenceWorksWithCompressedFile(self):
        """Does LineSentence work with a compressed file object argument?"""
        with utils.smart_open(datapath('head500.noblanks.cor')) as orig:
            sentences = word2vec.LineSentence(bz2.BZ2File(datapath('head500.noblanks.cor.bz2')))
            for words in sentences:
                self.assertEqual(words, utils.to_unicode(orig.readline()).split()) 

示例13: testLineSentenceWorksWithNormalFile

# 需要导入模块: from gensim.models import word2vec [as 别名]
# 或者: from gensim.models.word2vec import LineSentence [as 别名]
def testLineSentenceWorksWithNormalFile(self):
        """Does LineSentence work with a file object argument, rather than filename?"""
        with utils.smart_open(datapath('head500.noblanks.cor')) as orig:
            with utils.smart_open(datapath('head500.noblanks.cor')) as fin:
                sentences = word2vec.LineSentence(fin)
                for words in sentences:
                    self.assertEqual(words, utils.to_unicode(orig.readline()).split())
#endclass TestWord2VecSentenceIterators 

示例14: word2vec_model

# 需要导入模块: from gensim.models import word2vec [as 别名]
# 或者: from gensim.models.word2vec import LineSentence [as 别名]
def word2vec_model(blog_seg_path):
    sentences = word2vec.LineSentence(blog_seg_path)
    model = word2vec.Word2Vec(sentences, workers=4)
    return model 

示例15: learn_embeddings

# 需要导入模块: from gensim.models import word2vec [as 别名]
# 或者: from gensim.models.word2vec import LineSentence [as 别名]
def learn_embeddings():
	Learn embeddings by optimizing the Skipgram objective using SGD.
	logging.info("Initializing creation of the representations...")
	walks = LineSentence('random_walks.txt')
	model = Word2Vec(walks, size=args.dimensions, window=args.window_size, min_count=0, hs=1, sg=1, workers=args.workers, iter=args.iter)
	logging.info("Representations created.")
