本文整理汇总了Python中gensim.models.word2vec.LineSentence方法的典型用法代码示例。如果您正苦于以下问题:Python word2vec.LineSentence方法的具体用法?Python word2vec.LineSentence怎么用?Python word2vec.LineSentence使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类gensim.models.word2vec
的用法示例。
在下文中一共展示了word2vec.LineSentence方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: train_word2vec_by_word
# 需要导入模块: from gensim.models import word2vec [as 别名]
# 或者: from gensim.models.word2vec import LineSentence [as 别名]
def train_word2vec_by_word():
logging.basicConfig(format='%(asctime)s: %(levelname)s: %(message)s')
logging.root.setLevel(level=logging.INFO)
logging.info("running")
inp = "cut_zhwiki_wiki_parse.txt"
outp1 = "w2v_model_wiki.model"
outp2 = "w2v_model_wiki_word.vec"
print(multiprocessing.cpu_count())
model = Word2Vec(LineSentence(inp), size=300, window=10,
# 这里用skip-heriber
min_count=1, sg=1, hs=1, iter=10, workers=multiprocessing.cpu_count())
model.save(outp1)
model.wv.save_word2vec_format(outp2, binary=False)
示例2: train_fasttext
# 需要导入模块: from gensim.models import word2vec [as 别名]
# 或者: from gensim.models.word2vec import LineSentence [as 别名]
def train_fasttext(input_file, output_file, skipgram, loss, size, epochs):
"""
train_fasttext(args**) -> Takes the input file, the
output file and the model
hyperparameters as arguments
and trains the model accordingly.
The model is saved at the output location.
Arguments
---------
input_file : Input pre-processed wiki dump
output_file : Output directory to save the model.
skipgram : Layers of the model (0 - CBOW, 1 - Skipgram)
loss : Loss Function (0 - Negative Sampling, 1 - Heirarichal Loss)
size : Embedding size (100 ~ 300)
epochs : Number of epochs
"""
sentence = LineSentence(input_file)
model = FastText(sentence, sg=skipgram, hs=loss, size=size,
alpha=0.05, window=5, min_count=5, min_n=2,
max_n=5, workers=3, iter=epochs)
model.save(output_file)
示例3: train_word2vec
# 需要导入模块: from gensim.models import word2vec [as 别名]
# 或者: from gensim.models.word2vec import LineSentence [as 别名]
def train_word2vec(input_file, output_file, skipgram, loss, size, epochs):
"""
train_word2vec(args**) -> Takes the input file,
the output file and the model hyperparameters as
arguments and trains the model accordingly.
The model is saved at the output location.
Arguments
---------
input_file : Input pre-processed wiki dump
output_file : Output directory to save the model.
skipgram : Layers of the model (0 - CBOW, 1 - Skipgram)
loss : Loss Function (0 - Negative Sampling, 1 - Heirarichal Loss)
size : Embedding size (100 ~ 300)
epochs : Number of epochs
"""
sentence = LineSentence(input_file)
model = Word2Vec(sentence, sg=skipgram, hs=loss,
size=size, alpha=0.05, window=5,
min_count=5, workers=3, iter=epochs)
model.save(output_file)
示例4: build
# 需要导入模块: from gensim.models import word2vec [as 别名]
# 或者: from gensim.models.word2vec import LineSentence [as 别名]
def build(train_seg_path, test_seg_path, out_path=None, sentence_path='',
w2v_bin_path="w2v.bin", min_count=1, col_sep='\t'):
sentences = extract_sentence(train_seg_path, test_seg_path, col_sep=col_sep)
save_sentence(sentences, sentence_path)
print('train w2v model...')
# train model
w2v = Word2Vec(sg=1, sentences=LineSentence(sentence_path),
size=256, window=5, min_count=min_count, iter=40)
w2v.wv.save_word2vec_format(w2v_bin_path, binary=True)
print("save %s ok." % w2v_bin_path)
# test
# sim = w2v.wv.similarity('大', '小')
# print('大 vs 小 similarity score:', sim)
# load model
model = KeyedVectors.load_word2vec_format(w2v_bin_path, binary=True)
word_dict = {}
for word in model.vocab:
word_dict[word] = model[word]
save_pkl(word_dict, out_path, overwrite=True)
示例5: bigrammer
# 需要导入模块: from gensim.models import word2vec [as 别名]
# 或者: from gensim.models.word2vec import LineSentence [as 别名]
def bigrammer(source_file, outfile, mincount=100, threshold=0.99, scoring='npmi',
commonfile='common_tagged.txt'):
"""
:param source_file:
:param outfile:
:param mincount:
:param threshold:
:param scoring:
:param commonfile:
:return:
"""
common = set([word.strip() for word in open(commonfile, 'r').readlines()])
data = LineSentence(source_file)
bigram_transformer = Phrases(sentences=data, min_count=mincount, threshold=threshold,
scoring=scoring, max_vocab_size=400000000, delimiter=b':::',
progress_per=100000, common_terms=common)
bigrams = Phraser(bigram_transformer)
tempfile = open(outfile, 'a')
print('Writing bigrammed text to %s' % outfile, file=sys.stderr)
for i in bigrams[data]:
tempfile.write(' '.join(i) + '\n')
tempfile.close()
return len(bigrams.phrasegrams)
示例6: train_model
# 需要导入模块: from gensim.models import word2vec [as 别名]
# 或者: from gensim.models.word2vec import LineSentence [as 别名]
def train_model(corpus, size=200, window=5, workers=3, model_path=None,
word_freq=None, corpus_count=None):
"""Train using Skipgram model.
Args:
corpus (str): file path of corpus
size (int): embedding size (default=200)
window (int): window size (default=5)
workers (int): number of workers (default=3)
model_path (str): file path of model we want to update
word_freq (dict): dictionary of word frequencies
corpus_count (int): corpus size
Returns:
Word2Vec: word2vec model
"""
sentences = LineSentence(corpus)
if model_path is not None:
logger.info("Updating pre-existing model: %s", model_path)
assert os.path.isfile(model_path), "File does not exist"
model = Word2Vec.load(model_path)
model.build_vocab(sentences, update=True)
model.train(sentences, total_examples=model.corpus_count,
epochs=model.iter)
else:
model = Skipgram(sentences=sentences, size=size, window=window,
min_count=1, workers=workers, raw_vocab=word_freq,
corpus_count=corpus_count)
return model
示例7: train_word2vec_model
# 需要导入模块: from gensim.models import word2vec [as 别名]
# 或者: from gensim.models.word2vec import LineSentence [as 别名]
def train_word2vec_model(text_file_path, model_file_path):
# define training data
# train model
logger.info("Loading input file and training mode ...")
model = Word2Vec(sentences=LineSentence(text_file_path), min_count=1, size=global_config.embedding_size)
# summarize the loaded model
logger.info("Model Details: {}".format(model))
# save model
model.wv.save_word2vec_format(model_file_path, binary=True)
logger.info("Model saved")
示例8: main
# 需要导入模块: from gensim.models import word2vec [as 别名]
# 或者: from gensim.models.word2vec import LineSentence [as 别名]
def main():
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
sentences = word2vec.LineSentence("wiki_seg.txt")
model = word2vec.Word2Vec(sentences, size=250)
#保存模型,供日後使用
model.save(u"word2vec.model")
#模型讀取方式
# model = word2vec.Word2Vec.load("your_model_name")
示例9: main
# 需要导入模块: from gensim.models import word2vec [as 别名]
# 或者: from gensim.models.word2vec import LineSentence [as 别名]
def main():
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
sentences = word2vec.LineSentence("wiki_seg.txt")
model = word2vec.Word2Vec(sentences, size=250)
#保存模型,供日後使用
model.save("word2vec.model")
#模型讀取方式
# model = word2vec.Word2Vec.load("your_model_name")
示例10: train_w2v
# 需要导入模块: from gensim.models import word2vec [as 别名]
# 或者: from gensim.models.word2vec import LineSentence [as 别名]
def train_w2v(self, filename):
"""
训练wv模型
:param filename:path
:return:none
"""
sentences = word2vec.LineSentence(filename) # 加载语料,要求语料为“一行一文本”的格式
print '正在训练w2v 针对语料:',str(filename)
print 'size is: ',self.size
model = word2vec.Word2Vec(sentences, size=self.size, window=100,workers=48) # 训练模型; 注意参数window 对结果有影响 一般5-100
savepath = '20w_size_win100_' + str(self.size)+'.model' # 保存model的路径
print '训练完毕,已保存: ', savepath,
model.save(savepath)
示例11: testLineSentenceWorksWithFilename
# 需要导入模块: from gensim.models import word2vec [as 别名]
# 或者: from gensim.models.word2vec import LineSentence [as 别名]
def testLineSentenceWorksWithFilename(self):
"""Does LineSentence work with a filename argument?"""
with utils.smart_open(datapath('lee_background.cor')) as orig:
sentences = word2vec.LineSentence(datapath('lee_background.cor'))
for words in sentences:
self.assertEqual(words, utils.to_unicode(orig.readline()).split())
示例12: testLineSentenceWorksWithCompressedFile
# 需要导入模块: from gensim.models import word2vec [as 别名]
# 或者: from gensim.models.word2vec import LineSentence [as 别名]
def testLineSentenceWorksWithCompressedFile(self):
"""Does LineSentence work with a compressed file object argument?"""
with utils.smart_open(datapath('head500.noblanks.cor')) as orig:
sentences = word2vec.LineSentence(bz2.BZ2File(datapath('head500.noblanks.cor.bz2')))
for words in sentences:
self.assertEqual(words, utils.to_unicode(orig.readline()).split())
示例13: testLineSentenceWorksWithNormalFile
# 需要导入模块: from gensim.models import word2vec [as 别名]
# 或者: from gensim.models.word2vec import LineSentence [as 别名]
def testLineSentenceWorksWithNormalFile(self):
"""Does LineSentence work with a file object argument, rather than filename?"""
with utils.smart_open(datapath('head500.noblanks.cor')) as orig:
with utils.smart_open(datapath('head500.noblanks.cor')) as fin:
sentences = word2vec.LineSentence(fin)
for words in sentences:
self.assertEqual(words, utils.to_unicode(orig.readline()).split())
#endclass TestWord2VecSentenceIterators
示例14: word2vec_model
# 需要导入模块: from gensim.models import word2vec [as 别名]
# 或者: from gensim.models.word2vec import LineSentence [as 别名]
def word2vec_model(blog_seg_path):
sentences = word2vec.LineSentence(blog_seg_path)
model = word2vec.Word2Vec(sentences, workers=4)
return model
示例15: learn_embeddings
# 需要导入模块: from gensim.models import word2vec [as 别名]
# 或者: from gensim.models.word2vec import LineSentence [as 别名]
def learn_embeddings():
'''
Learn embeddings by optimizing the Skipgram objective using SGD.
'''
logging.info("Initializing creation of the representations...")
walks = LineSentence('random_walks.txt')
model = Word2Vec(walks, size=args.dimensions, window=args.window_size, min_count=0, hs=1, sg=1, workers=args.workers, iter=args.iter)
model.wv.save_word2vec_format(args.output)
logging.info("Representations created.")
return