本文整理汇总了Python中gensim.models.Doc2Vec方法的典型用法代码示例。如果您正苦于以下问题:Python models.Doc2Vec方法的具体用法?Python models.Doc2Vec怎么用?Python models.Doc2Vec使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类gensim.models
的用法示例。
在下文中一共展示了models.Doc2Vec方法的9个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: train_word2vec_model
# 需要导入模块: from gensim import models [as 别名]
# 或者: from gensim.models import Doc2Vec [as 别名]
def train_word2vec_model(df, columns):
model_param = {
"alpha": config.EMBEDDING_ALPHA,
"learning_rate_decay": config.EMBEDDING_LEARNING_RATE_DECAY,
"n_epoch": config.EMBEDDING_N_EPOCH,
"sg": 1,
"hs": 1,
"min_count": config.EMBEDDING_MIN_COUNT,
"size": config.EMBEDDING_DIM,
"sample": 0.001,
"window": config.EMBEDDING_WINDOW,
"workers": config.EMBEDDING_WORKERS,
}
model_dir = config.WORD2VEC_MODEL_DIR
model_name = "Homedepot-word2vec-D%d-min_count%d.model"%(
model_param["size"], model_param["min_count"])
word2vec = DataFrameWord2Vec(df, columns, model_param)
word2vec.train()
word2vec.save(model_dir, model_name)
#---------------------- Doc2Vec ----------------------
示例2: learn_non_pooled_embeddings
# 需要导入模块: from gensim import models [as 别名]
# 或者: from gensim.models import Doc2Vec [as 别名]
def learn_non_pooled_embeddings(walks, counts, args):
"""
Method to learn an embedding given the sequences and arguments.
:param walks: Linear vertex sequences.
:param counts: Number of nodes.
:param args: Arguments.
"""
walks = process_non_pooled_model_data(walks, counts, args)
model = Doc2Vec(walks,
size=args.dimensions,
window=0,
dm=0,
alpha=args.alpha,
iter=args.iter,
workers=args.workers)
save_embedding(args, model, counts)
示例3: __init__
# 需要导入模块: from gensim import models [as 别名]
# 或者: from gensim.models import Doc2Vec [as 别名]
def __init__(self, df, columns, model_param):
super().__init__(df, columns, model_param)
self.model = Doc2Vec(dm=self.model_param["dm"],
hs=self.model_param["hs"],
alpha=self.model_param["alpha"],
min_alpha=self.model_param["alpha"],
min_count=self.model_param["min_count"],
size=self.model_param["size"],
sample=self.model_param["sample"],
window=self.model_param["window"],
workers=self.model_param["workers"])
示例4: doc2vec
# 需要导入模块: from gensim import models [as 别名]
# 或者: from gensim.models import Doc2Vec [as 别名]
def doc2vec(corpus_fname, output_fname):
make_save_path(output_fname)
corpus = Doc2VecInput(corpus_fname)
model = Doc2Vec(corpus, vector_size=100)
model.save(output_fname)
示例5: extract_instances
# 需要导入模块: from gensim import models [as 别名]
# 或者: from gensim.models import Doc2Vec [as 别名]
def extract_instances(self, train_instances):
sentences = []
for idx, train_instance in enumerate(train_instances):
sa, sb = train_instance.get_word(type='lemma', lower=True)
sentences.append(TaggedDocument(words=sa, tags=['sa_%d' % idx]))
sentences.append(TaggedDocument(words=sb, tags=['sb_%d' % idx]))
model = Doc2Vec(sentences, size=25, window=3, min_count=0, workers=10, iter=1000)
features = []
infos = []
for idx in range(len(train_instances)):
vec_a = model.docvecs['sa_%d' % idx]
vec_b = model.docvecs['sb_%d' % idx]
feature, info = vk.get_all_kernel(vec_a, vec_b)
features.append(feature)
infos.append([])
# infos.append([vec_a, vec_b])
return features, infos
# def load_instances(self, train_instances):
# """
# extract cosine distance from already trained feature file
# without modify the feature_file
# this function's priority is higher that the above extract_instances
# """
#
# _features, _n_dim, _n_instance = Feature.load_feature_from_file(self.feature_file)
# features = []
# infos = []
# ''' get features from train instances'''
# for _feature in _features:
# feature = Feature._feat_string_to_list(_feature, _n_dim)
# features.append([feature[1]])
# infos.append(['cosine'])
#
# features = [ Feature._feat_list_to_string(feature) for feature in features ]
#
# return features, 1, _n_instance
示例6: __init__
# 需要导入模块: from gensim import models [as 别名]
# 或者: from gensim.models import Doc2Vec [as 别名]
def __init__(self,
analyzer=None, matching=None,
name=None,
verbose=0,
n_epochs=10,
alpha=0.25,
min_alpha=0.05,
n_jobs=4,
**kwargs):
# self.model = model
self.alpha = alpha
self.min_alpha = min_alpha
self.verbose = verbose
self.name = "paragraph-vectors" if name is None else name
if matching is True:
self._matching = Matching()
elif matching is False or matching is None:
self._matching = None
else:
self._matching = Matching(**dict(matching))
self.analyzer = analyzer
self.model = Doc2Vec(alpha=alpha,
min_alpha=alpha,
size=500,
window=8,
min_count=1,
sample=1e-5,
workers=n_jobs,
negative=20,
dm=0, dbow_words=1, # words only with dm!=0?
dm_mean=0, # unused when in concat mode
dm_concat=1,
dm_tag_count=1
)
self.n_epochs = n_epochs
self._neighbors = NearestNeighbors(**kwargs)
示例7: fit
# 需要导入模块: from gensim import models [as 别名]
# 或者: from gensim.models import Doc2Vec [as 别名]
def fit(self, docs, y):
assert len(docs) == len(y)
model = self.model
n_epochs = self.n_epochs
verbose = self.verbose
decay = (self.alpha - self.min_alpha) / n_epochs
X = [TaggedDocument(self.analyzer(doc), [label])
for doc, label in zip(docs, y)]
if verbose > 0:
print("First 3 tagged documents:\n", X[:3])
print("Training doc2vec model")
# d2v = Doc2Vec()
# d2v.build_vocab(X)
# if self.intersect is not None:
# d2v.intersect_word2vec_format(self.intersect)
model.build_vocab(X)
for epoch in range(n_epochs):
if verbose:
print("Doc2Vec: Epoch {} of {}.".format(epoch + 1, n_epochs))
model.train(X)
model.alpha -= decay # apply global decay
model.min_alpha = model.alpha # but no decay inside one epoch
if verbose > 0:
print("Finished.")
print("model:", self.model)
if self._matching:
self._matching.fit(docs)
else:
# if we dont do matching, its enough to fit a nearest neighbors on
# all centroids before query time
dvs = np.asarray([model.docvecs[tag] for tag in y])
self._neighbors.fit(dvs)
self._y = y
return self
示例8: __init__
# 需要导入模块: from gensim import models [as 别名]
# 或者: from gensim.models import Doc2Vec [as 别名]
def __init__(self, docs: DocumentSequence, pretrained_word2vec=None):
"""
This class features interfaces to different methods of computing document embeddings.
Supported embedding mechanisms are:
Dov2Vec: see self.get_doc2vec()
Naive Doc2Vec: see self.get_naive_doc2vec()
One-Hot Sum: see self.get_onehot()
Attention is all you need To be implemented
FastText To be implemented
:param docs: a DocumentSequence instance
:pretrained_word2vec: path to pretrained word2vec model, in .bin format
"""
self.docs = docs
self.pretrained = pretrained_word2vec
示例9: _set_doc2vec
# 需要导入模块: from gensim import models [as 别名]
# 或者: from gensim.models import Doc2Vec [as 别名]
def _set_doc2vec(self, vector_size=300, window=5, min_count=5, dm=1, epochs=20):
# instantiate a Doc2Vec model, setting pretrained GoogleNews Vector
self._d2v = Doc2Vec(vector_size=vector_size, window=window, min_count=min_count, dm=dm, epochs=epochs,
pretrained=self.pretrained)
# build vocabulary from corpus
self._d2v.build_vocab(self.docs.get_tagged())
# somehow, the training won't start automatically, and must be manually started
self._d2v.train(self.docs.get_tagged(), total_examples=self._d2v.corpus_count, epochs=epochs)
# list document embeddings by order of their tags
self._d2v_embedding = np.stack(self._d2v.docvecs[index] for index in range(len(self.docs.get_tagged())))