本文整理汇总了Python中gensim.models.doc2vec.Doc2Vec.load方法的典型用法代码示例。如果您正苦于以下问题:Python Doc2Vec.load方法的具体用法?Python Doc2Vec.load怎么用?Python Doc2Vec.load使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类gensim.models.doc2vec.Doc2Vec
的用法示例。
在下文中一共展示了Doc2Vec.load方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: setUp
# 需要导入模块: from gensim.models.doc2vec import Doc2Vec [as 别名]
# 或者: from gensim.models.doc2vec.Doc2Vec import load [as 别名]
def setUp(self):
filename = datapath("alldata-id-10.txt")
train_docs = read_sentiment_docs(filename)
self.train_docs = train_docs
self.source_doc_vec_file = datapath("small_tag_doc_5_iter50")
self.target_doc_vec_file = datapath("large_tag_doc_10_iter50")
self.source_doc_vec = Doc2Vec.load(self.source_doc_vec_file)
self.target_doc_vec = Doc2Vec.load(self.target_doc_vec_file)
示例2: do_command
# 需要导入模块: from gensim.models.doc2vec import Doc2Vec [as 别名]
# 或者: from gensim.models.doc2vec.Doc2Vec import load [as 别名]
def do_command(args):
# Load data
data = load_data(args.input)
#ids, documents = zip(*data)
data = [(id, tokenize(doc)) for id, doc in data]
ids = [id for id, _ in data]
if not os.path.exists(args.modelfile):
model = embed_documents(data)
# Save model
model.save(args.modelfile)
else:
model = Doc2Vec.load(args.modelfile)
#map(model.infer_tokens, tokenized)
print("Loaded model.")
# Do k-nearest neighbors search.
writer = csv.writer(args.output, delimiter='\t')
writer.writerow(["id1", "id2", "score"])
count = int(args.count) if args.count > 0 else len(model.docvecs)
vectors = np.array([model.docvecs[i] for i in range(count)])
del model # clear up memory
for i, j, score in find_nearest_neighbors(vectors):
id1, id2 = ids[i], ids[j]
writer.writerow([id1, id2, score])
示例3: __init__
# 需要导入模块: from gensim.models.doc2vec import Doc2Vec [as 别名]
# 或者: from gensim.models.doc2vec.Doc2Vec import load [as 别名]
def __init__(self, sentences, name, dataset_name, epochs=1, dimension=50, modelfile=None):
self.inner_model = None
# parameters
self.dataset = dataset_name
self.sentences = sentences
self.name = name
self.epochs = epochs
self.dimension = dimension
# data file path
models_folder = os.path.join(*[os.path.dirname(os.path.dirname(os.path.abspath(__file__))), 'data', 'models'])
if modelfile is not None:
filename = modelfile
else:
filename = "DOC2VEC_%s_%s_%s_%s" % (self.dataset, self.name, self.epochs, self.dimension)
self.filepath = os.path.join(models_folder, filename)
model_exists = os.path.isfile(self.filepath)
# train initial model
if model_exists:
logging.info("found data file %s" % (self.filepath, ))
self.inner_model = Doc2Vec.load(self.filepath)
else:
self.inner_model = Doc2Vec(sentences, size=self.dimension)
print self.inner_model.vocab.keys()
self.inner_model.save(fname=self.filepath)
示例4: test_category
# 需要导入模块: from gensim.models.doc2vec import Doc2Vec [as 别名]
# 或者: from gensim.models.doc2vec.Doc2Vec import load [as 别名]
def test_category():
from gensim.models.doc2vec import Doc2Vec
from sematch.utility import FileIO
from sematch.semantic.relatedness import ConceptRelatedness
model_category = Doc2Vec.load(FileIO.filename('models/category/cat2vec'))
cat2vec_rel = ConceptRelatedness(model_category)
print(cat2vec_rel.word_similarity('happy','sad'))
示例5: __init__
# 需要导入模块: from gensim.models.doc2vec import Doc2Vec [as 别名]
# 或者: from gensim.models.doc2vec.Doc2Vec import load [as 别名]
def __init__(self, size=300, window=8, min_count=2, workers=8, path_to_model=None, stream_train=False):
'''
Initializes the Doc2Vec_Wrapper class.
Args:
size (int): Specifies the size of the feature-vector. Defaults to 300
window (int): Specifies the size of the context window from which the feature vector is learned
min_count (int): Specifices the minimum number of instances of each word that is saved in the model
workers (int): number of parallel processes
path_to_model (str): Specifies model on disk
stream_train (bool): If true, update word vectors with new sentences. If false, just get doc vecs
'''
self.stream_train=stream_train
self.is_trained = False
self.model = None
## if a path is passed, try to load from disk. Otherwise, retrain anyway
if path_to_model:
try:
self.is_trained = True
self.model = Doc2Vec.load(path_to_model)
except:
pass
## params for Doc2Vec
self.size = size ## size of the vector
self.window = window ## size of the context window
self.min_count = min_count ## minimum count of vocab to store in binary tree
self.workers = workers ## number of parallel processes == number of cores on the computer
示例6: load_external
# 需要导入模块: from gensim.models.doc2vec import Doc2Vec [as 别名]
# 或者: from gensim.models.doc2vec.Doc2Vec import load [as 别名]
def load_external(self, model_file_name):
"""
load a word2vec model from the file specified
:param model_file_name: name of the model file
:return:
"""
self.model = Doc2Vec.load(model_file_name)
示例7: varify
# 需要导入模块: from gensim.models.doc2vec import Doc2Vec [as 别名]
# 或者: from gensim.models.doc2vec.Doc2Vec import load [as 别名]
def varify():
from gensim.models.doc2vec import Doc2Vec
model = Doc2Vec.load('data/doc2vec.d2v')
documents = pickle.load(open('data/fedcorpus.pick', 'r'))
for i in xrange(3):
inferred_docvec = model.infer_vector(documents[i].words)
print documents[i].tags
print('%s:\n %s' % (model, model.docvecs.most_similar([inferred_docvec], topn=3)))
示例8: get_model
# 需要导入模块: from gensim.models.doc2vec import Doc2Vec [as 别名]
# 或者: from gensim.models.doc2vec.Doc2Vec import load [as 别名]
def get_model(model_num, model_names):
if model_num < 10:
model = Word2Vec.load(model_path + model_names)
elif model_num < 99:
model = Doc2Vec.load(model_path + model_names)
else:
model = Word2Vec.load_word2vec_format(model_path + model_names, binary=True) # C text format
return model
示例9: create_and_train_models_d2vec
# 需要导入模块: from gensim.models.doc2vec import Doc2Vec [as 别名]
# 或者: from gensim.models.doc2vec.Doc2Vec import load [as 别名]
def create_and_train_models_d2vec(tag, cores=6):
"""
Build vocabulary and train models
:param tag: small or big
:param cores: number of cores
:return: the current models
"""
simple_models = get_models_d2vec(cores)
model_files = get_models_filename_d2vec(tag)
if all([os.path.exists(file) for file in model_files]):
print('Models exist, loading...')
for i, fname in enumerate(model_files):
simple_models[i] = Doc2Vec.load(fname)
models_by_name = OrderedDict((str(model), model) for model in simple_models)
return models_by_name
else:
print('Building models...')
voc_model = build_vocab_d2vec(tag, cores)
# Share vocabulary between models
for model in simple_models:
model.reset_from(voc_model)
models_by_name = OrderedDict((str(model), model) for model in simple_models)
print('Training models...')
print("START %s" % datetime.datetime.now())
best_error = defaultdict(lambda: 1.0) # to selectively-print only best errors achieved
alpha, min_alpha, passes = (0.025, 0.001, 20)
alpha_delta = (alpha - min_alpha) / passes
file = x_train_str.format(tag)
x_train = pd.read_hdf(file)
train_list = x_train.tolist()
for epoch in range(passes):
shuffle(train_list) # shuffling gets best results
for name, train_model in models_by_name.items():
# train
duration = 'na'
train_model.alpha, train_model.min_alpha = alpha, alpha
with elapsed_timer() as elapsed:
train_model.train(CorpusStream(train_list, 'train'), total_examples=train_model.corpus_count,
epochs=train_model.iter)
duration = '%.1f' % elapsed()
print('completed pass %i at alpha %f' % (epoch + 1, alpha))
alpha -= alpha_delta
print("END %s" % str(datetime.datetime.now()))
for name, model in models_by_name.items():
name = name.replace('/', '').replace(',', '_')
model.save('models/{0}_{1}.m'.format(name, tag))
return models_by_name
示例10: get_WordVector_matrix
# 需要导入模块: from gensim.models.doc2vec import Doc2Vec [as 别名]
# 或者: from gensim.models.doc2vec.Doc2Vec import load [as 别名]
def get_WordVector_matrix(label):
model = Doc2Vec.load('./WordVector_model.d2v')
size = len(label)
vectors = np.zeros((size,depth))
for i in range(size):
try:
doc_vector = model.docvecs[str(i)]
vectors[i]=(doc_vector[0])
except KeyError:
print str(i) + ' occurs KeyError'
pass
return map(list,vectors)
示例11: test_models
# 需要导入模块: from gensim.models.doc2vec import Doc2Vec [as 别名]
# 或者: from gensim.models.doc2vec.Doc2Vec import load [as 别名]
def test_models( FULL_SIM, models_files ):
test_papers = pd.read_csv( TEST_FILEPATH )
# NOTE: Only need for testing with AII:
keywords_docsrels = populate_iks_dict()
authorities = initialize_authorities()
for mod_f in models_files:
print( 'Testing '+ mod_f )
model = Doc2Vec.load( mod_f )
print( 'Model loaded.' )
test_model( FULL_SIM, model, test_papers, keywords_docsrels, authorities )
示例12: build_model
# 需要导入模块: from gensim.models.doc2vec import Doc2Vec [as 别名]
# 或者: from gensim.models.doc2vec.Doc2Vec import load [as 别名]
def build_model(x_train, x_test, iteration =5, save=True):
if(save):
big_list = x_train + x_test
model = Doc2Vec(min_count=2, window=10, size=100, sample=1e-4, negative=5, workers=8)
model.build_vocab(big_list)
for i in range(iteration):
model.train(big_list)
print 'saving model to file.....'
model.save('./sentim.d2v')
else:
print 'loading model from file.....'
model = Doc2Vec.load('./sentim.d2v')
return model
示例13: datacluster
# 需要导入模块: from gensim.models.doc2vec import Doc2Vec [as 别名]
# 或者: from gensim.models.doc2vec.Doc2Vec import load [as 别名]
def datacluster(data):
infered_vectors_list = []
print "load model..."
model_dm = Doc2Vec.load(model_path)
print "load train vectors..."
for text, label in data:
vector = model_dm.infer_vector(text)
infered_vectors_list.append(vector)
'''
print "Check the optimized parameter..."
Nc = range(1, 50)
pca_data = [PCA(n_components = i).fit(infered_vectors_list).transform(infered_vectors_list) for i in Nc]
kmeans = cluster.KMeans(init='k-means++',n_clusters=20,max_iter=300)
score = [kmeans.fit(pca_data[i]).score(pca_data[i]) for i in range(len(pca_data))]
print score
plt.plot(Nc,score)
plt.xlabel('PCA components')
plt.ylabel('Score')
plt.title('Elbow Curve')
plt.show()
'''
print "PCA decomposition..."
pca = PCA(n_components = 10).fit(infered_vectors_list)
pca_data = pca.transform(infered_vectors_list)
print "train K-Means model..."
kmean_model = cluster.KMeans(init='k-means++',n_clusters=16,max_iter=300)
kmean_model.fit(pca_data)
#get the classified index
result = kmean_model.fit_predict(pca_data)
print "Predicting result:", result
#save the cluster result
joblib.dump(kmean_model, cluster_path)
#load the cluster result
# new_km = joblib.load(cluster_path)
numSamples = len(pca_data)
print numSamples
centroids = kmean_model.labels_
#print centroids,type(centroids) #显示中心点
#print kmean_model.inertia_ #显示聚类效果
'''
marker = ['o', '.', ',', 'x', '*', 'd', 's', 'p']
color = ['r', 'g', 'b', 'c', 'm', 'k', 'y', 'w']
for i in xrange(numSamples):
plt.scatter(pca_data[i][0], pca_data[i][1], \
marker=marker[centroids[i]], color=color[centroids[i]])
plt.show()
'''
return centroids
示例14: get_vec
# 需要导入模块: from gensim.models.doc2vec import Doc2Vec [as 别名]
# 或者: from gensim.models.doc2vec.Doc2Vec import load [as 别名]
def get_vec(vector_file, id_file, w_file):
p2v = Doc2Vec.load(vector_file)
fout = open(w_file, "w")
index = 0
with open(id_file) as f:
for line in f:
index += 1
if index % 1000 == 0:
logging("%d cases" % index)
line = line.strip()
vec = p2v.docvecs[line]
line_w = line + "\t" + "\t".join([str(x) for x in vec]) + "\t" + "\n"
fout.write(line_w)
fout.close()
示例15: main
# 需要导入模块: from gensim.models.doc2vec import Doc2Vec [as 别名]
# 或者: from gensim.models.doc2vec.Doc2Vec import load [as 别名]
def main():
#load data set
training_reviews = load_dataset(TRAIN_FILE)
testing_reviews = load_dataset(TEST_FILE)
#load doc2vec model
doc2vec_model = Doc2Vec.load(DOC2VEC_MODEL)
cate_index = get_all_categories(training_reviews)
cates = dict2list(cate_index)
n_cates = len(cates)
train_X = get_X(training_reviews, doc2vec_model)
test_X = get_X(testing_reviews, doc2vec_model)
train_labels = get_labels(training_reviews, cate_index)
test_labels = get_labels(testing_reviews, cate_index)
labelwise_acc = []
labelwise_output = []
for cate in range(n_cates):
# train a bonary model
train_Y = get_Y(train_labels, cate)
prob = svm_problem(train_Y, train_X)
param = svm_parameter("-s 0 -t 2 -b 1")
m = svm_train(prob, param)
# test
test_Y = get_Y(test_labels, cate)
p_label, p_acc, p_val = svm_predict(test_Y, test_X, m, '-b 1')
labelwise_acc.append(p_acc)
labelwise_output.append(p_label)
# evaluation
p, r, f = microF1(labelwise_output, test_labels)
# output
out_dir = "../data/use_doc2vec/"
out_file = out_dir + "laptop.txt"
labelwise_acc = [(cates[i], labelwise_acc[i][0]) for i in range(n_cates)]
labelwise_acc = sorted(labelwise_acc, key=lambda x:x[1])
with open(out_file, 'w') as out:
out.write("Precision:\t{}\nRecall:\t{}\nF1:\t{}\n".format(p, r, f))
print("{}\n{}\n{}".format(p, r, f))
for cate_i in range(n_cates):
out.write("{}:\t{}\n".format(labelwise_acc[cate_i][0], labelwise_acc[cate_i][1]))