本文整理汇总了Python中gensim.models.doc2vec.Doc2Vec类的典型用法代码示例。如果您正苦于以下问题:Python Doc2Vec类的具体用法?Python Doc2Vec怎么用?Python Doc2Vec使用的例子?那么, 这里精选的类代码示例或许可以为您提供帮助。
在下文中一共展示了Doc2Vec类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: setUp
def setUp(self):
filename = datapath("alldata-id-10.txt")
train_docs = read_sentiment_docs(filename)
self.train_docs = train_docs
self.source_doc_vec_file = datapath("small_tag_doc_5_iter50")
self.target_doc_vec_file = datapath("large_tag_doc_10_iter50")
self.source_doc_vec = Doc2Vec.load(self.source_doc_vec_file)
self.target_doc_vec = Doc2Vec.load(self.target_doc_vec_file)
示例2: __init__
def __init__(self, size=300, window=8, min_count=2, workers=8, path_to_model=None, stream_train=False):
'''
Initializes the Doc2Vec_Wrapper class.
Args:
size (int): Specifies the size of the feature-vector. Defaults to 300
window (int): Specifies the size of the context window from which the feature vector is learned
min_count (int): Specifices the minimum number of instances of each word that is saved in the model
workers (int): number of parallel processes
path_to_model (str): Specifies model on disk
stream_train (bool): If true, update word vectors with new sentences. If false, just get doc vecs
'''
self.stream_train=stream_train
self.is_trained = False
self.model = None
## if a path is passed, try to load from disk. Otherwise, retrain anyway
if path_to_model:
try:
self.is_trained = True
self.model = Doc2Vec.load(path_to_model)
except:
pass
## params for Doc2Vec
self.size = size ## size of the vector
self.window = window ## size of the context window
self.min_count = min_count ## minimum count of vocab to store in binary tree
self.workers = workers ## number of parallel processes == number of cores on the computer
示例3: __init__
def __init__(self, sentences, name, dataset_name, epochs=1, dimension=50, modelfile=None):
self.inner_model = None
# parameters
self.dataset = dataset_name
self.sentences = sentences
self.name = name
self.epochs = epochs
self.dimension = dimension
# data file path
models_folder = os.path.join(*[os.path.dirname(os.path.dirname(os.path.abspath(__file__))), 'data', 'models'])
if modelfile is not None:
filename = modelfile
else:
filename = "DOC2VEC_%s_%s_%s_%s" % (self.dataset, self.name, self.epochs, self.dimension)
self.filepath = os.path.join(models_folder, filename)
model_exists = os.path.isfile(self.filepath)
# train initial model
if model_exists:
logging.info("found data file %s" % (self.filepath, ))
self.inner_model = Doc2Vec.load(self.filepath)
else:
self.inner_model = Doc2Vec(sentences, size=self.dimension)
print self.inner_model.vocab.keys()
self.inner_model.save(fname=self.filepath)
示例4: test_category
def test_category():
from gensim.models.doc2vec import Doc2Vec
from sematch.utility import FileIO
from sematch.semantic.relatedness import ConceptRelatedness
model_category = Doc2Vec.load(FileIO.filename('models/category/cat2vec'))
cat2vec_rel = ConceptRelatedness(model_category)
print(cat2vec_rel.word_similarity('happy','sad'))
示例5: do_command
def do_command(args):
# Load data
data = load_data(args.input)
#ids, documents = zip(*data)
data = [(id, tokenize(doc)) for id, doc in data]
ids = [id for id, _ in data]
if not os.path.exists(args.modelfile):
model = embed_documents(data)
# Save model
model.save(args.modelfile)
else:
model = Doc2Vec.load(args.modelfile)
#map(model.infer_tokens, tokenized)
print("Loaded model.")
# Do k-nearest neighbors search.
writer = csv.writer(args.output, delimiter='\t')
writer.writerow(["id1", "id2", "score"])
count = int(args.count) if args.count > 0 else len(model.docvecs)
vectors = np.array([model.docvecs[i] for i in range(count)])
del model # clear up memory
for i, j, score in find_nearest_neighbors(vectors):
id1, id2 = ids[i], ids[j]
writer.writerow([id1, id2, score])
示例6: load_external
def load_external(self, model_file_name):
"""
load a word2vec model from the file specified
:param model_file_name: name of the model file
:return:
"""
self.model = Doc2Vec.load(model_file_name)
示例7: varify
def varify():
from gensim.models.doc2vec import Doc2Vec
model = Doc2Vec.load('data/doc2vec.d2v')
documents = pickle.load(open('data/fedcorpus.pick', 'r'))
for i in xrange(3):
inferred_docvec = model.infer_vector(documents[i].words)
print documents[i].tags
print('%s:\n %s' % (model, model.docvecs.most_similar([inferred_docvec], topn=3)))
示例8: main
def main():
"""
1. Divide total dataset into several data bins by randomly extracting data entries with given ratio.
2. Run cross-validation for given numbers of iterations in either SMOTE or non-SMOTE mode.
3. Report and present statistical evaluations for each data bin.
"""
stats_Fscores_ns, stats_recalls_ns, stats_precisions_ns = list(), list(), list() # ns for non-SMOTE
stats_Fscores_ws, stats_recalls_ws, stats_precisions_ws = list(), list(), list() # ws for with SMOTE
data_pos, data_neg = load_data("../data/")
data_pos, data_neg = data_filter(data_pos), data_filter(data_neg)
print "Loading Doc2Vec model ..."
model_doc2vec = Doc2Vec.load_word2vec_format('GoogleNews-vectors-negative300.bin.gz', binary=True) # load Doc2Vec model
print "Doc2Vec model loading done!"
models = {"SVC": sklearn.svm.SVC(), \
"Logit": sklearn.linear_model.LogisticRegression(), \
"DT": sklearn.tree.DecisionTreeClassifier(), \
"NBayes": sklearn.naive_bayes.GaussianNB(), \
"NNeighbors": sklearn.neighbors.nearest_centroid.NearestCentroid()}
model_chosen = "NBayes"
print "Classifier Type:", model_chosen
for binIndex in range(NUM_OF_BINS):
print "Experiment on DataSet#", str(binIndex)
random.shuffle(data_pos)
random.shuffle(data_neg)
size_pos_bin, size_neg_bin = int(len(data_pos)*SAMPLE_SIZE_RATIO), int(len(data_neg)*SAMPLE_SIZE_RATIO)
data_pos_bin, data_neg_bin = data_pos[:size_pos_bin], data_neg[:size_neg_bin] # dataset bin
sFscores_iter_ns, sRecalls_iter_ns, sPrecisions_iter_ns = list(), list(), list()
sFscores_iter_ws, sRecalls_iter_ws, sPrecisions_iter_ws = list(), list(), list()
for iteration in range(NUM_OF_ITERATION):
random.seed(iteration)
random.shuffle(data_pos_bin)
random.shuffle(data_neg_bin)
data_pos_vec, data_neg_vec = feature_extraction_Doc2Vec(data_pos_bin, data_neg_bin, model_doc2vec) # convert to doc vectors
print "non-SMOTE experiment"
accuracys, precisions, recalls, Fscores = cross_validationS( \
data_pos_vec, data_neg_vec, models[model_chosen], num_cross=NUM_OF_CROSSFOLD,
smote_flag=False) # cross validation
sFscores_iter_ns.extend(Fscores)
sRecalls_iter_ns.extend(recalls)
sPrecisions_iter_ns.extend(precisions)
print "with SMOTE experiemnt"
accuracys, precisions, recalls, Fscores = cross_validationS( \
data_pos_vec, data_neg_vec, models[model_chosen], num_cross=NUM_OF_CROSSFOLD,
smote_flag=True) # cross validation
sFscores_iter_ws.extend(Fscores)
sRecalls_iter_ws.extend(recalls)
sPrecisions_iter_ws.extend(precisions)
stats_Fscores_ns.append(sFscores_iter_ns)
stats_precisions_ns.append(sPrecisions_iter_ns)
stats_recalls_ns.append(sRecalls_iter_ns)
stats_Fscores_ws.append(sFscores_iter_ws)
stats_precisions_ws.append(sPrecisions_iter_ws)
stats_recalls_ws.append(sRecalls_iter_ws)
print "All Experiments Done!"
save_stats(stats_Fscores_ns, stats_recalls_ns, stats_precisions_ns, stats_Fscores_ws, stats_recalls_ws,\
stats_precisions_ws, model_name=model_chosen)
print "Statistics ready!"
示例9: get_model
def get_model(model_num, model_names):
if model_num < 10:
model = Word2Vec.load(model_path + model_names)
elif model_num < 99:
model = Doc2Vec.load(model_path + model_names)
else:
model = Word2Vec.load_word2vec_format(model_path + model_names, binary=True) # C text format
return model
示例10: create_and_train_models_d2vec
def create_and_train_models_d2vec(tag, cores=6):
"""
Build vocabulary and train models
:param tag: small or big
:param cores: number of cores
:return: the current models
"""
simple_models = get_models_d2vec(cores)
model_files = get_models_filename_d2vec(tag)
if all([os.path.exists(file) for file in model_files]):
print('Models exist, loading...')
for i, fname in enumerate(model_files):
simple_models[i] = Doc2Vec.load(fname)
models_by_name = OrderedDict((str(model), model) for model in simple_models)
return models_by_name
else:
print('Building models...')
voc_model = build_vocab_d2vec(tag, cores)
# Share vocabulary between models
for model in simple_models:
model.reset_from(voc_model)
models_by_name = OrderedDict((str(model), model) for model in simple_models)
print('Training models...')
print("START %s" % datetime.datetime.now())
best_error = defaultdict(lambda: 1.0) # to selectively-print only best errors achieved
alpha, min_alpha, passes = (0.025, 0.001, 20)
alpha_delta = (alpha - min_alpha) / passes
file = x_train_str.format(tag)
x_train = pd.read_hdf(file)
train_list = x_train.tolist()
for epoch in range(passes):
shuffle(train_list) # shuffling gets best results
for name, train_model in models_by_name.items():
# train
duration = 'na'
train_model.alpha, train_model.min_alpha = alpha, alpha
with elapsed_timer() as elapsed:
train_model.train(CorpusStream(train_list, 'train'), total_examples=train_model.corpus_count,
epochs=train_model.iter)
duration = '%.1f' % elapsed()
print('completed pass %i at alpha %f' % (epoch + 1, alpha))
alpha -= alpha_delta
print("END %s" % str(datetime.datetime.now()))
for name, model in models_by_name.items():
name = name.replace('/', '').replace(',', '_')
model.save('models/{0}_{1}.m'.format(name, tag))
return models_by_name
示例11: get_WordVector_matrix
def get_WordVector_matrix(label):
model = Doc2Vec.load('./WordVector_model.d2v')
size = len(label)
vectors = np.zeros((size,depth))
for i in range(size):
try:
doc_vector = model.docvecs[str(i)]
vectors[i]=(doc_vector[0])
except KeyError:
print str(i) + ' occurs KeyError'
pass
return map(list,vectors)
示例12: test_models
def test_models( FULL_SIM, models_files ):
test_papers = pd.read_csv( TEST_FILEPATH )
# NOTE: Only need for testing with AII:
keywords_docsrels = populate_iks_dict()
authorities = initialize_authorities()
for mod_f in models_files:
print( 'Testing '+ mod_f )
model = Doc2Vec.load( mod_f )
print( 'Model loaded.' )
test_model( FULL_SIM, model, test_papers, keywords_docsrels, authorities )
示例13: build_model
def build_model(x_train, x_test, iteration =5, save=True):
if(save):
big_list = x_train + x_test
model = Doc2Vec(min_count=2, window=10, size=100, sample=1e-4, negative=5, workers=8)
model.build_vocab(big_list)
for i in range(iteration):
model.train(big_list)
print 'saving model to file.....'
model.save('./sentim.d2v')
else:
print 'loading model from file.....'
model = Doc2Vec.load('./sentim.d2v')
return model
示例14: get_vec
def get_vec(vector_file, id_file, w_file):
p2v = Doc2Vec.load(vector_file)
fout = open(w_file, "w")
index = 0
with open(id_file) as f:
for line in f:
index += 1
if index % 1000 == 0:
logging("%d cases" % index)
line = line.strip()
vec = p2v.docvecs[line]
line_w = line + "\t" + "\t".join([str(x) for x in vec]) + "\t" + "\n"
fout.write(line_w)
fout.close()
示例15: datacluster
def datacluster(data):
infered_vectors_list = []
print "load model..."
model_dm = Doc2Vec.load(model_path)
print "load train vectors..."
for text, label in data:
vector = model_dm.infer_vector(text)
infered_vectors_list.append(vector)
'''
print "Check the optimized parameter..."
Nc = range(1, 50)
pca_data = [PCA(n_components = i).fit(infered_vectors_list).transform(infered_vectors_list) for i in Nc]
kmeans = cluster.KMeans(init='k-means++',n_clusters=20,max_iter=300)
score = [kmeans.fit(pca_data[i]).score(pca_data[i]) for i in range(len(pca_data))]
print score
plt.plot(Nc,score)
plt.xlabel('PCA components')
plt.ylabel('Score')
plt.title('Elbow Curve')
plt.show()
'''
print "PCA decomposition..."
pca = PCA(n_components = 10).fit(infered_vectors_list)
pca_data = pca.transform(infered_vectors_list)
print "train K-Means model..."
kmean_model = cluster.KMeans(init='k-means++',n_clusters=16,max_iter=300)
kmean_model.fit(pca_data)
#get the classified index
result = kmean_model.fit_predict(pca_data)
print "Predicting result:", result
#save the cluster result
joblib.dump(kmean_model, cluster_path)
#load the cluster result
# new_km = joblib.load(cluster_path)
numSamples = len(pca_data)
print numSamples
centroids = kmean_model.labels_
#print centroids,type(centroids) #显示中心点
#print kmean_model.inertia_ #显示聚类效果
'''
marker = ['o', '.', ',', 'x', '*', 'd', 's', 'p']
color = ['r', 'g', 'b', 'c', 'm', 'k', 'y', 'w']
for i in xrange(numSamples):
plt.scatter(pca_data[i][0], pca_data[i][1], \
marker=marker[centroids[i]], color=color[centroids[i]])
plt.show()
'''
return centroids