本文整理匯總了Python中gensim.models.doc2vec.Doc2Vec.load_word2vec_format方法的典型用法代碼示例。如果您正苦於以下問題:Python Doc2Vec.load_word2vec_format方法的具體用法?Python Doc2Vec.load_word2vec_format怎麽用?Python Doc2Vec.load_word2vec_format使用的例子?那麽, 這裏精選的方法代碼示例或許可以為您提供幫助。您也可以進一步了解該方法所在類gensim.models.doc2vec.Doc2Vec
的用法示例。
在下文中一共展示了Doc2Vec.load_word2vec_format方法的3個代碼示例,這些例子默認根據受歡迎程度排序。您可以為喜歡或者感覺有用的代碼點讚,您的評價將有助於係統推薦出更棒的Python代碼示例。
示例1: main
# 需要導入模塊: from gensim.models.doc2vec import Doc2Vec [as 別名]
# 或者: from gensim.models.doc2vec.Doc2Vec import load_word2vec_format [as 別名]
def main():
"""
1. Divide total dataset into several data bins by randomly extracting data entries with given ratio.
2. Run cross-validation for given numbers of iterations in either SMOTE or non-SMOTE mode.
3. Report and present statistical evaluations for each data bin.
"""
stats_Fscores_ns, stats_recalls_ns, stats_precisions_ns = list(), list(), list() # ns for non-SMOTE
stats_Fscores_ws, stats_recalls_ws, stats_precisions_ws = list(), list(), list() # ws for with SMOTE
data_pos, data_neg = load_data("../data/")
data_pos, data_neg = data_filter(data_pos), data_filter(data_neg)
print "Loading Doc2Vec model ..."
model_doc2vec = Doc2Vec.load_word2vec_format('GoogleNews-vectors-negative300.bin.gz', binary=True) # load Doc2Vec model
print "Doc2Vec model loading done!"
models = {"SVC": sklearn.svm.SVC(), \
"Logit": sklearn.linear_model.LogisticRegression(), \
"DT": sklearn.tree.DecisionTreeClassifier(), \
"NBayes": sklearn.naive_bayes.GaussianNB(), \
"NNeighbors": sklearn.neighbors.nearest_centroid.NearestCentroid()}
model_chosen = "NBayes"
print "Classifier Type:", model_chosen
for binIndex in range(NUM_OF_BINS):
print "Experiment on DataSet#", str(binIndex)
random.shuffle(data_pos)
random.shuffle(data_neg)
size_pos_bin, size_neg_bin = int(len(data_pos)*SAMPLE_SIZE_RATIO), int(len(data_neg)*SAMPLE_SIZE_RATIO)
data_pos_bin, data_neg_bin = data_pos[:size_pos_bin], data_neg[:size_neg_bin] # dataset bin
sFscores_iter_ns, sRecalls_iter_ns, sPrecisions_iter_ns = list(), list(), list()
sFscores_iter_ws, sRecalls_iter_ws, sPrecisions_iter_ws = list(), list(), list()
for iteration in range(NUM_OF_ITERATION):
random.seed(iteration)
random.shuffle(data_pos_bin)
random.shuffle(data_neg_bin)
data_pos_vec, data_neg_vec = feature_extraction_Doc2Vec(data_pos_bin, data_neg_bin, model_doc2vec) # convert to doc vectors
print "non-SMOTE experiment"
accuracys, precisions, recalls, Fscores = cross_validationS( \
data_pos_vec, data_neg_vec, models[model_chosen], num_cross=NUM_OF_CROSSFOLD,
smote_flag=False) # cross validation
sFscores_iter_ns.extend(Fscores)
sRecalls_iter_ns.extend(recalls)
sPrecisions_iter_ns.extend(precisions)
print "with SMOTE experiemnt"
accuracys, precisions, recalls, Fscores = cross_validationS( \
data_pos_vec, data_neg_vec, models[model_chosen], num_cross=NUM_OF_CROSSFOLD,
smote_flag=True) # cross validation
sFscores_iter_ws.extend(Fscores)
sRecalls_iter_ws.extend(recalls)
sPrecisions_iter_ws.extend(precisions)
stats_Fscores_ns.append(sFscores_iter_ns)
stats_precisions_ns.append(sPrecisions_iter_ns)
stats_recalls_ns.append(sRecalls_iter_ns)
stats_Fscores_ws.append(sFscores_iter_ws)
stats_precisions_ws.append(sPrecisions_iter_ws)
stats_recalls_ws.append(sRecalls_iter_ws)
print "All Experiments Done!"
save_stats(stats_Fscores_ns, stats_recalls_ns, stats_precisions_ns, stats_Fscores_ws, stats_recalls_ws,\
stats_precisions_ws, model_name=model_chosen)
print "Statistics ready!"
示例2: main
# 需要導入模塊: from gensim.models.doc2vec import Doc2Vec [as 別名]
# 或者: from gensim.models.doc2vec.Doc2Vec import load_word2vec_format [as 別名]
def main():
stats_Fscore, stats_recall, stats_precision = list(), list(), list()
data_pos, data_neg = load_data("../data/")
data_pos, data_neg = data_filter(data_pos), data_filter(data_neg)
model = Doc2Vec.load_word2vec_format('GoogleNews-vectors-negative300.bin.gz', binary=True)
print "Model loading done!"
for test_mode in range(2):
if test_mode == 0:
print "non-SMOTE"
else:
print "SMOTE"
sFscores, sRecalls, sPrecisions = list(), list(), list()
for iteration in range(NUM_OF_ITERATION): # start iteration
random.seed(iteration)
random.shuffle(data_pos)
random.shuffle(data_neg)
data_pos_vec, data_neg_vec = feature_extraction_Doc2Vec(data_pos, data_neg, model) # convert to Word Vectors
print len(data_pos_vec), len(data_neg_vec)
models = {"SVC": sklearn.svm.SVC(), \
"Logit": sklearn.linear_model.LogisticRegression(), \
"DT": sklearn.tree.DecisionTreeClassifier(), \
"NBayes": sklearn.naive_bayes.GaussianNB(), \
"NNeighbors": sklearn.neighbors.nearest_centroid.NearestCentroid()}
model_chosen = "SVC"
accuracys, precisions, recalls, Fscores = cross_validationS(\
data_pos_vec, data_neg_vec, models[model_chosen], num_cross=NUM_OF_CROSSFOLD, smote_flag=test_mode) # cross validation
sFscores.extend(Fscores)
sRecalls.extend(recalls)
sPrecisions.extend(precisions)
stats_Fscore.append(sFscores)
stats_recall.append(sRecalls)
stats_precision.append(sPrecisions)
plt.figure()
colors = ["red", "blue"]
modes = ["no-SMOTE", "SMOTE"]
for i in range(len(stats_Fscore)): # plot statistical summary
plt.plot(stats_Fscore[i], marker='o', color=colors[i], label=modes[i]+"_Fscore")
#plt.plot(stats_precision[i], marker='+', color=colors[i], label=modes[i]+"_precision")
#plt.plot(stats_recall[i], marker='*', color=colors[i], label=modes[i]+"_recall")
plt.ylim([0, 1.0])
plt.legend(loc=4, borderaxespad=0.5)
plt.ylabel("Scores")
plt.xlabel("Data Sequence")
plt.savefig("../results/"+model_chosen+"-ValidationStats.png")
savefile_name = "../results/" + model_chosen + "-ValidationStats.txt"
fp = open(savefile_name, 'w')
print "******** Evaluation **********\n"
fp.write("******** Evaluation **********\n")
for test_mode in range(2): # print statistical evaluations
stats_precision[test_mode].sort()
stats_recall[test_mode].sort()
stats_Fscore[test_mode].sort()
p_median = stats_precision[test_mode][len(stats_precision)/2]
r_median = stats_recall[test_mode][len(stats_recall)/2]
f_median = stats_Fscore[test_mode][len(stats_Fscore)/2]
iqr_p = stats_precision[test_mode][int(len(stats_precision)*0.75)] - stats_precision[test_mode][int(len(stats_precision)*0.25)]
iqr_r = stats_recall[test_mode][int(len(stats_recall)*0.75)] - stats_recall[test_mode][int(len(stats_recall)*0.25)]
iqr_f = stats_Fscore[test_mode][int(len(stats_Fscore)*0.75)] - stats_Fscore[test_mode][int(len(stats_Fscore)*0.25)]
print modes[test_mode]
fp.write(modes[test_mode]+'\n')
print "\t p_median \t r_median \t f_median"
fp.write("\t p_median \t r_median \t f_median \n")
print "\t%.5f \t%.5f \t%.5f" % (p_median, r_median, f_median)
fp.write("\t%.5f \t%.5f \t%.5f \n" % (p_median, r_median, f_median))
print "\t iqr_p \t iqr_r \t iqr_f"
fp.write("\t iqr_p \t iqr_r \t iqr_f \n")
print "\t%.5f \t%.5f \t%.5f" % (iqr_p, iqr_r, iqr_f)
fp.write("\t%.5f \t%.5f \t%.5f \n" % (iqr_p, iqr_r, iqr_f))
print '\n'
示例3: filter_essay
# 需要導入模塊: from gensim.models.doc2vec import Doc2Vec [as 別名]
# 或者: from gensim.models.doc2vec.Doc2Vec import load_word2vec_format [as 別名]
import feature_extractor
from gensim.models.doc2vec import Doc2Vec
import parser
import numpy as np
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
import pdb
import pickle
model = Doc2Vec.load_word2vec_format('data/GoogleNews-vectors-negative300.bin', binary = True)
print "MODEL LOADED"
f = open('stopwords.txt')
stoplist = set(line.split('\n')[0] for line in f)
def filter_essay(essay):
stop_removed = filter(lambda x: x not in stoplist, essay.split())
all_filtered = filter(lambda x: x in model.vocab, stop_removed)
return all_filtered
def filter_essays(essays):
return [filter_essay(essay) for essay in essays]
def calc_similarity(i1, i2):
return model.n_similarity(i1, i2)
def classify(k, instance, training_data, training_scores):
similarity = np.array([calc_similarity(instance, x) for x in training_data])