本文整理汇总了Python中gensim.models.Doc2Vec类的典型用法代码示例。如果您正苦于以下问题:Python Doc2Vec类的具体用法?Python Doc2Vec怎么用?Python Doc2Vec使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。
在下文中一共展示了Doc2Vec类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: test_build
def test_build(Xtrain, ytrain, Xtest, ytest):
'''
Load the three varieties of Doc2Vec models that were previously saved.
Build a random forest model for each Doc2Vec model. Test each random
forest model with the same test data, and write the results to a CSV
file for each Doc2Vec model.
'''
print "Loading the model..."
models = [Doc2Vec.load("Doc2Vec_dbow_d300_n5_t4"), \
Doc2Vec.load("Doc2Vec_dm-c_d300_n5_w5_t4"), \
Doc2Vec.load("Doc2Vec_dm-m_d300_n5_w10_t4")]
filenames = ['Doc2Vec_dbow.csv', 'Doc2Vec_dm-c.csv', 'Doc2Vec_dm-m.csv']
forests = []
for model in models:
forests.append(build_forest(model, Xtrain, ytrain))
for i in xrange(3):
model = models[i]
forest = forests[i]
filename = filenames[i]
features = []
print "Creating feature list for test data..."
for id in Xtest['id']:
# remove the extra quotes around the id
features.append(model.docvecs[id[1:-1]])
print "Predicting test sentiment..."
use_forest(forest, features, ytest, filename)
示例2: __init__
def __init__(self, note_type, model_file, max_notes, dbow_file=None):
self.note_type = note_type
self.max_notes = max_notes
self.model = Doc2Vec.load(model_file)
if dbow_file:
self.dbow = Doc2Vec.load(dbow_file)
else:
self.dbow = None
示例3: load_model
def load_model():
'''
Loading and Building Train and Test Data
'''
# loading labels
labels = pickle.load(open('labels.p', 'rb'))
# Using LabelEncoder to convert string to numerical value.
label_encoder = preprocessing.LabelEncoder()
transformed_labels = label_encoder.fit_transform(labels)
transformed_labels = np.array(transformed_labels)
transformed_labels = label_binarize(transformed_labels,
np.unique(transformed_labels))
print('Found %d Labels' % len(label_encoder.classes_))
print('Labels:', label_encoder.classes_)
# initialising feature array
cow_arrays = np.zeros((247543, 300))
# learning model Distributed memory model
model = Doc2Vec.load('./acm_cow.d2v')
# updating training arrays
for i in range(247543):
prefix_train_pos = "SET_" + str(i)
cow_arrays[i] = model.docvecs[prefix_train_pos]
train_arrays_cow, test_arrays_cow, train_labels_cow, test_labels_cow = \
train_test_split(cow_arrays, transformed_labels,
test_size=0.1, random_state=42)
# initialising feature array
skip_arrays = np.zeros((247543, 300))
# learning model Distributed Bag of words model
model = Doc2Vec.load('./acm_skip.d2v')
# updating training arrays
for i in range(247543):
prefix_train_pos = "SET_" + str(i)
skip_arrays[i] = model.docvecs[prefix_train_pos]
train_arrays_skip, test_arrays_skip, train_labels_skip, test_labels_skip = \
train_test_split(skip_arrays, transformed_labels,
test_size=0.1, random_state=42)
to_return = (train_arrays_cow, train_labels_cow,
test_arrays_cow, test_labels_cow,
train_arrays_skip, train_labels_skip,
test_arrays_skip, test_labels_skip)
return to_return
示例4: load_questions
def load_questions(modelname,f_name,mapname,a_modelname):
model = Doc2Vec.load(modelname)
a_model = Doc2Vec.load(a_modelname)
qids = list(enumerate([int(q) for q in open(f_name)]))
rev_qids = [(item,index) for index,item in qids]
qid_dict = dict(rev_qids)
Q = []
doc_dict = load_doc_hashes(mapname)
for fname in os.listdir("questions"):
Q.append(load_question(fname,model.docvecs.doctag_syn0,qid_dict,doc_dict,a_model))
return Q
示例5: main
def main():
model = Doc2Vec.load('400_pvdm_doc2vec.d2v')
model_dbow = Doc2Vec.load('400_pvdbow_doc2vec.d2v')
#mistake pvdm is actually pv-dbow
path = 'datasets/'
files = [f for f in listdir(path) if isfile(join(path,f))]
files.pop(0)
data_loader = DataLoader(path)
domains = data_loader.csv_files
names = {1: 'title', 4: 'abstract', 5: 'mesh', 'y': 6}
domain_features = data_loader.get_feature_matrix(names)
#get size
n_total_documents = 0
for domain in domain_features:
n_total_documents+=len(domain[0])
all_features = numpy.zeros(shape=(n_total_documents, 800))
all_labels = numpy.asarray([])
i = 0
for domain in domain_features:
features, labels = domain
all_labels = numpy.hstack((all_labels, labels))
for feature_vector in features:
preprocessed_line = list(preprocess(feature_vector))
all_features[i, 0:400] = numpy.float_(model.infer_vector(preprocessed_line))
all_features[i, 400:] = numpy.float_(model_dbow.infer_vector(preprocessed_line))
i+=1
all_labels = numpy.asarray(all_labels)
all_labels[all_labels == -1] = 0
all_labels = numpy.intc(all_labels)
train, test = data_loader.create_random_samples(all_features, all_labels)
train_x, train_y = train
test_x, test_y = test
classifier = NeuralNet(n_hidden_units=[200], output_size=2, batch_size=20, n_epochs=200, dropout=True,
activation_function='relu', learning_rate=.3, momentum=True, momentum_term=.5)
classifier.train(train_x, train_y)
classifier.test(test_x, test_y)
示例6: do
def do():
global shouldStemData
global shouldSaveModel
from os.path import isfile
from gensim.models import Doc2Vec
from sys import argv
if not isfile(modelname):# or (len(argv) > 1 and argv[1] == '--update'):
parsed = parseData(trainData)
print 'Begin stemming data'
parsed = stemData(parsed[:10000])
if False:
try:
print 'Write stemmed data'
f = open('stemmed_data.csv', 'w')
f.write('\n'.join(map(lambda x: ' '.join(x), parsed)))
except Exception:
print 'Failed to write'
finally:
try:
f.close()
except Exception:
print ''
print 'Begin training'
if False:#len(argv) > 1 and argv[1] == '--update':
print 'Update model'
model = Doc2Vec.load(modelname)
model.train(documents=parsed)
else:
model = Doc2Vec(documents=parsed)#, size=100, workers=4, window=5, min_count=5)
if shouldSaveModel:
print 'Save model'
model.save(modelname)
else:
stemData([])
model = Doc2Vec.load(modelname)
print 'Get results'
t = ''
try:
t = getResults(model)
except Exception:
for x in model.most_similar(happy):
print x[0].encode('utf8')
open('res.txt', 'w').write(t.encode('utf8'))
示例7: transform_input
def transform_input(vectorsize):
# this loads the premade model saved as amzn.d2v and transforms writes its vectors into arrays that can be input into the scikit learn algorithms
print('Loading Doc2Vec model...')
try:
model = Doc2Vec.load('./amzn.d2v')
except Exception as exception:
print('No existing model found. Starting to create a model...')
train_size = 50000
d2v_source(train_size)
model = create_doc2vec_model(vectorsize)
# load or generate train and test data
try:
with open('train.txt') as f:
train_raw = np.asarray([line.rstrip('\n') for line in f])
with open('test.txt') as f:
test_raw = np.asarray([line.rstrip('\n') for line in f])
with open('train_target.txt') as f:
target = np.asarray([int(line.rstrip('\n')) for line in f])
with open('test_target.txt') as f:
target_test = np.asarray([int(line.rstrip('\n')) for line in f])
except Exception as exception:
print('No train data found. Generating new train and test files....')
train_size = 50000
test_size = 20000
review_lines(train_size,test_size)
with open('train.txt') as f:
train_raw = np.asarray([line.rstrip('\n') for line in f])
with open('test.txt') as f:
test_raw = np.asarray([line.rstrip('\n') for line in f])
with open('train_target.txt') as f:
target = np.asarray([int(line.rstrip('\n')) for line in f])
with open('test_target.txt') as f:
target_test = np.asarray([int(line.rstrip('\n')) for line in f])
# infer vectors for the sentences of the train and test sets
# I do this by creating a list of strings out of the document and then converting that into a vector
# this takes forever...so for further use, I will only do this for new train and test sets and save the vectors
try:
train_arrays = np.loadtxt('train_vectors.txt')
test_arrays = np.loadtxt('test_vectors.txt')
except Exception as exception:
train_arrays = np.zeros((target.shape[0],vectorsize))
test_arrays = np.zeros((target_test.shape[0],vectorsize))
print('Vectorizing the train and test data...')
for i in range(target.shape[0]):
train_arrays[i,:] = model.infer_vector(train_raw[i].split())
for i in range(target_test.shape[0]):
test_arrays[i,:] = model.infer_vector(test_raw[i].split())
np.savetxt('train_vectors.txt',train_arrays)
np.savetxt('test_vectors.txt',test_arrays)
return train_arrays, target, test_arrays, target_test
示例8: load_or_train
def load_or_train(sentences=None,dim=83,epochs=10):
# Doc2Vec params
# --------------
# min_count: words appearing more than..
# window: size of the skip-gram model
# size: vector embedding size
# sample: higher frecuency words are downsampled with this
# negative: noise factor in context (neagtive sampling)
# workers: parallel processing factor
try:
print "> Loading model.."
model = Doc2Vec.load("doc2vec.model")
except IOError:
print "> No pretrained model found or loading failed."
model = Doc2Vec(min_count=1, size=dim, window=10, negative=5, sample=1e-4, workers=7)
if not sentences:
print "> No labeled sentences provided. Building them now."
sentences = labeled_sentences()
print "> Building vocabulary.. (this may take a awhile)"
train_sentences, test_sentences = sentences.to_array()
model.build_vocab(train_sentences+test_sentences)
print "> Training Doc2Vec.. (this may take awhile)"
for i in range(epochs):
print "--> Epoch %d"%i
model.train(sentences.permutate())
model.train_size = sentences.train_size
model.test_size = sentences.test_size
model.test_sentences = test_sentences
model.save('./doc2vec.model')
return model
示例9: get_model
def get_model():
try:
model = Doc2Vec.load(DOC2VEC_MODEL)
return model
except:
print "Model couldn't be loaded"
return None
示例10: instance_generator
def instance_generator(reviews_path, model_path):
print "Loading model"
model = Doc2Vec.load(model_path)
print "Model loaded"
with gzip.open(reviews_path, 'rt') as file:
for index, line in enumerate(file):
review = json.loads(line)
yield model.infer_vector(review['reviewText'].split()), review['overall']
示例11: load_embeddings
def load_embeddings(arg=None):
if arg == 'zh_tw': # dim = 400
model = gensim.models.Word2Vec.load_word2vec_format(get_file_path('cn_word2vec'), binary=False)
elif arg == 'CVAT': # dim = 50
model = gensim.models.Word2Vec.load(get_file_path('wordvecs_CVAT'))
elif arg == 'IMDb': # dim = 100
model = Doc2Vec.load(get_file_path('test_doc2vec_model'))
elif arg == 'CVAT_docvecs': # dim = 50
model = Doc2Vec.load(get_file_path('docvecs_CVAT'))
elif arg == 'google_news':
model = gensim.models.Word2Vec.load_word2vec_format(get_file_path('google_news'), binary=True)
elif arg == 'vader':
model = gensim.models.Word2Vec.load('./data/vader_wordvecs.w2v')
else:
raise Exception('Wrong Argument.')
print('Load Model Complete.')
return model
示例12: __init__
def __init__(self, filename=None, min_count=1, alpha_initial=0.002,
alpha_start=0.0005, alpha_end=0.0002, min_iters=10,
monitor=None):
Doc2Vec.__init__(self)
if filename is not None:
self.load_from_pickle(filename)
self.checkpoint = {}
self.filename = filename
self.min_count = min_count
self.alpha_initial = alpha_initial
self.alpha_start = alpha_start
self.alpha_end = alpha_end
self.min_iters = min_iters
if monitor is None:
monitor = lambda *x: None
self.monitor = monitor
assert 'train_lbls' in dir(self)
示例13: puebaSimpleCosenos
def puebaSimpleCosenos():
model = Doc2Vec.load('./imdb_dm.d2v')
source = 'data/trainneg.txt'
generador = GeneraVectores(model)
vecs = generador.getVecsFromFile(source)
print "coseno primer vector, trainneg"
print dot(matutils.unitvec(vecs[0]), matutils.unitvec(model.docvecs["TRAIN_NEG_0"]))
示例14: load_model
def load_model(language, models_path, models):
if check_lang:
path = models_path.format(language) + models[language]
print path
model = Doc2Vec.load(path)
assert model.docvecs.count > 0
return model
else:
return None
示例15: do_doc2vec
def do_doc2vec(label_tweet, text_tweet):
# Traitement : exécute Doc2Vec sur l'ensemble des
# tweets étiquetés passés en paramètre.
# Retourne : la matrice des vecteurs lignes associés à chaque
# tweet.
print("-> Doc2Vec...")
documents = [TaggedDocument(words = text.split(),
tags = [label]) for (label, text) in zip(label_tweet, text_tweet)]
model = None
filename_cache = ('model_nbdocs_' + str(args.amount) +
'_dim_' + str(args.dim) +
'.doc2vec')
if not os.path.exists(filename_cache):
model = Doc2Vec(documents, size = args.dim,
min_count = 1, workers = 4)
model.save(filename_cache)
else:
model = Doc2Vec.load(filename_cache)
data = None
if args.coeff != 1:
print(" pondération des #tags : " + str(args.coeff))
if args.tfidf:
print(" tfidf...")
data = do_tfidf(text_tweet, model)
elif args.mean:
print(" mean...")
data = do_mean(text_tweet, model, True)
else:
print(" sum...")
data = do_mean(text_tweet, model)
print(" ok!")
# rassembler les labels de chaque tweet
# avec les vecteurs correspondants
data = pd.DataFrame(data)
final_data = pd.DataFrame({'id' : label_tweet})
final_data = pd.concat([final_data, data], axis = 1)
return final_data