本文整理汇总了Python中gensim.models.Doc2Vec.load方法的典型用法代码示例。如果您正苦于以下问题:Python Doc2Vec.load方法的具体用法?Python Doc2Vec.load怎么用?Python Doc2Vec.load使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类gensim.models.Doc2Vec
def load_idf_dict(self, dict_name='idf_dict'):
if dict_name not in self.dict_manager:
word_frequencies = {}
file_name = config.EX_DICT_DIR + '/word-frequencies.txt'
print('load dict from file %s \n' % file_name)
f_dict = utils.create_read_file(file_name)
for idx, line in enumerate(f_dict):
if idx == 0:
totfreq = int(line)
w, freq = line.strip().split()
freq = float(freq)
if freq < 10:
word_frequencies[w] = math.log(totfreq / freq) / math.log(2)
self.dict_manager[dict_name] = word_frequencies
return self.dict_manager[dict_name]
def load_from_pickle(self, filename):
This loads a pretrained Word2Vec file into this Doc2Vec class.
model_w2v = Doc2Vec.load(filename)
for attr in dir(model_w2v):
if attr == '__dict__':
# Skip methods that we already have in this class
if attr in dir(self) and callable(getattr(model_w2v, attr)):
setattr(self, attr, getattr(model_w2v, attr))
except AttributeError:
def __init__(self, model_fname="data/doc2vec.vecs", use_notebook=False):
self.model = Doc2Vec.load(model_fname)
self.doc2idx = {el:idx for idx, el in enumerate(self.model.docvecs.doctags.keys())}
self.use_notebook = use_notebook
def load_doc2vec(mod_file):
return Doc2Vec.load(mod_file)
def load_dict(self, dict_name, path=config.DICT_DIR):
path: config.DICT_DIR
if dict_name not in self.dict_manager:
dict_object = {}
cur_dir = os.path.dirname(__file__)
path = os.path.join(cur_dir, '../resources')
''' load dict from file '''
file_name = path + '/dict_%s.txt' % dict_name
print('load dict from file %s \n' % file_name)
f_dict = utils.create_read_file(file_name)
for idx, line in enumerate(f_dict):
line = line.strip().split('\t')
if len(line) == 1:
dict_object[line[0]] = idx + 1
elif len(line) == 2:
dict_object[line[0]] = eval(line[1])
raise NotImplementedError
self.dict_manager[dict_name] = dict_object
return self.dict_manager[dict_name]
def load_doc2vec(self):
dict_name = 'doc2vec'
if dict_name not in self.dict_manager:
from gensim.models import Doc2Vec
model = Doc2Vec.load(config.EX_DICT_DIR + '/doc2vec.model')
self.dict_manager[dict_name] = model
return self.dict_manager[dict_name]
def test_doc2vec_inference_saveload():
tagged_docs = [TaggedDocument(simple_preprocess(doc), [i])
for i, doc in enumerate(documents)]
model = Doc2Vec(tagged_docs, epochs=1, min_count=1, vector_size=10)
del model
model = Doc2Vec.load(TEST_FILE)
d2v = Doc2VecInference(model, DEFAULT_ANALYZER)
match_op = Matching()
retrieval = Retrieval(d2v, matching=match_op).fit(documents)
result = retrieval.query("scientists")
assert result[0] == 1
def retrainModel(vectorFile, dataFile, outputFile, iterations):
documents = LabeledLineSentence("Data\\" + dataFile)
model = Doc2Vec.load("Models\\" + vectorFile)
for epoch in range(iterations):
model.save("Models\\" + outputFile)
def testModel(inputFile):
model = Doc2Vec.load("Models\\" + inputFile)
while True:
choice = input("Press 1 to compare documents within the model to each other.\n"
"Press 2 to run similarity tests on individual words.\n"
"Press 3 to get the top related subreddits for an inferred new vector (comment).\n"
"Hit any key to exit.\n")
if choice == "1":
docChoice = input("Enter the subreddit you want to test.\n")
elif choice == "2":
wordChoice = input("Enter the word you wish to analyze.\n").lower()
elif choice == "3":
with open("testing.txt") as t:
resultList = []
testDocs = t.readlines()
for doc in testDocs:
doc = doc.split("\t")
tag = doc[0]
body = doc[1]
newVec = model.infer_vector(body.split())
resultList.append("The original category is {}: {}\n {}\n".
format(tag, body, model.docvecs.most_similar(positive=[newVec])))
with open("clusteredResults.txt", "a") as x:
for element in resultList:
def newKMeansModel(vectorFile, outputFile, numClusters):
# https://stackoverflow.com/questions/43476869/doc2vec-sentence-clustering
model = Doc2Vec.load("Models\\" + vectorFile)
docVecs = model.docvecs.doctag_syn0
km = KMeans(n_clusters=numClusters)
print("Fitting Data")
joblib.dump(km, outputFile)
def loadKMeansModel(vectorFile, clusterFile, csvFile):
# https://stackoverflow.com/questions/43476869/doc2vec-sentence-clustering
model = Doc2Vec.load("Models\\" + vectorFile)
km = joblib.load(clusterFile)
clusters = km.labels_.tolist()
cluster_info = {'labels': model.docvecs.offset2doctag,
"index, wordcount and repeated words": [model.docvecs.doctags[x] for x in model.docvecs.offset2doctag],
'clusters': clusters}
sentenceDF = pd.DataFrame(cluster_info, index=[clusters],
columns=['labels', "index, wordcount and repeated words", 'clusters'])
def newDBSCANModel(vectorFile, outputFile):
model = Doc2Vec.load("Models\\" + vectorFile)
vecs = []
for doc in range(0, len(model.docvecs)):
doc_vec = model.docvecs[doc]
# print doc_vec
vecs.append(doc_vec.reshape((1, 300)))
doc_vecs = np.array(vecs, dtype='float') # TSNE expects float type values
# print doc_vecs
docs = []
for i in doc_vecs:
db = DBSCAN(eps=0.03, algorithm="brute", metric='cosine').fit(docs)
joblib.dump(db, outputFile)
core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
core_samples_mask[db.core_sample_indices_] = True
labels = db.labels_
n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
clusters = db.labels_.tolist()
cluster_info = {'labels': model.docvecs.offset2doctag,
"index, wordcount and repeated words": [model.docvecs.doctags[x] for x in
'clusters': clusters}
sentenceDF = pd.DataFrame(cluster_info, index=[clusters],
columns=['labels', "index, wordcount and repeated words", 'clusters'])
print('Estimated number of clusters: %d' % n_clusters_)
def plotModel3D(vectorFile, numClusters):
# http://scikit-learn.org/stable/auto_examples/cluster/plot_cluster_iris.html
model = Doc2Vec.load("Models\\" + vectorFile)
docVecs = model.docvecs.doctag_syn0
reduced_data = PCA(n_components=10).fit_transform(docVecs)
kmeans = KMeans(init='k-means++', n_clusters=numClusters, n_init=10)
fig = plt.figure(1, figsize=(10, 10))
ax = Axes3D(fig, rect=[0, 0, .95, 1], elev=48, azim=134)
labels = kmeans.labels_
ax.scatter(reduced_data[:, 5], reduced_data[:, 2], reduced_data[:, 3], c=labels.astype(np.float))
# Plot the ground truth
fig = plt.figure(1, figsize=(10, 10))
ax = Axes3D(fig, rect=[0, 0, .95, 1], elev=48, azim=134)
ax.scatter(reduced_data[:, 5], reduced_data[:, 2], reduced_data[:, 3], c=labels.astype(np.float))
def load_model(self):
logger.info('loading doc2vec model name %s', self.model_fname)
self.model = Doc2Vec.load(join(self.model_dir, self.model_fname))
logger.info('doc2vec model %s loaded', self.model_fname)
return self.model