本文整理汇总了Python中gensim.models.doc2vec.TaggedDocument方法的典型用法代码示例。如果您正苦于以下问题:Python doc2vec.TaggedDocument方法的具体用法?Python doc2vec.TaggedDocument怎么用?Python doc2vec.TaggedDocument使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类gensim.models.doc2vec
的用法示例。
在下文中一共展示了doc2vec.TaggedDocument方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: fit
# 需要导入模块: from gensim.models import doc2vec [as 别名]
# 或者: from gensim.models.doc2vec import TaggedDocument [as 别名]
def fit(self, graphs):
"""
Fitting a Graph2Vec model.
Arg types:
* **graphs** *(List of NetworkX graphs)* - The graphs to be embedded.
"""
self._set_seed()
self._check_graphs(graphs)
documents = [WeisfeilerLehmanHashing(graph, self.wl_iterations, self.attributed) for graph in graphs]
documents = [TaggedDocument(words=doc.get_graph_features(), tags=[str(i)]) for i, doc in enumerate(documents)]
model = Doc2Vec(documents,
vector_size=self.dimensions,
window=0,
min_count=self.min_count,
dm=0,
sample=self.down_sampling,
workers=self.workers,
epochs=self.epochs,
alpha=self.learning_rate,
seed=self.seed)
self._embedding = [model.docvecs[str(i)] for i, _ in enumerate(documents)]
示例2: fit
# 需要导入模块: from gensim.models import doc2vec [as 别名]
# 或者: from gensim.models.doc2vec import TaggedDocument [as 别名]
def fit(self, graphs):
"""
Fitting a GL2Vec model.
Arg types:
* **graphs** *(List of NetworkX graphs)* - The graphs to be embedded.
"""
self._set_seed()
self._check_graphs(graphs)
graphs = [self._create_line_graph(graph) for graph in graphs]
documents = [WeisfeilerLehmanHashing(graph, self.wl_iterations, False) for graph in graphs]
documents = [TaggedDocument(words=doc.get_graph_features(), tags=[str(i)]) for i, doc in enumerate(documents)]
model = Doc2Vec(documents,
vector_size=self.dimensions,
window=0,
min_count=self.min_count,
dm=0,
sample=self.down_sampling,
workers=self.workers,
epochs=self.epochs,
alpha=self.learning_rate,
seed=self.seed)
self._embedding = [model.docvecs[str(i)] for i, _ in enumerate(documents)]
示例3: _create_documents
# 需要导入模块: from gensim.models import doc2vec [as 别名]
# 或者: from gensim.models.doc2vec import TaggedDocument [as 别名]
def _create_documents(self, walks, features):
"""
Accumulating the WL feature in neighbourhoods.
Arg types:
* **walks** *(list of lists)* - Random walks with string ids.
Return types:
* **new_features** *(list of TaggedDocument objects)* - The pooled features of nodes.
"""
new_features = {node: [] for node, feature in features.items()}
walks = self._transform_walks(walks)
for walk in walks:
for i in range(self.walk_length-self.window_size):
for j in range(self.window_size):
source = walk[i]
target = walk[i+j]
new_features[source].append(features[target])
new_features[target].append(features[source])
new_features = {node: [feature for features in new_features[node] for feature in features] for node, _ in new_features.items()}
new_features = [TaggedDocument(words=feature, tags=[str(node)]) for node, feature in new_features.items()]
return new_features
示例4: train
# 需要导入模块: from gensim.models import doc2vec [as 别名]
# 或者: from gensim.models.doc2vec import TaggedDocument [as 别名]
def train(args):
vocab = load_json(args.vocab)
# import pdb;pdb.set_trace()
# load corpus
corpus = CorpusIter20News(args.corpus[0], recursive=True, stem=True, with_docname=True)
# corpus = CorpusIterMRD(args.corpus[0], load_json(args.docnames), stem=True, with_docname=True)
# corpus = CorpusIterWiki10plus(args.corpus[0], load_json(args.docnames), stem=True, with_docname=True)
# corpus = CorpusIterReuters(args.corpus, load_json(args.docnames), with_docname=True)
corpus_iter = lambda: (TaggedDocument([word for word in sentence if word in vocab], tag) for sentence, tag in corpus)
d2v = MyDoc2Vec(args.n_dim, window=args.window_size, \
negative=args.negative, epoches=args.n_epoch, dm_concat=1)
start = timeit.default_timer()
d2v.train(corpus_iter)
print 'runtime: %ss' % (timeit.default_timer() - start)
save_doc2vec(d2v.model, args.save_model)
import pdb;pdb.set_trace()
示例5: process_non_pooled_model_data
# 需要导入模块: from gensim.models import doc2vec [as 别名]
# 或者: from gensim.models.doc2vec import TaggedDocument [as 别名]
def process_non_pooled_model_data(walks, counts, args):
"""
Function to extract proximity statistics.
:param walks: Diffusion lists.
:param counts: Number of nodes.
:param args: Arguments objects.
:return docs: Processed walks.
"""
print("Run feature extraction across windows.")
features = {str(node): [] for node in range(counts)}
for walk in tqdm(walks):
for i in range(len(walk)-args.window_size):
for j in range(1, args.window_size+1):
features[walk[i]].append(["+"+str(j)+"_"+walk[i+j]])
features[walk[i+j]].append(["_"+str(j)+"_"+walk[i]])
docs = [TaggedDocument(words=[x[0] for x in v], tags=[str(k)]) for k, v in features.items()]
return docs
示例6: read_corpus
# 需要导入模块: from gensim.models import doc2vec [as 别名]
# 或者: from gensim.models.doc2vec import TaggedDocument [as 别名]
def read_corpus(path = '.', exclude = [], targets = None):
i= 0
for file in os.listdir(path):
if file[-4:] == '.txt' and file not in exclude and 'no_en' not in file: # ensure file is an english txt file
print(file)
with open(os.path.join(path, file), encoding="utf8") as document_text:
for line in document_text:
count = 0
words = simple_preprocess(line)
for word in words: # count the number of words with <= 3 characters
if len(word) <= 3:
count += 1
if count < len(words)/2 and len(words) > 10: # exclude lines in which 1/2 the words have less
yield(doc2vec.TaggedDocument(words, [i])) # than 3 characters or have less than 10 words
i+=1
if targets:
for key, val in targets.items():
yield(doc2vec.TaggedDocument(simple_preprocess(val), [i]))
i+=1
示例7: fit
# 需要导入模块: from gensim.models import doc2vec [as 别名]
# 或者: from gensim.models.doc2vec import TaggedDocument [as 别名]
def fit(self, texts):
model_param = {
"vector_size": self.vector_size,
"epochs": self.epochs,
"min_count": self.min_count,
"workers": self.n_jobs,
"window": self.window,
"dm_concat": self.dm_concat,
"dbow_words": self.dbow_words,
}
corpus = [TaggedDocument(simple_preprocess(text), [i])
for i, text in enumerate(texts)]
# If self.dm is 2, train both models and concatenate the feature
# vectors later. Resulting vector size should be the same.
if self.dm == 2:
model_param["vector_size"] = int(model_param["vector_size"]/2)
self.model_dm = _train_model(corpus, **model_param, dm=1)
self.model_dbow = _train_model(corpus, **model_param, dm=0)
else:
self.model = _train_model(corpus, **model_param, dm=self.dm)
示例8: tagcol_paragraph_embeddings_features
# 需要导入模块: from gensim.models import doc2vec [as 别名]
# 或者: from gensim.models.doc2vec import TaggedDocument [as 别名]
def tagcol_paragraph_embeddings_features(train_data):
# Expects a dataframe with a 'values' column
train_data_values = train_data['values']
columns = [TaggedDocument( random.sample(col, min(1000, len(col))) , [i]) for i, col in enumerate(train_data_values.values)]
return columns
# Input: returned tagged document collection from tagcol_paragraph_embeddings_features
# Only needed for training.
示例9: _create_documents
# 需要导入模块: from gensim.models import doc2vec [as 别名]
# 或者: from gensim.models.doc2vec import TaggedDocument [as 别名]
def _create_documents(self, features):
features_out = [TaggedDocument(words=[str(feat) for feat_elems in feature_set for feat in feat_elems], tags = [str(node)]) for node, feature_set in features.items()]
return features_out
示例10: _create_base_docs
# 需要导入模块: from gensim.models import doc2vec [as 别名]
# 或者: from gensim.models.doc2vec import TaggedDocument [as 别名]
def _create_base_docs(self):
features_out = [TaggedDocument(words=[str(feature) for feature in features], tags = [str(node)]) for node, features in self.features.items()]
return features_out
示例11: create_documents
# 需要导入模块: from gensim.models import doc2vec [as 别名]
# 或者: from gensim.models.doc2vec import TaggedDocument [as 别名]
def create_documents(features):
"""
From a feature hash create a list of TaggedDocuments.
:param features: Feature hash table - keys are nodes, values are feature lists.
:return docs: Tagged Documents list.
"""
docs = [TaggedDocument(words=v, tags=[str(k)]) for k, v in features.items()]
return docs
示例12: __iter__
# 需要导入模块: from gensim.models import doc2vec [as 别名]
# 或者: from gensim.models.doc2vec import TaggedDocument [as 别名]
def __iter__(self):
with open(self.fname, encoding='utf-8') as f:
for line in f:
try:
sentence, movie_id = line.strip().split("\u241E")
tokens = self.tokenizer.morphs(sentence)
tagged_doc = TaggedDocument(words=tokens, tags=['MOVIE_%s' % movie_id])
yield tagged_doc
except:
continue
示例13: test
# 需要导入模块: from gensim.models import doc2vec [as 别名]
# 或者: from gensim.models.doc2vec import TaggedDocument [as 别名]
def test(args):
vocab = load_json(args.vocab)
# load corpus
corpus = CorpusIter20News(args.corpus[0], recursive=True, stem=True, with_docname=True)
# corpus = CorpusIterMRD(args.corpus[0], load_json(args.docnames), stem=True, with_docname=True)
# corpus = CorpusIterWiki10plus(args.corpus[0], load_json(args.docnames), stem=True, with_docname=True)
# corpus = CorpusIterReuters(args.corpus, load_json(args.docnames), with_docname=True)
corpus_iter = lambda: (TaggedDocument([word for word in sentence if word in vocab], tag) for sentence, tag in corpus)
d2v = load_doc2vec(args.load_model)
doc_codes = predict(d2v, corpus_iter)
dump_json(doc_codes, args.output)
import pdb;pdb.set_trace()
示例14: main
# 需要导入模块: from gensim.models import doc2vec [as 别名]
# 或者: from gensim.models.doc2vec import TaggedDocument [as 别名]
def main(script_folder, model_pickle_filename, training_algorithm, num_cores, epochs, vector_size, window, min_count, alpha, max_script_count, min_script_len, negative):
doc2vec_tagged_documents = list()
counter = 0
logger.info("retrieving files")
# Retrieve files containing Python scripts
# Altair's JSON format uses the 'content' label for the script code
for py_file in sorted(os.listdir(script_folder)):
if counter >= max_script_count: break
if counter % 100000 == 0: logger.info("processed %d files" % counter)
fullpath = os.path.join(script_folder, py_file)
with open(fullpath, "r") as py_file_contents:
for line in py_file_contents:
parsed_json = json.loads(line)
code, comments = separate_code_and_comments(parsed_json['content'],py_file)
if len(code) < min_script_len:
continue
else:
tokenized_code = normalize_text(code, remove_stop_words=False, only_letters=False, return_list=True, remove_one_char_words=True)
doc2vec_tagged_documents.append(doc2vec.TaggedDocument(tokenized_code, [counter]))
counter += 1
doc2vec_model = build_doc2vec_model(doc2vec_tagged_documents,training_algorithm,num_cores,epochs,vector_size,window,min_count,alpha,negative)
# Per http://radimrehurek.com/gensim/models/doc2vec.html, delete_temporary_training_data reduces model size
# If keep_doctags_vectors is set to false, most_similar, similarity, sims is no longer available
# If keep_inference is set to false, infer_vector on a new document is no longer possible
doc2vec_model.delete_temporary_training_data(keep_doctags_vectors=False, keep_inference=True)
# Per http://radimrehurek.com/gensim/models/doc2vec.html, doc2vec has its own method for saving/loading models
# doc2vec_model.save(model_pickle_filename)
# doc2vec_model = doc2vec.Doc2Vec.load(model_pickle_filename)
#logger.info("saving doc2vec model in a pickle file at %s" % model_pickle_filename)
pickle.dump(doc2vec_model, open(model_pickle_filename, "wb"))
logger.info("doc2vec model pickle file saved at %s" % model_pickle_filename)
# Run this when called from CLI
示例15: main
# 需要导入模块: from gensim.models import doc2vec [as 别名]
# 或者: from gensim.models.doc2vec import TaggedDocument [as 别名]
def main(script_folder,output_folder,min_script_len,max_total_files,max_per_pkl):
doc2vec_tagged_documents = list()
counter = 0
logger.info("retrieving files")
just_started = True
# Retrieve files containing Python scripts
# Altair's JSON format uses the 'content' label for the script code
for py_file in sorted(os.listdir(script_folder)):
if counter>= max_total_files: break
fullpath = os.path.join(script_folder, py_file)
with open(fullpath, "r") as py_file_contents:
for line in py_file_contents:
if counter >= max_total_files: break
if counter!=0 and counter % 50000 == 0: logger.info("processed %d files" % counter)
if not just_started and counter % max_per_pkl == 0:
logger.info("Saving pickle file of tagged documents for size %d",max_per_pkl)
pickle.dump(doc2vec_tagged_documents, open(os.path.join(output_folder,"training"+str(counter)+".pkl"), "wb"))
doc2vec_tagged_documents = list()
just_started = True
parsed_json = json.loads(line)
code, _ = separate_code_and_comments(parsed_json['content'],py_file)
if len(code) < min_script_len:
continue
else:
tokenized_code = normalize_text(code, remove_stop_words=False, only_letters=False, return_list=True, remove_one_char_words=True)
if len(tokenized_code) > 1:
doc2vec_tagged_documents.append(doc2vec.TaggedDocument(tokenized_code, [counter]))
counter += 1
just_started = False
logger.info("Saving final pickle file of tagged documents for size %d",max_per_pkl)
pickle.dump(doc2vec_tagged_documents, open(os.path.join(output_folder,"training"+str(counter)+".pkl"), "wb"))
# Run this when called from CLI