当前位置: 首页>>代码示例>>Python>>正文


Python doc2vec.TaggedDocument方法代码示例

本文整理汇总了Python中gensim.models.doc2vec.TaggedDocument方法的典型用法代码示例。如果您正苦于以下问题:Python doc2vec.TaggedDocument方法的具体用法?Python doc2vec.TaggedDocument怎么用?Python doc2vec.TaggedDocument使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在gensim.models.doc2vec的用法示例。


在下文中一共展示了doc2vec.TaggedDocument方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: fit

# 需要导入模块: from gensim.models import doc2vec [as 别名]
# 或者: from gensim.models.doc2vec import TaggedDocument [as 别名]
def fit(self, graphs):
        """
        Fitting a Graph2Vec model.

        Arg types:
            * **graphs** *(List of NetworkX graphs)* - The graphs to be embedded.
        """
        self._set_seed()
        self._check_graphs(graphs)
        documents = [WeisfeilerLehmanHashing(graph, self.wl_iterations, self.attributed) for graph in graphs]
        documents = [TaggedDocument(words=doc.get_graph_features(), tags=[str(i)]) for i, doc in enumerate(documents)]

        model = Doc2Vec(documents,
                        vector_size=self.dimensions,
                        window=0,
                        min_count=self.min_count,
                        dm=0,
                        sample=self.down_sampling,
                        workers=self.workers,
                        epochs=self.epochs,
                        alpha=self.learning_rate,
                        seed=self.seed)

        self._embedding = [model.docvecs[str(i)] for i, _ in enumerate(documents)] 
开发者ID:benedekrozemberczki,项目名称:karateclub,代码行数:26,代码来源:graph2vec.py

示例2: fit

# 需要导入模块: from gensim.models import doc2vec [as 别名]
# 或者: from gensim.models.doc2vec import TaggedDocument [as 别名]
def fit(self, graphs):
        """
        Fitting a GL2Vec model.

        Arg types:
            * **graphs** *(List of NetworkX graphs)* - The graphs to be embedded.
        """
        self._set_seed()
        self._check_graphs(graphs)
        graphs = [self._create_line_graph(graph) for graph in graphs]
        documents = [WeisfeilerLehmanHashing(graph, self.wl_iterations, False) for graph in graphs]
        documents = [TaggedDocument(words=doc.get_graph_features(), tags=[str(i)]) for i, doc in enumerate(documents)]

        model = Doc2Vec(documents,
                        vector_size=self.dimensions,
                        window=0,
                        min_count=self.min_count,
                        dm=0,
                        sample=self.down_sampling,
                        workers=self.workers,
                        epochs=self.epochs,
                        alpha=self.learning_rate,
                        seed=self.seed)

        self._embedding = [model.docvecs[str(i)] for i, _ in enumerate(documents)] 
开发者ID:benedekrozemberczki,项目名称:karateclub,代码行数:27,代码来源:gl2vec.py

示例3: _create_documents

# 需要导入模块: from gensim.models import doc2vec [as 别名]
# 或者: from gensim.models.doc2vec import TaggedDocument [as 别名]
def _create_documents(self, walks, features):
        """
        Accumulating the WL feature in neighbourhoods.
        
        Arg types:
            * **walks** *(list of lists)* - Random walks with string ids.

        Return types:
            * **new_features** *(list of TaggedDocument objects)* - The pooled features of nodes.
        """
        new_features = {node: [] for node, feature in features.items()}
        walks = self._transform_walks(walks)
        for walk in walks:
            for i in range(self.walk_length-self.window_size):
                for j in range(self.window_size):
                    source = walk[i]
                    target = walk[i+j]
                    new_features[source].append(features[target])
                    new_features[target].append(features[source])

        new_features = {node: [feature for features in new_features[node] for feature in features] for node, _ in new_features.items()}
        new_features = [TaggedDocument(words=feature, tags=[str(node)]) for node, feature in new_features.items()]
        return new_features 
开发者ID:benedekrozemberczki,项目名称:karateclub,代码行数:25,代码来源:role2vec.py

示例4: train

# 需要导入模块: from gensim.models import doc2vec [as 别名]
# 或者: from gensim.models.doc2vec import TaggedDocument [as 别名]
def train(args):
    vocab = load_json(args.vocab)
    # import pdb;pdb.set_trace()
    # load corpus
    corpus = CorpusIter20News(args.corpus[0], recursive=True, stem=True, with_docname=True)
    # corpus = CorpusIterMRD(args.corpus[0], load_json(args.docnames), stem=True, with_docname=True)
    # corpus = CorpusIterWiki10plus(args.corpus[0], load_json(args.docnames), stem=True, with_docname=True)
    # corpus = CorpusIterReuters(args.corpus, load_json(args.docnames), with_docname=True)
    corpus_iter = lambda: (TaggedDocument([word for word in sentence if word in vocab], tag) for sentence, tag in corpus)

    d2v = MyDoc2Vec(args.n_dim, window=args.window_size, \
        negative=args.negative, epoches=args.n_epoch, dm_concat=1)

    start = timeit.default_timer()
    d2v.train(corpus_iter)
    print 'runtime: %ss' % (timeit.default_timer() - start)

    save_doc2vec(d2v.model, args.save_model)
    import pdb;pdb.set_trace() 
开发者ID:hugochan,项目名称:KATE,代码行数:21,代码来源:run_doc2vec.py

示例5: process_non_pooled_model_data

# 需要导入模块: from gensim.models import doc2vec [as 别名]
# 或者: from gensim.models.doc2vec import TaggedDocument [as 别名]
def process_non_pooled_model_data(walks, counts, args):
    """
    Function to extract proximity statistics.
    :param walks: Diffusion lists.
    :param counts: Number of nodes.
    :param args: Arguments objects.
    :return docs: Processed walks.
    """
    print("Run feature extraction across windows.")
    features = {str(node): [] for node in range(counts)}
    for walk in tqdm(walks):
        for i in range(len(walk)-args.window_size):
            for j in range(1, args.window_size+1):
                features[walk[i]].append(["+"+str(j)+"_"+walk[i+j]])
                features[walk[i+j]].append(["_"+str(j)+"_"+walk[i]])

    docs = [TaggedDocument(words=[x[0] for x in v], tags=[str(k)]) for k, v in features.items()]
    return docs 
开发者ID:benedekrozemberczki,项目名称:diff2vec,代码行数:20,代码来源:helper.py

示例6: read_corpus

# 需要导入模块: from gensim.models import doc2vec [as 别名]
# 或者: from gensim.models.doc2vec import TaggedDocument [as 别名]
def read_corpus(path = '.', exclude = [], targets = None):
    i= 0
    for file in os.listdir(path):
        if file[-4:] == '.txt' and file not in exclude and 'no_en' not in file: # ensure file is an english txt file
            print(file)
            with open(os.path.join(path, file),  encoding="utf8") as document_text:
                for line in document_text:
                    count = 0
                    words = simple_preprocess(line)
                    for word in words: # count the number of words with <= 3 characters
                        if len(word) <= 3:
                            count += 1
                    if count < len(words)/2 and len(words) > 10: # exclude lines in which 1/2 the words have less 
                        yield(doc2vec.TaggedDocument(words, [i])) # than 3 characters or have less than 10 words
                        i+=1
    if targets:
        for key, val in targets.items():
            yield(doc2vec.TaggedDocument(simple_preprocess(val), [i]))
            i+=1 
开发者ID:IBM,项目名称:Semantic-Search-for-Sustainable-Development,代码行数:21,代码来源:parseundp.py

示例7: fit

# 需要导入模块: from gensim.models import doc2vec [as 别名]
# 或者: from gensim.models.doc2vec import TaggedDocument [as 别名]
def fit(self, texts):

        model_param = {
            "vector_size": self.vector_size,
            "epochs": self.epochs,
            "min_count": self.min_count,
            "workers": self.n_jobs,
            "window": self.window,
            "dm_concat": self.dm_concat,
            "dbow_words": self.dbow_words,
        }

        corpus = [TaggedDocument(simple_preprocess(text), [i])
                  for i, text in enumerate(texts)]

        # If self.dm is 2, train both models and concatenate the feature
        # vectors later. Resulting vector size should be the same.
        if self.dm == 2:
            model_param["vector_size"] = int(model_param["vector_size"]/2)
            self.model_dm = _train_model(corpus, **model_param, dm=1)
            self.model_dbow = _train_model(corpus, **model_param, dm=0)
        else:
            self.model = _train_model(corpus, **model_param, dm=self.dm) 
开发者ID:asreview,项目名称:asreview,代码行数:25,代码来源:doc2vec.py

示例8: tagcol_paragraph_embeddings_features

# 需要导入模块: from gensim.models import doc2vec [as 别名]
# 或者: from gensim.models.doc2vec import TaggedDocument [as 别名]
def tagcol_paragraph_embeddings_features(train_data):
    
    # Expects a dataframe with a 'values' column
    train_data_values = train_data['values']
    columns = [TaggedDocument( random.sample(col, min(1000, len(col))) , [i]) for i, col in enumerate(train_data_values.values)]
    
    return columns

# Input: returned tagged document collection from tagcol_paragraph_embeddings_features
# Only needed for training. 
开发者ID:megagonlabs,项目名称:sato,代码行数:12,代码来源:paragraph_vectors.py

示例9: _create_documents

# 需要导入模块: from gensim.models import doc2vec [as 别名]
# 或者: from gensim.models.doc2vec import TaggedDocument [as 别名]
def _create_documents(self, features):
        features_out = [TaggedDocument(words=[str(feat) for feat_elems in feature_set for feat in feat_elems], tags = [str(node)]) for node, feature_set in features.items()]
        return features_out 
开发者ID:benedekrozemberczki,项目名称:karateclub,代码行数:5,代码来源:musae.py

示例10: _create_base_docs

# 需要导入模块: from gensim.models import doc2vec [as 别名]
# 或者: from gensim.models.doc2vec import TaggedDocument [as 别名]
def _create_base_docs(self):
        features_out = [TaggedDocument(words=[str(feature) for feature in features], tags = [str(node)]) for node, features in self.features.items()]
        return features_out 
开发者ID:benedekrozemberczki,项目名称:karateclub,代码行数:5,代码来源:musae.py

示例11: create_documents

# 需要导入模块: from gensim.models import doc2vec [as 别名]
# 或者: from gensim.models.doc2vec import TaggedDocument [as 别名]
def create_documents(features):
    """
    From a feature hash create a list of TaggedDocuments.
    :param features: Feature hash table - keys are nodes, values are feature lists.
    :return docs: Tagged Documents list.
    """
    docs = [TaggedDocument(words=v, tags=[str(k)]) for k, v in features.items()]
    return docs 
开发者ID:benedekrozemberczki,项目名称:MUSAE,代码行数:10,代码来源:utils.py

示例12: __iter__

# 需要导入模块: from gensim.models import doc2vec [as 别名]
# 或者: from gensim.models.doc2vec import TaggedDocument [as 别名]
def __iter__(self):
        with open(self.fname, encoding='utf-8') as f:
            for line in f:
                try:
                    sentence, movie_id = line.strip().split("\u241E")
                    tokens = self.tokenizer.morphs(sentence)
                    tagged_doc = TaggedDocument(words=tokens, tags=['MOVIE_%s' % movie_id])
                    yield tagged_doc
                except:
                    continue 
开发者ID:ratsgo,项目名称:embedding,代码行数:12,代码来源:sent_utils.py

示例13: test

# 需要导入模块: from gensim.models import doc2vec [as 别名]
# 或者: from gensim.models.doc2vec import TaggedDocument [as 别名]
def test(args):
    vocab = load_json(args.vocab)
    # load corpus
    corpus = CorpusIter20News(args.corpus[0], recursive=True, stem=True, with_docname=True)
    # corpus = CorpusIterMRD(args.corpus[0], load_json(args.docnames), stem=True, with_docname=True)
    # corpus = CorpusIterWiki10plus(args.corpus[0], load_json(args.docnames), stem=True, with_docname=True)
    # corpus = CorpusIterReuters(args.corpus, load_json(args.docnames), with_docname=True)
    corpus_iter = lambda: (TaggedDocument([word for word in sentence if word in vocab], tag) for sentence, tag in corpus)

    d2v = load_doc2vec(args.load_model)
    doc_codes = predict(d2v, corpus_iter)
    dump_json(doc_codes, args.output)
    import pdb;pdb.set_trace() 
开发者ID:hugochan,项目名称:KATE,代码行数:15,代码来源:run_doc2vec.py

示例14: main

# 需要导入模块: from gensim.models import doc2vec [as 别名]
# 或者: from gensim.models.doc2vec import TaggedDocument [as 别名]
def main(script_folder, model_pickle_filename, training_algorithm, num_cores, epochs, vector_size, window, min_count, alpha, max_script_count, min_script_len, negative):

    doc2vec_tagged_documents = list()
    counter = 0

    logger.info("retrieving files")

    # Retrieve files containing Python scripts
    # Altair's JSON format uses the 'content' label for the script code
    for py_file in sorted(os.listdir(script_folder)):
        if counter >= max_script_count: break
        if counter % 100000 == 0: logger.info("processed %d files" % counter)
        fullpath = os.path.join(script_folder, py_file)
        with open(fullpath, "r") as py_file_contents:
            for line in py_file_contents:
                parsed_json = json.loads(line)
                code, comments = separate_code_and_comments(parsed_json['content'],py_file)
                if len(code) < min_script_len:
                    continue
                else:
                    tokenized_code = normalize_text(code, remove_stop_words=False, only_letters=False, return_list=True, remove_one_char_words=True)
                    doc2vec_tagged_documents.append(doc2vec.TaggedDocument(tokenized_code, [counter]))
                    counter += 1

    doc2vec_model = build_doc2vec_model(doc2vec_tagged_documents,training_algorithm,num_cores,epochs,vector_size,window,min_count,alpha,negative)

    # Per http://radimrehurek.com/gensim/models/doc2vec.html, delete_temporary_training_data reduces model size
    # If keep_doctags_vectors is set to false, most_similar, similarity, sims is no longer available
    # If keep_inference is set to false, infer_vector on a new document is no longer possible
    doc2vec_model.delete_temporary_training_data(keep_doctags_vectors=False, keep_inference=True)

    # Per http://radimrehurek.com/gensim/models/doc2vec.html, doc2vec has its own  method for saving/loading models
    # doc2vec_model.save(model_pickle_filename)
    # doc2vec_model = doc2vec.Doc2Vec.load(model_pickle_filename)

    #logger.info("saving doc2vec model in a pickle file at %s" % model_pickle_filename)
    pickle.dump(doc2vec_model, open(model_pickle_filename, "wb"))
    logger.info("doc2vec model pickle file saved at %s" % model_pickle_filename)

# Run this when called from CLI 
开发者ID:Lab41,项目名称:altair,代码行数:42,代码来源:build_doc2vec_model.py

示例15: main

# 需要导入模块: from gensim.models import doc2vec [as 别名]
# 或者: from gensim.models.doc2vec import TaggedDocument [as 别名]
def main(script_folder,output_folder,min_script_len,max_total_files,max_per_pkl):

    doc2vec_tagged_documents = list()
    counter = 0
    logger.info("retrieving files")
    just_started = True

    # Retrieve files containing Python scripts
    # Altair's JSON format uses the 'content' label for the script code
    for py_file in sorted(os.listdir(script_folder)):
        if counter>= max_total_files: break
        fullpath = os.path.join(script_folder, py_file)
        with open(fullpath, "r") as py_file_contents:
            for line in py_file_contents:
                if counter >= max_total_files: break
                if counter!=0 and counter % 50000 == 0: logger.info("processed %d files" % counter)
                if not just_started and counter % max_per_pkl == 0:
                    logger.info("Saving pickle file of tagged documents for size %d",max_per_pkl)
                    pickle.dump(doc2vec_tagged_documents, open(os.path.join(output_folder,"training"+str(counter)+".pkl"), "wb"))
                    doc2vec_tagged_documents = list()
                    just_started = True
                parsed_json = json.loads(line)
                code, _ = separate_code_and_comments(parsed_json['content'],py_file)
                if len(code) < min_script_len:
                    continue
                else:
                    tokenized_code = normalize_text(code, remove_stop_words=False, only_letters=False, return_list=True, remove_one_char_words=True)
                    if len(tokenized_code) > 1:
                    	doc2vec_tagged_documents.append(doc2vec.TaggedDocument(tokenized_code, [counter]))
                    	counter += 1
                    	just_started = False
        
    logger.info("Saving final pickle file of tagged documents for size %d",max_per_pkl)            
    pickle.dump(doc2vec_tagged_documents, open(os.path.join(output_folder,"training"+str(counter)+".pkl"), "wb"))

# Run this when called from CLI 
开发者ID:Lab41,项目名称:altair,代码行数:38,代码来源:build_doc2vec_trainingset.py


注:本文中的gensim.models.doc2vec.TaggedDocument方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。