Python doc2vec.Doc2Vec方法代碼示例

本文整理匯總了Python中gensim.models.doc2vec.Doc2Vec方法的典型用法代碼示例。如果您正苦於以下問題：Python doc2vec.Doc2Vec方法的具體用法？Python doc2vec.Doc2Vec怎麽用？Python doc2vec.Doc2Vec使用的例子？那麽, 這裏精選的方法代碼示例或許可以為您提供幫助。您也可以進一步了解該方法所在類gensim.models.doc2vec的用法示例。

在下文中一共展示了doc2vec.Doc2Vec方法的15個代碼示例，這些例子默認根據受歡迎程度排序。您可以為喜歡或者感覺有用的代碼點讚，您的評價將有助於係統推薦出更棒的Python代碼示例。

示例1: fit

# 需要導入模塊: from gensim.models import doc2vec [as 別名]
# 或者: from gensim.models.doc2vec import Doc2Vec [as 別名]
def fit(self, graphs):
        """
        Fitting a Graph2Vec model.

        Arg types:
            * **graphs** *(List of NetworkX graphs)* - The graphs to be embedded.
        """
        self._set_seed()
        self._check_graphs(graphs)
        documents = [WeisfeilerLehmanHashing(graph, self.wl_iterations, self.attributed) for graph in graphs]
        documents = [TaggedDocument(words=doc.get_graph_features(), tags=[str(i)]) for i, doc in enumerate(documents)]

        model = Doc2Vec(documents,
                        vector_size=self.dimensions,
                        window=0,
                        min_count=self.min_count,
                        dm=0,
                        sample=self.down_sampling,
                        workers=self.workers,
                        epochs=self.epochs,
                        alpha=self.learning_rate,
                        seed=self.seed)

        self._embedding = [model.docvecs[str(i)] for i, _ in enumerate(documents)]

開發者ID:benedekrozemberczki，項目名稱:karateclub，代碼行數:26，代碼來源:graph2vec.py

示例2: fit

# 需要導入模塊: from gensim.models import doc2vec [as 別名]
# 或者: from gensim.models.doc2vec import Doc2Vec [as 別名]
def fit(self, graphs):
        """
        Fitting a GL2Vec model.

        Arg types:
            * **graphs** *(List of NetworkX graphs)* - The graphs to be embedded.
        """
        self._set_seed()
        self._check_graphs(graphs)
        graphs = [self._create_line_graph(graph) for graph in graphs]
        documents = [WeisfeilerLehmanHashing(graph, self.wl_iterations, False) for graph in graphs]
        documents = [TaggedDocument(words=doc.get_graph_features(), tags=[str(i)]) for i, doc in enumerate(documents)]

        model = Doc2Vec(documents,
                        vector_size=self.dimensions,
                        window=0,
                        min_count=self.min_count,
                        dm=0,
                        sample=self.down_sampling,
                        workers=self.workers,
                        epochs=self.epochs,
                        alpha=self.learning_rate,
                        seed=self.seed)

        self._embedding = [model.docvecs[str(i)] for i, _ in enumerate(documents)]

開發者ID:benedekrozemberczki，項目名稱:karateclub，代碼行數:27，代碼來源:gl2vec.py

示例3: _create_single_embedding

# 需要導入模塊: from gensim.models import doc2vec [as 別名]
# 或者: from gensim.models.doc2vec import Doc2Vec [as 別名]
def _create_single_embedding(self, features):
        """
        Learning an embedding from a feature hash table.
        :param features: A hash table with node keys and feature list values.
        :return embedding: Numpy array of embedding.
        """
        print("\nLearning the embedding.")
        document_collections = create_documents(features)

        model = Doc2Vec(document_collections,
                        vector_size=self.args.dimensions,
                        window=0,
                        min_count=self.args.min_count,
                        alpha=self.args.alpha,
                        dm=0,
                        negative=self.args.negative_samples,
                        ns_exponent=self.args.exponent,
                        min_alpha=self.args.min_alpha,
                        sample=self.args.down_sampling,
                        workers=self.args.workers,
                        epochs=self.args.epochs)

        emb = np.array([model.docvecs[str(n)] for n in range(self.graph.number_of_nodes())])
        return emb

開發者ID:benedekrozemberczki，項目名稱:MUSAE，代碼行數:26，代碼來源:musae.py

示例4: initialize_model

# 需要導入模塊: from gensim.models import doc2vec [as 別名]
# 或者: from gensim.models.doc2vec import Doc2Vec [as 別名]
def initialize_model(self, corpus):
        logging.info("Building Doc2Vec vocabulary")
        self.corpus = corpus
        self.model = doc2vec.Doc2Vec(min_count=1,
                                     # Ignores all words with
                                     # total frequency lower than this
                                     window=10,
                                     # The maximum distance between the current
                                     #  and predicted word within a sentence
                                     vector_size=300,  # Dimensionality of the
                                     #  generated feature vectors
                                     workers=5,  # Number of worker threads to
                                     #  train the model
                                     alpha=0.025,  # The initial learning rate
                                     min_alpha=0.00025,
                                     # Learning rate will linearly drop to
                                     # min_alpha as training progresses
                                     dm=1)
        # dm defines the training algorithm.
        #  If dm=1 means 'distributed memory' (PV-DM)
        # and dm =0 means 'distributed bag of words' (PV-DBOW)
        self.model.build_vocab(self.corpus)

開發者ID:ibrahimsharaf，項目名稱:doc2vec，代碼行數:24，代碼來源:doc2vec_model.py

示例5: forward

# 需要導入模塊: from gensim.models import doc2vec [as 別名]
# 或者: from gensim.models.doc2vec import Doc2Vec [as 別名]
def forward(self, graphs, **kwargs):
        if self.doc_collections is None:
            self.doc_collections = Parallel(n_jobs=self.worker)(
                delayed(Graph2Vec.feature_extractor)(graph, self.rounds, str(i)) for i, graph in enumerate(graphs)
            )
        self.model = Doc2Vec(
            self.doc_collections,
            vector_size=self.dimension,
            window=self.window_size,
            min_count=self.min_count,
            dm=self.dm,
            sample=self.sampling_rate,
            workers=self.worker,
            epochs=self.epoch,
            alpha=self.lr
        )
        vectors = np.array([self.model["g_"+str(i)] for i in range(len(graphs))])
        return vectors, None

開發者ID:THUDM，項目名稱:cogdl，代碼行數:20，代碼來源:graph2vec.py

示例6: main

# 需要導入模塊: from gensim.models import doc2vec [as 別名]
# 或者: from gensim.models.doc2vec import Doc2Vec [as 別名]
def main(args):
    """
    Main function to read the graph list, extract features.
    Learn the embedding and save it.
    :param args: Object with the arguments.
    """
    graphs = glob.glob(args.input_path + "*.json")
    print("\nFeature extraction started.\n")
    document_collections = Parallel(n_jobs=args.workers)(delayed(feature_extractor)(g, args.wl_iterations) for g in tqdm(graphs))
    print("\nOptimization started.\n")

    model = Doc2Vec(document_collections,
                    vector_size=args.dimensions,
                    window=0,
                    min_count=args.min_count,
                    dm=0,
                    sample=args.down_sampling,
                    workers=args.workers,
                    epochs=args.epochs,
                    alpha=args.learning_rate)

    save_embedding(args.output_path, model, graphs, args.dimensions)

開發者ID:benedekrozemberczki，項目名稱:graph2vec，代碼行數:24，代碼來源:graph2vec.py

示例7: train_doc2vec

# 需要導入模塊: from gensim.models import doc2vec [as 別名]
# 或者: from gensim.models.doc2vec import Doc2Vec [as 別名]
def train_doc2vec(paths, out='data/model.d2v', tokenizer=word_tokenize, sentences=False, **kwargs):
    """
    Train a doc2vec model on a list of files.
    """
    kwargs = {
        'size': 400,
        'window': 8,
        'min_count': 2,
        'workers': 8
    }.update(kwargs)

    n = 0
    for path in paths:
        print('Counting lines for {0}...'.format(path))
        n += sum(1 for line in open(path, 'r'))
    print('Processing {0} lines...'.format(n))

    print('Training doc2vec model...')
    m = Doc2Vec(_doc2vec_doc_stream(paths, n, tokenizer=tokenizer, sentences=sentences), **kwargs)

    print('Saving...')
    m.save(out)

開發者ID:frnsys，項目名稱:broca，代碼行數:24，代碼來源:doc2vec.py

示例8: create_embedding

# 需要導入模塊: from gensim.models import doc2vec [as 別名]
# 或者: from gensim.models.doc2vec import Doc2Vec [as 別名]
def create_embedding(self):
        """
        Fitting an embedding.
        """
        document_collections = create_documents(self.pooled_features)

        model = Doc2Vec(document_collections,
                        vector_size=self.args.dimensions,
                        window=0,
                        min_count=self.args.min_count,
                        alpha=self.args.alpha,
                        dm=0,
                        min_alpha=self.args.min_alpha,
                        sample=self.args.down_sampling,
                        workers=self.args.workers,
                        epochs=self.args.epochs)

        embedding = np.array([model.docvecs[str(node)] for node in self.graph.nodes()])
        return embedding

開發者ID:benedekrozemberczki，項目名稱:role2vec，代碼行數:21，代碼來源:role2vec.py

示例9: fit

# 需要導入模塊: from gensim.models import doc2vec [as 別名]
# 或者: from gensim.models.doc2vec import Doc2Vec [as 別名]
def fit(self, graph):
        """
        Fitting a Role2vec model.

        Arg types:
            * **graph** *(NetworkX graph)* - The graph to be embedded.
        """
        self._set_seed()
        self._check_graph(graph)
        walker = RandomWalker(self.walk_length, self.walk_number)
        walker.do_walks(graph)
 
        hasher = WeisfeilerLehmanHashing(graph=graph, wl_iterations=self.wl_iterations, attributed=False)
      
        node_features = hasher.get_node_features()
        documents = self._create_documents(walker.walks, node_features)

        model = Doc2Vec(documents,
                        vector_size=self.dimensions,
                        window=0,
                        min_count=self.min_count,
                        dm=0,
                        workers=self.workers,
                        sample=self.down_sampling,
                        epochs=self.epochs,
                        alpha=self.learning_rate,
                        seed=self.seed)

        self._embedding = [model.docvecs[str(i)] for i, _ in enumerate(documents)]

開發者ID:benedekrozemberczki，項目名稱:karateclub，代碼行數:31，代碼來源:role2vec.py

示例10: _create_single_embedding

# 需要導入模塊: from gensim.models import doc2vec [as 別名]
# 或者: from gensim.models.doc2vec import Doc2Vec [as 別名]
def _create_single_embedding(self, document_collections):
        model = Doc2Vec(document_collections,
                        vector_size=self.dimensions,
                        window=0,
                        min_count=self.min_count,
                        alpha=self.learning_rate,
                        dm=0,
                        sample=self.down_sampling,
                        workers=self.workers,
                        epochs=self.epochs,
                        seed=self.seed)

        emb = np.array([model.docvecs[str(n)] for n in range(self.graph.number_of_nodes())])
        return emb

開發者ID:benedekrozemberczki，項目名稱:karateclub，代碼行數:16，代碼來源:musae.py

示例11: gensim_doc2vec_vectorize

# 需要導入模塊: from gensim.models import doc2vec [as 別名]
# 或者: from gensim.models.doc2vec import Doc2Vec [as 別名]
def gensim_doc2vec_vectorize(corpus):
    from gensim.models.doc2vec import TaggedDocument, Doc2Vec

    corpus = [list(tokenize(doc)) for doc in corpus]
    docs   = [
        TaggedDocument(words, ['d{}'.format(idx)])
        for idx, words in enumerate(corpus)
    ]
    model = Doc2Vec(docs, size=5, min_count=0)
    return model.docvecs

開發者ID:foxbook，項目名稱:atap，代碼行數:12，代碼來源:vectorization.py

示例12: transform

# 需要導入模塊: from gensim.models import doc2vec [as 別名]
# 或者: from gensim.models.doc2vec import Doc2Vec [as 別名]
def transform(self, documents):
        docs = [
            TaggedDocument(words, ['d{}'.format(idx)])
            for idx, words in enumerate(documents)
        ]
        model = Doc2Vec(docs, size=self.size, min_count=self.min_count)
        return np.array(list(model.docvecs))

開發者ID:foxbook，項目名稱:atap，代碼行數:9，代碼來源:transformer.py

示例13: build_doc2vec_model

# 需要導入模塊: from gensim.models import doc2vec [as 別名]
# 或者: from gensim.models.doc2vec import Doc2Vec [as 別名]
def build_doc2vec_model(doc2vec_tagged_documents,training_algorithm=2,num_cores=1,epochs=5,vector_size=300,window=5,min_count=10,alpha=0.05, negative=0):

    '''
    Doc2Vec parameters
    dm_mean - 0 uses sum, 1 uses mean. Only applies when dm is non-concatenative mode
    dm - defines the training algorithm. By default (dm=1), ‘distributed memory’ (PV-DM) is used. Otherwise, distributed bag of words (PV-DBOW) is employed.
    dbow_words - if set to 1 trains word-vectors (in skip-gram fashion) simultaneous with DBOW doc-vector training; default is 0 (faster training of doc-vectors only).
    dm_concat - if 1, use concatenation of context vectors rather than sum/average; default is 0 (off). Note concatenation results in a much-larger model, as the input is no longer the size of one (sampled or arithmatically combined) word vector, but the size of the tag(s) and all words in the context strung together.
    dm_tag_count = expected constant number of document tags per document, when using dm_concat mode; default is 1.
    trim_rule = vocabulary trimming rule, specifies whether certain words should remain
    size is the dimensionality of the feature vectors
    window is the maximum distance between the predicted word and context words used for prediction within a document.
    alpha is the initial learning rate (will linearly drop to zero as training progresses).
    min_count = ignore all words with total frequency lower than this.
    max_vocab_size = limit RAM during vocabulary building
    sample = threshold for configuring which higher-frequency words are randomly downsampled; default is 0 (off), useful value is 1e-5.
    iter = number of iterations (epochs) over the corpus. The default inherited from Word2Vec is 5, but values of 10 or 20 are common in published ‘Paragraph Vector’ experiments.
    hs = if 1 (default), hierarchical sampling will be used for model training (else set to 0).
    negative = if > 0, negative sampling will be used, the int for negative specifies how many “noise words” should be drawn (usually between 5-20).
    '''

    # build Doc2Vec's vocab
    doc2vec_model = doc2vec.Doc2Vec(dm=training_algorithm, size=vector_size, sample=1e-5, window=window, min_count=min_count, iter=20, dbow_words=1, workers=num_cores, alpha=0.05, min_alpha=0.001, negative=negative)
    doc2vec_model.build_vocab(doc2vec_tagged_documents)

    # run training epochs while shuffling data and lowering learning rate (alpha)
    for i in range(epochs):
        logger.info("starting code epoch %d" % int(i+1))
        doc2vec_model.train(doc2vec_tagged_documents)
        doc2vec_model.alpha -= 0.002
        shuffle(doc2vec_tagged_documents)

    return doc2vec_model

開發者ID:Lab41，項目名稱:altair，代碼行數:35，代碼來源:build_doc2vec_model.py

示例14: main

# 需要導入模塊: from gensim.models import doc2vec [as 別名]
# 或者: from gensim.models.doc2vec import Doc2Vec [as 別名]
def main(script_folder, model_pickle_filename, training_algorithm, num_cores, epochs, vector_size, window, min_count, alpha, max_script_count, min_script_len, negative):

    doc2vec_tagged_documents = list()
    counter = 0

    logger.info("retrieving files")

    # Retrieve files containing Python scripts
    # Altair's JSON format uses the 'content' label for the script code
    for py_file in sorted(os.listdir(script_folder)):
        if counter >= max_script_count: break
        if counter % 100000 == 0: logger.info("processed %d files" % counter)
        fullpath = os.path.join(script_folder, py_file)
        with open(fullpath, "r") as py_file_contents:
            for line in py_file_contents:
                parsed_json = json.loads(line)
                code, comments = separate_code_and_comments(parsed_json['content'],py_file)
                if len(code) < min_script_len:
                    continue
                else:
                    tokenized_code = normalize_text(code, remove_stop_words=False, only_letters=False, return_list=True, remove_one_char_words=True)
                    doc2vec_tagged_documents.append(doc2vec.TaggedDocument(tokenized_code, [counter]))
                    counter += 1

    doc2vec_model = build_doc2vec_model(doc2vec_tagged_documents,training_algorithm,num_cores,epochs,vector_size,window,min_count,alpha,negative)

    # Per http://radimrehurek.com/gensim/models/doc2vec.html, delete_temporary_training_data reduces model size
    # If keep_doctags_vectors is set to false, most_similar, similarity, sims is no longer available
    # If keep_inference is set to false, infer_vector on a new document is no longer possible
    doc2vec_model.delete_temporary_training_data(keep_doctags_vectors=False, keep_inference=True)

    # Per http://radimrehurek.com/gensim/models/doc2vec.html, doc2vec has its own  method for saving/loading models
    # doc2vec_model.save(model_pickle_filename)
    # doc2vec_model = doc2vec.Doc2Vec.load(model_pickle_filename)

    #logger.info("saving doc2vec model in a pickle file at %s" % model_pickle_filename)
    pickle.dump(doc2vec_model, open(model_pickle_filename, "wb"))
    logger.info("doc2vec model pickle file saved at %s" % model_pickle_filename)

# Run this when called from CLI

開發者ID:Lab41，項目名稱:altair，代碼行數:42，代碼來源:build_doc2vec_model.py

示例15: main

# 需要導入模塊: from gensim.models import doc2vec [as 別名]
# 或者: from gensim.models.doc2vec import Doc2Vec [as 別名]
def main(trainingset_folder, model_pickle_filename, training_algorithm, num_cores, epochs, vector_size, window, min_count, alpha, negative):

    doc2vec_model = doc2vec.Doc2Vec(dm=training_algorithm, size=vector_size, sample=1e-5, window=window, min_count=min_count, iter=20, dbow_words=1, workers=num_cores, alpha=0.05, min_alpha=0.001, negative=negative)
    doc2vec_tagged_documents = list()

    for trainingset in os.listdir(trainingset_folder):
        logger.info("starting training set %s" % trainingset)
        doc2vec_tagged_documents += pickle.load(open(os.path.join(trainingset_folder,trainingset),"rb"))

    #doc2vec_model = train_doc2vec_model(doc2vec_model, doc2vec_tagged_documents,epochs)
    # build Doc2Vec's vocab
    logger.info("building vocabulary")
    doc2vec_model.build_vocab(doc2vec_tagged_documents)

    # run training epochs while shuffling data and lowering learning rate (alpha)
    for i in range(epochs):
        logger.info("starting code epoch %d" % int(i+1))
        doc2vec_model.train(doc2vec_tagged_documents)
        doc2vec_model.alpha -= 0.002
        shuffle(doc2vec_tagged_documents)
    #logger.info("saving model pickle for %s" % trainingset)
    #pickle.dump(doc2vec_model, open(model_pickle_filename[:-4]+"_"+str(int(time.time()))+os.path.splitext(model_pickle_filename)[1], "wb"))
    #doc2vec_model.alpha = 0.05
    #in_loop = True

    # Per http://radimrehurek.com/gensim/models/doc2vec.html, delete_temporary_training_data reduces model size
    # If keep_doctags_vectors is set to false, most_similar, similarity, sims is no longer available
    # If keep_inference is set to false, infer_vector on a new document is no longer possible
    doc2vec_model.delete_temporary_training_data(keep_doctags_vectors=False, keep_inference=True)

    # Per http://radimrehurek.com/gensim/models/doc2vec.html, doc2vec has its own  method for saving/loading models
    # doc2vec_model.save(model_pickle_filename)
    # doc2vec_model = doc2vec.Doc2Vec.load(model_pickle_filename)

    #logger.info("saving doc2vec model in a pickle file at %s" % model_pickle_filename)
    pickle.dump(doc2vec_model, open(model_pickle_filename, "wb"))
    logger.info("doc2vec model pickle file saved at %s" % model_pickle_filename)

# Run this when called from CLI

開發者ID:Lab41，項目名稱:altair，代碼行數:41，代碼來源:build_doc2vec_model_from_training_set.py

注：本文中的gensim.models.doc2vec.Doc2Vec方法示例由純淨天空整理自Github/MSDocs等開源代碼及文檔管理平台，相關代碼片段篩選自各路編程大神貢獻的開源項目，源碼版權歸原作者所有，傳播和使用請參考對應項目的License；未經允許，請勿轉載。