当前位置: 首页>>代码示例>>Python>>正文


Python doc2vec.Doc2Vec类代码示例

本文整理汇总了Python中gensim.models.doc2vec.Doc2Vec的典型用法代码示例。如果您正苦于以下问题:Python Doc2Vec类的具体用法?Python Doc2Vec怎么用?Python Doc2Vec使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。


在下文中一共展示了Doc2Vec类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: setUp

    def setUp(self):
        filename = datapath("alldata-id-10.txt")
        train_docs = read_sentiment_docs(filename)
        self.train_docs = train_docs
        self.source_doc_vec_file = datapath("small_tag_doc_5_iter50")
        self.target_doc_vec_file = datapath("large_tag_doc_10_iter50")

        self.source_doc_vec = Doc2Vec.load(self.source_doc_vec_file)
        self.target_doc_vec = Doc2Vec.load(self.target_doc_vec_file)
开发者ID:RaRe-Technologies,项目名称:gensim,代码行数:9,代码来源:test_translation_matrix.py

示例2: __init__

    def __init__(self, size=300, window=8, min_count=2, workers=8, path_to_model=None, stream_train=False):

        '''
        Initializes the Doc2Vec_Wrapper class. 

        Args:
            size (int): Specifies the size of the feature-vector. Defaults to 300
            window (int): Specifies the size of the context window from which the feature vector is learned
            min_count (int): Specifices the minimum number of instances of each word that is saved in the model
            workers (int): number of parallel processes
            path_to_model (str): Specifies model on disk 
            stream_train (bool): If true, update word vectors with new sentences. If false, just get doc vecs
        '''

        self.stream_train=stream_train

        self.is_trained = False
        self.model = None

        ## if a path is passed, try to load from disk. Otherwise, retrain anyway
        if path_to_model:
            try:
                self.is_trained = True
                self.model = Doc2Vec.load(path_to_model)
            except:
                pass

        ## params for Doc2Vec 
        self.size = size ## size of the vector
        self.window = window ## size of the context window
        self.min_count = min_count ## minimum count of vocab to store in binary tree
        self.workers = workers ## number of parallel processes == number of cores on the computer
开发者ID:redreamality,项目名称:broca,代码行数:32,代码来源:doc2vec_wrapper.py

示例3: __init__

    def __init__(self, sentences, name, dataset_name, epochs=1, dimension=50, modelfile=None):
        self.inner_model = None

        # parameters
        self.dataset = dataset_name
        self.sentences = sentences
        self.name = name
        self.epochs = epochs
        self.dimension = dimension

        # data file path
        models_folder = os.path.join(*[os.path.dirname(os.path.dirname(os.path.abspath(__file__))), 'data', 'models'])
        if modelfile is not None:
            filename = modelfile
        else:
            filename = "DOC2VEC_%s_%s_%s_%s" % (self.dataset, self.name, self.epochs, self.dimension)
        self.filepath = os.path.join(models_folder, filename)
        model_exists = os.path.isfile(self.filepath)

        # train initial model
        if model_exists:
            logging.info("found data file %s" % (self.filepath, ))
            self.inner_model = Doc2Vec.load(self.filepath)
        else:
            self.inner_model = Doc2Vec(sentences, size=self.dimension)
            print self.inner_model.vocab.keys()
            self.inner_model.save(fname=self.filepath)
开发者ID:carriercomm,项目名称:medical-text,代码行数:27,代码来源:D2Vmodel.py

示例4: test_category

def test_category():
    from gensim.models.doc2vec import Doc2Vec
    from sematch.utility import FileIO
    from sematch.semantic.relatedness import ConceptRelatedness
    model_category = Doc2Vec.load(FileIO.filename('models/category/cat2vec'))
    cat2vec_rel = ConceptRelatedness(model_category)
    print(cat2vec_rel.word_similarity('happy','sad'))
开发者ID:gsi-upm,项目名称:sematch,代码行数:7,代码来源:test_relatedness.py

示例5: do_command

def do_command(args):
    # Load data
    data = load_data(args.input)
    #ids, documents = zip(*data)
    data = [(id, tokenize(doc)) for id, doc in data]
    ids = [id for id, _ in data]

    if not os.path.exists(args.modelfile):
        model = embed_documents(data)
        # Save model
        model.save(args.modelfile)
    else:
        model = Doc2Vec.load(args.modelfile)
        #map(model.infer_tokens, tokenized)
    print("Loaded model.")
    # Do k-nearest neighbors search.

    writer = csv.writer(args.output, delimiter='\t')
    writer.writerow(["id1", "id2", "score"])
    count = int(args.count) if args.count > 0 else len(model.docvecs)
    vectors = np.array([model.docvecs[i] for i in range(count)])
    del model # clear up memory

    for i, j, score in find_nearest_neighbors(vectors):
        id1, id2 = ids[i], ids[j]
        writer.writerow([id1, id2, score])
开发者ID:arunchaganty,项目名称:aeschines,代码行数:26,代码来源:doc2vec.py

示例6: load_external

 def load_external(self, model_file_name):
     """
     load a word2vec model from the file specified
     :param model_file_name: name of the model file
     :return:
     """
     self.model = Doc2Vec.load(model_file_name)
开发者ID:subhadeepmaji,项目名称:ml_algorithms,代码行数:7,代码来源:DocumentEmbedding.py

示例7: varify

def varify():
    from gensim.models.doc2vec import Doc2Vec
    model = Doc2Vec.load('data/doc2vec.d2v')
    documents = pickle.load(open('data/fedcorpus.pick', 'r'))
    for i in xrange(3):
        inferred_docvec = model.infer_vector(documents[i].words)
        print documents[i].tags
        print('%s:\n %s' % (model, model.docvecs.most_similar([inferred_docvec], topn=3)))
开发者ID:wtgme,项目名称:ohsn,代码行数:8,代码来源:doc2vec.py

示例8: main

def main():
    """
    1. Divide total dataset into several data bins by randomly extracting data entries with given ratio.
    2. Run cross-validation for given numbers of iterations in either SMOTE or non-SMOTE mode.
    3. Report and present statistical evaluations for each data bin.
    """
    stats_Fscores_ns, stats_recalls_ns, stats_precisions_ns = list(), list(), list() # ns for non-SMOTE
    stats_Fscores_ws, stats_recalls_ws, stats_precisions_ws = list(), list(), list() # ws for with SMOTE
    data_pos, data_neg = load_data("../data/")
    data_pos, data_neg = data_filter(data_pos), data_filter(data_neg)
    print "Loading Doc2Vec model ..."
    model_doc2vec = Doc2Vec.load_word2vec_format('GoogleNews-vectors-negative300.bin.gz', binary=True) # load Doc2Vec model
    print "Doc2Vec model loading done!"
    models = {"SVC": sklearn.svm.SVC(), \
              "Logit": sklearn.linear_model.LogisticRegression(), \
              "DT": sklearn.tree.DecisionTreeClassifier(), \
              "NBayes": sklearn.naive_bayes.GaussianNB(), \
              "NNeighbors": sklearn.neighbors.nearest_centroid.NearestCentroid()}
    model_chosen = "NBayes"
    print "Classifier Type:", model_chosen
    for binIndex in range(NUM_OF_BINS):
        print "Experiment on DataSet#", str(binIndex)
        random.shuffle(data_pos)
        random.shuffle(data_neg)
        size_pos_bin, size_neg_bin = int(len(data_pos)*SAMPLE_SIZE_RATIO), int(len(data_neg)*SAMPLE_SIZE_RATIO)
        data_pos_bin, data_neg_bin = data_pos[:size_pos_bin], data_neg[:size_neg_bin] # dataset bin
        sFscores_iter_ns, sRecalls_iter_ns, sPrecisions_iter_ns = list(), list(), list()
        sFscores_iter_ws, sRecalls_iter_ws, sPrecisions_iter_ws = list(), list(), list()
        for iteration in range(NUM_OF_ITERATION):
            random.seed(iteration)
            random.shuffle(data_pos_bin)
            random.shuffle(data_neg_bin)
            data_pos_vec, data_neg_vec = feature_extraction_Doc2Vec(data_pos_bin, data_neg_bin, model_doc2vec) # convert to doc vectors
            print "non-SMOTE experiment"
            accuracys, precisions, recalls, Fscores = cross_validationS( \
                data_pos_vec, data_neg_vec, models[model_chosen], num_cross=NUM_OF_CROSSFOLD,
                smote_flag=False)  # cross validation
            sFscores_iter_ns.extend(Fscores)
            sRecalls_iter_ns.extend(recalls)
            sPrecisions_iter_ns.extend(precisions)
            print "with SMOTE experiemnt"
            accuracys, precisions, recalls, Fscores = cross_validationS( \
                data_pos_vec, data_neg_vec, models[model_chosen], num_cross=NUM_OF_CROSSFOLD,
                smote_flag=True)  # cross validation
            sFscores_iter_ws.extend(Fscores)
            sRecalls_iter_ws.extend(recalls)
            sPrecisions_iter_ws.extend(precisions)
        stats_Fscores_ns.append(sFscores_iter_ns)
        stats_precisions_ns.append(sPrecisions_iter_ns)
        stats_recalls_ns.append(sRecalls_iter_ns)
        stats_Fscores_ws.append(sFscores_iter_ws)
        stats_precisions_ws.append(sPrecisions_iter_ws)
        stats_recalls_ws.append(sRecalls_iter_ws)
    print "All Experiments Done!"
    save_stats(stats_Fscores_ns, stats_recalls_ns, stats_precisions_ns, stats_Fscores_ws, stats_recalls_ws,\
               stats_precisions_ws, model_name=model_chosen)
    print "Statistics ready!"
开发者ID:jerry-shijieli,项目名称:EmailSignature,代码行数:57,代码来源:EmailSignatureExtraction_v4.py

示例9: get_model

def get_model(model_num, model_names):
    
    
    if model_num < 10:
        model = Word2Vec.load(model_path + model_names)
    elif model_num < 99:
        model = Doc2Vec.load(model_path + model_names)
    else:
        model = Word2Vec.load_word2vec_format(model_path + model_names, binary=True)  # C text format
    return model
开发者ID:DirkBrand,项目名称:Comment-Classification,代码行数:10,代码来源:ModelTraining.py

示例10: create_and_train_models_d2vec

def create_and_train_models_d2vec(tag, cores=6):
    """
    Build vocabulary and train models
    :param tag: small or big 
    :param cores: number of cores
    :return: the current models
    """
    simple_models = get_models_d2vec(cores)
    model_files = get_models_filename_d2vec(tag)
    if all([os.path.exists(file) for file in model_files]):
        print('Models exist, loading...')
        for i, fname in enumerate(model_files):
            simple_models[i] = Doc2Vec.load(fname)
        models_by_name = OrderedDict((str(model), model) for model in simple_models)
        return models_by_name
    else:
        print('Building models...')
        voc_model = build_vocab_d2vec(tag, cores)
        # Share vocabulary between models
        for model in simple_models:
            model.reset_from(voc_model)

        models_by_name = OrderedDict((str(model), model) for model in simple_models)
        print('Training models...')
        print("START %s" % datetime.datetime.now())
        best_error = defaultdict(lambda: 1.0)  # to selectively-print only best errors achieved

        alpha, min_alpha, passes = (0.025, 0.001, 20)
        alpha_delta = (alpha - min_alpha) / passes
        file = x_train_str.format(tag)
        x_train = pd.read_hdf(file)
        train_list = x_train.tolist()

        for epoch in range(passes):
            shuffle(train_list)  # shuffling gets best results

            for name, train_model in models_by_name.items():
                # train
                duration = 'na'
                train_model.alpha, train_model.min_alpha = alpha, alpha
                with elapsed_timer() as elapsed:
                    train_model.train(CorpusStream(train_list, 'train'), total_examples=train_model.corpus_count,
                                      epochs=train_model.iter)
                    duration = '%.1f' % elapsed()

            print('completed pass %i at alpha %f' % (epoch + 1, alpha))
            alpha -= alpha_delta

        print("END %s" % str(datetime.datetime.now()))
        for name, model in models_by_name.items():
            name = name.replace('/', '').replace(',', '_')
            model.save('models/{0}_{1}.m'.format(name, tag))

    return models_by_name
开发者ID:papapana,项目名称:data_science,代码行数:54,代码来源:yelp_runner.py

示例11: get_WordVector_matrix

def get_WordVector_matrix(label):
    model = Doc2Vec.load('./WordVector_model.d2v')
    size = len(label)
    vectors = np.zeros((size,depth))
    for i in range(size):
        try:
            doc_vector = model.docvecs[str(i)]
            vectors[i]=(doc_vector[0])
        except KeyError:
            print str(i) + ' occurs KeyError'
            pass
    return map(list,vectors)
开发者ID:azhe825,项目名称:CSC510,代码行数:12,代码来源:get_model.py

示例12: test_models

def test_models( FULL_SIM, models_files ):
    test_papers = pd.read_csv( TEST_FILEPATH )

    # NOTE: Only need for testing with AII:
    keywords_docsrels = populate_iks_dict()
    authorities = initialize_authorities()

    for mod_f in models_files:
        print( 'Testing '+ mod_f )
        model = Doc2Vec.load( mod_f )
        print( 'Model loaded.' )

        test_model( FULL_SIM, model, test_papers, keywords_docsrels, authorities )
开发者ID:cuptrail,项目名称:papertrail-backend,代码行数:13,代码来源:doc2vec_train.py

示例13: build_model

def build_model(x_train, x_test, iteration =5, save=True):
    if(save):
        big_list = x_train + x_test
        model = Doc2Vec(min_count=2, window=10, size=100, sample=1e-4, negative=5, workers=8)
        model.build_vocab(big_list)
	for i in range(iteration):
            model.train(big_list)
	print 'saving model to file.....'  
        model.save('./sentim.d2v')
    else:
	print 'loading model from file.....'
	model = Doc2Vec.load('./sentim.d2v')
    return model
开发者ID:moliq1,项目名称:sentiment_analysis,代码行数:13,代码来源:doc2vec.py

示例14: get_vec

def get_vec(vector_file, id_file, w_file):
    p2v = Doc2Vec.load(vector_file)
    fout = open(w_file, "w")
    index = 0
    with open(id_file) as f:
        for line in f:
            index += 1
            if index % 1000 == 0:
                logging("%d cases" % index)
            line = line.strip()
            vec = p2v.docvecs[line]
            line_w = line + "\t" + "\t".join([str(x) for x in vec]) + "\t" + "\n"
            fout.write(line_w)
    fout.close()
开发者ID:lienzhen,项目名称:review_rating,代码行数:14,代码来源:generate_vector.py

示例15: datacluster

def datacluster(data):
	infered_vectors_list = []
	print "load model..."
	model_dm = Doc2Vec.load(model_path)
	print "load train vectors..."
	for text, label in data:
		vector = model_dm.infer_vector(text)
		infered_vectors_list.append(vector)
	'''
	print "Check the optimized parameter..."
	Nc = range(1, 50)
	pca_data = [PCA(n_components = i).fit(infered_vectors_list).transform(infered_vectors_list) for i in Nc]
	kmeans = cluster.KMeans(init='k-means++',n_clusters=20,max_iter=300)
	score = [kmeans.fit(pca_data[i]).score(pca_data[i]) for i in range(len(pca_data))]
	print score
	plt.plot(Nc,score)
	plt.xlabel('PCA components')
	plt.ylabel('Score')
	plt.title('Elbow Curve')
	plt.show()
	'''

	print "PCA decomposition..."
	pca = PCA(n_components = 10).fit(infered_vectors_list)
	pca_data = pca.transform(infered_vectors_list)
	print "train K-Means model..."
	kmean_model = cluster.KMeans(init='k-means++',n_clusters=16,max_iter=300)
	kmean_model.fit(pca_data)
	#get the classified index
	result = kmean_model.fit_predict(pca_data)
	print "Predicting result:", result
	#save the cluster result
	joblib.dump(kmean_model, cluster_path)
	#load the cluster result
#	new_km = joblib.load(cluster_path)
	numSamples = len(pca_data) 
	print numSamples
	centroids = kmean_model.labels_
	
	#print centroids,type(centroids) #显示中心点
	#print kmean_model.inertia_  #显示聚类效果
	'''	
	marker = ['o', '.', ',', 'x', '*', 'd', 's', 'p']
	color = ['r', 'g', 'b', 'c', 'm', 'k', 'y', 'w']
	for i in xrange(numSamples):
		plt.scatter(pca_data[i][0], pca_data[i][1], \
				marker=marker[centroids[i]], color=color[centroids[i]])
	plt.show()
	'''
	return centroids
开发者ID:NeoCui,项目名称:Codebackup,代码行数:50,代码来源:cluster.py


注:本文中的gensim.models.doc2vec.Doc2Vec类示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。