Python models.Doc2Vec类代码示例

本文整理汇总了Python中gensim.models.Doc2Vec类的典型用法代码示例。如果您正苦于以下问题：Python Doc2Vec类的具体用法？Python Doc2Vec怎么用？Python Doc2Vec使用的例子？那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。

在下文中一共展示了Doc2Vec类的15个代码示例，这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞，您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: test_build

def test_build(Xtrain, ytrain, Xtest, ytest):
    '''
    Load the three varieties of Doc2Vec models that were previously saved.
    Build a random forest model for each Doc2Vec model. Test each random
    forest model with the same test data, and write the results to a CSV
    file for each Doc2Vec model.
    '''
    print "Loading the model..."
    models = [Doc2Vec.load("Doc2Vec_dbow_d300_n5_t4"), \
    Doc2Vec.load("Doc2Vec_dm-c_d300_n5_w5_t4"),  \
    Doc2Vec.load("Doc2Vec_dm-m_d300_n5_w10_t4")]
    filenames = ['Doc2Vec_dbow.csv', 'Doc2Vec_dm-c.csv', 'Doc2Vec_dm-m.csv']
    forests = []
    for model in models:
        forests.append(build_forest(model, Xtrain, ytrain))
    for i in xrange(3):
        model = models[i]
        forest = forests[i]
        filename = filenames[i]
        features = []
        print "Creating feature list for test data..."
        for id in Xtest['id']:
            # remove the extra quotes around the id
            features.append(model.docvecs[id[1:-1]])
        print "Predicting test sentiment..."
        use_forest(forest, features, ytest, filename)

开发者ID:JenniferDunne，项目名称:ml-from-movie-reviews，代码行数:26，代码来源:bag_of_docs.py

示例2: init

 def __init__(self, note_type, model_file, max_notes, dbow_file=None):
     self.note_type = note_type
     self.max_notes = max_notes
     self.model = Doc2Vec.load(model_file)
     if dbow_file:
         self.dbow = Doc2Vec.load(dbow_file)
     else:
         self.dbow = None

开发者ID:Jdhaimson，项目名称:NLP_CRT，代码行数:8，代码来源:doc2vec_transformer.py

示例3: load_model

def load_model():
    '''
        Loading and Building Train and Test Data
    '''
    # loading labels
    labels = pickle.load(open('labels.p', 'rb'))

    # Using LabelEncoder to convert string to numerical value.
    label_encoder = preprocessing.LabelEncoder()
    transformed_labels = label_encoder.fit_transform(labels)

    transformed_labels = np.array(transformed_labels)

    transformed_labels = label_binarize(transformed_labels,
                                        np.unique(transformed_labels))

    print('Found %d Labels' % len(label_encoder.classes_))
    print('Labels:', label_encoder.classes_)

    # initialising feature array
    cow_arrays = np.zeros((247543, 300))

    # learning model Distributed memory model
    model = Doc2Vec.load('./acm_cow.d2v')

    # updating training arrays
    for i in range(247543):
        prefix_train_pos = "SET_" + str(i)
        cow_arrays[i] = model.docvecs[prefix_train_pos]

    train_arrays_cow, test_arrays_cow, train_labels_cow, test_labels_cow = \
        train_test_split(cow_arrays, transformed_labels,
                         test_size=0.1, random_state=42)

    # initialising feature array
    skip_arrays = np.zeros((247543, 300))

    # learning model Distributed Bag of words model
    model = Doc2Vec.load('./acm_skip.d2v')

    # updating training arrays
    for i in range(247543):
        prefix_train_pos = "SET_" + str(i)
        skip_arrays[i] = model.docvecs[prefix_train_pos]

    train_arrays_skip, test_arrays_skip, train_labels_skip, test_labels_skip = \
        train_test_split(skip_arrays, transformed_labels,
                         test_size=0.1, random_state=42)

    to_return = (train_arrays_cow, train_labels_cow,
                 test_arrays_cow, test_labels_cow,
                 train_arrays_skip, train_labels_skip,
                 test_arrays_skip, test_labels_skip)

    return to_return

开发者ID:rohitsakala，项目名称:semanticAnnotationAcmCategories，代码行数:55，代码来源:classify.py

示例4: load_questions

def load_questions(modelname,f_name,mapname,a_modelname):
    model = Doc2Vec.load(modelname)
    a_model = Doc2Vec.load(a_modelname)
    qids = list(enumerate([int(q) for q in open(f_name)]))
    rev_qids = [(item,index) for index,item in qids]
    qid_dict = dict(rev_qids)
    Q = []
    doc_dict = load_doc_hashes(mapname)
    for fname in os.listdir("questions"):
        Q.append(load_question(fname,model.docvecs.doctag_syn0,qid_dict,doc_dict,a_model))
    return Q

开发者ID:matulma4，项目名称:esc，代码行数:11，代码来源:qa_tuple.py

示例5: main

def main():
    model = Doc2Vec.load('400_pvdm_doc2vec.d2v')
    model_dbow = Doc2Vec.load('400_pvdbow_doc2vec.d2v')
    #mistake pvdm is actually pv-dbow
    path = 'datasets/'

    files = [f for f in listdir(path) if isfile(join(path,f))]
    files.pop(0)

    data_loader = DataLoader(path)

    domains = data_loader.csv_files


    names = {1: 'title', 4: 'abstract', 5: 'mesh', 'y': 6}

    domain_features = data_loader.get_feature_matrix(names)

    #get size
    n_total_documents = 0

    for domain in domain_features:
        n_total_documents+=len(domain[0])

    all_features = numpy.zeros(shape=(n_total_documents, 800))
    all_labels = numpy.asarray([])
    i = 0

    for domain in domain_features:
        features, labels = domain
        all_labels = numpy.hstack((all_labels, labels))
        for feature_vector in features:
            preprocessed_line = list(preprocess(feature_vector))
            all_features[i, 0:400] = numpy.float_(model.infer_vector(preprocessed_line))
            all_features[i, 400:] = numpy.float_(model_dbow.infer_vector(preprocessed_line))
            i+=1
    all_labels = numpy.asarray(all_labels)
    all_labels[all_labels == -1] = 0
    all_labels = numpy.intc(all_labels)
    train, test = data_loader.create_random_samples(all_features, all_labels)
    train_x, train_y = train
    test_x, test_y = test

    classifier = NeuralNet(n_hidden_units=[200], output_size=2, batch_size=20, n_epochs=200, dropout=True,
                                   activation_function='relu', learning_rate=.3, momentum=True, momentum_term=.5)

    classifier.train(train_x, train_y)
    classifier.test(test_x, test_y)

开发者ID:ericrincon，项目名称:Deep-Learning-NLP，代码行数:48，代码来源:Doc2VecClassifier.py

示例6: do

def do():
    global shouldStemData
    global shouldSaveModel
    from os.path import isfile
    from gensim.models import Doc2Vec
    from sys import argv

    if not isfile(modelname):# or (len(argv) > 1 and argv[1] == '--update'):
        parsed = parseData(trainData)
        print 'Begin stemming data'
        parsed = stemData(parsed[:10000])
        if False:
            try:
                print 'Write stemmed data'
                f = open('stemmed_data.csv', 'w')
                f.write('\n'.join(map(lambda x: ' '.join(x), parsed)))
            except Exception:
                print 'Failed to write'
            finally:
                try:
                    f.close()
                except Exception:
                    print ''

        print 'Begin training'
        if False:#len(argv) > 1 and argv[1] == '--update':
            print 'Update model'
            model = Doc2Vec.load(modelname)
            model.train(documents=parsed)
        else:
            model = Doc2Vec(documents=parsed)#, size=100, workers=4, window=5, min_count=5)
        
        if shouldSaveModel:
            print 'Save model'
            model.save(modelname)

    else:
        stemData([])
        model = Doc2Vec.load(modelname)

    print 'Get results'
    t = ''
    try:
        t = getResults(model)
    except Exception:
        for x in model.most_similar(happy):
            print x[0].encode('utf8')
    open('res.txt', 'w').write(t.encode('utf8'))

开发者ID:CepGamer，项目名称:HomeworkAll，代码行数:48，代码来源:script.py

示例7: transform_input

def transform_input(vectorsize):
    # this loads the premade model saved as amzn.d2v and transforms writes its vectors into arrays that can be input into the scikit learn algorithms
    print('Loading Doc2Vec model...')
    try:
        model = Doc2Vec.load('./amzn.d2v')
    except Exception as exception:
        print('No existing model found. Starting to create a model...')
        train_size = 50000
        d2v_source(train_size)
        model = create_doc2vec_model(vectorsize)

    # load or generate train and test data
    try:
        with open('train.txt') as f:
            train_raw = np.asarray([line.rstrip('\n') for line in f])
        with open('test.txt') as f:
            test_raw = np.asarray([line.rstrip('\n') for line in f])
        with open('train_target.txt') as f:
            target = np.asarray([int(line.rstrip('\n')) for line in f])
        with open('test_target.txt') as f:
            target_test = np.asarray([int(line.rstrip('\n')) for line in f])
    
    except Exception as exception:
        print('No train data found. Generating new train and test files....')
        train_size = 50000
        test_size = 20000
        review_lines(train_size,test_size)
        with open('train.txt') as f:
            train_raw = np.asarray([line.rstrip('\n') for line in f])
        with open('test.txt') as f:
            test_raw = np.asarray([line.rstrip('\n') for line in f])
        with open('train_target.txt') as f:
            target = np.asarray([int(line.rstrip('\n')) for line in f])

        with open('test_target.txt') as f:
            target_test = np.asarray([int(line.rstrip('\n')) for line in f])

    # infer vectors for the sentences of the train and test sets
    # I do this by creating a list of strings out of the document and then converting that into a vector
    # this takes forever...so for further use, I will only do this for new train and test sets and save the vectors
    try:
         train_arrays = np.loadtxt('train_vectors.txt')
         test_arrays = np.loadtxt('test_vectors.txt')
    except Exception as exception:
    
        train_arrays = np.zeros((target.shape[0],vectorsize))
        test_arrays = np.zeros((target_test.shape[0],vectorsize))

        print('Vectorizing the train and test data...')

        for i in range(target.shape[0]):
            train_arrays[i,:] = model.infer_vector(train_raw[i].split())

        for i in range(target_test.shape[0]):
            test_arrays[i,:] = model.infer_vector(test_raw[i].split())

        np.savetxt('train_vectors.txt',train_arrays)
        np.savetxt('test_vectors.txt',test_arrays)

    return train_arrays, target, test_arrays, target_test

开发者ID:jborchma，项目名称:amznrevs，代码行数:60，代码来源:review_classifier.py

示例8: load_or_train

def load_or_train(sentences=None,dim=83,epochs=10):
      # Doc2Vec params
      # --------------
      # min_count: words appearing more than..
      # window: size of the skip-gram model
      # size: vector embedding size
      # sample: higher frecuency words are downsampled with this
      # negative: noise factor in context (neagtive sampling)
      # workers: parallel processing factor
      try:
          print "> Loading model.."
          model = Doc2Vec.load("doc2vec.model")
      except IOError:
          print "> No pretrained model found or loading failed."
          model = Doc2Vec(min_count=1, size=dim, window=10, negative=5, sample=1e-4, workers=7)
          if not sentences:
              print "> No labeled sentences provided. Building them now."
              sentences = labeled_sentences()
          print "> Building vocabulary.. (this may take a awhile)"
          train_sentences, test_sentences = sentences.to_array()
          model.build_vocab(train_sentences+test_sentences)
          print "> Training Doc2Vec.. (this may take awhile)"
          for i in range(epochs):
              print "--> Epoch %d"%i
              model.train(sentences.permutate())
          model.train_size = sentences.train_size
          model.test_size = sentences.test_size
          model.test_sentences = test_sentences
          model.save('./doc2vec.model')
      return model

开发者ID:mansilla，项目名称:KueskiTest，代码行数:30，代码来源:nlpUtils.py

示例9: get_model

def get_model():
  try:
    model = Doc2Vec.load(DOC2VEC_MODEL)
    return model
  except:
    print "Model couldn't be loaded"
    return None

开发者ID:piyushbansal，项目名称:ir-crowd-thesis，代码行数:7，代码来源:get_document_vectors.py

示例10: instance_generator

def instance_generator(reviews_path, model_path):
    print "Loading model"
    model = Doc2Vec.load(model_path)
    print "Model loaded"
    with gzip.open(reviews_path, 'rt') as file:
        for index, line in enumerate(file):
            review = json.loads(line)
            yield model.infer_vector(review['reviewText'].split()), review['overall']

开发者ID:cqql，项目名称:tum-graph-analytics，代码行数:8，代码来源:extract-model10e-hdf5.py

示例11: load_embeddings

def load_embeddings(arg=None):
    if arg == 'zh_tw':  # dim = 400
        model = gensim.models.Word2Vec.load_word2vec_format(get_file_path('cn_word2vec'), binary=False)
    elif arg == 'CVAT':  # dim = 50
        model = gensim.models.Word2Vec.load(get_file_path('wordvecs_CVAT'))
    elif arg == 'IMDb':  # dim = 100
        model = Doc2Vec.load(get_file_path('test_doc2vec_model'))
    elif arg == 'CVAT_docvecs':  # dim = 50
        model = Doc2Vec.load(get_file_path('docvecs_CVAT'))
    elif arg == 'google_news':
        model = gensim.models.Word2Vec.load_word2vec_format(get_file_path('google_news'), binary=True)
    elif arg == 'vader':
        model = gensim.models.Word2Vec.load('./data/vader_wordvecs.w2v')
    else:
        raise Exception('Wrong Argument.')
    print('Load Model Complete.')
    return model

开发者ID:manasRK，项目名称:document_rating，代码行数:17，代码来源:load_data.py

示例12: init

 def __init__(self, filename=None, min_count=1, alpha_initial=0.002,
              alpha_start=0.0005, alpha_end=0.0002, min_iters=10,
              monitor=None):
     Doc2Vec.__init__(self)
     if filename is not None:
         self.load_from_pickle(filename)
     self.checkpoint = {}
     self.filename = filename
     self.min_count = min_count
     self.alpha_initial = alpha_initial
     self.alpha_start = alpha_start
     self.alpha_end = alpha_end
     self.min_iters = min_iters
     if monitor is None:
         monitor = lambda *x: None
     self.monitor = monitor
     assert 'train_lbls' in dir(self)

开发者ID:Chaojiayuan，项目名称:Document2Vec，代码行数:17，代码来源:document2vec.py

示例13: puebaSimpleCosenos

def puebaSimpleCosenos():
	model = Doc2Vec.load('./imdb_dm.d2v')

	source = 'data/trainneg.txt'
	generador = GeneraVectores(model)
	vecs = generador.getVecsFromFile(source)

	print "coseno primer vector, trainneg"
	print dot(matutils.unitvec(vecs[0]), matutils.unitvec(model.docvecs["TRAIN_NEG_0"]))

开发者ID:andersonhaynes，项目名称:SimpleDoc2Vec，代码行数:9，代码来源:GeneraVectores.py

示例14: load_model

def load_model(language, models_path, models):
    if check_lang:
        path = models_path.format(language) + models[language]
        print path
        model = Doc2Vec.load(path)
        assert model.docvecs.count > 0
        return model
    else:
        return None

开发者ID:constanr，项目名称:gender，代码行数:9，代码来源:docs2vecs.py

示例15: do_doc2vec

def do_doc2vec(label_tweet, text_tweet):

    # Traitement : exécute Doc2Vec sur l'ensemble des
    # tweets étiquetés passés en paramètre.

    # Retourne : la matrice des vecteurs lignes associés à chaque
    # tweet.
    
    print("-> Doc2Vec...")
    
    documents = [TaggedDocument(words = text.split(),
                             tags = [label]) for (label, text) in zip(label_tweet, text_tweet)]

    model = None

    filename_cache = ('model_nbdocs_' + str(args.amount) +
                          '_dim_' + str(args.dim) +
                          '.doc2vec')
    
    if not os.path.exists(filename_cache):
    
        model = Doc2Vec(documents, size = args.dim,
                    min_count = 1, workers = 4)
    
        model.save(filename_cache)
        
    else:
        model = Doc2Vec.load(filename_cache)
    
    data = None
    
    if args.coeff != 1:
        print("    pondération des #tags : " + str(args.coeff))
    
    if args.tfidf:
        print("    tfidf...")
        data = do_tfidf(text_tweet, model)
    elif args.mean:
        print("    mean...")
        data = do_mean(text_tweet, model, True)
    else:
        print("    sum...")
        data = do_mean(text_tweet, model)
    
    
    print("    ok!")
    
    # rassembler les labels de chaque tweet
    # avec les vecteurs correspondants
    
    data = pd.DataFrame(data)
    
    final_data = pd.DataFrame({'id' : label_tweet})
    final_data = pd.concat([final_data, data], axis = 1)
    
    return final_data

开发者ID:galakingon，项目名称:projet_specifique_twitter，代码行数:56，代码来源:main.py

注：本文中的gensim.models.Doc2Vec类示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台，相关代码片段筛选自各路编程大神贡献的开源项目，源码版权归原作者所有，传播和使用请参考对应项目的License；未经允许，请勿转载。