当前位置: 首页>>代码示例>>Python>>正文


Python gensim.models方法代码示例

本文整理汇总了Python中gensim.models方法的典型用法代码示例。如果您正苦于以下问题:Python gensim.models方法的具体用法?Python gensim.models怎么用?Python gensim.models使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在gensim的用法示例。


在下文中一共展示了gensim.models方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: create_metadata_file

# 需要导入模块: import gensim [as 别名]
# 或者: from gensim import models [as 别名]
def create_metadata_file(word2vec_file, output_file):
    """
    Create the metadata file based on the corpus file (Used for the Embedding Visualization later).

    Args:
        word2vec_file: The word2vec file
        output_file: The metadata file path
    Raises:
        IOError: If word2vec model file doesn't exist
    """
    if not os.path.isfile(word2vec_file):
        raise IOError("[Error] The word2vec file doesn't exist.")

    model = gensim.models.Word2Vec.load(word2vec_file)
    word2idx = dict([(k, v.index) for k, v in model.wv.vocab.items()])
    word2idx_sorted = [(k, word2idx[k]) for k in sorted(word2idx, key=word2idx.get, reverse=False)]

    with open(output_file, 'w+') as fout:
        for word in word2idx_sorted:
            if word[0] is None:
                print("[Warning] Empty Line, should replaced by any thing else, or will cause a bug of tensorboard")
                fout.write('<Empty Line>' + '\n')
            else:
                fout.write(word[0] + '\n') 
开发者ID:RandolphVI,项目名称:Text-Pairs-Relation-Classification,代码行数:26,代码来源:data_helpers.py

示例2: load_poincare_model

# 需要导入模块: import gensim [as 别名]
# 或者: from gensim import models [as 别名]
def load_poincare_model(path, word2vec_format=True, binary=False):
    """ Load a Poincare embedding model.

    :param path: path of the file of the pre-trained Poincare embedding model
    :param word2vec_format: whether to load from word2vec format (default: True)
    :param binary: binary format (default: False)
    :return: a pre-trained Poincare embedding model
    :type path: str
    :type word2vec_format: bool
    :type binary: bool
    :rtype: gensim.models.poincare.PoincareKeyedVectors
    """
    if word2vec_format:
        return PoincareKeyedVectors.load_word2vec_format(path, binary=binary)
    else:
        return PoincareModel.load(path).kv 
开发者ID:stephenhky,项目名称:PyShortTextCategorization,代码行数:18,代码来源:wordembed.py

示例3: shorttext_to_avgvec

# 需要导入模块: import gensim [as 别名]
# 或者: from gensim import models [as 别名]
def shorttext_to_avgvec(shorttext, wvmodel):
    """ Convert the short text into an averaged embedded vector representation.

    Given a short sentence, it converts all the tokens into embedded vectors according to
    the given word-embedding model, sums
    them up, and normalize the resulting vector. It returns the resulting vector
    that represents this short sentence.

    :param shorttext: a short sentence
    :param wvmodel: word-embedding model
    :return: an embedded vector that represents the short sentence
    :type shorttext: str
    :type wvmodel: gensim.models.keyedvectors.KeyedVectors
    :rtype: numpy.ndarray
    """
    vec = np.sum([wvmodel[token] for token in tokenize(shorttext) if token in wvmodel], axis=0)

    # normalize
    norm = np.linalg.norm(vec)
    if norm != 0:
        vec /= norm

    return vec 
开发者ID:stephenhky,项目名称:PyShortTextCategorization,代码行数:25,代码来源:wordembed.py

示例4: load_word2vec_matrix

# 需要导入模块: import gensim [as 别名]
# 或者: from gensim import models [as 别名]
def load_word2vec_matrix(word2vec_file):
    """
    Return the word2vec model matrix.

    Args:
        word2vec_file: The word2vec file
    Returns:
        The word2vec model matrix
    Raises:
        IOError: If word2vec model file doesn't exist
    """
    if not os.path.isfile(word2vec_file):
        raise IOError("[Error] The word2vec file doesn't exist. ")

    model = gensim.models.Word2Vec.load(word2vec_file)
    vocab_size = model.wv.vectors.shape[0]
    embedding_size = model.vector_size
    vocab = dict([(k, v.index) for k, v in model.wv.vocab.items()])
    embedding_matrix = np.zeros([vocab_size, embedding_size])
    for key, value in vocab.items():
        if key is not None:
            embedding_matrix[value] = model[key]
    return vocab_size, embedding_size, embedding_matrix 
开发者ID:RandolphVI,项目名称:Text-Pairs-Relation-Classification,代码行数:25,代码来源:data_helpers.py

示例5: compute_epoch_accuracies

# 需要导入模块: import gensim [as 别名]
# 或者: from gensim import models [as 别名]
def compute_epoch_accuracies(root, prefix, analogy_file):
    filenames = glob.glob(os.path.join(root, prefix+"_epoch*.model"))
    nr_epochs = len(filenames)
    accuracies = dict()
    losses = [0] * nr_epochs
    for filename in filenames:
        epoch = int(re.search("\d+\.model", filename).group()[:-6])
        m = Word2Vec.load(filename)
        losses[epoch] = m.get_latest_training_loss()
        sections = m.wv.accuracy(analogy_file)
        for sec in sections:
            if sec["section"] not in accuracies:
                accuracies[sec["section"]] = [0] * nr_epochs
            correct, incorrect = len(sec["correct"]), len(sec["incorrect"])
            if incorrect > 0:
                accuracy = correct / (correct + incorrect)
            else:
                accuracy = 0
            accuracies[sec["section"]][epoch] = (correct, incorrect, accuracy)
        save_obj(accuracies, os.path.join("models", prefix + "_accuracies"))
        save_obj(np.concatenate([np.array([losses[0]]), np.diff(losses)]), os.path.join("models", prefix + "_loss")) 
开发者ID:materialsintelligence,项目名称:mat2vec,代码行数:23,代码来源:utils.py

示例6: train_artistsong2vec_model

# 需要导入模块: import gensim [as 别名]
# 或者: from gensim import models [as 别名]
def train_artistsong2vec_model(fout_path, input_datas=None, data_path=None,
                               min_count=5, sorted_vocab=1, window=10,
                               size=250,
                               iter_n=50):
    if not input_datas and data_path:
        input_datas = pickle.load(open(data_path, 'rb'))
    full_data = []
    for i in input_datas:
        tmp = []
        for j in i:
            tmp.append(j[0])
            tmp.append(j[1])
        full_data.append(tmp)
    data_process_logger.info('start training')
    wv_model = gensim.models.Word2Vec(full_data, min_count=min_count, sorted_vocab=sorted_vocab, window=window,
                                      size=size, iter=iter_n)
    with open(fout_path, 'wb') as fout:
        data_process_logger.info('start saving model')
        pickle.dump(wv_model, fout)
        print 'model saved' 
开发者ID:JayveeHe,项目名称:MusicTaster,代码行数:22,代码来源:rock_gensim.py

示例7: get_word

# 需要导入模块: import gensim [as 别名]
# 或者: from gensim import models [as 别名]
def get_word(word):
    inst = re.search(r"_\(([A-Za-z0-9_]+)\)", word)

    if inst == None:
        length = len(word.split("_"))
        if length < 5:
            return True, word
    else:
        if inst.group(1) != "disambiguation":
            word2 = re.sub(r'_\(.+\)','',word)
            if len(word2.split(" ")) <5:
                return True, word

    return False,word

# Load the trained doc2vec and word2vec models. 
开发者ID:sb1992,项目名称:NETL-Automatic-Topic-Labelling-,代码行数:18,代码来源:get_indices.py

示例8: online_lda

# 需要导入模块: import gensim [as 别名]
# 或者: from gensim import models [as 别名]
def online_lda(corpus, dictionary, k=25, alpha="symmetric", chunk_size=10000, update_every=1, passes=1):
	"""
	Build the standard online LDA topic model (see gensim:
	http://radimrehurek.com/gensim/wiki.html#latent-dirichlet-allocation)
	
	Updates model every 'update_every' chunks, does 'passes' full passes over the corpus (updating
	every 'update_every' time each pass), and breaks corpus into 'chunk_size' document chunks.

	EG: chunk_size=100, update_every=1, passes=1: Does one full pass over the corpus, updating the
	model every one chunk, breaking the whole corpus into corpus_size/chunk_size chunks. 

	500 documents => 5 chunks, updates model on every chunk.

	Alpha values can be "symmetric", "asymmetric", and "auto". See: 
	http://radimrehurek.com/gensim/models/ldamodel.html
	"""
	return models.ldamodel.LdaModel(corpus=corpus,
									id2word=dictionary,
									num_topics=k,
									alpha=alpha,
									chunksize=chunk_size,
									update_every=update_every,
									passes=passes) 
开发者ID:SMAPPNYU,项目名称:smappPy,代码行数:25,代码来源:build_models.py

示例9: batch_lda

# 需要导入模块: import gensim [as 别名]
# 或者: from gensim import models [as 别名]
def batch_lda(corpus, dictionary, k=25, alpha="symmetric", passes=20):
	"""
	Build basic batch LDA topic model (see gensim:
	http://radimrehurek.com/gensim/wiki.html#latent-dirichlet-allocation)

	Does 'passes' number of passes over the whole corpus, no chunking, and updates the model
	at the end of every full pass.

	Alpha values can be "symmetric", "asymmetric", and "auto". See: 
	http://radimrehurek.com/gensim/models/ldamodel.html
	"""
	return models.ldamodel.LdaModel(corpus=corpus,
								   id2word=dictionary,
								   num_topics=k,
								   alpha=alpha,
								   update_every=0,
								   passes=passes) 
开发者ID:SMAPPNYU,项目名称:smappPy,代码行数:19,代码来源:build_models.py

示例10: predictData

# 需要导入模块: import gensim [as 别名]
# 或者: from gensim import models [as 别名]
def predictData():
    """
    使用模型预测真实数据

    """
    input_texts = ["很好很满意","不好不满意","质量有问题","商家态度很差","售后很渣,渣渣"]

    # word_model = word2vec.Word2Vec.load('./models/Word2vec_model.model')
    # w2indx, w2vec, texts = create_dictionaries(word_model, texts)
    # print(texts)

    texts = predict_wordtoVect(input_texts)

    model = get_model()
    # # 预测
    pred_result = model.predict_classes(texts)
    print(pred_result)
    labels = [int(round(x[0])) for x in pred_result]
    label2word = {1: '正面', 0: '负面'}
    for i in range(len(pred_result)):
        print('{0} -------- {1}'.format(label2word[labels[i]], input_texts[i])) 
开发者ID:jarvisqi,项目名称:deep_learning,代码行数:23,代码来源:textAnalysis.py

示例11: predict_phrase

# 需要导入模块: import gensim [as 别名]
# 或者: from gensim import models [as 别名]
def predict_phrase(phrase):
   #load the model
   #preprocess the phrase
   
   #phrase_clean = clean_str(phrase)
   phrase_clean = phrase 
   #load the dictionary
   char_dict = np.load('EARL/models/char_dict.npy').item()
   #phrase_clean = [char for char in phrase_clean]
   #print phrase_clean
   
   phrase_clean = [char_dict[char] for char in phrase_clean]

   #print phrase_clean
   
   #print np.concatenate((np.zeros(max_len-len(phrase_clean)), phrase_clean) )
   prediction = model.predict(np.concatenate((np.zeros((270-len(phrase_clean))), phrase_clean)).reshape(1,270))

   print prediction[0]
   
   pred = np.argmax(prediction[0])

   return 'R' if pred == 0 else 'E' 
开发者ID:AskNowQA,项目名称:EARL,代码行数:25,代码来源:predict_phrase.py

示例12: _expand_from

# 需要导入模块: import gensim [as 别名]
# 或者: from gensim import models [as 别名]
def _expand_from(self, corpus, prefix=None, labels=None):
        """
        Pass through the dataset once to add the new labels to the model.
        These labels stand in one for each document/sentence and not
        for new vocabulary.
        """
        if prefix is None:
            prefix = 'SENT'
        num_lines = sum(1 for _ in corpus)
        # Expand syn0
        shape = (self.syn0.shape[0] + num_lines, self.syn0.shape[1])
        syn0 = (np.random.random(shape).astype(self.syn0.dtype) - 0.5)
        syn0 /= self.layer1_size
        syn0[:self.syn0.shape[0]] = self.syn0
        self.syn0 = syn0
        index2word_start = len(self.index2word)
        for j, line_no in enumerate(range(num_lines)):
            # Expand vocab
            newvocab = gensim.models.doc2vec.Vocab()
            newvocab.index = len(self.index2word)
            newvocab.sample_probability = 1.0
            # We insert each sentence at the root of the
            # Huffman tree. It's a hack.
            newvocab.code = [1, ] * int(math.log(line_no + 1, 2) + 1)
            label = Document2Vec._make_label(prefix, str(j))
            self.vocab[label] = newvocab
            # Expand index2word
            self.index2word.append(label)
            assert len(self.index2word) == newvocab.index + 1
        return index2word_start 
开发者ID:cemoody,项目名称:Document2Vec,代码行数:32,代码来源:document2vec.py

示例13: load_word2vec_model

# 需要导入模块: import gensim [as 别名]
# 或者: from gensim import models [as 别名]
def load_word2vec_model(path, binary=True):
    """ Load a pre-trained Word2Vec model.

    :param path: path of the file of the pre-trained Word2Vec model
    :param binary: whether the file is in binary format (Default: True)
    :return: a pre-trained Word2Vec model
    :type path: str
    :type binary: bool
    :rtype: gensim.models.keyedvectors.KeyedVectors
    """
    return KeyedVectors.load_word2vec_format(path, binary=binary) 
开发者ID:stephenhky,项目名称:PyShortTextCategorization,代码行数:13,代码来源:wordembed.py

示例14: load_fasttext_model

# 需要导入模块: import gensim [as 别名]
# 或者: from gensim import models [as 别名]
def load_fasttext_model(path, encoding='utf-8'):
    """ Load a pre-trained FastText model.

    :param path: path of the file of the pre-trained FastText model
    :return: a pre-trained FastText model
    :type path: str
    :rtype: gensim.models.keyedvectors.FastTextKeyedVectors
    """
    return gensim.models.fasttext.load_facebook_vectors(path, encoding=encoding) 
开发者ID:stephenhky,项目名称:PyShortTextCategorization,代码行数:11,代码来源:wordembed.py

示例15: save

# 需要导入模块: import gensim [as 别名]
# 或者: from gensim import models [as 别名]
def save(self, fname_or_handle, **kwargs):
        """

        :param fname_or_handle:
        :param kwargs:
        :return:
        """
        raise IOError('The class RESTfulKeyedVectors do not persist models to a file.') 
开发者ID:stephenhky,项目名称:PyShortTextCategorization,代码行数:10,代码来源:wordembed.py


注:本文中的gensim.models方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。