当前位置: 首页>>代码示例>>Python>>正文


Python KeyedVectors.load_word2vec_format方法代码示例

本文整理汇总了Python中gensim.models.KeyedVectors.load_word2vec_format方法的典型用法代码示例。如果您正苦于以下问题:Python KeyedVectors.load_word2vec_format方法的具体用法?Python KeyedVectors.load_word2vec_format怎么用?Python KeyedVectors.load_word2vec_format使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在gensim.models.KeyedVectors的用法示例。


在下文中一共展示了KeyedVectors.load_word2vec_format方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: load_word2vec_embeddings

# 需要导入模块: from gensim.models import KeyedVectors [as 别名]
# 或者: from gensim.models.KeyedVectors import load_word2vec_format [as 别名]
def load_word2vec_embeddings(filepath, tokenizer, max_features, embedding_size):
    model = KeyedVectors.load_word2vec_format(filepath, binary=True)

    emb_mean, emb_std = model.wv.syn0.mean(), model.wv.syn0.std()

    word_index = tokenizer.word_index
    nb_words = min(max_features, len(word_index))
    embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words, embedding_size))
    for word, i in word_index.items():
        if i >= max_features:
            continue
        try:
            embedding_vector = model[word]
            embedding_matrix[i] = embedding_vector
        except KeyError:
            continue
    return embedding_matrix 
开发者ID:minerva-ml,项目名称:steppy-toolkit,代码行数:19,代码来源:embeddings.py

示例2: load_poincare_model

# 需要导入模块: from gensim.models import KeyedVectors [as 别名]
# 或者: from gensim.models.KeyedVectors import load_word2vec_format [as 别名]
def load_poincare_model(path, word2vec_format=True, binary=False):
    """ Load a Poincare embedding model.

    :param path: path of the file of the pre-trained Poincare embedding model
    :param word2vec_format: whether to load from word2vec format (default: True)
    :param binary: binary format (default: False)
    :return: a pre-trained Poincare embedding model
    :type path: str
    :type word2vec_format: bool
    :type binary: bool
    :rtype: gensim.models.poincare.PoincareKeyedVectors
    """
    if word2vec_format:
        return PoincareKeyedVectors.load_word2vec_format(path, binary=binary)
    else:
        return PoincareModel.load(path).kv 
开发者ID:stephenhky,项目名称:PyShortTextCategorization,代码行数:18,代码来源:wordembed.py

示例3: __init__

# 需要导入模块: from gensim.models import KeyedVectors [as 别名]
# 或者: from gensim.models.KeyedVectors import load_word2vec_format [as 别名]
def __init__(self, all_texts):
        # Creating the model
        print("Reading the pretrained model for Word2VecEmbedder")
        self.sk_model = KeyedVectors.load_word2vec_format(
            '/data/verejne/datautils/embedding_data/slovak.vec', encoding='utf-8', unicode_errors='ignore')
        print("Model contains", len(self.sk_model.vocab), "tokens")
        print(self.sk_model.similarity("mesto", "mesta"))
        self.dimension = len(self.sk_model["auto"])
        print("sídlisk" in self.sk_model)
        print("sídlisk".encode('utf8') in self.sk_model)

        print("Dimension of embedding of 'auto' is", self.dimension)
        # Create frequency table for words
        if all_texts is None:
            return

        for text in all_texts:
            self.add_text_to_corpus(text)
        self.print_corpus_stats() 
开发者ID:verejnedigital,项目名称:verejne.digital,代码行数:21,代码来源:embed.py

示例4: _get_embedding_matrix

# 需要导入模块: from gensim.models import KeyedVectors [as 别名]
# 或者: from gensim.models.KeyedVectors import load_word2vec_format [as 别名]
def _get_embedding_matrix(self, tokenizer):
        model = KeyedVectors.load_word2vec_format(self.pretrained_filepath, binary=True)

        emb_mean, emb_std = model.syn0.mean(), model.syn0.std()

        word_index = tokenizer.word_index
        nb_words = min(self.max_features, len(word_index))
        embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words, self.embedding_size))
        for word, i in word_index.items():
            if i >= self.max_features:
                continue
            try:
                embedding_vector = model[word]
                embedding_matrix[i] = embedding_vector
            except KeyError:
                continue
        return embedding_matrix 
开发者ID:minerva-ml,项目名称:open-solution-toxic-comments,代码行数:19,代码来源:models.py

示例5: word2vec

# 需要导入模块: from gensim.models import KeyedVectors [as 别名]
# 或者: from gensim.models.KeyedVectors import load_word2vec_format [as 别名]
def word2vec(word2vec_path):
    # Download word2vec data if it's not present yet
    if not path.exists(word2vec_path):
        glove_file_path = get_glove_file_path()
        print('Converting Glove to word2vec...', end='')
        glove2word2vec(glove_file_path, word2vec_path)  # Convert glove to word2vec
        os.remove(glove_file_path)                      # Remove glove file and keep only word2vec
        print('Done')

    print('Reading word2vec data... ', end='')
    model = KeyedVectors.load_word2vec_format(word2vec_path)
    print('Done')

    def get_word_vector(word):
        try:
            return model[word]
        except KeyError:
            return np.zeros(model.vector_size)

    return get_word_vector 
开发者ID:YerevaNN,项目名称:R-NET-in-Keras,代码行数:22,代码来源:preprocessing.py

示例6: loadEmbeddings

# 需要导入模块: from gensim.models import KeyedVectors [as 别名]
# 或者: from gensim.models.KeyedVectors import load_word2vec_format [as 别名]
def loadEmbeddings(self, filepath, data_path, vocab_size, binary_val):
        embed_short = os.path.normpath("%s/embed.dat" % data_path)
        if not os.path.exists(embed_short):
            print("Caching word embeddings in memmapped format...")
            print(binary_val, filepath)
            wv = KeyedVectors.load_word2vec_format("%s" % (filepath), binary=binary_val)
            fp = np.memmap(embed_short, dtype=np.double, mode='w+', shape=wv.syn0.shape)
            fp[:] = wv.syn0[:]
            with open(os.path.normpath("%s/embed.vocab" % data_path), "w") as fp:
                for _, w in sorted((voc.index, word) for word, voc in wv.vocab.items()):
                    fp.write("%s\n"%(w.encode("utf8")))
            del fp, wv
            
        self.W = np.memmap(os.path.normpath("%s/embed.dat" % data_path), dtype=np.double, mode="r", shape=(vocab_size, self.embedding_size))
        with codecs.open(os.path.normpath("%s/embed.vocab" % data_path), 'r', 'utf-8') as f:
            vocab_list = [x.strip() for x in f.readlines()]
        self.vocab_dict = {w: k for k, w in enumerate(vocab_list)} 
开发者ID:UKPLab,项目名称:acl2017-interactive_summarizer,代码行数:19,代码来源:loadEmbeddings.py

示例7: main

# 需要导入模块: from gensim.models import KeyedVectors [as 别名]
# 或者: from gensim.models.KeyedVectors import load_word2vec_format [as 别名]
def main(opt):
    vocab = pickle.load(open(opt.vocab_path, 'rb'))
    num = len(vocab)
    print (num)
    model = KeyedVectors.load_word2vec_format(opt.embed_weight, binary=True)
    

    matrix_len = num
    weights_matrix = np.zeros((num, 300))
    words_found = 0 
    mask = np.zeros(num, dtype=int)

    for i, word in enumerate(vocab.idx2word):
        try: 
            weights_matrix[i] = model[vocab.idx2word[i]]
            words_found += 1
            mask[i] = 1
        except KeyError:
            weights_matrix[i] = np.random.normal(scale=0.1, size=(300, ))

    print (words_found)

    np.save("./embed/f30kword2vec300dim_3.npy", weights_matrix)
    np.save("./embed/f30kword2vecmask_3.npy", mask) 
开发者ID:ZihaoWang-CV,项目名称:CAMP_iccv19,代码行数:26,代码来源:pick_wordvec.py

示例8: get_summary_nlp

# 需要导入模块: from gensim.models import KeyedVectors [as 别名]
# 或者: from gensim.models.KeyedVectors import load_word2vec_format [as 别名]
def get_summary_nlp(self):
        """
        Return a summary of a NLP model file
        :return:
        """
        ret = []
        try:
            word2vec = KeyedVectors.load_word2vec_format(self.filename, binary=False)
            mtime = self._get_mtime()
            dim_vectors = word2vec.vector_size
            word_count = len(word2vec.vectors)

            ret.append("---------------------------")
            ret.append("Summary for NLP model file:")
            ret.append("---------------------------")
            ret.append(self.FILE_NAME_OUTPUT.format(self.filename))
            ret.append(self.LAST_MODIFICATION_TIME.format(mtime))
            ret.append(self.FEATURE_DIMENSION.format(dim_vectors))
            ret.append(self.NUM_SENTENCES.format(word_count))
            ret.append("\n")
        except Exception as e:
            ret.append("Failed to read NLP model {}.".format(self.filename))
            ret.append("Error: {}".format(e))

        return ret 
开发者ID:ibmresilient,项目名称:resilient-community-apps,代码行数:27,代码来源:file_manage.py

示例9: load

# 需要导入模块: from gensim.models import KeyedVectors [as 别名]
# 或者: from gensim.models.KeyedVectors import load_word2vec_format [as 别名]
def load(cls, np2vec_model_file, binary=False, word_ngrams=0, word2vec_format=True):
        """
        Load the np2vec model.

        Args:
            np2vec_model_file (str): the file containing the np2vec model to load
            binary (bool): boolean indicating whether the np2vec model to load is in binary format
            word_ngrams (int {1,0}): If 1, np2vec model to load uses word vectors with subword (
            ngrams) information.
            word2vec_format(bool): boolean indicating whether the model to load has been stored in
            original word2vec format.

        Returns:
            np2vec model to load
        """
        if word_ngrams == 0:
            if word2vec_format:
                return KeyedVectors.load_word2vec_format(np2vec_model_file, binary=binary)
            return KeyedVectors.load(np2vec_model_file, mmap="r")
        if word_ngrams == 1:
            return FastText.load(np2vec_model_file)
        logger.error("invalid value for 'word_ngrams'")
        return None 
开发者ID:NervanaSystems,项目名称:nlp-architect,代码行数:25,代码来源:np2vec.py

示例10: _prepare_embedding_index

# 需要导入模块: from gensim.models import KeyedVectors [as 别名]
# 或者: from gensim.models.KeyedVectors import load_word2vec_format [as 别名]
def _prepare_embedding_index(self, binary=True):
        """Returns an embedding index for pre-trained token embeddings.

        For pre-trained word embeddings given at `self.filepath`, returns a
        dictionary mapping words to their embedding (an 'embedding index'). If `self.debug` is
        True, only the first ten thousand vectors are loaded.

        Args:
            binary (bool): True if pre-trained embeddings are in C binary format, False if they are
                in C text format. Defaults to True.

        Returns:
            Dictionary mapping words to pre-trained word embeddings, known as an 'embedding index'.
        """
        limit = 10000 if self.__dict__.get("debug", False) else None
        vectors = KeyedVectors.load_word2vec_format(self.filepath, binary=binary, limit=limit)
        embedding_idx = {word: vectors[word] for word in vectors.vocab}

        return embedding_idx 
开发者ID:BaderLab,项目名称:saber,代码行数:21,代码来源:embeddings.py

示例11: get_word_embeddings

# 需要导入模块: from gensim.models import KeyedVectors [as 别名]
# 或者: from gensim.models.KeyedVectors import load_word2vec_format [as 别名]
def get_word_embeddings(w2vfile, word_to_index, embedsize=300):
    '''
    For each word in our vocabulary, get the word2vec encoding of the word
    Inputs:
        w2vfile (string) : Path to the file containing (pre-trained) word embeddings
        embedsize (int) : Length of each word vector
    Returns:
        word_embeddings : Dictionary mapping each word to corresponding embedding
    '''

    word_embeddings = {}
    if w2vfile.endswith('.txt'):
        f = open(w2vfile)
        for line in tqdm(f):
            values = line.split(" ")
            word = values[0]
            coefs = np.asarray(values[1:], dtype='float32')
            if word in word_to_index:
                word_embeddings[word] = coefs
        f.close()
    elif w2vfile.endswith('.bin'):
        word2vec = KeyedVectors.load_word2vec_format(w2vfile, binary=True, limit=1000000)
        for word in tqdm(word_to_index):
            try:
                word_embeddings[word] = word2vec[word.lower()]
            except KeyError:
                pass
    else:
        print ('Can\'t load word embeddings.')
        exit(-1)

    print('Found {0}/{1} word vectors.'.format(len(word_embeddings), len(word_to_index)))
    if len(word_to_index) > len(word_embeddings):
        print('Initializing remaining {} word vectors with zeros.'.format(len(word_to_index) - len(word_embeddings)))

    for word in word_to_index:
        if word not in word_embeddings:
            word_embeddings[word] = np.zeros((embedsize,))
    return word_embeddings 
开发者ID:AnubhavGupta3377,项目名称:Text-Classification-Models-Pytorch,代码行数:41,代码来源:utils.py

示例12: load_word2vec_model

# 需要导入模块: from gensim.models import KeyedVectors [as 别名]
# 或者: from gensim.models.KeyedVectors import load_word2vec_format [as 别名]
def load_word2vec_model(path, binary=True):
    """ Load a pre-trained Word2Vec model.

    :param path: path of the file of the pre-trained Word2Vec model
    :param binary: whether the file is in binary format (Default: True)
    :return: a pre-trained Word2Vec model
    :type path: str
    :type binary: bool
    :rtype: gensim.models.keyedvectors.KeyedVectors
    """
    return KeyedVectors.load_word2vec_format(path, binary=binary) 
开发者ID:stephenhky,项目名称:PyShortTextCategorization,代码行数:13,代码来源:wordembed.py

示例13: load_text

# 需要导入模块: from gensim.models import KeyedVectors [as 别名]
# 或者: from gensim.models.KeyedVectors import load_word2vec_format [as 别名]
def load_text(self):
        try:
            model = KeyedVectors.load_word2vec_format(self.model_path, self.vocab_path, binary=False)
        except FileNotFoundError:
            model = None
        return model 
开发者ID:msgi,项目名称:nlp-journey,代码行数:8,代码来源:word2vec.py

示例14: load_bin_word2vec

# 需要导入模块: from gensim.models import KeyedVectors [as 别名]
# 或者: from gensim.models.KeyedVectors import load_word2vec_format [as 别名]
def load_bin_word2vec(word_index, word2vec_path, max_index):
    log.info('Begin load word vector from bin...')
    word2vec = KeyedVectors.load_word2vec_format(word2vec_path, binary=True)
    embeddings = _select_vectors(word2vec, word_index, max_index)
    log.info('End load word vector from bin...')
    return embeddings


# 加载词向量(获取需要的词向量:自己训练好的文本词向量) 
开发者ID:msgi,项目名称:nlp-journey,代码行数:11,代码来源:loader.py

示例15: load_text_vector

# 需要导入模块: from gensim.models import KeyedVectors [as 别名]
# 或者: from gensim.models.KeyedVectors import load_word2vec_format [as 别名]
def load_text_vector(word_index, word2vec_path, max_index):
    log.info('Begin load word vector from text...')
    word2vec = KeyedVectors.load_word2vec_format(word2vec_path, binary=False)
    embeddings = _select_vectors(word2vec, word_index, max_index)
    log.info('End load word vector from text...')
    return embeddings 
开发者ID:msgi,项目名称:nlp-journey,代码行数:8,代码来源:loader.py


注:本文中的gensim.models.KeyedVectors.load_word2vec_format方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。