当前位置: 首页>>代码示例>>Python>>正文


Python Dictionary.items方法代码示例

本文整理汇总了Python中gensim.corpora.dictionary.Dictionary.items方法的典型用法代码示例。如果您正苦于以下问题:Python Dictionary.items方法的具体用法?Python Dictionary.items怎么用?Python Dictionary.items使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在gensim.corpora.dictionary.Dictionary的用法示例。


在下文中一共展示了Dictionary.items方法的6个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: create_dictionaries

# 需要导入模块: from gensim.corpora.dictionary import Dictionary [as 别名]
# 或者: from gensim.corpora.dictionary.Dictionary import items [as 别名]
def create_dictionaries(train=None,
                        test=None,
                        model=None):
    ''' Function does are number of Jobs:
        1- Creates a word to index mapping
        2- Creates a word to vector mapping
        3- Transforms the Training and Testing Dictionaries

    '''
    if (train is not None) and (model is not None) and (test is not None):
        gensim_dict = Dictionary()
        gensim_dict.doc2bow(model.vocab.keys(),
                            allow_update=True)
        w2indx = {v: k+1 for k, v in gensim_dict.items()}
        w2vec = {word: model[word] for word in w2indx.keys()}

        def parse_dataset(data):
            ''' Words become integers
            '''
            for key in data.keys():
                txt = data[key].lower().replace('\n', '').split()
                new_txt = []
                for word in txt:
                    try:
                        new_txt.append(w2indx[word])
                    except:
                        new_txt.append(0)
                data[key] = new_txt
            return data
        train = parse_dataset(train)
        test = parse_dataset(test)
        return w2indx, w2vec, train, test
    else:
        print('No data provided...')
开发者ID:caomw,项目名称:DeepLearning_MachineLearning,代码行数:36,代码来源:imdb_embedding_w2v.py

示例2: create_dictionaries

# 需要导入模块: from gensim.corpora.dictionary import Dictionary [as 别名]
# 或者: from gensim.corpora.dictionary.Dictionary import items [as 别名]
def create_dictionaries(model=None,
                        combined=None):
    ''' Function does are number of Jobs:
        1- Creates a word to index mapping
        2- Creates a word to vector mapping
        3- Transforms the Training and Testing Dictionaries

    '''
    if (combined is not None) and (model is not None):
        gensim_dict = Dictionary()
        gensim_dict.doc2bow(model.vocab.keys(),
                            allow_update=True)
        w2indx = {v: k+1 for k, v in gensim_dict.items()}#所有频数超过10的词语的索引
        w2vec = {word: model[word] for word in w2indx.keys()}#所有频数超过10的词语的词向量

        def parse_dataset(combined):
            ''' Words become integers
            '''
            data=[]
            for sentence in combined:
                new_txt = []
                for word in sentence:
                    try:
                        new_txt.append(w2indx[word])
                    except:
                        new_txt.append(0)
                data.append(new_txt)
            return data
        combined=parse_dataset(combined)
        combined= sequence.pad_sequences(combined, maxlen=maxlen)#每个句子所含词语对应的索引,所以句子中含有频数小于10的词语,索引为0
        return w2indx, w2vec,combined
    else:
        print 'No data provided...'
开发者ID:BUPTLdy,项目名称:Sentiment-Analysis,代码行数:35,代码来源:Sentiment_lstm.py

示例3: create_mapping_dicts

# 需要导入模块: from gensim.corpora.dictionary import Dictionary [as 别名]
# 或者: from gensim.corpora.dictionary.Dictionary import items [as 别名]
def create_mapping_dicts(wrd_embedding, filter_corpus=False, bodies=None,
                         headlines=None): 
    """Generate word:index, word:vector, index:word dictionaries. 

    Args: 
    ----
        wrd_embedding: gensim.models.word2vec.Word2Vec fitted model
        filter_corpus (optional): boolean  
            Filter the corpus to only those words seen in the bodies/headlines. 
        bodies (optional): list of lists 
            Must be passed in if `filter_corpus` is True. 
        headlines (optional): list of lists  
            Must be passed in if `filter_corpus` is True. 

    Return: 
    ------
        word_idx_dct: dict
        idx_word_dct: dict
        word_vector_dct: dict
    """

    if filter_corpus:
        if (not bodies or not headlines): 
            excep_str = "Must pass in bodies and headlines with filter_corpus True!"
            raise Exception(excep_str)
        else: 
            wrd_embedding = _filter_corpus(bodies, headlines, wrd_embedding)

    gensim_dct = Dictionary()
    gensim_dct.doc2bow(wrd_embedding.vocab.keys(), allow_update=True)

    # Leave index 0 for the newline character
    word_idx_dct = {wrd: (idx + 1) for idx, wrd in gensim_dct.items()}
    idx_word_dct = {(idx + 1): wrd for idx, wrd in gensim_dct.items()}
    word_idx_dct['\n'] = 0
    idx_word_dct[0] = '\n'

    word_vector_dct = {wrd: wrd_embedding[wrd] for idx, wrd in gensim_dct.items()}
    vec_dim = next(len(value) for value in word_vector_dct.values())
    word_vector_dct['\n'] = np.zeros((vec_dim))

    return word_idx_dct, idx_word_dct, word_vector_dct 
开发者ID:sallamander,项目名称:headline-generation,代码行数:44,代码来源:mappings.py

示例4: create_mapping_dicts

# 需要导入模块: from gensim.corpora.dictionary import Dictionary [as 别名]
# 或者: from gensim.corpora.dictionary.Dictionary import items [as 别名]
def create_mapping_dicts(wrd_embedding, reviews=None, vocab_size=None):
    """Generate word:index, word:vector, index:word dictionaries. 

    Args: 
    ----
        wrd_embedding: gensim.models.word2vec.Word2Vec fitted model
        reviews (optional): np.array (or array-like) of lists of strings
            Used to filter the vocabulary, either to only those words in `reviews`
            or the most common `vocab_size` words in `reviews` that are also in 
            the `wrd_embedding`.
        vocab_size (optional): int
            Keep only `vocab_size` most common words from the reviews. 

    Return: 
    ------
        word_idx_dct: dict
        idx_word_dct: dict
        word_vector_dct: dict
    """

    if reviews is not None: 
        wrd_embedding = _filter_corpus(wrd_embedding, reviews, vocab_size)

    gensim_dct = Dictionary()
    gensim_dct.doc2bow(wrd_embedding.vocab.keys(), allow_update=True)

    # Leave index 0 for masking the padding, 1 for the end of sequence
    # character (EOS), and 2 for unkown words (denoted 'UNK')
    wrd_idx_dct = {wrd: (idx + 3) for idx, wrd in gensim_dct.items()}
    idx_wrd_dct = {(idx + 3): wrd for idx, wrd in gensim_dct.items()}
    wrd_idx_dct['EOS'] = 1
    idx_wrd_dct[1] = 'EOS'
    wrd_idx_dct['UNK'] = 2
    idx_wrd_dct[2] = 'UNK'

    wrd_vector_dct = {wrd: wrd_embedding[wrd] for idx, wrd in gensim_dct.items()}
    embedding_dim = wrd_embedding.vector_size
    wrd_vector_dct['EOS'] = np.zeros((embedding_dim))
    wrd_vector_dct['UNK'] = np.zeros((embedding_dim))

    return wrd_idx_dct, idx_wrd_dct, wrd_vector_dct 
开发者ID:sallamander,项目名称:review-analysis,代码行数:43,代码来源:mappings.py

示例5: create_mapping_dicts

# 需要导入模块: from gensim.corpora.dictionary import Dictionary [as 别名]
# 或者: from gensim.corpora.dictionary.Dictionary import items [as 别名]
def create_mapping_dicts(wrd_embedding, filter_corpus=False, bodies=None,
                         headlines=None): 
    """Generate word:index, word:vector, index:word dictionaries. 

    Args: 
    ----
        wrd_embedding: gensim.models.word2vec.Word2Vec fitted model
        filter_corpus (optional): boolean  
            Filter the corpus to only those words seen in the articles. Use
            to speed up iteration during intial building/training phases. 
        bodies (optional): list of lists 
            Must be passed in if `filter_corpus` is True. 
        headlines (optional): list of lists  
            Must be passed in if `filter_corpus` is True. 

    Return: 
    ------
        word_idx_dct: dict
        idx_word_dct: dict
        word_vector_dct: dict
    """

    if filter_corpus:
        if (not bodies or not headlines): 
            raise Exception('Must pass in bodies and headlines with filter_corpus as True!')
        else: 
            wrd_embedding = _filter_corpus(bodies, headlines, wrd_embedding)

    gensim_dct = Dictionary()
    gensim_dct.doc2bow(wrd_embedding.vocab.keys(), allow_update=True)

    word_idx_dct = {wrd: idx for idx, wrd in gensim_dct.items()}
    idx_word_dct = {idx: wrd for idx, wrd in gensim_dct.items()}
    word_vector_dct = {wrd: wrd_embedding[wrd] for idx, wrd in gensim_dct.items()}

    return word_idx_dct, idx_word_dct, word_vector_dct 
开发者ID:JostineHo,项目名称:headline-generation,代码行数:38,代码来源:preprocessing.py

示例6: wmdistance

# 需要导入模块: from gensim.corpora.dictionary import Dictionary [as 别名]
# 或者: from gensim.corpora.dictionary.Dictionary import items [as 别名]
    def wmdistance(self, document1, document2):
        """
        Compute the Word Mover's Distance between two documents. When using this
        code, please consider citing the following papers:

        .. Ofir Pele and Michael Werman, "A linear time histogram metric for improved SIFT matching".
        .. Ofir Pele and Michael Werman, "Fast and robust earth mover's distances".
        .. Matt Kusner et al. "From Word Embeddings To Document Distances".

        Note that if one of the documents have no words that exist in the
        Word2Vec vocab, `float('inf')` (i.e. infinity) will be returned.

        This method only works if `pyemd` is installed (can be installed via pip, but requires a C compiler).

        Example:
            >>> # Train word2vec model.
            >>> model = Word2Vec(sentences)

            >>> # Some sentences to test.
            >>> sentence_obama = 'Obama speaks to the media in Illinois'.lower().split()
            >>> sentence_president = 'The president greets the press in Chicago'.lower().split()

            >>> # Remove their stopwords.
            >>> from nltk.corpus import stopwords
            >>> stopwords = nltk.corpus.stopwords.words('english')
            >>> sentence_obama = [w for w in sentence_obama if w not in stopwords]
            >>> sentence_president = [w for w in sentence_president if w not in stopwords]

            >>> # Compute WMD.
            >>> distance = model.wmdistance(sentence_obama, sentence_president)
        """

        if not PYEMD_EXT:
            raise ImportError("Please install pyemd Python package to compute WMD.")

        # Remove out-of-vocabulary words.
        len_pre_oov1 = len(document1)
        len_pre_oov2 = len(document2)
        document1 = [token for token in document1 if token in self]
        document2 = [token for token in document2 if token in self]
        diff1 = len_pre_oov1 - len(document1)
        diff2 = len_pre_oov2 - len(document2)
        if diff1 > 0 or diff2 > 0:
            logger.info('Removed %d and %d OOV words from document 1 and 2 (respectively).', diff1, diff2)

        if len(document1) == 0 or len(document2) == 0:
            logger.info(
                "At least one of the documents had no words that werein the vocabulary. "
                "Aborting (returning inf)."
            )
            return float('inf')

        dictionary = Dictionary(documents=[document1, document2])
        vocab_len = len(dictionary)

        if vocab_len == 1:
            # Both documents are composed by a single unique token
            return 0.0

        # Sets for faster look-up.
        docset1 = set(document1)
        docset2 = set(document2)

        # Compute distance matrix.
        distance_matrix = zeros((vocab_len, vocab_len), dtype=double)
        for i, t1 in dictionary.items():
            for j, t2 in dictionary.items():
                if t1 not in docset1 or t2 not in docset2:
                    continue
                # Compute Euclidean distance between word vectors.
                distance_matrix[i, j] = sqrt(np_sum((self[t1] - self[t2])**2))

        if np_sum(distance_matrix) == 0.0:
            # `emd` gets stuck if the distance matrix contains only zeros.
            logger.info('The distance matrix is all zeros. Aborting (returning inf).')
            return float('inf')

        def nbow(document):
            d = zeros(vocab_len, dtype=double)
            nbow = dictionary.doc2bow(document)  # Word frequencies.
            doc_len = len(document)
            for idx, freq in nbow:
                d[idx] = freq / float(doc_len)  # Normalized word frequencies.
            return d

        # Compute nBOW representation of documents.
        d1 = nbow(document1)
        d2 = nbow(document2)

        # Compute WMD.
        return emd(d1, d2, distance_matrix)
开发者ID:jMonteroMunoz,项目名称:gensim,代码行数:93,代码来源:keyedvectors.py


注:本文中的gensim.corpora.dictionary.Dictionary.items方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。