当前位置: 首页>>代码示例>>Python>>正文


Python dictionary.Dictionary类代码示例

本文整理汇总了Python中gensim.corpora.dictionary.Dictionary的典型用法代码示例。如果您正苦于以下问题:Python Dictionary类的具体用法?Python Dictionary怎么用?Python Dictionary使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。


在下文中一共展示了Dictionary类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: EnronCorpus

class EnronCorpus(TextCorpus):
    def __init__(self, root_name, no_below=20, keep_words=DEFAULT_DICT_SIZE, dictionary=None):
        """
    Initialize the corpus. This scans through all the emails once, to determine the corpus
    vocabulary. (only the first `keep_words` most frequent words that appear in at least 
    `no_below` documents are kept).
    """
        self.root_name = root_name
        if dictionary is None:
            self.dictionary = Dictionary(self.get_texts())
            self.dictionary.filter_extremes(no_below=no_below, no_above=0.1, keep_n=keep_words)
        else:
            self.dictionary = dictionary

    def get_texts(self, return_raw=False):
        """
    Walk the file system, strip punctuation, normalize all numbers to be '2'.
    """
        filenames = walk_os(self.root_name)
        opened_files = gen_open(filenames)
        stripped_files = strip_punct(opened_files)
        length = 0
        for email in stripped_files:
            if len(email) > ARTICLE_MIN_CHARS:
                length += 1
                print "Iteration: %i" % length
                yield tokenize(email)
        self.length = length  # cache corpus length
开发者ID:aurora1625,项目名称:EnronTopicModelling,代码行数:28,代码来源:enroncorpus.py

示例2: create_dictionaries

def create_dictionaries(model=None,
                        combined=None):
    ''' Function does are number of Jobs:
        1- Creates a word to index mapping
        2- Creates a word to vector mapping
        3- Transforms the Training and Testing Dictionaries

    '''
    if (combined is not None) and (model is not None):
        gensim_dict = Dictionary()
        gensim_dict.doc2bow(model.vocab.keys(),
                            allow_update=True)
        w2indx = {v: k+1 for k, v in gensim_dict.items()}#所有频数超过10的词语的索引
        w2vec = {word: model[word] for word in w2indx.keys()}#所有频数超过10的词语的词向量

        def parse_dataset(combined):
            ''' Words become integers
            '''
            data=[]
            for sentence in combined:
                new_txt = []
                for word in sentence:
                    try:
                        new_txt.append(w2indx[word])
                    except:
                        new_txt.append(0)
                data.append(new_txt)
            return data
        combined=parse_dataset(combined)
        combined= sequence.pad_sequences(combined, maxlen=maxlen)#每个句子所含词语对应的索引,所以句子中含有频数小于10的词语,索引为0
        return w2indx, w2vec,combined
    else:
        print 'No data provided...'
开发者ID:BUPTLdy,项目名称:Sentiment-Analysis,代码行数:33,代码来源:Sentiment_lstm.py

示例3: build_dictionaries_from_splits

def build_dictionaries_from_splits(splits_template, n, save_pickle_tup=None):
    ''' Builds all 3 dictionaries from splits. If provided, `save_pickle_tup` must
        be a 3-tuple of the picklefile names in the following order:
        
        (title, body, tags)
        
        If `save_pickle_tup[i]` is None, the corresponding dictionary will not be saved.
    '''
    utitledict, ubodydict, utagdict = Dictionary(), Dictionary(), Dictionary()
    for eid in xrange(n):
        for row in row_stream(splits_template % eid):
            ID, title, body, tags = row
            utitledict.doc2bow(title.split(), allow_update=True)
            ubodydict.doc2bow(body.split(), allow_update=True)
            utagdict.doc2bow(tags.split(), allow_update=True)
    
    assert ubodydict.num_docs == utitledict.num_docs == utagdict.num_docs
    print "Before filtering..."
    print "utitledict:", utitledict
    print "ubodydict:", ubodydict
    print "utagdict:", utagdict
    
    if save_pickle_tup:
        assert len(save_pickle_tup) == 3
        if save_pickle_tup[0]:
            print "saving utitledict..."
            utitledict.save(save_pickle_tup[0])
        if save_pickle_tup[1]:
            print "saving ubodydict..."
            ubodydict.save(save_pickle_tup[1])
        if save_pickle_tup[2]:
            print "saving utagdict..."
            utagdict.save(save_pickle_tup[2])
            
    return (utitledict, ubodydict, utagdict)
开发者ID:mr1azl,项目名称:tag_recommender,代码行数:35,代码来源:pruning.py

示例4: doc_to_gensim

def doc_to_gensim(doc, lemmatize=True,
                  filter_stops=True, filter_punct=True, filter_nums=False):
    """
    Convert a single ``spacy.Doc`` into a gensim dictionary and bag-of-words document.

    Args:
        doc (``spacy.Doc``)
        lemmatize (bool): if True, use lemmatized strings for words; otherwise,
            use the original form of the string as it appears in ``doc``
        filter_stops (bool): if True, remove stop words from word list
        filter_punct (bool): if True, remove punctuation from word list
        filter_nums (bool): if True, remove numbers from word list

    Returns:
        :class:`gensim.Dictionary <gensim.corpora.dictionary.Dictionary>`:
            integer word ID to word string mapping
        list((int, int)): bag-of-words document, a list of (integer word ID, word count)
            2-tuples
    """
    gdict = Dictionary()
    words = extract.words(doc,
                          filter_stops=filter_stops,
                          filter_punct=filter_punct,
                          filter_nums=filter_nums)
    if lemmatize is True:
        gdoc = gdict.doc2bow((word.lemma_ for word in words), allow_update=True)
    else:
        gdoc = gdict.doc2bow((word.orth_ for word in words), allow_update=True)

    return (gdict, gdoc)
开发者ID:GregBowyer,项目名称:textacy,代码行数:30,代码来源:export.py

示例5: loadDictionary

    def loadDictionary(fname, mapping_only=True):
        """
        Load previously stored mapping between words and their ids.

        The result can be used as the `id2word` parameter for input to transformations.
        """
        if mapping_only:
            result = {}
            for lineNo, line in enumerate(open(fname)):
                cols = line[:-1].split('\t')
                if len(cols) == 2:
                    wordId, word = cols
                elif len(cols) == 3:
                    wordId, word, dfs = cols
                else:
                    raise ValueError("invalid line in dictionary file %s: %s" % (fname, line.strip()))
                result[int(wordId)] = word # dfs not used
        else:
            result = Dictionary()
            for lineNo, line in enumerate(open(fname)):
                cols = line[:-1].split('\t')
                if len(cols) == 3:
                    wordId, word, dfs = cols
                else:
                    raise ValueError("invalid line in dictionary file %s: %s" % (fname, line.strip()))
                wordId = int(wordId)
                result.token2id[word] = wordId
                result.dfs[wordId] = int(dfs)

        return result
开发者ID:DavidNemeskey,项目名称:gensim,代码行数:30,代码来源:wikicorpus.py

示例6: create_dictionaries

def create_dictionaries(train=None,
                        test=None,
                        model=None):
    ''' Function does are number of Jobs:
        1- Creates a word to index mapping
        2- Creates a word to vector mapping
        3- Transforms the Training and Testing Dictionaries

    '''
    if (train is not None) and (model is not None) and (test is not None):
        gensim_dict = Dictionary()
        gensim_dict.doc2bow(model.vocab.keys(),
                            allow_update=True)
        w2indx = {v: k+1 for k, v in gensim_dict.items()}
        w2vec = {word: model[word] for word in w2indx.keys()}

        def parse_dataset(data):
            ''' Words become integers
            '''
            for key in data.keys():
                txt = data[key].lower().replace('\n', '').split()
                new_txt = []
                for word in txt:
                    try:
                        new_txt.append(w2indx[word])
                    except:
                        new_txt.append(0)
                data[key] = new_txt
            return data
        train = parse_dataset(train)
        test = parse_dataset(test)
        return w2indx, w2vec, train, test
    else:
        print('No data provided...')
开发者ID:caomw,项目名称:DeepLearning_MachineLearning,代码行数:34,代码来源:imdb_embedding_w2v.py

示例7: create_corpus

def create_corpus(src, out_dir, no_below=20, keep_words=_DEFAULT_KEEP_WORDS):
    """\

    """
    wordid_filename = os.path.join(out_dir, 'cables_wordids.pickle')
    bow_filename = os.path.join(out_dir, 'cables_bow.mm')
    tfidf_filename = os.path.join(out_dir, 'cables_tfidf.mm')
    predicate = None # Could be set to something like pred.origin_filter(pred.origin_germany)
    # 1. Create word dict
    dct = Dictionary()
    dct_handler = DictionaryHandler(dct)
    handler = create_filter(dct_handler)
    handle_source(src, handler, predicate)
    dct.filter_extremes(no_below=no_below, no_above=0.1, keep_n=keep_words)
    dct.save(wordid_filename)
    # 2. Reiterate through the cables and create the vector space
    corpus_handler = CorpusHandler(out_dir, dct=dct, allow_dict_updates=False)
    handler = create_filter(corpus_handler)
    handle_source(src, handler, predicate)
    # 3. Load corpus
    mm = MmCorpus(bow_filename)
    # 4. Create TF-IDF model
    tfidf = TfidfModel(mm, id2word=dct, normalize=True)
    # 5. Save the TF-IDF model
    MmCorpus.serialize(tfidf_filename, tfidf[mm], progress_cnt=10000)
开发者ID:Tooa,项目名称:cablemap,代码行数:25,代码来源:defaultcorpus.py

示例8: CorpusOfMethodContents

class CorpusOfMethodContents(TextCorpus):
    
    def __init__(self):
        self.mapMethodFQNtoIndex = {}
        self.methodFqns = []
        self.methodContents = []
        TextCorpus.__init__(self)
        
    def addDocument(self, methodFqn, words):
        if methodFqn not in self.mapMethodFQNtoIndex:
            self.methodFqns.append(methodFqn)
            self.mapMethodFQNtoIndex[methodFqn] = len(self.mapMethodFQNtoIndex) - 1
            self.methodContents.append(words)
            self.dictionary.doc2bow(words, allow_update = True)
        else:
            self.methodContents[self.mapMethodFQNtoIndex[methodFqn]] = words
            self.dictionary = Dictionary()
            self.dictionary.add_documents(self.get_texts())
    
    def getMethodContentsForFqn(self, fqn):
        if fqn in self.mapMethodFQNtoIndex.keys():
            return self.methodContents[self.mapMethodFQNtoIndex[fqn]]
        return None
    
    def get_texts(self):
        for content in self.methodContents:
            yield content
开发者ID:IFT-SE,项目名称:pfis3,代码行数:27,代码来源:algorithmLexicalBase.py

示例9: create_dictionary

def create_dictionary(analyzed_items_path, dictionary_path=None):
    dictionary = Dictionary(iter_docs(analyzed_items_path))

    if dictionary_path:
        dictionary.save(dictionary_path)

    return dictionary
开发者ID:NLeSC,项目名称:AVResearcherXL,代码行数:7,代码来源:tasks.py

示例10: get_corpus_dictionary

def get_corpus_dictionary():
    """Crafts a toy corpus and the dictionary associated."""
    # Toy corpus.
    corpus = [
        ['carrot', 'salad', 'tomato'],
        ['carrot', 'salad', 'dish'],
        ['tomato', 'dish'],
        ['tomato', 'salad'],

        ['car', 'break', 'highway'],
        ['highway', 'accident', 'car'],
        ['moto', 'break'],
        ['accident', 'moto', 'car']
    ]

    dictionary = Dictionary(corpus)

    # Transforming corpus with dictionary.
    corpus = [dictionary.doc2bow(doc) for doc in corpus]

    # Building reverse index.
    for (token, uid) in dictionary.token2id.items():
        dictionary.id2token[uid] = token

    return corpus, dictionary
开发者ID:bmabey,项目名称:pyLDAvis,代码行数:25,代码来源:test_gensim_models.py

示例11: WordCorpus

class WordCorpus(BaseCorpus):
    """\
    Wrapper around a `gensim.corpora.dictionary.Dictionary`.

    This is a light-weight alternative to `CableCorpus` to create an initial
    word dictionary::

        wd = WordCorpus()
        wd.add_text('ref-1', 'bla bla')
        # add more texts
        wd.dct.filter_extremes()

        corpus = CableCorpus('/my/directory/', wd.dct)
        corpus.add_text('ref-1', 'bla bla')
        # add more texts
        corpus.close()
    """
    def __init__(self, dct=None, tokenizer=None):
        """\
        Initializes the wrapper.

        `dct`
            An existing Dictionary or ``None`` if a new Dictionary should be
            created (default)
        `tokenizer`
            A tokenizer function or ``None``, see `BaseCorpus`
        """
        super(WordCorpus, self).__init__(tokenizer)
        self.dct = Dictionary() if dct is None else dct

    def add_words(self, reference_id, words):
        self.dct.doc2bow(words, True)
开发者ID:Tooa,项目名称:cablemap,代码行数:32,代码来源:corpus.py

示例12: build_dictionary

 def build_dictionary(self):
     documents = ReadThreads(
         self.board, input_dir=self.input_dir, file_type='phrases',
         return_func=lambda x, y: y.split())
     dictionary = Dictionary(documents)
     dictionary.save(f'{self.board}.dictionary')
     
     return dictionary
开发者ID:ffaristocrat,项目名称:ml-sandbox,代码行数:8,代码来源:build_model.py

示例13: getDictionary

def getDictionary(word_corpus, useSavedTill):
    if useSavedTill >= USESAVED.dictionary:
        common_logger.info("loading dictionary from file")
        dictionary = Dictionary.load(file_lda_gensim_dictionary)
        return dictionary
    else:
        common_logger.info("Creating dictionary from corpus")
        dictionary = Dictionary(word_corpus.values())
        common_logger.info("saving dictionary")
        dictionary.save(file_lda_gensim_dictionary)
        return dictionary
开发者ID:KshitizSethia,项目名称:AcroDisam,代码行数:11,代码来源:LDAModel.py

示例14: build_dictionary_from_splits

def build_dictionary_from_splits(splits_template, column, n, save_pickle=None):
    ''' Build dictionary from splits. If `save_pickle` is provided, then save. '''
    unfiltered_dict = Dictionary()
    for eid in xrange(n):
        unfiltered_dict.add_documents(csv_isolator("../../data/proc_Train_%d.csv" % eid, column))
    print "Before filtering,", unfiltered_dict
    if save_pickle:
        print "\nsaving..."
        unfiltered_dict.save(save_pickle)
    
    return unfiltered_dict
开发者ID:mr1azl,项目名称:tag_recommender,代码行数:11,代码来源:pruning.py

示例15: SublexicalizedCorpus

class SublexicalizedCorpus(TextCorpus):
    def __init__(self, base_corpus, order=3, word_limit=None, clean_func=mahoney_clean, create_dictionary=True,
                 n_proc=1):
        self.order = order

        self.clean_func = clean_func
        self.base_corpus = base_corpus
        self.word_limit = word_limit
        self.n_proc = n_proc

        super(SublexicalizedCorpus, self).__init__()

        self.dictionary = Dictionary()

        if create_dictionary:
            self.dictionary.add_documents(self.get_texts())

    def get_texts(self):
        a_count = 0
        t_count = 0

        texts = ((text, self.clean_func, self.order) for text in self.base_corpus.get_texts())

        pool = multiprocessing.Pool(self.n_proc)

        start = time.clock()
        prev = start

        for group in chunkize(texts, chunksize=10 * self.n_proc, maxsize=100):
            for tokens in pool.imap_unordered(process, group):
                a_count += 1

                cur = time.clock()

                if cur - prev > 60:
                    logging.info("Sublexicalized %d in %d seconds, %.0f t/s"
                                 % (t_count, cur - start, t_count*1. / (cur - start)))

                    prev = cur

                t_count += len(tokens)

                yield tokens

                if self.word_limit and t_count > self.word_limit:
                    break

        pool.terminate()

        end = time.clock()
        logging.info("Sublexicalizing %d finished in %d seconds, %.0f t/s"
                     % (t_count, end - start, t_count*1. / (end - start)))

        self.length = t_count
开发者ID:andrely,项目名称:sublexical-features,代码行数:54,代码来源:experiment_runner.py


注:本文中的gensim.corpora.dictionary.Dictionary类示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。