当前位置: 首页>>代码示例>>Python>>正文


Python Dictionary.filter_extremes方法代码示例

本文整理汇总了Python中gensim.corpora.dictionary.Dictionary.filter_extremes方法的典型用法代码示例。如果您正苦于以下问题:Python Dictionary.filter_extremes方法的具体用法?Python Dictionary.filter_extremes怎么用?Python Dictionary.filter_extremes使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在gensim.corpora.dictionary.Dictionary的用法示例。


在下文中一共展示了Dictionary.filter_extremes方法的11个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: create_corpus

# 需要导入模块: from gensim.corpora.dictionary import Dictionary [as 别名]
# 或者: from gensim.corpora.dictionary.Dictionary import filter_extremes [as 别名]
def create_corpus(src, out_dir, no_below=20, keep_words=_DEFAULT_KEEP_WORDS):
    """\

    """
    wordid_filename = os.path.join(out_dir, 'cables_wordids.pickle')
    bow_filename = os.path.join(out_dir, 'cables_bow.mm')
    tfidf_filename = os.path.join(out_dir, 'cables_tfidf.mm')
    predicate = None # Could be set to something like pred.origin_filter(pred.origin_germany)
    # 1. Create word dict
    dct = Dictionary()
    dct_handler = DictionaryHandler(dct)
    handler = create_filter(dct_handler)
    handle_source(src, handler, predicate)
    dct.filter_extremes(no_below=no_below, no_above=0.1, keep_n=keep_words)
    dct.save(wordid_filename)
    # 2. Reiterate through the cables and create the vector space
    corpus_handler = CorpusHandler(out_dir, dct=dct, allow_dict_updates=False)
    handler = create_filter(corpus_handler)
    handle_source(src, handler, predicate)
    # 3. Load corpus
    mm = MmCorpus(bow_filename)
    # 4. Create TF-IDF model
    tfidf = TfidfModel(mm, id2word=dct, normalize=True)
    # 5. Save the TF-IDF model
    MmCorpus.serialize(tfidf_filename, tfidf[mm], progress_cnt=10000)
开发者ID:Tooa,项目名称:cablemap,代码行数:27,代码来源:defaultcorpus.py

示例2: build_dictionary

# 需要导入模块: from gensim.corpora.dictionary import Dictionary [as 别名]
# 或者: from gensim.corpora.dictionary.Dictionary import filter_extremes [as 别名]
def build_dictionary():
    dictionary = Dictionary()
    for line in open(wiki_index.ARTICLES_FILE):
        dictionary.add_documents([line.lower().split()])
    dictionary.filter_extremes(no_below=2, no_above=0.5)
    dictionary.save(DICTIONARY_FILE)
    return dictionary
开发者ID:msushkov,项目名称:cs224w-wiki,代码行数:9,代码来源:lda.py

示例3: EnronCorpus

# 需要导入模块: from gensim.corpora.dictionary import Dictionary [as 别名]
# 或者: from gensim.corpora.dictionary.Dictionary import filter_extremes [as 别名]
class EnronCorpus(TextCorpus):
    def __init__(self, root_name, no_below=20, keep_words=DEFAULT_DICT_SIZE, dictionary=None):
        """
    Initialize the corpus. This scans through all the emails once, to determine the corpus
    vocabulary. (only the first `keep_words` most frequent words that appear in at least 
    `no_below` documents are kept).
    """
        self.root_name = root_name
        if dictionary is None:
            self.dictionary = Dictionary(self.get_texts())
            self.dictionary.filter_extremes(no_below=no_below, no_above=0.1, keep_n=keep_words)
        else:
            self.dictionary = dictionary

    def get_texts(self, return_raw=False):
        """
    Walk the file system, strip punctuation, normalize all numbers to be '2'.
    """
        filenames = walk_os(self.root_name)
        opened_files = gen_open(filenames)
        stripped_files = strip_punct(opened_files)
        length = 0
        for email in stripped_files:
            if len(email) > ARTICLE_MIN_CHARS:
                length += 1
                print "Iteration: %i" % length
                yield tokenize(email)
        self.length = length  # cache corpus length
开发者ID:aurora1625,项目名称:EnronTopicModelling,代码行数:30,代码来源:enroncorpus.py

示例4: preprocess_corpora

# 需要导入模块: from gensim.corpora.dictionary import Dictionary [as 别名]
# 或者: from gensim.corpora.dictionary.Dictionary import filter_extremes [as 别名]
def preprocess_corpora(corpora, stopwords, allowed_pos, max_doc=float('inf'), no_above=0.5, no_below=1, keep_n=None):
    """


    :rtype : gensim.corpora.dictionary.Dictionary
    :param corpora: 
    :param stopwords: 
    :param allowed_pos: 
    :param max_doc: 
    :return: 
    """
    logging.info('Lemmatizing the corpora...')
    count = 0
    corpus_num = len(corpora)
    processed_corpora = []
    corpus_id2orig_id = []

    for index, corpus in corpora.items():
        count += 1
        if count > max_doc:
            break
        if corpus is None:  # skip if corpus is None
            continue

        print '\r', count, '/', corpus_num,
        cleaned_corpus = clean_text(corpus)  # delete irrelevant characters
        corpus = []
        tokens = lemmatize(content=cleaned_corpus, allowed_tags=allowed_pos)
        for token in tokens:
            word, pos = token.split('/')
            corpus.append(word)

        # convert compound word into one token
        corpus = convert_compound(corpus)

        # filter stop words, long words, and non-english words
        corpus = [w for w in corpus if not w in stopwords and 2 <= len(w) <= 15 and w.islower()]
        processed_corpora.append(corpus)
        corpus_id2orig_id.append(index)

    print '\n'

    logging.info('Creating dictionary and corpus...')
    dictionary = Dictionary(processed_corpora)
    dictionary.corpus_id2orig_id = corpus_id2orig_id

    logging.info('Filtering unimportant terms...')
    dictionary.filter_extremes(no_below=no_below, no_above=no_above, keep_n=keep_n)
    dictionary.compactify()

    logging.info('Generating corpus...')
    dictionary.corpus = [dictionary.doc2bow(corpus) for corpus in processed_corpora]
    dictionary.id2token = revdict(dictionary.token2id)

    return dictionary
开发者ID:kensk8er,项目名称:MsTweetAnalysis,代码行数:57,代码来源:preprocess.py

示例5: FolderCorpus

# 需要导入模块: from gensim.corpora.dictionary import Dictionary [as 别名]
# 或者: from gensim.corpora.dictionary.Dictionary import filter_extremes [as 别名]
class FolderCorpus(corpora.TextCorpus):
    def __init__(self, filepaths, preprocess=[], dictionary=None):
        self.filepaths = filepaths
        self.preprocess = preprocess
        self.metadata = None

        self.dictionary = Dictionary()

        self.dictionary.add_documents(self.get_texts())
        self.dictionary.filter_extremes(no_below=5, no_above=0.5, keep_n=500000)
        self.dictionary.compactify()

    def get_texts(self):
        for path in self.filepaths:
            with codecs.open(path, encoding='utf8') as f:
                raw_text = f.read()
                raw_text = raw_text.lower()
                for filt in self.preprocess:
                    raw_text = filt(raw_text)
                text = list(utils.tokenize(raw_text, deacc=True, lowercase=True))
                yield text
开发者ID:wpli,项目名称:ptr,代码行数:23,代码来源:utils_gensim.py

示例6: HNCorpus

# 需要导入模块: from gensim.corpora.dictionary import Dictionary [as 别名]
# 或者: from gensim.corpora.dictionary.Dictionary import filter_extremes [as 别名]
class HNCorpus(TextCorpus):
    def __init__(self, hn_folder, dictionary=None):
        """
        Takes the HN folder of articles 
        as input and builds the dictionary and corpus
        """
        self.hn_folder = hn_folder
        if dictionary is None:
            self.dictionary = Dictionary(self.get_texts())
            self.dictionary.filter_extremes(no_below=NO_BELOW, 
                    no_above=NO_ABOVE, keep_n=VOCAB_SIZE)
        else:
            self.dictionary = dictionary


    def get_texts(self):
        """
        Iterate over the HN articles returning text
        """
        positions, hn_articles = 0, 0

        # ************ HN articles ************
        fnamelist = []
        for g in glob.iglob(self.hn_folder + '/*.txt'):
            fnamelist.append(g)
        for fileno, fname in enumerate(fnamelist):
            hn_text = open(fname).read()
            hn_articles += 1
            if LEMMATIZE:
                result = utils.lemmatize(hn_text)
                positions += len(result)
                yield result
            else:
                result = tokenize(hn_text) # text into tokens here
                positions += len(result)
                yield result

        print (">>> finished iterating over HN corpus of %i documents with %i positions" % (hn_articles, positions))

        self.length = hn_articles # cache corpus length
开发者ID:imclab,项目名称:HN_stats,代码行数:42,代码来源:hn.py

示例7: ArchiveCorpus

# 需要导入模块: from gensim.corpora.dictionary import Dictionary [as 别名]
# 或者: from gensim.corpora.dictionary.Dictionary import filter_extremes [as 别名]
class ArchiveCorpus(corpora.TextCorpus):

	def __init__(self, datafile, preprocess=[], dictionary=None):
		self.datafile = datafile
		self.preprocess = preprocess
		self.metadata = None

		if dictionary:
				self.dictionary = dictionary
		else:
				self.dictionary = Dictionary()
				if datafile is not None:
					self.dictionary.add_documents(self.get_texts())
					self.dictionary.filter_extremes(no_below=5, no_above=0.5, keep_n=500000)


	def get_texts(self):
		with utils.smart_open(self.datafile) as inputfile:
			for line in inputfile:
				for f in self.preprocess:
					line = f(line)
				text = list(utils.tokenize(line, deacc=True, lowercase=True))
				yield text
开发者ID:harixxy,项目名称:latentmodels,代码行数:25,代码来源:001_create_gensim_data.py

示例8: WikiCorpus

# 需要导入模块: from gensim.corpora.dictionary import Dictionary [as 别名]
# 或者: from gensim.corpora.dictionary.Dictionary import filter_extremes [as 别名]
class WikiCorpus(TextCorpus):
    """
    Treat a wikipedia articles dump (*articles.xml.bz2) as a (read-only) corpus.

    The documents are extracted on-the-fly, so that the whole (massive) dump
    can stay compressed on disk.

    >>> wiki = WikiCorpus('enwiki-20100622-pages-articles.xml.bz2') # create word->word_id mapping, takes almost 8h
    >>> wiki.saveAsText('wiki_en_vocab200k') # another 8h, creates a file in MatrixMarket format plus file with id->word

    """
    def __init__(self, fname, no_below=20, keep_words=DEFAULT_DICT_SIZE, dictionary=None):
        """
        Initialize the corpus. This scans the corpus once, to determine its
        vocabulary (only the first `keep_words` most frequent words that
        appear in at least `noBelow` documents are kept).
        """
        self.fname = fname
        if keep_words is None:
            keep_words = DEFAULT_DICT_SIZE
        if dictionary is None:
            self.dictionary = Dictionary(self.get_texts())
            self.dictionary.filter_extremes(no_below=no_below, no_above=0.1, keep_n=keep_words)
        else:
            self.dictionary = dictionary


    def get_texts(self, return_raw=False):
        """
        Iterate over the dump, returning text version of each article.

        Only articles of sufficient length are returned (short articles & redirects
        etc are ignored).

        Note that this iterates over the **texts**; if you want vectors, just use
        the standard corpus interface instead of this function::

        >>> for vec in wiki_corpus:
        >>>     print vec
        """
        articles, articles_all = 0, 0
        intext, positions = False, 0
        if LEMMATIZE:
            lemmatizer = utils.lemmatizer
            yielded = 0

        for _, text in _extract_pages(bz2.BZ2File(self.fname)):
            text = filter_wiki(text)
            articles_all += 1
            if len(text) > ARTICLE_MIN_CHARS: # article redirects are pruned here
                articles += 1
                if return_raw:
                    result = text
                    yield result
                else:
                    if LEMMATIZE:
                        _ = lemmatizer.feed(text)
                        while lemmatizer.has_results():
                            _, result = lemmatizer.read() # not necessarily the same text as entered above!
                            positions += len(result)
                            yielded += 1
                            yield result
                    else:
                        result = tokenize(text) # text into tokens here
                        positions += len(result)
                        yield result

        if LEMMATIZE:
            logger.info("all %i articles read; waiting for lemmatizer to finish the %i remaining jobs" %
                        (articles, articles - yielded))
            while yielded < articles:
                _, result = lemmatizer.read()
                positions += len(result)
                yielded += 1
                yield result

        logger.info("finished iterating over Wikipedia corpus of %i documents with %i positions"
                     " (total %i articles before pruning)" %
                     (articles, positions, articles_all))
        self.length = articles # cache corpus length
开发者ID:Hrehory,项目名称:gensim,代码行数:82,代码来源:wikicorpus.py

示例9: WikiCorpus

# 需要导入模块: from gensim.corpora.dictionary import Dictionary [as 别名]
# 或者: from gensim.corpora.dictionary.Dictionary import filter_extremes [as 别名]
class WikiCorpus(TextCorpus):
    """
    Treat a wikipedia articles dump (*articles.xml.bz2) as a (read-only) corpus.

    The documents are extracted on-the-fly, so that the whole (massive) dump
    can stay compressed on disk.

    >>> wiki = WikiCorpus('enwiki-20100622-pages-articles.xml.bz2') # create word->word_id mapping, takes almost 8h
    >>> wiki.saveAsText('wiki_en_vocab200k') # another 8h, creates a file in MatrixMarket format plus file with id->word

    """

    def __init__(self, fname, no_below=20, keep_words=DEFAULT_DICT_SIZE, dictionary=None):
        """
        Initialize the corpus. This scans the corpus once, to determine its
        vocabulary (only the first `keep_words` most frequent words that
        appear in at least `noBelow` documents are kept).
        """
        self.fname = fname
        if dictionary is None:
            self.dictionary = Dictionary(self.get_texts())
            self.dictionary.filter_extremes(no_below=no_below, no_above=0.1, keep_n=keep_words)
        else:
            self.dictionary = dictionary

    def get_texts(self, return_raw=False):
        """
        Iterate over the dump, returning text version of each article.

        Only articles of sufficient length are returned (short articles & redirects
        etc are ignored).

        Note that this iterates over the **texts**; if you want vectors, just use
        the standard corpus interface instead of this function::

        >>> for vec in wiki_corpus:
        >>>     print vec
        """
        articles, articles_all = 0, 0
        intext, positions = False, 0
        for lineno, line in enumerate(bz2.BZ2File(self.fname)):
            if line.startswith("      <text"):
                intext = True
                line = line[line.find(">") + 1 :]
                lines = [line]
            elif intext:
                lines.append(line)
            pos = line.find("</text>")  # can be on the same line as <text>
            if pos >= 0:
                articles_all += 1
                intext = False
                if not lines:
                    continue
                lines[-1] = line[:pos]
                text = filter_wiki("".join(lines))
                if len(text) > ARTICLE_MIN_CHARS:  # article redirects are pruned here
                    articles += 1
                    if return_raw:
                        result = text
                    else:
                        result = tokenize(text)  # text into tokens here
                        positions += len(result)
                    yield result

        logger.info(
            "finished iterating over Wikipedia corpus of %i documents with %i positions"
            " (total %i articles before pruning)" % (articles, positions, articles_all)
        )
        self.length = articles  # cache corpus length
开发者ID:hjanime,项目名称:gensim,代码行数:71,代码来源:wikicorpus.py

示例10: CDS_Corpus

# 需要导入模块: from gensim.corpora.dictionary import Dictionary [as 别名]
# 或者: from gensim.corpora.dictionary.Dictionary import filter_extremes [as 别名]
class CDS_Corpus(TextCorpus):
    def __init__(self, folder, dictionary=None):
        """
        Takes the list of txt files in a folder from Isabelle 
        as input and builds the dictionary and corpus
        """
        self.folder = folder
        if dictionary is None:
            self.dictionary = Dictionary(self.get_texts())
            self.dictionary.filter_extremes(no_below=NO_BELOW, 
                    no_above=NO_ABOVE, keep_n=VOCAB_SIZE)
        else:
            self.dictionary = dictionary


    def get_texts(self):
        """
        Iterate over the "documents" (sessions/places) returning text
        """
        filter_words = set()
        if FILTER_WORDS:
            filter_words = []
            with open(FILTER_WORDS) as f:
                for line in f:
                    filter_words.append(line.rstrip('\n'))
            filter_words = set(filter_words)
            #print "the following words will be filtered", filter_words
        filter_words_add = set()
        if FILTER_WORDS_ADD:
            filter_words_add = []
            with open(FILTER_WORDS_ADD) as f:
                for line in f:
                    filter_words_add.append(line.rstrip('\n'))
            filter_words_add = set(filter_words_add)

        positions, hn_articles = 0, 0
        fnamelist = []
        docs = 0
        for g in glob.iglob(self.folder + '/*.txt'):
            fnamelist.append(g)
        for fileno, fname in enumerate(fnamelist):
            with open(fname) as f:
                text = ""
                for line in f:
                    if line[0] != '@':
                        #sentence = re.sub('\d+', '', line.rstrip('\n').strip('\t').replace('\n', '')).split(' ')
                        sentence = tokenize(re.sub('\d+', '', line.rstrip('\n').strip('\t').replace('\n', '')))
                        for ind, word in enumerate(sentence):
                            w = word.lower().rstrip(' ').strip(' ').strip('\t')
                            sentence[ind] = w
                        if FILTER_WORDS:
                            for ind, word in enumerate(sentence):
                                if word.upper() in filter_words:
                                    sentence[ind] = ''
                        if FILTER_WORDS_ADD:
                            for ind, word in enumerate(sentence):
                                if word in filter_words_add:
                                    sentence[ind] = ''
                        text += ' '.join(sentence) + '\n'
                    else:
                        docs += 1
                        if LEMMATIZE:
                            result = lemmatizer(text)
                            positions += len(result)
                            yield result
                        else:
                            result = tokenize(text) # text into tokens here
                            positions += len(result)
                            yield result
                        text = ""
                docs += 1
                if LEMMATIZE:
                    result = lemmatizer(text)
                    positions += len(result)
                    yield result
                else:
                    result = tokenize(text) # text into tokens here
                    positions += len(result)
                    yield result

        print (">>> finished iterating over the corpus of %i documents with %i positions" % (docs, positions))

        self.length = docs # cache corpus length
开发者ID:syhw,项目名称:contextual_word_segmentation,代码行数:85,代码来源:prepare_corpus_tfidf.py

示例11: WikiHNCorpus

# 需要导入模块: from gensim.corpora.dictionary import Dictionary [as 别名]
# 或者: from gensim.corpora.dictionary.Dictionary import filter_extremes [as 别名]
class WikiHNCorpus(TextCorpus):
    def __init__(self, wiki_file, hn_folder, dictionary=None, processes=None, 
            lemmatize=utils.HAS_PATTERN):
        """
        Takes the wikipedia *articles.xml.bz2 and the HN folder of articles 
        as input and builds the dictionary and corpus
        """
        global outputname
        self.lemmatize = lemmatize
        if self.lemmatize:
            print "We will lemmatize ('you were'->'be/VB')"
            self.outputname = outputname + "_lemmatized"
        else:
            print "We will only tokenize ('you were'->'you','were')"

        self.wiki_file = wiki_file
        self.hn_folder = hn_folder

        if processes is None:
            processes = max(1, multiprocessing.cpu_count() - 1)
        self.processes = processes

        if dictionary is None:
            self.dictionary = Dictionary(self.get_texts())
            self.dictionary.filter_extremes(no_below=NO_BELOW, 
                    no_above=NO_ABOVE, keep_n=VOCAB_SIZE)
        else:
            self.dictionary = dictionary


    def get_texts(self):
        """
        Iterate over the Wikipedia dump and the HN articles returning text
        """
        wiki_articles, hn_articles, articles_all = 0, 0, 0
        positions, positions_all = 0, 0

        # ************ Wikipedia ************
        texts = ((text, self.lemmatize) for _, text in wikicorpus._extract_pages(bz2.BZ2File(self.wiki_file)))
        pool = multiprocessing.Pool(self.processes)
        for group in utils.chunkize(texts, chunksize=10 * pool._processes, maxsize=1): # otherwise imap puts all the corpus into memory
            for tokens in pool.imap(wikicorpus.process_article, group):
                articles_all += 1
                positions_all += len(tokens)
                if len(tokens) > WIKI_ARTICLE_MIN_WORDS:
                    wiki_articles += 1
                    positions += len(tokens)
                    yield tokens
        pool.terminate()

        print (">>> finished iterating over Wikipedia corpus of %i documents with %i positions (total %i articles, %i positions before pruning articles shorter than %i words)" % (wiki_articles, positions, articles_all, positions_all, WIKI_ARTICLE_MIN_WORDS))

        # ************ HN articles ************
        positions_after_wiki = positions
        fnamelist = []
        for g in glob.iglob(self.hn_folder + '/*.txt'):
            fnamelist.append(g)
        for fileno, fname in enumerate(fnamelist): # TODO parallelize as Wiki
            hn_text = open(fname).read()
            if self.lemmatize:
                result = utils.lemmatize(hn_text) # text into lemmas here
            else:
                result = tokenize(hn_text) # text into tokens here
            articles_all += 1
            positions_all += len(result)
            if len(result) > HN_ARTICLE_MIN_WORDS:
                hn_articles += 1
                positions += len(result)
                yield result

        print (">>> finished iterating over HN corpus of %i documents with %i positions" % (hn_articles, positions - positions_after_wiki))
        # ************ /HN articles ************

        self.length = wiki_articles + hn_articles # cache corpus length
开发者ID:imclab,项目名称:HN_stats,代码行数:76,代码来源:wiki_and_hn.py


注:本文中的gensim.corpora.dictionary.Dictionary.filter_extremes方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。