当前位置: 首页>>代码示例>>Python>>正文


Python Dictionary.add_documents方法代码示例

本文整理汇总了Python中gensim.corpora.dictionary.Dictionary.add_documents方法的典型用法代码示例。如果您正苦于以下问题:Python Dictionary.add_documents方法的具体用法?Python Dictionary.add_documents怎么用?Python Dictionary.add_documents使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在gensim.corpora.dictionary.Dictionary的用法示例。


在下文中一共展示了Dictionary.add_documents方法的12个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: CorpusOfMethodContents

# 需要导入模块: from gensim.corpora.dictionary import Dictionary [as 别名]
# 或者: from gensim.corpora.dictionary.Dictionary import add_documents [as 别名]
class CorpusOfMethodContents(TextCorpus):
    
    def __init__(self):
        self.mapMethodFQNtoIndex = {}
        self.methodFqns = []
        self.methodContents = []
        TextCorpus.__init__(self)
        
    def addDocument(self, methodFqn, words):
        if methodFqn not in self.mapMethodFQNtoIndex:
            self.methodFqns.append(methodFqn)
            self.mapMethodFQNtoIndex[methodFqn] = len(self.mapMethodFQNtoIndex) - 1
            self.methodContents.append(words)
            self.dictionary.doc2bow(words, allow_update = True)
        else:
            self.methodContents[self.mapMethodFQNtoIndex[methodFqn]] = words
            self.dictionary = Dictionary()
            self.dictionary.add_documents(self.get_texts())
    
    def getMethodContentsForFqn(self, fqn):
        if fqn in self.mapMethodFQNtoIndex.keys():
            return self.methodContents[self.mapMethodFQNtoIndex[fqn]]
        return None
    
    def get_texts(self):
        for content in self.methodContents:
            yield content
开发者ID:IFT-SE,项目名称:pfis3,代码行数:29,代码来源:algorithmLexicalBase.py

示例2: build_dictionary

# 需要导入模块: from gensim.corpora.dictionary import Dictionary [as 别名]
# 或者: from gensim.corpora.dictionary.Dictionary import add_documents [as 别名]
def build_dictionary():
    dictionary = Dictionary()
    for line in open(wiki_index.ARTICLES_FILE):
        dictionary.add_documents([line.lower().split()])
    dictionary.filter_extremes(no_below=2, no_above=0.5)
    dictionary.save(DICTIONARY_FILE)
    return dictionary
开发者ID:msushkov,项目名称:cs224w-wiki,代码行数:9,代码来源:lda.py

示例3: SublexicalizedCorpus

# 需要导入模块: from gensim.corpora.dictionary import Dictionary [as 别名]
# 或者: from gensim.corpora.dictionary.Dictionary import add_documents [as 别名]
class SublexicalizedCorpus(TextCorpus):
    def __init__(self, base_corpus, order=3, word_limit=None, clean_func=mahoney_clean, create_dictionary=True,
                 n_proc=1):
        self.order = order

        self.clean_func = clean_func
        self.base_corpus = base_corpus
        self.word_limit = word_limit
        self.n_proc = n_proc

        super(SublexicalizedCorpus, self).__init__()

        self.dictionary = Dictionary()

        if create_dictionary:
            self.dictionary.add_documents(self.get_texts())

    def get_texts(self):
        a_count = 0
        t_count = 0

        texts = ((text, self.clean_func, self.order) for text in self.base_corpus.get_texts())

        pool = multiprocessing.Pool(self.n_proc)

        start = time.clock()
        prev = start

        for group in chunkize(texts, chunksize=10 * self.n_proc, maxsize=100):
            for tokens in pool.imap_unordered(process, group):
                a_count += 1

                cur = time.clock()

                if cur - prev > 60:
                    logging.info("Sublexicalized %d in %d seconds, %.0f t/s"
                                 % (t_count, cur - start, t_count*1. / (cur - start)))

                    prev = cur

                t_count += len(tokens)

                yield tokens

                if self.word_limit and t_count > self.word_limit:
                    break

        pool.terminate()

        end = time.clock()
        logging.info("Sublexicalizing %d finished in %d seconds, %.0f t/s"
                     % (t_count, end - start, t_count*1. / (end - start)))

        self.length = t_count
开发者ID:andrely,项目名称:sublexical-features,代码行数:56,代码来源:experiment_runner.py

示例4: build_dictionary_from_splits

# 需要导入模块: from gensim.corpora.dictionary import Dictionary [as 别名]
# 或者: from gensim.corpora.dictionary.Dictionary import add_documents [as 别名]
def build_dictionary_from_splits(splits_template, column, n, save_pickle=None):
    ''' Build dictionary from splits. If `save_pickle` is provided, then save. '''
    unfiltered_dict = Dictionary()
    for eid in xrange(n):
        unfiltered_dict.add_documents(csv_isolator("../../data/proc_Train_%d.csv" % eid, column))
    print "Before filtering,", unfiltered_dict
    if save_pickle:
        print "\nsaving..."
        unfiltered_dict.save(save_pickle)
    
    return unfiltered_dict
开发者ID:mr1azl,项目名称:tag_recommender,代码行数:13,代码来源:pruning.py

示例5: __init__

# 需要导入模块: from gensim.corpora.dictionary import Dictionary [as 别名]
# 或者: from gensim.corpora.dictionary.Dictionary import add_documents [as 别名]
    def __init__(self, fname, dictionary=None):
        """
        Initialize the corpus. Unless a dictionary is provided, this scans the
        corpus once, to determine its vocabulary.
        """
        self.fname = fname
        self.metadata = False

        if dictionary is None:
            dictionary = Dictionary()
            for text in self.get_texts():
                dictionary.add_documents([text])
        self.dictionary = dictionary
开发者ID:buruzaemon,项目名称:LDA-reuters,代码行数:15,代码来源:reuterscorpus.py

示例6: TextCorpus

# 需要导入模块: from gensim.corpora.dictionary import Dictionary [as 别名]
# 或者: from gensim.corpora.dictionary.Dictionary import add_documents [as 别名]
class TextCorpus(gensim.corpora.TextCorpus):
    """A corpus class which makes some minor extensions to the Gensim
    `TextCorpus` implementation:

    - Support loading of pre-built dictionary
    """

    def __init__(self, input=None, dictionary=None, dictionary_save_path=None,
                 pre_tokenized=False, lowercase=False):
        super(gensim.corpora.TextCorpus, self).__init__()

        self.input = input
        self.metadata = False

        self.pre_tokenized = pre_tokenized
        self.lowercase = lowercase

        if dictionary is None:
            self.dictionary = Dictionary()

            if input is not None:
                self.dictionary.add_documents(self.get_texts())
            else:
                logging.warning("No input document stream provided; "
                                "assuming dictionary will be "
                                "initialized in some other way.")
        else:
            self.dictionary = dictionary

        if dictionary_save_path is not None:
            self.dictionary.save(dictionary_save_path)

    def get_texts(self):
        length = 0

        # Input should have one document (sentence, for the word2vec case) per line
        for line in getstream(self.input):
            length += 1

            if self.pre_tokenized:
		if not isinstance(line, unicode):
		    line = unicode(line, encoding='utf8', errors='strict')
                yield line
            else:
                yield gensim.utils.tokenize(line, lowercase=self.lowercase)

        self.length = length
开发者ID:hans,项目名称:deepBLE,代码行数:49,代码来源:text.py

示例7: FolderCorpus

# 需要导入模块: from gensim.corpora.dictionary import Dictionary [as 别名]
# 或者: from gensim.corpora.dictionary.Dictionary import add_documents [as 别名]
class FolderCorpus(corpora.TextCorpus):
    def __init__(self, filepaths, preprocess=[], dictionary=None):
        self.filepaths = filepaths
        self.preprocess = preprocess
        self.metadata = None

        self.dictionary = Dictionary()

        self.dictionary.add_documents(self.get_texts())
        self.dictionary.filter_extremes(no_below=5, no_above=0.5, keep_n=500000)
        self.dictionary.compactify()

    def get_texts(self):
        for path in self.filepaths:
            with codecs.open(path, encoding='utf8') as f:
                raw_text = f.read()
                raw_text = raw_text.lower()
                for filt in self.preprocess:
                    raw_text = filt(raw_text)
                text = list(utils.tokenize(raw_text, deacc=True, lowercase=True))
                yield text
开发者ID:wpli,项目名称:ptr,代码行数:23,代码来源:utils_gensim.py

示例8: ArchiveCorpus

# 需要导入模块: from gensim.corpora.dictionary import Dictionary [as 别名]
# 或者: from gensim.corpora.dictionary.Dictionary import add_documents [as 别名]
class ArchiveCorpus(corpora.TextCorpus):

	def __init__(self, datafile, preprocess=[], dictionary=None):
		self.datafile = datafile
		self.preprocess = preprocess
		self.metadata = None

		if dictionary:
				self.dictionary = dictionary
		else:
				self.dictionary = Dictionary()
				if datafile is not None:
					self.dictionary.add_documents(self.get_texts())
					self.dictionary.filter_extremes(no_below=5, no_above=0.5, keep_n=500000)


	def get_texts(self):
		with utils.smart_open(self.datafile) as inputfile:
			for line in inputfile:
				for f in self.preprocess:
					line = f(line)
				text = list(utils.tokenize(line, deacc=True, lowercase=True))
				yield text
开发者ID:harixxy,项目名称:latentmodels,代码行数:25,代码来源:001_create_gensim_data.py

示例9: DefaultJsonCorpus

# 需要导入模块: from gensim.corpora.dictionary import Dictionary [as 别名]
# 或者: from gensim.corpora.dictionary.Dictionary import add_documents [as 别名]
class DefaultJsonCorpus(object):
    """
    A default JSON corpus based on gensim TextCorpus. It assumes a file or list of JSON as input.
    The methods provided by gensim TextCorpus are needed for the GenSim training.
    Any corpus provided to DocumentSimilarity should provide the methods given in this class.
    """
    def __init__(self, input=None,create_dictionary=True):
        super(DefaultJsonCorpus, self).__init__()
        self.input = input
        self.dictionary = Dictionary()
        self.metadata = False
        if create_dictionary:
            self.dictionary.add_documents(self.get_texts())


    def __iter__(self):
        for text in self.get_texts():
            yield self.dictionary.doc2bow(text, allow_update=False)

    def getstream(self):
        return utils.file_or_filename(self.input)

    def __len__(self):
        if not hasattr(self, 'length'):
            # cache the corpus length
            self.length = sum(1 for _ in self.get_texts())
        return self.length

    def get_json(self):
        if isinstance(self.input,list):
            for j in self.input:
                yield j
        else:
            with self.getstream() as lines:
                for line in lines:
                    line = line.rstrip()
                    j = json.loads(line)
                    yield j

    def get_texts(self,raw=False):
        """
        yield raw text or tokenized text
        """
        for j in self.get_json():
            text = j["text"]
            if raw:
                yield text
            else:
                yield utils.tokenize(text, deacc=True, lowercase=True)

    def get_meta(self):
        """
        return a json object with meta data for the documents. It must return:
        id - id for this document
        optional title and tags. Tags will be used as base truth used to score document similarity results.
        """
        doc_id = 0
        for j in self.get_json():
            m = copy.deepcopy(j)
            m['id'] = long(m['id'])
            m['corpus_seq_id'] = doc_id
            doc_id += 1
            yield m

    def get_dictionary(self):
        return self.dictionary
开发者ID:transformersprimeabcxyz,项目名称:seldon-server-AI-ML,代码行数:68,代码来源:docsim.py

示例10: TextCorpus

# 需要导入模块: from gensim.corpora.dictionary import Dictionary [as 别名]
# 或者: from gensim.corpora.dictionary.Dictionary import add_documents [as 别名]
class TextCorpus(interfaces.CorpusABC):
    """
    Helper class to simplify the pipeline of getting bag-of-words vectors (= a
    gensim corpus) from plain text.

    This is an abstract base class: override the `get_texts()` method to match
    your particular input.

    Given a filename (or a file-like object) in constructor, the corpus object
    will be automatically initialized with a dictionary in `self.dictionary` and
    will support the `iter` corpus method. You must only provide a correct `get_texts`
    implementation.

    """
    def __init__(self, input=None):
        super(TextCorpus, self).__init__()
        self.input = input
        self.dictionary = Dictionary()
        self.metadata = False
        if input is not None:
            self.dictionary.add_documents(self.get_texts())
        else:
            logger.warning("No input document stream provided; assuming "
                           "dictionary will be initialized some other way.")


    def __iter__(self):
        """
        The function that defines a corpus.

        Iterating over the corpus must yield sparse vectors, one for each document.
        """
        for text in self.get_texts():
            if self.metadata:
                yield (self.dictionary.doc2bow(text[0], allow_update=False), text[1])
            else:
                yield self.dictionary.doc2bow(text, allow_update=False)


    def getstream(self):
        return getstream(self.input)


    def get_texts(self):
        """
        Iterate over the collection, yielding one document at a time. A document
        is a sequence of words (strings) that can be fed into `Dictionary.doc2bow`.

        Override this function to match your input (parse input files, do any
        text preprocessing, lowercasing, tokenizing etc.). There will be no further
        preprocessing of the words coming out of this function.
        """
        # Instead of raising NotImplementedError, let's provide a sample implementation:
        # assume documents are lines in a single file (one document per line).
        # Yield each document as a list of lowercase tokens, via `utils.tokenize`.
        length = 0
        for lineno, line in enumerate(getstream(self.input)):
            length += 1
            yield utils.tokenize(line, lowercase=True)
        self.length = length


    def __len__(self):
        return self.length # will throw if corpus not initialized
开发者ID:EricChanBD,项目名称:gensim,代码行数:66,代码来源:textcorpus.py

示例11: LDA

# 需要导入模块: from gensim.corpora.dictionary import Dictionary [as 别名]
# 或者: from gensim.corpora.dictionary.Dictionary import add_documents [as 别名]
class LDA(object):

    def __init__(self, topics = 10, 
                 worker = 3, 
                 pretrained_model = None, 
                 dictionary = None):
        """
        lda模型训练初始化。
        Args:
            topics -- 指定主题个数
            worker -- 并行化参数,一般为core数量减一
            pretrained_model -- 预训练的模型,由于支持在线更新,所以可以加载上次训练的模型
            dictionary -- 训练时词需要转换成ID,所以跟模型配套有一个ID映射的词典
        Example:
            >>> lda = LDA(topics = 20, worker = 2, 
                          pretrained_model = model_file, 
                          dictionary = dictionary_file)
            >>> corpus = read_file(corpus_file) # [['word1', 'word2'], ['word3', 'word4']]
            >>> lda.update(corpus)
            >>> lda.save(model_file, dictionary_file)
            >>> topics = lda.inference(['word5', 'word6'])
        """

        self._topics = topics
        self._workers = worker
        self._model = None
        self._common_dictionary = None
        if pretrained_model and common_dictionary:
            self._model = LdaModel.load(pretrained_model)
            self._common_dictionary = Dictionary.load(dictionary)

    def save(self, model_file, dictionary_file):
        """
        保存训练的模型,同时保存对应的词典
        Args:
            model_file -- 模型文件
            dictionary_file -- 词典文件
        Returns:
            无
        """

        if self._model:
            self._model.save(model_file)
        if self._common_dictionary:
            self._common_dictionary.save(dictionary_file)

    def update(self, corpus = [[]]):
        """
        在线更新,在已有模型的基础上在线更新
        Args:
            corpus -- 用于更新的文档列表
        """

        if not self._model and len(corpus) > 0:
            self._common_dictionary = Dictionary(corpus)
            corpus_data =  [self._common_dictionary.doc2bow(sentence) for sentence in corpus]
            self._model = LdaModel(corpus_data, self._topics)
        elif self._model and len(corpus) > 0:
            self._common_dictionary.add_documents(corpus)
            new_corpus_data =  [self._common_dictionary.doc2bow(sentence) for sentence in corpus]
            self._model.update(new_corpus_data)

    def inference(self, document = []):
        """
        对新文档推断其话题分布
        Args:
            document -- 文档,其实是词列表
        Returns:
            话题分布列表        
        """
        if self._model:
            doc =  [self._common_dictionary.doc2bow(document)]
            return self._model.get_document_topics(doc)
        return []

    @property
    def model(self):
        return self._model

    @property
    def dictionary(self):
        return self._common_dictionary
开发者ID:freygit,项目名称:36,代码行数:84,代码来源:lda.py

示例12: Similarities

# 需要导入模块: from gensim.corpora.dictionary import Dictionary [as 别名]
# 或者: from gensim.corpora.dictionary.Dictionary import add_documents [as 别名]

#.........这里部分代码省略.........
        :return: bool
        """
        return os.path.isfile(self._create_resource_path(resource_file))

    def _run_transformers(self):
        """
        Runs all the transformer methods listed providing the MongoDB client context instance.
        """
        with MongoClientContext(self._mongo_connection_record) as client:
            self._create_dictionary(client)
            self._create_lsi_similarity_index(client)

    def _create_dictionary(self, mongo_client):
        """
        Creates the gensim Dictionary (gensim.corpora.dictionary.Dictionary) or loads it if it already exists and sets
        the object's dictionary property.
        :param mongo_client: server.db.MongoClientContext
        """
        from gensim.corpora.dictionary import Dictionary

        if self._resource_exists(self.dictionary_file):
            self.logger().debug(
                    "Dictionary file found, loading it [%s]" % self._create_resource_path(self.dictionary_file))
            self._dictionary = Dictionary.load(self._create_resource_path(self.dictionary_file))
        else:
            self.logger().debug("Dictionary file not found, creating a new Dictionary file")
            self._dictionary = Dictionary()

        documents = []
        for doc in [di for d in mongo_client.scrappers_collections() for di in d.find()]:
            documents.append(self.tokenize_sentence(doc[self.considerable_doc_property]))

        self.logger().debug("Adding %d documents to dictionary (will skip existing ones)" % len(documents))
        self._dictionary.add_documents(documents)
        self._dictionary.save(self._create_resource_path(self.dictionary_file))

    def _create_lsi_similarity_index(self, mongo_client):
        """
        Creates a Similarity index based on LSI model from the available dictionary. Sets the object's lsi_model and
        similarity_index object properties.
        """
        from gensim.models import LsiModel
        from gensim.similarities import MatrixSimilarity

        self._lsi_mapping.clear()
        bow_corpus = []
        for idx, tp in enumerate([(c, di) for c in mongo_client.scrappers_collections() for di in c.find()]):
            self._lsi_mapping[idx] = tp
            bow_corpus.append(self.sentence_to_bow(tp[1][self.considerable_doc_property]))

        self._lsimodel = LsiModel(bow_corpus, id2word=self.dictionary)
        self._sim_index = MatrixSimilarity(self._lsimodel[bow_corpus])

    def calculate_similarities(self):
        """
        Find / calculate similarities between documents in the index.
        Returns a defaultdict with the key as the LSI index and the value is a list of tuples with the following values
        (LSI model Index, similarity threshold - numpy.float32)
        tuple
        :return: defaultdict(list)
        """
        similarities = defaultdict(list)
        if not self.lsi_index_mapping:
            return

        for idx, tp in sorted(self.lsi_index_mapping.items(), key=itemgetter(0)):
开发者ID:nathanIL,项目名称:openews,代码行数:70,代码来源:__init__.py


注:本文中的gensim.corpora.dictionary.Dictionary.add_documents方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。