本文整理汇总了Python中gensim.corpora.dictionary.Dictionary.add_documents方法的典型用法代码示例。如果您正苦于以下问题:Python Dictionary.add_documents方法的具体用法?Python Dictionary.add_documents怎么用?Python Dictionary.add_documents使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类gensim.corpora.dictionary.Dictionary
的用法示例。
在下文中一共展示了Dictionary.add_documents方法的12个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: CorpusOfMethodContents
# 需要导入模块: from gensim.corpora.dictionary import Dictionary [as 别名]
# 或者: from gensim.corpora.dictionary.Dictionary import add_documents [as 别名]
class CorpusOfMethodContents(TextCorpus):
def __init__(self):
self.mapMethodFQNtoIndex = {}
self.methodFqns = []
self.methodContents = []
TextCorpus.__init__(self)
def addDocument(self, methodFqn, words):
if methodFqn not in self.mapMethodFQNtoIndex:
self.methodFqns.append(methodFqn)
self.mapMethodFQNtoIndex[methodFqn] = len(self.mapMethodFQNtoIndex) - 1
self.methodContents.append(words)
self.dictionary.doc2bow(words, allow_update = True)
else:
self.methodContents[self.mapMethodFQNtoIndex[methodFqn]] = words
self.dictionary = Dictionary()
self.dictionary.add_documents(self.get_texts())
def getMethodContentsForFqn(self, fqn):
if fqn in self.mapMethodFQNtoIndex.keys():
return self.methodContents[self.mapMethodFQNtoIndex[fqn]]
return None
def get_texts(self):
for content in self.methodContents:
yield content
示例2: build_dictionary
# 需要导入模块: from gensim.corpora.dictionary import Dictionary [as 别名]
# 或者: from gensim.corpora.dictionary.Dictionary import add_documents [as 别名]
def build_dictionary():
dictionary = Dictionary()
for line in open(wiki_index.ARTICLES_FILE):
dictionary.add_documents([line.lower().split()])
dictionary.filter_extremes(no_below=2, no_above=0.5)
dictionary.save(DICTIONARY_FILE)
return dictionary
示例3: SublexicalizedCorpus
# 需要导入模块: from gensim.corpora.dictionary import Dictionary [as 别名]
# 或者: from gensim.corpora.dictionary.Dictionary import add_documents [as 别名]
class SublexicalizedCorpus(TextCorpus):
def __init__(self, base_corpus, order=3, word_limit=None, clean_func=mahoney_clean, create_dictionary=True,
n_proc=1):
self.order = order
self.clean_func = clean_func
self.base_corpus = base_corpus
self.word_limit = word_limit
self.n_proc = n_proc
super(SublexicalizedCorpus, self).__init__()
self.dictionary = Dictionary()
if create_dictionary:
self.dictionary.add_documents(self.get_texts())
def get_texts(self):
a_count = 0
t_count = 0
texts = ((text, self.clean_func, self.order) for text in self.base_corpus.get_texts())
pool = multiprocessing.Pool(self.n_proc)
start = time.clock()
prev = start
for group in chunkize(texts, chunksize=10 * self.n_proc, maxsize=100):
for tokens in pool.imap_unordered(process, group):
a_count += 1
cur = time.clock()
if cur - prev > 60:
logging.info("Sublexicalized %d in %d seconds, %.0f t/s"
% (t_count, cur - start, t_count*1. / (cur - start)))
prev = cur
t_count += len(tokens)
yield tokens
if self.word_limit and t_count > self.word_limit:
break
pool.terminate()
end = time.clock()
logging.info("Sublexicalizing %d finished in %d seconds, %.0f t/s"
% (t_count, end - start, t_count*1. / (end - start)))
self.length = t_count
示例4: build_dictionary_from_splits
# 需要导入模块: from gensim.corpora.dictionary import Dictionary [as 别名]
# 或者: from gensim.corpora.dictionary.Dictionary import add_documents [as 别名]
def build_dictionary_from_splits(splits_template, column, n, save_pickle=None):
''' Build dictionary from splits. If `save_pickle` is provided, then save. '''
unfiltered_dict = Dictionary()
for eid in xrange(n):
unfiltered_dict.add_documents(csv_isolator("../../data/proc_Train_%d.csv" % eid, column))
print "Before filtering,", unfiltered_dict
if save_pickle:
print "\nsaving..."
unfiltered_dict.save(save_pickle)
return unfiltered_dict
示例5: __init__
# 需要导入模块: from gensim.corpora.dictionary import Dictionary [as 别名]
# 或者: from gensim.corpora.dictionary.Dictionary import add_documents [as 别名]
def __init__(self, fname, dictionary=None):
"""
Initialize the corpus. Unless a dictionary is provided, this scans the
corpus once, to determine its vocabulary.
"""
self.fname = fname
self.metadata = False
if dictionary is None:
dictionary = Dictionary()
for text in self.get_texts():
dictionary.add_documents([text])
self.dictionary = dictionary
示例6: TextCorpus
# 需要导入模块: from gensim.corpora.dictionary import Dictionary [as 别名]
# 或者: from gensim.corpora.dictionary.Dictionary import add_documents [as 别名]
class TextCorpus(gensim.corpora.TextCorpus):
"""A corpus class which makes some minor extensions to the Gensim
`TextCorpus` implementation:
- Support loading of pre-built dictionary
"""
def __init__(self, input=None, dictionary=None, dictionary_save_path=None,
pre_tokenized=False, lowercase=False):
super(gensim.corpora.TextCorpus, self).__init__()
self.input = input
self.metadata = False
self.pre_tokenized = pre_tokenized
self.lowercase = lowercase
if dictionary is None:
self.dictionary = Dictionary()
if input is not None:
self.dictionary.add_documents(self.get_texts())
else:
logging.warning("No input document stream provided; "
"assuming dictionary will be "
"initialized in some other way.")
else:
self.dictionary = dictionary
if dictionary_save_path is not None:
self.dictionary.save(dictionary_save_path)
def get_texts(self):
length = 0
# Input should have one document (sentence, for the word2vec case) per line
for line in getstream(self.input):
length += 1
if self.pre_tokenized:
if not isinstance(line, unicode):
line = unicode(line, encoding='utf8', errors='strict')
yield line
else:
yield gensim.utils.tokenize(line, lowercase=self.lowercase)
self.length = length
示例7: FolderCorpus
# 需要导入模块: from gensim.corpora.dictionary import Dictionary [as 别名]
# 或者: from gensim.corpora.dictionary.Dictionary import add_documents [as 别名]
class FolderCorpus(corpora.TextCorpus):
def __init__(self, filepaths, preprocess=[], dictionary=None):
self.filepaths = filepaths
self.preprocess = preprocess
self.metadata = None
self.dictionary = Dictionary()
self.dictionary.add_documents(self.get_texts())
self.dictionary.filter_extremes(no_below=5, no_above=0.5, keep_n=500000)
self.dictionary.compactify()
def get_texts(self):
for path in self.filepaths:
with codecs.open(path, encoding='utf8') as f:
raw_text = f.read()
raw_text = raw_text.lower()
for filt in self.preprocess:
raw_text = filt(raw_text)
text = list(utils.tokenize(raw_text, deacc=True, lowercase=True))
yield text
示例8: ArchiveCorpus
# 需要导入模块: from gensim.corpora.dictionary import Dictionary [as 别名]
# 或者: from gensim.corpora.dictionary.Dictionary import add_documents [as 别名]
class ArchiveCorpus(corpora.TextCorpus):
def __init__(self, datafile, preprocess=[], dictionary=None):
self.datafile = datafile
self.preprocess = preprocess
self.metadata = None
if dictionary:
self.dictionary = dictionary
else:
self.dictionary = Dictionary()
if datafile is not None:
self.dictionary.add_documents(self.get_texts())
self.dictionary.filter_extremes(no_below=5, no_above=0.5, keep_n=500000)
def get_texts(self):
with utils.smart_open(self.datafile) as inputfile:
for line in inputfile:
for f in self.preprocess:
line = f(line)
text = list(utils.tokenize(line, deacc=True, lowercase=True))
yield text
示例9: DefaultJsonCorpus
# 需要导入模块: from gensim.corpora.dictionary import Dictionary [as 别名]
# 或者: from gensim.corpora.dictionary.Dictionary import add_documents [as 别名]
class DefaultJsonCorpus(object):
"""
A default JSON corpus based on gensim TextCorpus. It assumes a file or list of JSON as input.
The methods provided by gensim TextCorpus are needed for the GenSim training.
Any corpus provided to DocumentSimilarity should provide the methods given in this class.
"""
def __init__(self, input=None,create_dictionary=True):
super(DefaultJsonCorpus, self).__init__()
self.input = input
self.dictionary = Dictionary()
self.metadata = False
if create_dictionary:
self.dictionary.add_documents(self.get_texts())
def __iter__(self):
for text in self.get_texts():
yield self.dictionary.doc2bow(text, allow_update=False)
def getstream(self):
return utils.file_or_filename(self.input)
def __len__(self):
if not hasattr(self, 'length'):
# cache the corpus length
self.length = sum(1 for _ in self.get_texts())
return self.length
def get_json(self):
if isinstance(self.input,list):
for j in self.input:
yield j
else:
with self.getstream() as lines:
for line in lines:
line = line.rstrip()
j = json.loads(line)
yield j
def get_texts(self,raw=False):
"""
yield raw text or tokenized text
"""
for j in self.get_json():
text = j["text"]
if raw:
yield text
else:
yield utils.tokenize(text, deacc=True, lowercase=True)
def get_meta(self):
"""
return a json object with meta data for the documents. It must return:
id - id for this document
optional title and tags. Tags will be used as base truth used to score document similarity results.
"""
doc_id = 0
for j in self.get_json():
m = copy.deepcopy(j)
m['id'] = long(m['id'])
m['corpus_seq_id'] = doc_id
doc_id += 1
yield m
def get_dictionary(self):
return self.dictionary
示例10: TextCorpus
# 需要导入模块: from gensim.corpora.dictionary import Dictionary [as 别名]
# 或者: from gensim.corpora.dictionary.Dictionary import add_documents [as 别名]
class TextCorpus(interfaces.CorpusABC):
"""
Helper class to simplify the pipeline of getting bag-of-words vectors (= a
gensim corpus) from plain text.
This is an abstract base class: override the `get_texts()` method to match
your particular input.
Given a filename (or a file-like object) in constructor, the corpus object
will be automatically initialized with a dictionary in `self.dictionary` and
will support the `iter` corpus method. You must only provide a correct `get_texts`
implementation.
"""
def __init__(self, input=None):
super(TextCorpus, self).__init__()
self.input = input
self.dictionary = Dictionary()
self.metadata = False
if input is not None:
self.dictionary.add_documents(self.get_texts())
else:
logger.warning("No input document stream provided; assuming "
"dictionary will be initialized some other way.")
def __iter__(self):
"""
The function that defines a corpus.
Iterating over the corpus must yield sparse vectors, one for each document.
"""
for text in self.get_texts():
if self.metadata:
yield (self.dictionary.doc2bow(text[0], allow_update=False), text[1])
else:
yield self.dictionary.doc2bow(text, allow_update=False)
def getstream(self):
return getstream(self.input)
def get_texts(self):
"""
Iterate over the collection, yielding one document at a time. A document
is a sequence of words (strings) that can be fed into `Dictionary.doc2bow`.
Override this function to match your input (parse input files, do any
text preprocessing, lowercasing, tokenizing etc.). There will be no further
preprocessing of the words coming out of this function.
"""
# Instead of raising NotImplementedError, let's provide a sample implementation:
# assume documents are lines in a single file (one document per line).
# Yield each document as a list of lowercase tokens, via `utils.tokenize`.
length = 0
for lineno, line in enumerate(getstream(self.input)):
length += 1
yield utils.tokenize(line, lowercase=True)
self.length = length
def __len__(self):
return self.length # will throw if corpus not initialized
示例11: LDA
# 需要导入模块: from gensim.corpora.dictionary import Dictionary [as 别名]
# 或者: from gensim.corpora.dictionary.Dictionary import add_documents [as 别名]
class LDA(object):
def __init__(self, topics = 10,
worker = 3,
pretrained_model = None,
dictionary = None):
"""
lda模型训练初始化。
Args:
topics -- 指定主题个数
worker -- 并行化参数,一般为core数量减一
pretrained_model -- 预训练的模型,由于支持在线更新,所以可以加载上次训练的模型
dictionary -- 训练时词需要转换成ID,所以跟模型配套有一个ID映射的词典
Example:
>>> lda = LDA(topics = 20, worker = 2,
pretrained_model = model_file,
dictionary = dictionary_file)
>>> corpus = read_file(corpus_file) # [['word1', 'word2'], ['word3', 'word4']]
>>> lda.update(corpus)
>>> lda.save(model_file, dictionary_file)
>>> topics = lda.inference(['word5', 'word6'])
"""
self._topics = topics
self._workers = worker
self._model = None
self._common_dictionary = None
if pretrained_model and common_dictionary:
self._model = LdaModel.load(pretrained_model)
self._common_dictionary = Dictionary.load(dictionary)
def save(self, model_file, dictionary_file):
"""
保存训练的模型,同时保存对应的词典
Args:
model_file -- 模型文件
dictionary_file -- 词典文件
Returns:
无
"""
if self._model:
self._model.save(model_file)
if self._common_dictionary:
self._common_dictionary.save(dictionary_file)
def update(self, corpus = [[]]):
"""
在线更新,在已有模型的基础上在线更新
Args:
corpus -- 用于更新的文档列表
"""
if not self._model and len(corpus) > 0:
self._common_dictionary = Dictionary(corpus)
corpus_data = [self._common_dictionary.doc2bow(sentence) for sentence in corpus]
self._model = LdaModel(corpus_data, self._topics)
elif self._model and len(corpus) > 0:
self._common_dictionary.add_documents(corpus)
new_corpus_data = [self._common_dictionary.doc2bow(sentence) for sentence in corpus]
self._model.update(new_corpus_data)
def inference(self, document = []):
"""
对新文档推断其话题分布
Args:
document -- 文档,其实是词列表
Returns:
话题分布列表
"""
if self._model:
doc = [self._common_dictionary.doc2bow(document)]
return self._model.get_document_topics(doc)
return []
@property
def model(self):
return self._model
@property
def dictionary(self):
return self._common_dictionary
示例12: Similarities
# 需要导入模块: from gensim.corpora.dictionary import Dictionary [as 别名]
# 或者: from gensim.corpora.dictionary.Dictionary import add_documents [as 别名]
#.........这里部分代码省略.........
:return: bool
"""
return os.path.isfile(self._create_resource_path(resource_file))
def _run_transformers(self):
"""
Runs all the transformer methods listed providing the MongoDB client context instance.
"""
with MongoClientContext(self._mongo_connection_record) as client:
self._create_dictionary(client)
self._create_lsi_similarity_index(client)
def _create_dictionary(self, mongo_client):
"""
Creates the gensim Dictionary (gensim.corpora.dictionary.Dictionary) or loads it if it already exists and sets
the object's dictionary property.
:param mongo_client: server.db.MongoClientContext
"""
from gensim.corpora.dictionary import Dictionary
if self._resource_exists(self.dictionary_file):
self.logger().debug(
"Dictionary file found, loading it [%s]" % self._create_resource_path(self.dictionary_file))
self._dictionary = Dictionary.load(self._create_resource_path(self.dictionary_file))
else:
self.logger().debug("Dictionary file not found, creating a new Dictionary file")
self._dictionary = Dictionary()
documents = []
for doc in [di for d in mongo_client.scrappers_collections() for di in d.find()]:
documents.append(self.tokenize_sentence(doc[self.considerable_doc_property]))
self.logger().debug("Adding %d documents to dictionary (will skip existing ones)" % len(documents))
self._dictionary.add_documents(documents)
self._dictionary.save(self._create_resource_path(self.dictionary_file))
def _create_lsi_similarity_index(self, mongo_client):
"""
Creates a Similarity index based on LSI model from the available dictionary. Sets the object's lsi_model and
similarity_index object properties.
"""
from gensim.models import LsiModel
from gensim.similarities import MatrixSimilarity
self._lsi_mapping.clear()
bow_corpus = []
for idx, tp in enumerate([(c, di) for c in mongo_client.scrappers_collections() for di in c.find()]):
self._lsi_mapping[idx] = tp
bow_corpus.append(self.sentence_to_bow(tp[1][self.considerable_doc_property]))
self._lsimodel = LsiModel(bow_corpus, id2word=self.dictionary)
self._sim_index = MatrixSimilarity(self._lsimodel[bow_corpus])
def calculate_similarities(self):
"""
Find / calculate similarities between documents in the index.
Returns a defaultdict with the key as the LSI index and the value is a list of tuples with the following values
(LSI model Index, similarity threshold - numpy.float32)
tuple
:return: defaultdict(list)
"""
similarities = defaultdict(list)
if not self.lsi_index_mapping:
return
for idx, tp in sorted(self.lsi_index_mapping.items(), key=itemgetter(0)):