本文整理汇总了Python中gensim.corpora.dictionary.Dictionary.save方法的典型用法代码示例。如果您正苦于以下问题:Python Dictionary.save方法的具体用法?Python Dictionary.save怎么用?Python Dictionary.save使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类gensim.corpora.dictionary.Dictionary
的用法示例。
在下文中一共展示了Dictionary.save方法的14个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: build_dictionaries_from_splits
# 需要导入模块: from gensim.corpora.dictionary import Dictionary [as 别名]
# 或者: from gensim.corpora.dictionary.Dictionary import save [as 别名]
def build_dictionaries_from_splits(splits_template, n, save_pickle_tup=None):
''' Builds all 3 dictionaries from splits. If provided, `save_pickle_tup` must
be a 3-tuple of the picklefile names in the following order:
(title, body, tags)
If `save_pickle_tup[i]` is None, the corresponding dictionary will not be saved.
'''
utitledict, ubodydict, utagdict = Dictionary(), Dictionary(), Dictionary()
for eid in xrange(n):
for row in row_stream(splits_template % eid):
ID, title, body, tags = row
utitledict.doc2bow(title.split(), allow_update=True)
ubodydict.doc2bow(body.split(), allow_update=True)
utagdict.doc2bow(tags.split(), allow_update=True)
assert ubodydict.num_docs == utitledict.num_docs == utagdict.num_docs
print "Before filtering..."
print "utitledict:", utitledict
print "ubodydict:", ubodydict
print "utagdict:", utagdict
if save_pickle_tup:
assert len(save_pickle_tup) == 3
if save_pickle_tup[0]:
print "saving utitledict..."
utitledict.save(save_pickle_tup[0])
if save_pickle_tup[1]:
print "saving ubodydict..."
ubodydict.save(save_pickle_tup[1])
if save_pickle_tup[2]:
print "saving utagdict..."
utagdict.save(save_pickle_tup[2])
return (utitledict, ubodydict, utagdict)
示例2: create_dictionary
# 需要导入模块: from gensim.corpora.dictionary import Dictionary [as 别名]
# 或者: from gensim.corpora.dictionary.Dictionary import save [as 别名]
def create_dictionary(analyzed_items_path, dictionary_path=None):
dictionary = Dictionary(iter_docs(analyzed_items_path))
if dictionary_path:
dictionary.save(dictionary_path)
return dictionary
示例3: create_corpus
# 需要导入模块: from gensim.corpora.dictionary import Dictionary [as 别名]
# 或者: from gensim.corpora.dictionary.Dictionary import save [as 别名]
def create_corpus(src, out_dir, no_below=20, keep_words=_DEFAULT_KEEP_WORDS):
"""\
"""
wordid_filename = os.path.join(out_dir, 'cables_wordids.pickle')
bow_filename = os.path.join(out_dir, 'cables_bow.mm')
tfidf_filename = os.path.join(out_dir, 'cables_tfidf.mm')
predicate = None # Could be set to something like pred.origin_filter(pred.origin_germany)
# 1. Create word dict
dct = Dictionary()
dct_handler = DictionaryHandler(dct)
handler = create_filter(dct_handler)
handle_source(src, handler, predicate)
dct.filter_extremes(no_below=no_below, no_above=0.1, keep_n=keep_words)
dct.save(wordid_filename)
# 2. Reiterate through the cables and create the vector space
corpus_handler = CorpusHandler(out_dir, dct=dct, allow_dict_updates=False)
handler = create_filter(corpus_handler)
handle_source(src, handler, predicate)
# 3. Load corpus
mm = MmCorpus(bow_filename)
# 4. Create TF-IDF model
tfidf = TfidfModel(mm, id2word=dct, normalize=True)
# 5. Save the TF-IDF model
MmCorpus.serialize(tfidf_filename, tfidf[mm], progress_cnt=10000)
示例4: build_dictionary
# 需要导入模块: from gensim.corpora.dictionary import Dictionary [as 别名]
# 或者: from gensim.corpora.dictionary.Dictionary import save [as 别名]
def build_dictionary():
dictionary = Dictionary()
for line in open(wiki_index.ARTICLES_FILE):
dictionary.add_documents([line.lower().split()])
dictionary.filter_extremes(no_below=2, no_above=0.5)
dictionary.save(DICTIONARY_FILE)
return dictionary
示例5: build_dictionary
# 需要导入模块: from gensim.corpora.dictionary import Dictionary [as 别名]
# 或者: from gensim.corpora.dictionary.Dictionary import save [as 别名]
def build_dictionary(self):
documents = ReadThreads(
self.board, input_dir=self.input_dir, file_type='phrases',
return_func=lambda x, y: y.split())
dictionary = Dictionary(documents)
dictionary.save(f'{self.board}.dictionary')
return dictionary
示例6: getDictionary
# 需要导入模块: from gensim.corpora.dictionary import Dictionary [as 别名]
# 或者: from gensim.corpora.dictionary.Dictionary import save [as 别名]
def getDictionary(word_corpus, useSavedTill):
if useSavedTill >= USESAVED.dictionary:
common_logger.info("loading dictionary from file")
dictionary = Dictionary.load(file_lda_gensim_dictionary)
return dictionary
else:
common_logger.info("Creating dictionary from corpus")
dictionary = Dictionary(word_corpus.values())
common_logger.info("saving dictionary")
dictionary.save(file_lda_gensim_dictionary)
return dictionary
示例7: build_dictionary_from_splits
# 需要导入模块: from gensim.corpora.dictionary import Dictionary [as 别名]
# 或者: from gensim.corpora.dictionary.Dictionary import save [as 别名]
def build_dictionary_from_splits(splits_template, column, n, save_pickle=None):
''' Build dictionary from splits. If `save_pickle` is provided, then save. '''
unfiltered_dict = Dictionary()
for eid in xrange(n):
unfiltered_dict.add_documents(csv_isolator("../../data/proc_Train_%d.csv" % eid, column))
print "Before filtering,", unfiltered_dict
if save_pickle:
print "\nsaving..."
unfiltered_dict.save(save_pickle)
return unfiltered_dict
示例8: TextCorpus
# 需要导入模块: from gensim.corpora.dictionary import Dictionary [as 别名]
# 或者: from gensim.corpora.dictionary.Dictionary import save [as 别名]
class TextCorpus(gensim.corpora.TextCorpus):
"""A corpus class which makes some minor extensions to the Gensim
`TextCorpus` implementation:
- Support loading of pre-built dictionary
"""
def __init__(self, input=None, dictionary=None, dictionary_save_path=None,
pre_tokenized=False, lowercase=False):
super(gensim.corpora.TextCorpus, self).__init__()
self.input = input
self.metadata = False
self.pre_tokenized = pre_tokenized
self.lowercase = lowercase
if dictionary is None:
self.dictionary = Dictionary()
if input is not None:
self.dictionary.add_documents(self.get_texts())
else:
logging.warning("No input document stream provided; "
"assuming dictionary will be "
"initialized in some other way.")
else:
self.dictionary = dictionary
if dictionary_save_path is not None:
self.dictionary.save(dictionary_save_path)
def get_texts(self):
length = 0
# Input should have one document (sentence, for the word2vec case) per line
for line in getstream(self.input):
length += 1
if self.pre_tokenized:
if not isinstance(line, unicode):
line = unicode(line, encoding='utf8', errors='strict')
yield line
else:
yield gensim.utils.tokenize(line, lowercase=self.lowercase)
self.length = length
示例9: main
# 需要导入模块: from gensim.corpora.dictionary import Dictionary [as 别名]
# 或者: from gensim.corpora.dictionary.Dictionary import save [as 别名]
def main():
parser = ArgumentParser()
parser.add_argument('-d', '--wiki-dump')
parser.add_argument('-l', '--limit', default=None, type=int)
parser.add_argument('-p', '--num-procs', default=1, type=int)
parser.add_argument('-o', '--out', default='vocab')
opts = parser.parse_args()
dump_loc = opts.wiki_dump
limit = opts.limit
n_procs = opts.num_procs
out_fn = opts.out
dump_gen = get_dump_gen(dump_loc, limit=limit, n_procs=n_procs)
nlp = spacy.en.English()
vocab = Dictionary(([token.text.lower().strip() for token in doc if token.text.strip() != ""]
for doc in nlp.pipe((art['article.text'] for art in dump_gen), n_threads=n_procs,
parse=False, tag=False, entity=False)))
vocab.save('%s.vocab' % out_fn)
vocab.save_as_text('%s.txt' % out_fn)
示例10: RedisCorpus
# 需要导入模块: from gensim.corpora.dictionary import Dictionary [as 别名]
# 或者: from gensim.corpora.dictionary.Dictionary import save [as 别名]
db.hset('idlookup', index, postid)
class RedisCorpus(object):
def __init__(self, postids):
self.postids = postids
self.numPosts = len(self.postids)
def __iter__(self):
count = 0
for postid in self.postids:
if count % 100 == 0:
print "Wrote %d out of %d to corpus: %s" % (count, self.numPosts, time.strftime("%H:%M:%S"))
addCorpusMap(count, postid)
count += 1
yield corpusOfPost(postid, force=True)
def buildCorpus():
""" Returns a corpus object that contains sparse vectors from every post. """
postids = getPostids()
corpus = RedisCorpus(postids)
return corpus
if __name__ == "__main__":
buildDictionary(force=True)
globalDict.save(dictName)
corpus = buildCorpus()
BleiCorpus.serialize('redditcorpus.lda-c', corpus)
示例11: Similarities
# 需要导入模块: from gensim.corpora.dictionary import Dictionary [as 别名]
# 或者: from gensim.corpora.dictionary.Dictionary import save [as 别名]
#.........这里部分代码省略.........
"""
return os.path.isfile(self._create_resource_path(resource_file))
def _run_transformers(self):
"""
Runs all the transformer methods listed providing the MongoDB client context instance.
"""
with MongoClientContext(self._mongo_connection_record) as client:
self._create_dictionary(client)
self._create_lsi_similarity_index(client)
def _create_dictionary(self, mongo_client):
"""
Creates the gensim Dictionary (gensim.corpora.dictionary.Dictionary) or loads it if it already exists and sets
the object's dictionary property.
:param mongo_client: server.db.MongoClientContext
"""
from gensim.corpora.dictionary import Dictionary
if self._resource_exists(self.dictionary_file):
self.logger().debug(
"Dictionary file found, loading it [%s]" % self._create_resource_path(self.dictionary_file))
self._dictionary = Dictionary.load(self._create_resource_path(self.dictionary_file))
else:
self.logger().debug("Dictionary file not found, creating a new Dictionary file")
self._dictionary = Dictionary()
documents = []
for doc in [di for d in mongo_client.scrappers_collections() for di in d.find()]:
documents.append(self.tokenize_sentence(doc[self.considerable_doc_property]))
self.logger().debug("Adding %d documents to dictionary (will skip existing ones)" % len(documents))
self._dictionary.add_documents(documents)
self._dictionary.save(self._create_resource_path(self.dictionary_file))
def _create_lsi_similarity_index(self, mongo_client):
"""
Creates a Similarity index based on LSI model from the available dictionary. Sets the object's lsi_model and
similarity_index object properties.
"""
from gensim.models import LsiModel
from gensim.similarities import MatrixSimilarity
self._lsi_mapping.clear()
bow_corpus = []
for idx, tp in enumerate([(c, di) for c in mongo_client.scrappers_collections() for di in c.find()]):
self._lsi_mapping[idx] = tp
bow_corpus.append(self.sentence_to_bow(tp[1][self.considerable_doc_property]))
self._lsimodel = LsiModel(bow_corpus, id2word=self.dictionary)
self._sim_index = MatrixSimilarity(self._lsimodel[bow_corpus])
def calculate_similarities(self):
"""
Find / calculate similarities between documents in the index.
Returns a defaultdict with the key as the LSI index and the value is a list of tuples with the following values
(LSI model Index, similarity threshold - numpy.float32)
tuple
:return: defaultdict(list)
"""
similarities = defaultdict(list)
if not self.lsi_index_mapping:
return
for idx, tp in sorted(self.lsi_index_mapping.items(), key=itemgetter(0)):
sentence = tp[1][self.considerable_doc_property]
示例12: saveWords
# 需要导入模块: from gensim.corpora.dictionary import Dictionary [as 别名]
# 或者: from gensim.corpora.dictionary.Dictionary import save [as 别名]
def saveWords(words, wordfile):
from gensim.corpora.dictionary import Dictionary
from gensim.corpora import MmCorpus
dict=Dictionary(words)
dict.save(wordfile)
示例13: saveGensim
# 需要导入模块: from gensim.corpora.dictionary import Dictionary [as 别名]
# 或者: from gensim.corpora.dictionary.Dictionary import save [as 别名]
def saveGensim(self, topic):
if topic is None:
# generate all
self.saveGensim('movie')
self.saveGensim('celebrity')
self.saveGensim('syria')
self.saveGensim('ufo')
return
posDocs = []
negDocs = []
if topic == 'movie':
topic = 'movie_reviews'
elif topic == 'celebrity':
topic = 'bieber'
if topic == 'movie_reviews':
count = 100
posDocs = self.movieReviews('positive', count)
negDocs = self.movieReviews('negative', count)
else:
posDocs = self.getArticlesHelper('positive', topic)
negDocs = self.getArticlesHelper('negative', topic)
listOfTokens = [] # dictionary
docs = [] # corpus
for posDoc in posDocs:
processed = self.processDocForGensim(posDoc)
tokens = self.tokensFromText(processed)
listOfTokens.append(tokens)
docs.append(processed)
for negDoc in negDocs:
processed = self.processDocForGensim(negDoc)
tokens = self.tokensFromText(processed)
listOfTokens.append(tokens)
docs.append(processed)
dictionaryFilename = 'gensim_dictionary.txt'
corpusFilename = 'gensim_corpus.mm'
# make destination files if they don't exist
dictionaryPath = os.path.join(
os.path.dirname(os.path.abspath(__file__)),
'james_data',
topic,
dictionaryFilename
)
corpusPath = os.path.join(
os.path.dirname(os.path.abspath(__file__)),
'james_data',
topic,
corpusFilename
)
corpusTempPath = corpusPath + '.tmp'
if os.path.exists(dictionaryPath):
os.remove(dictionaryPath)
if os.path.exists(corpusPath):
os.remove(corpusPath)
if os.path.exists(corpusTempPath):
os.remove(corpusTempPath)
with open(dictionaryPath, 'w') as f:
f.write(' ')
with open(corpusPath, 'w') as f:
f.write(' ')
# save dictionary and corpus
d = Dictionary(listOfTokens)
d.save(dictionaryPath)
with open(corpusTempPath, 'w') as f:
f.write('\n'.join(docs))
corpus = TextCorpus(corpusTempPath)
MmCorpus.save_corpus(corpusPath, corpus)
return
示例14: CableCorpus
# 需要导入模块: from gensim.corpora.dictionary import Dictionary [as 别名]
# 或者: from gensim.corpora.dictionary.Dictionary import save [as 别名]
class CableCorpus(BaseCorpus):
"""\
The cable corpus consists of several files which are written into a directory.
* a dictionary with a ``<word id> <word> <frequency>`` mapping saved under "wordids.pickle"
* a JSON file with a ``<cable reference id> <document number>`` mapping under "id2docid.json"
* a `Market Matrix format <http://math.nist.gov/MatrixMarket/formats.html>` vector space model file "bow.mm"
CAUTION: The corpus overrides any existing files with the same file name in the specified directory.
By default, the corpus creates the word dictionary and the vector space model which
may lead into an unuseful vector space model. To filter certain words, the corpus may be
initialized with a pre-generated word dictionary. To make the dictionary immutable, the property
``allow_dict_updates`` should be set to ``False`` (updates are allowed by default).
The resulting vector space model contains only words which are in the word dictionary then.
Example to reduce the clutter::
corpus = CableCorpus('/my/directory/')
# Add some texts here
corpus.add_text('ref-1', u'bla bla bla')
corpus.add_text('ref-2', u'bla bla blub')
...
corpus.dct.filter_extremes()
corpus.close()
from gensim.corpora.dictionary import Dictionary
# Load previously created dict
dct = Dictionary.load_from_text('/my/directory/cables_wordids.txt')
# Create another corpus with the previously word dict
corpus = CableCorpus('/my/directory/', dct, allow_dict_updates=False)
# Add some texts
....
corpus.close()
"""
def __init__(self, path, dct=None, tokenizer=None, allow_dict_updates=True, prefix=None):
"""\
Initializes the cable corpus.
`path`
Directory where the generated files are stored.
`dct`
An existing `gensim.corpora.dictionary.Dictionary`
If it's ``None`` (default) a dictionary will be created.
`tokenizer`
A function to tokenize/normalize/clean-up/remove stop words from strings.
If it's ``None`` (default), a default function will be used to tokenize texts.
`allow_dict_updates`
Indicats if unknown words should be added to the dictionary (default ``True``).
`prefix`
A prefix for the generated file names.
"""
super(CableCorpus, self).__init__(tokenizer)
if not os.path.isdir(path):
raise IOError('Expected a directory path')
self.dct = Dictionary() if dct is None else dct
self._path = path
self._prefix = prefix or 'cables_'
self._mw = IncrementalMmWriter(os.path.join(path, self._prefix + 'bow.mm'))
self.allow_dict_updates = allow_dict_updates
self._cables = []
def add_words(self, reference_id, words):
self._cables.append(reference_id)
self._mw.add_vector(self.dct.doc2bow(words, self.allow_dict_updates))
def close(self):
self._mw.close()
self.dct.save(os.path.join(self._path, self._prefix + 'wordids.pickle'))
json_filename = os.path.join(self._path, self._prefix + 'id2docid.json')
json.dump(dict(zip(self._cables, count())), open(json_filename, 'wb'))