本文整理匯總了Python中gensim.corpora.MmCorpus類的典型用法代碼示例。如果您正苦於以下問題:Python MmCorpus類的具體用法?Python MmCorpus怎麽用?Python MmCorpus使用的例子?那麽, 這裏精選的類代碼示例或許可以為您提供幫助。
在下文中一共展示了MmCorpus類的15個代碼示例,這些例子默認根據受歡迎程度排序。您可以為喜歡或者感覺有用的代碼點讚,您的評價將有助於係統推薦出更棒的Python代碼示例。
示例1: test_apply
def test_apply(self):
transformed_vtcorp = self.transformer._apply(self.vtcorp)
self.assertTrue(hasattr(transformed_vtcorp.corpus, 'dictionary'))
transformed_names = self.loader.layout.required_text_corpus_names(self.transformation_label)
text_data_name = os.path.join(self.data_root,
self.loader.layout.corpus_dir,
transformed_names[0])
text_obj_name = os.path.join(self.data_root,
self.loader.layout.corpus_dir,
transformed_names[2])
MmCorpus.serialize(text_data_name, transformed_vtcorp)
transformed_vtcorp.save(text_obj_name)
self.assertTrue(self.loader.has_text_corpora(self.transformation_label))
self.temporary_files.extend([ os.path.join(self.data_root,
self.loader.layout.corpus_dir,
transformed_name)
for transformed_name in transformed_names])
transformed_vtcorp = TransformedCorpus.load(text_obj_name)
self.assertIsInstance(transformed_vtcorp, TransformedCorpus)
self.assertIsInstance(transformed_vtcorp.corpus, VTextCorpus)
self.assertTrue(hasattr(transformed_vtcorp.corpus, 'dictionary'))
print 'Transformed corpus dictionary size: %i' % len(transformed_vtcorp.corpus.dictionary)
self.assertEqual(self.k, len(transformed_vtcorp.obj.orig2transformed))
示例2: main
def main(argv=None):
if argv is None:
argv = sys.argv
print('Creating simple wiki serialized corpus')
# Download the raw file if we do not have it already
if not os.path.isfile(WIKIFILE):
# Get the file
wget.download(WIKIURL)
wiki = WikiCorpus(WIKIFILE, lemmatize=False)
i = 0
article_dict = {}
for text in wiki.get_texts(meta=True):
url_string = 'https://simple.wikipedia.org/wiki/?curid={}'
article_dict[i] = (url_string.format(text[0]), text[1])
i += 1
with open(ARTICLEDICT, 'w') as f:
json.dump(article_dict, f)
wiki.dictionary.filter_extremes(no_below=20, no_above=0.1,
keep_n=DEFAULT_DICT_SIZE)
MmCorpus.serialize(MMFILE, wiki, progress_cnt=10000, )
wiki.dictionary.save_as_text(DICTFILE)
print('Simple wiki serialized corpus created')
# Now run LSI
dictionary = Dictionary.load_from_text(DICTFILE)
mm = MmCorpus(MMFILE)
tfidf = TfidfModel(mm, id2word=dictionary, normalize=True)
tfidf.save(TDIFMODEL)
MmCorpus.serialize(TDIFFILE, tfidf[mm], progress_cnt=10000)
mm_tdif = MmCorpus(TDIFFILE)
lsi = LsiModel(mm_tdif, id2word=dictionary, num_topics=300)
index = similarities.MatrixSimilarity(lsi[mm_tdif])
index.save(SIMMATRIX)
lsi.save(LSIMODEL)
print("LSI model and index created")
示例3: extend_corpus
def extend_corpus(self, corpus):
"""
Add new documents in `corpus` to `self.corpus`. If serialization is used,
then the entire corpus (`self.corpus`) is re-serialized and the new documents
are added in the process. If serialization is not used, the corpus, as a list
of documents, is simply extended.
"""
if self.serialized:
# Re-serialize the entire corpus while appending the new documents.
if isinstance(corpus, MmCorpus):
# Check that we are not attempting to overwrite the serialized corpus.
assert self.corpus.input != corpus.input, \
'Input corpus cannot have the same file path as the model corpus (serialization_path).'
corpus_chain = chain(self.corpus, corpus) # A generator with the old and new documents.
# Make a temporary copy of the file where the corpus is serialized.
copyfile(self.serialization_path, self.serialization_path + '.tmp')
self.corpus.input = self.serialization_path + '.tmp' # Point the old corpus at this temporary file.
# Re-serialize the old corpus, and extend it with the new corpus.
MmCorpus.serialize(self.serialization_path, corpus_chain)
self.corpus = MmCorpus(self.serialization_path) # Store the new serialized corpus object in self.corpus.
remove(self.serialization_path + '.tmp') # Remove the temporary file again.
else:
# self.corpus and corpus are just lists, just extend the list.
# First check that corpus is actually a list.
assert isinstance(corpus, list), "If serialized == False, all input corpora must be lists."
self.corpus.extend(corpus)
示例4: _create_bow_representation
def _create_bow_representation(self):
"""Create bag-of-words representation of collection, and save it
in Matrix Matrix format to disk."""
print('Create bag-of-words matrix representation.')
self.bow_corpus = [self.dictionary.doc2bow(article)
for article in self.articles]
MmCorpus.serialize(self.bowmm_filepath, self.bow_corpus)
示例5: createcorpus
def createcorpus(bg_corpus,output_dictionary,output_serialize):
# Generating a training/background corpus from your own source of documents
#saving dictionary and corpus in Matrix method form
print("Creating corpus and dictionary")
background_corpus = TextCorpus(input=bg_corpus)
background_corpus.dictionary.save(output_dictionary)
MmCorpus.serialize(output_serialize,background_corpus)
return background_corpus,background_corpus.dictionary
開發者ID:AshwiniTokekar,項目名稱:Multi-document-summarization-of-news-articles-using-deep-learning,代碼行數:8,代碼來源:semantic_similarity.py
示例6: load_experts
def load_experts():
"""
load expert data and save to file
"""
expert_corpus = ExpertCorpus()
MmCorpus.serialize(corpus=expert_corpus, fname='expert_corpus_new_test.mm')
"""
save expert-to-document map to pickle
"""
pickle.dump(expert2doc, open('expert2doc_new_test.p', 'wb'))
示例7: _create_tfidf_matrix
def _create_tfidf_matrix(self):
"""Create TF-IDF matrix and save it in Matrix Matrix format to
disk"""
print('Create TF-IDF matrix of collection.')
tfidf = TfidfModel(self.bow_corpus,
id2word=self.dictionary,
normalize=True)
MmCorpus.serialize(self.tfidf_filepath,
tfidf[self.bow_corpus])
print('Number of documents:', tfidf.num_docs)
示例8: init_empty_corpus
def init_empty_corpus(self):
"""
Initialize an empty corpus. If the corpora are to be treated as lists, simply
initialize an empty list. If serialization is used, initialize an empty corpus
of the class `gensim.corpora.MmCorpus`.
"""
if self.serialized:
# Initialize the corpus as a serialized empty list.
# This corpus will be extended in self.update.
MmCorpus.serialize(self.serialization_path, []) # Serialize empty corpus.
self.corpus = MmCorpus(self.serialization_path) # Store serialized corpus object in self.corpus.
else:
# All input corpora are assumed to just be lists.
self.corpus = []
示例9: pretrain
def pretrain():
"""pre train the text corpus and build the dictionary"""
gutenberg_corpus = TextCorpus(text_corpus_file)
gutenberg_corpus.dictionary.save(dict_file)
gutenberg_corpus.dictionary.save_as_text(dic_txt_file)
mm = MmCorpus.serialize(mm_corpus_file, gutenberg_corpus)
print mm;
示例10: main
def main():
datadir = path.abspath(path.join(os.getcwd(), "data"))
# Read in the corpus from within the archive file
fin = path.join(datadir, "reuters21578.tar.gz")
rc = ReutersCorpus(fin)
# filter out some of the more common words,
# and some of the less-common ones as well
rc.dictionary.filter_extremes(no_below=20, no_above=0.1)
rc.dictionary.compactify()
# Serialize the Reuters 21578 corpus
fout = path.join(datadir, "reuters21578.mm")
MmCorpus.serialize(fout, rc)
# Save the dictionary to file as text
fout = path.join(datadir, "reuters21578.dict.txt")
rc.dictionary.save_as_text(fout)
示例11: get_topics_lda
def get_topics_lda(tokens, n_topics=10):
"""
Using the `gensim` package for LDA.
LDA is a little better than LSA as it provides a reasonal mixture of topics (Wikipedia).
`gensim` is a package for topic modeling only. So for a particular topic modeling task,
it is a lighter option to install and run. Also it can be run distributed and updated over an existing model
:param tokens: Preprocessed tokens for faster dictionary building
:param n_topics: Number of topics to decompose data to
:return: list() of topics
"""
dict_file = 'resources/deals.dict'
if not os.path.isfile(dict_file):
print "Dictionary file does not exist. Creating one"
dictionary = Dictionary(tokens)
freq1 = [id for id, freq in dictionary.dfs.iteritems() if freq == 1]
dictionary.filter_tokens(freq1)
dictionary.compactify()
dictionary.save(dict_file)
dictionary = Dictionary.load(dict_file)
# print dictionary
corpus_file = 'resources/deals.mm'
if not os.path.isfile(corpus_file):
print "Corpus file does not exist. Creating one"
corpus = [dictionary.doc2bow(token) for token in tokens]
MmCorpus.serialize(corpus_file, corpus)
mm = MmCorpus(corpus_file)
# print mm
# tfidf = TfidfModel(mm)
# corpus_tfidf = tfidf[mm]
lda = LdaModel(corpus=mm, id2word=dictionary, num_topics=n_topics, update_every=1, chunksize=1000,
passes=1)
topics = []
for i in range(0, n_topics):
words = lda.print_topic(i).split('+')
topic = []
for word in words:
score, w = word.split('*')
topic.append((w, score))
topics.append(topic)
return topics
示例12: main
def main():
datadir = path.abspath(path.join(os.getcwd(), "data"))
# load back the id->word mapping directly from file
fin = path.join(datadir, "reuters21578.dict.txt")
vocabulary = Dictionary.load_from_text(fin)
# load the corpus
fin = path.join(datadir, "reuters21578.mm")
mm = MmCorpus(fin)
# build tfidf, ~50min
tfidf = TfidfModel(mm, id2word=vocabulary, normalize=True)
# save the TfidfModel instance to file
fout = path.join(datadir, "reuters21578.tfidf.model")
tfidf.save(fout)
# save TF-IDF vectors in matrix market format
fout = path.join(datadir, "reuters21578.tfidf.mm")
MmCorpus.serialize(fout, tfidf[mm], progress_cnt=10000)
示例13: main
def main():
program = os.path.basename(sys.argv[0])
logger = logging.getLogger(program)
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s')
logging.root.setLevel(level=logging.INFO)
logger.info("running %s" % ' '.join(sys.argv))
outp = OUT_PREFIX
keep_words = DEFAULT_DICT_SIZE
# the doc index
dbc = get_cursor()
dbc.execute('SELECT id, title FROM wiki_pages WHERE is_artist=1 ORDER BY id')
docindex = [(pageid, title) for pageid, title in dbc]
pickle.dump(docindex, open(outp + '_docindex.p', 'wb'))
lemmatize = True # 'lemma' in program
wiki = WikiCorpus(pages_gen, lemmatize=lemmatize)
# only keep the most frequent words
wiki.dictionary.filter_extremes(no_below=20, no_above=0.5, keep_n=DEFAULT_DICT_SIZE)
# save dictionary and bag-of-words (term-document frequency matrix)
MmCorpus.serialize(outp + '_bow.mm', wiki, progress_cnt=10000)
wiki.dictionary.save_as_text(outp + '_wordids.txt.bz2')
dictionary = Dictionary.load_from_text(outp + '_wordids.txt.bz2')
# initialize corpus reader and word->id mapping
mm = MmCorpus(outp + '_bow.mm')
# build tfidf, ~50min
tfidf = TfidfModel(mm, id2word=dictionary, normalize=True)
tfidf.save(outp + '.tfidf_model')
# save tfidf vectors in matrix market format
# another long task
MmCorpus.serialize(outp + '_tfidf.mm', tfidf[mm], progress_cnt=10000)
logger.info("finished running %s" % program)
示例14: main
def main(argv=None):
if argv is None:
argv = sys.argv
print('Creating speech serialized corpus')
# Create the speech corpus, it is inside the rawfile as a json format:
# "id0": {"text": [" "], "url": "http://www.americanrhetoric.com/"}
with open(RAWFILE, 'r') as f:
speech_dict = json.load(f)
with open(RAWIDS, 'r') as f:
id_dict = json.load(f)
# We also need to make sure that the article ids are saved in the correct
# format so that the gensimple engine can understand it, like this:
# "int": ["url", "title"],
texts = []
article_dict = {}
counter = 0
for key, value in speech_dict.items():
texts.append([token for token in value['text']])
article_dict[str(counter)] = [value['url'], id_dict[key]['title']]
counter += 1
with open(ARTICLEDICT, 'w') as f:
json.dump(article_dict, f)
dictionary = Dictionary(texts)
dictionary.save_as_text(DICTFILE)
corpus = [dictionary.doc2bow(text) for text in texts]
MmCorpus.serialize(MMFILE, corpus)
print('Speech serialized corpus created')
# # Now run LSI on TDIDF
dictionary = Dictionary.load_from_text(DICTFILE)
mm = MmCorpus(MMFILE)
tfidf = TfidfModel(mm, id2word=dictionary, normalize=True)
tfidf.save(TDIFMODEL)
MmCorpus.serialize(TDIFFILE, tfidf[mm], progress_cnt=10000)
mm_tdif = MmCorpus(TDIFFILE)
lsi = LsiModel(mm_tdif, id2word=dictionary, num_topics=300)
index = similarities.MatrixSimilarity(lsi[mm_tdif])
index.save(SIMMATRIX)
lsi.save(LSIMODEL)
print("LSI model and index created")
示例15: main
def main(args):
logging.info('Initializing loaders with root %s, name %s' % (
args.root, args.name))
dloader = MultimodalDatasetLoader(args.root, args.name)
icorp = dloader.load_image_corpus(args.img_label)
transformer = NormalizationTransform()
normalized_icorp = transformer._apply(icorp)
corpus_names = dloader.layout.required_img_corpus_names(args.transformation_label)
corpus_full_path = os.path.join(args.root, corpus_names[0])
logging.info('Serializing to file %s' % corpus_full_path)
MmCorpus.serialize(corpus_full_path, normalized_icorp)
logging.info('Re-saving original corpus object with infix %s' % args.transformation_label)
dloader.save_image_corpus(normalized_icorp.corpus, args.transformation_label)