本文整理汇总了Python中gensim.corpora.dictionary.Dictionary类的典型用法代码示例。如果您正苦于以下问题:Python Dictionary类的具体用法?Python Dictionary怎么用?Python Dictionary使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。
在下文中一共展示了Dictionary类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: EnronCorpus
class EnronCorpus(TextCorpus):
def __init__(self, root_name, no_below=20, keep_words=DEFAULT_DICT_SIZE, dictionary=None):
"""
Initialize the corpus. This scans through all the emails once, to determine the corpus
vocabulary. (only the first `keep_words` most frequent words that appear in at least
`no_below` documents are kept).
"""
self.root_name = root_name
if dictionary is None:
self.dictionary = Dictionary(self.get_texts())
self.dictionary.filter_extremes(no_below=no_below, no_above=0.1, keep_n=keep_words)
else:
self.dictionary = dictionary
def get_texts(self, return_raw=False):
"""
Walk the file system, strip punctuation, normalize all numbers to be '2'.
"""
filenames = walk_os(self.root_name)
opened_files = gen_open(filenames)
stripped_files = strip_punct(opened_files)
length = 0
for email in stripped_files:
if len(email) > ARTICLE_MIN_CHARS:
length += 1
print "Iteration: %i" % length
yield tokenize(email)
self.length = length # cache corpus length
示例2: create_dictionaries
def create_dictionaries(model=None,
combined=None):
''' Function does are number of Jobs:
1- Creates a word to index mapping
2- Creates a word to vector mapping
3- Transforms the Training and Testing Dictionaries
'''
if (combined is not None) and (model is not None):
gensim_dict = Dictionary()
gensim_dict.doc2bow(model.vocab.keys(),
allow_update=True)
w2indx = {v: k+1 for k, v in gensim_dict.items()}#所有频数超过10的词语的索引
w2vec = {word: model[word] for word in w2indx.keys()}#所有频数超过10的词语的词向量
def parse_dataset(combined):
''' Words become integers
'''
data=[]
for sentence in combined:
new_txt = []
for word in sentence:
try:
new_txt.append(w2indx[word])
except:
new_txt.append(0)
data.append(new_txt)
return data
combined=parse_dataset(combined)
combined= sequence.pad_sequences(combined, maxlen=maxlen)#每个句子所含词语对应的索引,所以句子中含有频数小于10的词语,索引为0
return w2indx, w2vec,combined
else:
print 'No data provided...'
示例3: build_dictionaries_from_splits
def build_dictionaries_from_splits(splits_template, n, save_pickle_tup=None):
''' Builds all 3 dictionaries from splits. If provided, `save_pickle_tup` must
be a 3-tuple of the picklefile names in the following order:
(title, body, tags)
If `save_pickle_tup[i]` is None, the corresponding dictionary will not be saved.
'''
utitledict, ubodydict, utagdict = Dictionary(), Dictionary(), Dictionary()
for eid in xrange(n):
for row in row_stream(splits_template % eid):
ID, title, body, tags = row
utitledict.doc2bow(title.split(), allow_update=True)
ubodydict.doc2bow(body.split(), allow_update=True)
utagdict.doc2bow(tags.split(), allow_update=True)
assert ubodydict.num_docs == utitledict.num_docs == utagdict.num_docs
print "Before filtering..."
print "utitledict:", utitledict
print "ubodydict:", ubodydict
print "utagdict:", utagdict
if save_pickle_tup:
assert len(save_pickle_tup) == 3
if save_pickle_tup[0]:
print "saving utitledict..."
utitledict.save(save_pickle_tup[0])
if save_pickle_tup[1]:
print "saving ubodydict..."
ubodydict.save(save_pickle_tup[1])
if save_pickle_tup[2]:
print "saving utagdict..."
utagdict.save(save_pickle_tup[2])
return (utitledict, ubodydict, utagdict)
示例4: doc_to_gensim
def doc_to_gensim(doc, lemmatize=True,
filter_stops=True, filter_punct=True, filter_nums=False):
"""
Convert a single ``spacy.Doc`` into a gensim dictionary and bag-of-words document.
Args:
doc (``spacy.Doc``)
lemmatize (bool): if True, use lemmatized strings for words; otherwise,
use the original form of the string as it appears in ``doc``
filter_stops (bool): if True, remove stop words from word list
filter_punct (bool): if True, remove punctuation from word list
filter_nums (bool): if True, remove numbers from word list
Returns:
:class:`gensim.Dictionary <gensim.corpora.dictionary.Dictionary>`:
integer word ID to word string mapping
list((int, int)): bag-of-words document, a list of (integer word ID, word count)
2-tuples
"""
gdict = Dictionary()
words = extract.words(doc,
filter_stops=filter_stops,
filter_punct=filter_punct,
filter_nums=filter_nums)
if lemmatize is True:
gdoc = gdict.doc2bow((word.lemma_ for word in words), allow_update=True)
else:
gdoc = gdict.doc2bow((word.orth_ for word in words), allow_update=True)
return (gdict, gdoc)
示例5: loadDictionary
def loadDictionary(fname, mapping_only=True):
"""
Load previously stored mapping between words and their ids.
The result can be used as the `id2word` parameter for input to transformations.
"""
if mapping_only:
result = {}
for lineNo, line in enumerate(open(fname)):
cols = line[:-1].split('\t')
if len(cols) == 2:
wordId, word = cols
elif len(cols) == 3:
wordId, word, dfs = cols
else:
raise ValueError("invalid line in dictionary file %s: %s" % (fname, line.strip()))
result[int(wordId)] = word # dfs not used
else:
result = Dictionary()
for lineNo, line in enumerate(open(fname)):
cols = line[:-1].split('\t')
if len(cols) == 3:
wordId, word, dfs = cols
else:
raise ValueError("invalid line in dictionary file %s: %s" % (fname, line.strip()))
wordId = int(wordId)
result.token2id[word] = wordId
result.dfs[wordId] = int(dfs)
return result
示例6: create_dictionaries
def create_dictionaries(train=None,
test=None,
model=None):
''' Function does are number of Jobs:
1- Creates a word to index mapping
2- Creates a word to vector mapping
3- Transforms the Training and Testing Dictionaries
'''
if (train is not None) and (model is not None) and (test is not None):
gensim_dict = Dictionary()
gensim_dict.doc2bow(model.vocab.keys(),
allow_update=True)
w2indx = {v: k+1 for k, v in gensim_dict.items()}
w2vec = {word: model[word] for word in w2indx.keys()}
def parse_dataset(data):
''' Words become integers
'''
for key in data.keys():
txt = data[key].lower().replace('\n', '').split()
new_txt = []
for word in txt:
try:
new_txt.append(w2indx[word])
except:
new_txt.append(0)
data[key] = new_txt
return data
train = parse_dataset(train)
test = parse_dataset(test)
return w2indx, w2vec, train, test
else:
print('No data provided...')
示例7: create_corpus
def create_corpus(src, out_dir, no_below=20, keep_words=_DEFAULT_KEEP_WORDS):
"""\
"""
wordid_filename = os.path.join(out_dir, 'cables_wordids.pickle')
bow_filename = os.path.join(out_dir, 'cables_bow.mm')
tfidf_filename = os.path.join(out_dir, 'cables_tfidf.mm')
predicate = None # Could be set to something like pred.origin_filter(pred.origin_germany)
# 1. Create word dict
dct = Dictionary()
dct_handler = DictionaryHandler(dct)
handler = create_filter(dct_handler)
handle_source(src, handler, predicate)
dct.filter_extremes(no_below=no_below, no_above=0.1, keep_n=keep_words)
dct.save(wordid_filename)
# 2. Reiterate through the cables and create the vector space
corpus_handler = CorpusHandler(out_dir, dct=dct, allow_dict_updates=False)
handler = create_filter(corpus_handler)
handle_source(src, handler, predicate)
# 3. Load corpus
mm = MmCorpus(bow_filename)
# 4. Create TF-IDF model
tfidf = TfidfModel(mm, id2word=dct, normalize=True)
# 5. Save the TF-IDF model
MmCorpus.serialize(tfidf_filename, tfidf[mm], progress_cnt=10000)
示例8: CorpusOfMethodContents
class CorpusOfMethodContents(TextCorpus):
def __init__(self):
self.mapMethodFQNtoIndex = {}
self.methodFqns = []
self.methodContents = []
TextCorpus.__init__(self)
def addDocument(self, methodFqn, words):
if methodFqn not in self.mapMethodFQNtoIndex:
self.methodFqns.append(methodFqn)
self.mapMethodFQNtoIndex[methodFqn] = len(self.mapMethodFQNtoIndex) - 1
self.methodContents.append(words)
self.dictionary.doc2bow(words, allow_update = True)
else:
self.methodContents[self.mapMethodFQNtoIndex[methodFqn]] = words
self.dictionary = Dictionary()
self.dictionary.add_documents(self.get_texts())
def getMethodContentsForFqn(self, fqn):
if fqn in self.mapMethodFQNtoIndex.keys():
return self.methodContents[self.mapMethodFQNtoIndex[fqn]]
return None
def get_texts(self):
for content in self.methodContents:
yield content
示例9: create_dictionary
def create_dictionary(analyzed_items_path, dictionary_path=None):
dictionary = Dictionary(iter_docs(analyzed_items_path))
if dictionary_path:
dictionary.save(dictionary_path)
return dictionary
示例10: get_corpus_dictionary
def get_corpus_dictionary():
"""Crafts a toy corpus and the dictionary associated."""
# Toy corpus.
corpus = [
['carrot', 'salad', 'tomato'],
['carrot', 'salad', 'dish'],
['tomato', 'dish'],
['tomato', 'salad'],
['car', 'break', 'highway'],
['highway', 'accident', 'car'],
['moto', 'break'],
['accident', 'moto', 'car']
]
dictionary = Dictionary(corpus)
# Transforming corpus with dictionary.
corpus = [dictionary.doc2bow(doc) for doc in corpus]
# Building reverse index.
for (token, uid) in dictionary.token2id.items():
dictionary.id2token[uid] = token
return corpus, dictionary
示例11: WordCorpus
class WordCorpus(BaseCorpus):
"""\
Wrapper around a `gensim.corpora.dictionary.Dictionary`.
This is a light-weight alternative to `CableCorpus` to create an initial
word dictionary::
wd = WordCorpus()
wd.add_text('ref-1', 'bla bla')
# add more texts
wd.dct.filter_extremes()
corpus = CableCorpus('/my/directory/', wd.dct)
corpus.add_text('ref-1', 'bla bla')
# add more texts
corpus.close()
"""
def __init__(self, dct=None, tokenizer=None):
"""\
Initializes the wrapper.
`dct`
An existing Dictionary or ``None`` if a new Dictionary should be
created (default)
`tokenizer`
A tokenizer function or ``None``, see `BaseCorpus`
"""
super(WordCorpus, self).__init__(tokenizer)
self.dct = Dictionary() if dct is None else dct
def add_words(self, reference_id, words):
self.dct.doc2bow(words, True)
示例12: build_dictionary
def build_dictionary(self):
documents = ReadThreads(
self.board, input_dir=self.input_dir, file_type='phrases',
return_func=lambda x, y: y.split())
dictionary = Dictionary(documents)
dictionary.save(f'{self.board}.dictionary')
return dictionary
示例13: getDictionary
def getDictionary(word_corpus, useSavedTill):
if useSavedTill >= USESAVED.dictionary:
common_logger.info("loading dictionary from file")
dictionary = Dictionary.load(file_lda_gensim_dictionary)
return dictionary
else:
common_logger.info("Creating dictionary from corpus")
dictionary = Dictionary(word_corpus.values())
common_logger.info("saving dictionary")
dictionary.save(file_lda_gensim_dictionary)
return dictionary
示例14: build_dictionary_from_splits
def build_dictionary_from_splits(splits_template, column, n, save_pickle=None):
''' Build dictionary from splits. If `save_pickle` is provided, then save. '''
unfiltered_dict = Dictionary()
for eid in xrange(n):
unfiltered_dict.add_documents(csv_isolator("../../data/proc_Train_%d.csv" % eid, column))
print "Before filtering,", unfiltered_dict
if save_pickle:
print "\nsaving..."
unfiltered_dict.save(save_pickle)
return unfiltered_dict
示例15: SublexicalizedCorpus
class SublexicalizedCorpus(TextCorpus):
def __init__(self, base_corpus, order=3, word_limit=None, clean_func=mahoney_clean, create_dictionary=True,
n_proc=1):
self.order = order
self.clean_func = clean_func
self.base_corpus = base_corpus
self.word_limit = word_limit
self.n_proc = n_proc
super(SublexicalizedCorpus, self).__init__()
self.dictionary = Dictionary()
if create_dictionary:
self.dictionary.add_documents(self.get_texts())
def get_texts(self):
a_count = 0
t_count = 0
texts = ((text, self.clean_func, self.order) for text in self.base_corpus.get_texts())
pool = multiprocessing.Pool(self.n_proc)
start = time.clock()
prev = start
for group in chunkize(texts, chunksize=10 * self.n_proc, maxsize=100):
for tokens in pool.imap_unordered(process, group):
a_count += 1
cur = time.clock()
if cur - prev > 60:
logging.info("Sublexicalized %d in %d seconds, %.0f t/s"
% (t_count, cur - start, t_count*1. / (cur - start)))
prev = cur
t_count += len(tokens)
yield tokens
if self.word_limit and t_count > self.word_limit:
break
pool.terminate()
end = time.clock()
logging.info("Sublexicalizing %d finished in %d seconds, %.0f t/s"
% (t_count, end - start, t_count*1. / (end - start)))
self.length = t_count