本文整理汇总了Python中gensim.corpora.dictionary.Dictionary.doc2bow方法的典型用法代码示例。如果您正苦于以下问题:Python Dictionary.doc2bow方法的具体用法?Python Dictionary.doc2bow怎么用?Python Dictionary.doc2bow使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类gensim.corpora.dictionary.Dictionary
的用法示例。
在下文中一共展示了Dictionary.doc2bow方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: build_dictionaries_from_splits
# 需要导入模块: from gensim.corpora.dictionary import Dictionary [as 别名]
# 或者: from gensim.corpora.dictionary.Dictionary import doc2bow [as 别名]
def build_dictionaries_from_splits(splits_template, n, save_pickle_tup=None):
''' Builds all 3 dictionaries from splits. If provided, `save_pickle_tup` must
be a 3-tuple of the picklefile names in the following order:
(title, body, tags)
If `save_pickle_tup[i]` is None, the corresponding dictionary will not be saved.
'''
utitledict, ubodydict, utagdict = Dictionary(), Dictionary(), Dictionary()
for eid in xrange(n):
for row in row_stream(splits_template % eid):
ID, title, body, tags = row
utitledict.doc2bow(title.split(), allow_update=True)
ubodydict.doc2bow(body.split(), allow_update=True)
utagdict.doc2bow(tags.split(), allow_update=True)
assert ubodydict.num_docs == utitledict.num_docs == utagdict.num_docs
print "Before filtering..."
print "utitledict:", utitledict
print "ubodydict:", ubodydict
print "utagdict:", utagdict
if save_pickle_tup:
assert len(save_pickle_tup) == 3
if save_pickle_tup[0]:
print "saving utitledict..."
utitledict.save(save_pickle_tup[0])
if save_pickle_tup[1]:
print "saving ubodydict..."
ubodydict.save(save_pickle_tup[1])
if save_pickle_tup[2]:
print "saving utagdict..."
utagdict.save(save_pickle_tup[2])
return (utitledict, ubodydict, utagdict)
示例2: doc_to_gensim
# 需要导入模块: from gensim.corpora.dictionary import Dictionary [as 别名]
# 或者: from gensim.corpora.dictionary.Dictionary import doc2bow [as 别名]
def doc_to_gensim(doc, lemmatize=True,
filter_stops=True, filter_punct=True, filter_nums=False):
"""
Convert a single ``spacy.Doc`` into a gensim dictionary and bag-of-words document.
Args:
doc (``spacy.Doc``)
lemmatize (bool): if True, use lemmatized strings for words; otherwise,
use the original form of the string as it appears in ``doc``
filter_stops (bool): if True, remove stop words from word list
filter_punct (bool): if True, remove punctuation from word list
filter_nums (bool): if True, remove numbers from word list
Returns:
:class:`gensim.Dictionary <gensim.corpora.dictionary.Dictionary>`:
integer word ID to word string mapping
list((int, int)): bag-of-words document, a list of (integer word ID, word count)
2-tuples
"""
gdict = Dictionary()
words = extract.words(doc,
filter_stops=filter_stops,
filter_punct=filter_punct,
filter_nums=filter_nums)
if lemmatize is True:
gdoc = gdict.doc2bow((word.lemma_ for word in words), allow_update=True)
else:
gdoc = gdict.doc2bow((word.orth_ for word in words), allow_update=True)
return (gdict, gdoc)
示例3: create_dictionaries
# 需要导入模块: from gensim.corpora.dictionary import Dictionary [as 别名]
# 或者: from gensim.corpora.dictionary.Dictionary import doc2bow [as 别名]
def create_dictionaries(train=None,
test=None,
model=None):
''' Function does are number of Jobs:
1- Creates a word to index mapping
2- Creates a word to vector mapping
3- Transforms the Training and Testing Dictionaries
'''
if (train is not None) and (model is not None) and (test is not None):
gensim_dict = Dictionary()
gensim_dict.doc2bow(model.vocab.keys(),
allow_update=True)
w2indx = {v: k+1 for k, v in gensim_dict.items()}
w2vec = {word: model[word] for word in w2indx.keys()}
def parse_dataset(data):
''' Words become integers
'''
for key in data.keys():
txt = data[key].lower().replace('\n', '').split()
new_txt = []
for word in txt:
try:
new_txt.append(w2indx[word])
except:
new_txt.append(0)
data[key] = new_txt
return data
train = parse_dataset(train)
test = parse_dataset(test)
return w2indx, w2vec, train, test
else:
print('No data provided...')
示例4: create_dictionaries
# 需要导入模块: from gensim.corpora.dictionary import Dictionary [as 别名]
# 或者: from gensim.corpora.dictionary.Dictionary import doc2bow [as 别名]
def create_dictionaries(model=None,
combined=None):
''' Function does are number of Jobs:
1- Creates a word to index mapping
2- Creates a word to vector mapping
3- Transforms the Training and Testing Dictionaries
'''
if (combined is not None) and (model is not None):
gensim_dict = Dictionary()
gensim_dict.doc2bow(model.vocab.keys(),
allow_update=True)
w2indx = {v: k+1 for k, v in gensim_dict.items()}#所有频数超过10的词语的索引
w2vec = {word: model[word] for word in w2indx.keys()}#所有频数超过10的词语的词向量
def parse_dataset(combined):
''' Words become integers
'''
data=[]
for sentence in combined:
new_txt = []
for word in sentence:
try:
new_txt.append(w2indx[word])
except:
new_txt.append(0)
data.append(new_txt)
return data
combined=parse_dataset(combined)
combined= sequence.pad_sequences(combined, maxlen=maxlen)#每个句子所含词语对应的索引,所以句子中含有频数小于10的词语,索引为0
return w2indx, w2vec,combined
else:
print 'No data provided...'
示例5: WordCorpus
# 需要导入模块: from gensim.corpora.dictionary import Dictionary [as 别名]
# 或者: from gensim.corpora.dictionary.Dictionary import doc2bow [as 别名]
class WordCorpus(BaseCorpus):
"""\
Wrapper around a `gensim.corpora.dictionary.Dictionary`.
This is a light-weight alternative to `CableCorpus` to create an initial
word dictionary::
wd = WordCorpus()
wd.add_text('ref-1', 'bla bla')
# add more texts
wd.dct.filter_extremes()
corpus = CableCorpus('/my/directory/', wd.dct)
corpus.add_text('ref-1', 'bla bla')
# add more texts
corpus.close()
"""
def __init__(self, dct=None, tokenizer=None):
"""\
Initializes the wrapper.
`dct`
An existing Dictionary or ``None`` if a new Dictionary should be
created (default)
`tokenizer`
A tokenizer function or ``None``, see `BaseCorpus`
"""
super(WordCorpus, self).__init__(tokenizer)
self.dct = Dictionary() if dct is None else dct
def add_words(self, reference_id, words):
self.dct.doc2bow(words, True)
示例6: _load_vocab
# 需要导入模块: from gensim.corpora.dictionary import Dictionary [as 别名]
# 或者: from gensim.corpora.dictionary.Dictionary import doc2bow [as 别名]
def _load_vocab(self,fname):
logging.info("loading plain-text file:{}".format(fname))
src_file = codecs.open(fname, 'rb', 'utf-8')
dictionary = Dictionary()
num_instances = 0
for term in src_file:
dictionary.doc2bow(term.strip().lower().encode('utf-8').split(), allow_update=True)
num_instances += 1
logging.info("processed {} instances".format(num_instances))
self.dictionary = dictionary
示例7: get_corpus_dictionary
# 需要导入模块: from gensim.corpora.dictionary import Dictionary [as 别名]
# 或者: from gensim.corpora.dictionary.Dictionary import doc2bow [as 别名]
def get_corpus_dictionary():
"""Crafts a toy corpus and the dictionary associated."""
# Toy corpus.
corpus = [
['carrot', 'salad', 'tomato'],
['carrot', 'salad', 'dish'],
['tomato', 'dish'],
['tomato', 'salad'],
['car', 'break', 'highway'],
['highway', 'accident', 'car'],
['moto', 'break'],
['accident', 'moto', 'car']
]
dictionary = Dictionary(corpus)
# Transforming corpus with dictionary.
corpus = [dictionary.doc2bow(doc) for doc in corpus]
# Building reverse index.
for (token, uid) in dictionary.token2id.items():
dictionary.id2token[uid] = token
return corpus, dictionary
示例8: preprocess_corpora
# 需要导入模块: from gensim.corpora.dictionary import Dictionary [as 别名]
# 或者: from gensim.corpora.dictionary.Dictionary import doc2bow [as 别名]
def preprocess_corpora(corpora, stopwords, allowed_pos, max_doc=float('inf'), no_above=0.5, no_below=1, keep_n=None):
"""
:rtype : gensim.corpora.dictionary.Dictionary
:param corpora:
:param stopwords:
:param allowed_pos:
:param max_doc:
:return:
"""
logging.info('Lemmatizing the corpora...')
count = 0
corpus_num = len(corpora)
processed_corpora = []
corpus_id2orig_id = []
for index, corpus in corpora.items():
count += 1
if count > max_doc:
break
if corpus is None: # skip if corpus is None
continue
print '\r', count, '/', corpus_num,
cleaned_corpus = clean_text(corpus) # delete irrelevant characters
corpus = []
tokens = lemmatize(content=cleaned_corpus, allowed_tags=allowed_pos)
for token in tokens:
word, pos = token.split('/')
corpus.append(word)
# convert compound word into one token
corpus = convert_compound(corpus)
# filter stop words, long words, and non-english words
corpus = [w for w in corpus if not w in stopwords and 2 <= len(w) <= 15 and w.islower()]
processed_corpora.append(corpus)
corpus_id2orig_id.append(index)
print '\n'
logging.info('Creating dictionary and corpus...')
dictionary = Dictionary(processed_corpora)
dictionary.corpus_id2orig_id = corpus_id2orig_id
logging.info('Filtering unimportant terms...')
dictionary.filter_extremes(no_below=no_below, no_above=no_above, keep_n=keep_n)
dictionary.compactify()
logging.info('Generating corpus...')
dictionary.corpus = [dictionary.doc2bow(corpus) for corpus in processed_corpora]
dictionary.id2token = revdict(dictionary.token2id)
return dictionary
示例9: create_mapping_dicts
# 需要导入模块: from gensim.corpora.dictionary import Dictionary [as 别名]
# 或者: from gensim.corpora.dictionary.Dictionary import doc2bow [as 别名]
def create_mapping_dicts(wrd_embedding, filter_corpus=False, bodies=None,
headlines=None):
"""Generate word:index, word:vector, index:word dictionaries.
Args:
----
wrd_embedding: gensim.models.word2vec.Word2Vec fitted model
filter_corpus (optional): boolean
Filter the corpus to only those words seen in the bodies/headlines.
bodies (optional): list of lists
Must be passed in if `filter_corpus` is True.
headlines (optional): list of lists
Must be passed in if `filter_corpus` is True.
Return:
------
word_idx_dct: dict
idx_word_dct: dict
word_vector_dct: dict
"""
if filter_corpus:
if (not bodies or not headlines):
excep_str = "Must pass in bodies and headlines with filter_corpus True!"
raise Exception(excep_str)
else:
wrd_embedding = _filter_corpus(bodies, headlines, wrd_embedding)
gensim_dct = Dictionary()
gensim_dct.doc2bow(wrd_embedding.vocab.keys(), allow_update=True)
# Leave index 0 for the newline character
word_idx_dct = {wrd: (idx + 1) for idx, wrd in gensim_dct.items()}
idx_word_dct = {(idx + 1): wrd for idx, wrd in gensim_dct.items()}
word_idx_dct['\n'] = 0
idx_word_dct[0] = '\n'
word_vector_dct = {wrd: wrd_embedding[wrd] for idx, wrd in gensim_dct.items()}
vec_dim = next(len(value) for value in word_vector_dct.values())
word_vector_dct['\n'] = np.zeros((vec_dim))
return word_idx_dct, idx_word_dct, word_vector_dct
示例10: create_mapping_dicts
# 需要导入模块: from gensim.corpora.dictionary import Dictionary [as 别名]
# 或者: from gensim.corpora.dictionary.Dictionary import doc2bow [as 别名]
def create_mapping_dicts(wrd_embedding, reviews=None, vocab_size=None):
"""Generate word:index, word:vector, index:word dictionaries.
Args:
----
wrd_embedding: gensim.models.word2vec.Word2Vec fitted model
reviews (optional): np.array (or array-like) of lists of strings
Used to filter the vocabulary, either to only those words in `reviews`
or the most common `vocab_size` words in `reviews` that are also in
the `wrd_embedding`.
vocab_size (optional): int
Keep only `vocab_size` most common words from the reviews.
Return:
------
word_idx_dct: dict
idx_word_dct: dict
word_vector_dct: dict
"""
if reviews is not None:
wrd_embedding = _filter_corpus(wrd_embedding, reviews, vocab_size)
gensim_dct = Dictionary()
gensim_dct.doc2bow(wrd_embedding.vocab.keys(), allow_update=True)
# Leave index 0 for masking the padding, 1 for the end of sequence
# character (EOS), and 2 for unkown words (denoted 'UNK')
wrd_idx_dct = {wrd: (idx + 3) for idx, wrd in gensim_dct.items()}
idx_wrd_dct = {(idx + 3): wrd for idx, wrd in gensim_dct.items()}
wrd_idx_dct['EOS'] = 1
idx_wrd_dct[1] = 'EOS'
wrd_idx_dct['UNK'] = 2
idx_wrd_dct[2] = 'UNK'
wrd_vector_dct = {wrd: wrd_embedding[wrd] for idx, wrd in gensim_dct.items()}
embedding_dim = wrd_embedding.vector_size
wrd_vector_dct['EOS'] = np.zeros((embedding_dim))
wrd_vector_dct['UNK'] = np.zeros((embedding_dim))
return wrd_idx_dct, idx_wrd_dct, wrd_vector_dct
示例11: __init__
# 需要导入模块: from gensim.corpora.dictionary import Dictionary [as 别名]
# 或者: from gensim.corpora.dictionary.Dictionary import doc2bow [as 别名]
class tip_rec:
def __init__(self, num_topics = 15):
self.numtopics = num_topics
self.topic_dict = dict(enumerate(np.zeros(num_topics)))
self.user_dict = {}
self.model = None
self.worddict = {}
self.mydict = None
def train(self, df):
self.user_dict = {el:self.topic_dict.copy() for el in df.sender.unique()}
cv = CV(stop_words='english')
X = cv.fit_transform(df['context'])
vocab = cv.vocabulary_.keys()
self.worddict=dict([(i, s) for i, s in enumerate(vocab)])
self.mydict = Dictionary()
self.mydict = self.mydict.from_corpus(matutils.Sparse2Corpus(X, documents_columns=False), id2word=self.worddict)
self.model = LatentDA.LdaModel(matutils.Sparse2Corpus(X, documents_columns=False), num_topics=self.numtopics, passes=20, id2word=self.worddict)
for i in df.iterrows():
if i[1]['context'] == '':
continue
else:
values = new_model[mydict.doc2bow(i[1]['context'].split())]
for val in values:
if val[0] in user_dict[i[1].sender].keys():
if i[1].amt == '':
continue
user_dict[i[1].sender][val[0]] += val[1] * float(i[1].amt)
continue
user_dict[i[1].sender][val[0]] = val[1]
for i in user_dict.keys():
norm_const = sum(user_dict[i].values())
for j in user_dict[i].keys():
user_dict[i][j] = user_dict[i][j]/norm_const
def predict(self, text, username = ''):
topics = self.model[self.mydict.doc2bow(text.split())]
doc_aff = np.zeros(self.numtopics)
for i in topics:
doc_aff[i[0]] = i[1]
if username == '':
returndict = {}
for user in self.user_dict.keys():
user_aff = np.array(self.user_dict[user].values())
score = np.linalg.norm(user_aff - doc_aff)
returndict[user] = score
return returndict
else:
user_aff = np.array(self.user_dict[username].values())
score = np.linalg.norm(user_aff - doc_aff)
return (username, score)
示例12: create_mapping_dicts
# 需要导入模块: from gensim.corpora.dictionary import Dictionary [as 别名]
# 或者: from gensim.corpora.dictionary.Dictionary import doc2bow [as 别名]
def create_mapping_dicts(wrd_embedding, filter_corpus=False, bodies=None,
headlines=None):
"""Generate word:index, word:vector, index:word dictionaries.
Args:
----
wrd_embedding: gensim.models.word2vec.Word2Vec fitted model
filter_corpus (optional): boolean
Filter the corpus to only those words seen in the articles. Use
to speed up iteration during intial building/training phases.
bodies (optional): list of lists
Must be passed in if `filter_corpus` is True.
headlines (optional): list of lists
Must be passed in if `filter_corpus` is True.
Return:
------
word_idx_dct: dict
idx_word_dct: dict
word_vector_dct: dict
"""
if filter_corpus:
if (not bodies or not headlines):
raise Exception('Must pass in bodies and headlines with filter_corpus as True!')
else:
wrd_embedding = _filter_corpus(bodies, headlines, wrd_embedding)
gensim_dct = Dictionary()
gensim_dct.doc2bow(wrd_embedding.vocab.keys(), allow_update=True)
word_idx_dct = {wrd: idx for idx, wrd in gensim_dct.items()}
idx_word_dct = {idx: wrd for idx, wrd in gensim_dct.items()}
word_vector_dct = {wrd: wrd_embedding[wrd] for idx, wrd in gensim_dct.items()}
return word_idx_dct, idx_word_dct, word_vector_dct
示例13: DigestedDocumentCollection
# 需要导入模块: from gensim.corpora.dictionary import Dictionary [as 别名]
# 或者: from gensim.corpora.dictionary.Dictionary import doc2bow [as 别名]
class DigestedDocumentCollection(CorpusABC):
"""A bag-of-words representation of a corpus (collection of documents).
This serves as direct input to modeling functions. It is output from
preprocessing functions.
Parameters
----------
corpus: A collection of tokenized documents
Each document is a list of tokens, tokenized and normalized strings
(either utf8 or unicode) (e.g. output of topik.SimpleTokenizer)
Readers iterate over tuples (id, content), but discard id in return (for compatibility with Gensim.)
"""
def __init__(self, tokenized_corpus):
self.corpus = tokenized_corpus
self.dict = Dictionary(tokenized_corpus.get_generator_without_id())
super(DigestedDocumentCollection, self).__init__()
def __iter__(self):
"""Discards id field - for compatibility with Gensim."""
for _id, doc_tokens in self.corpus:
yield self.dict.doc2bow(doc_tokens)
def __len__(self):
return len(self.corpus)
def get_id2word_dict(self):
return self.dict
def save(self, filename):
self.corpus.save(filename)
@classmethod
def load(cls, filename):
return cls(load_persisted_corpus(filename))
@property
def persistor(self):
return self.corpus.persistor
@property
def filter_string(self):
return self.corpus.filter_string
示例14: testing
# 需要导入模块: from gensim.corpora.dictionary import Dictionary [as 别名]
# 或者: from gensim.corpora.dictionary.Dictionary import doc2bow [as 别名]
module_path = os.path.dirname(__file__) # needed because sample data files are located in the same folder
datapath = lambda fname: os.path.join(module_path, 'test_data', fname)
# set up vars used in testing ("Deerwester" from the web tutorial)
texts = [['human', 'interface', 'computer'],
['survey', 'user', 'computer', 'system', 'response', 'time'],
['eps', 'user', 'interface', 'system'],
['system', 'human', 'system', 'eps'],
['user', 'response', 'time'],
['trees'],
['graph', 'trees'],
['graph', 'minors', 'trees'],
['graph', 'minors', 'survey']]
dictionary = Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]
boolean_document_based = ['u_mass']
sliding_window_based = ['c_v', 'c_uci', 'c_npmi']
def testfile():
# temporary data will be stored to this file
return os.path.join(tempfile.gettempdir(), 'gensim_models.tst')
def checkCoherenceMeasure(topics1, topics2, coherence):
"""Check provided topic coherence algorithm on given topics"""
if coherence in boolean_document_based:
cm1 = CoherenceModel(topics=topics1, corpus=corpus, dictionary=dictionary, coherence=coherence)
cm2 = CoherenceModel(topics=topics2, corpus=corpus, dictionary=dictionary, coherence=coherence)
else:
cm1 = CoherenceModel(topics=topics1, texts=texts, dictionary=dictionary, coherence=coherence)
示例15: Dictionary
# 需要导入模块: from gensim.corpora.dictionary import Dictionary [as 别名]
# 或者: from gensim.corpora.dictionary.Dictionary import doc2bow [as 别名]
logging.info('load the dictionary')
id2word, word2id = utils.loadDictionary(working_corpus + word_ids_extension)
dictionary = Dictionary(word2id=word2id, id2word=id2word)
logging.info('load the log_ent model')
log_ent = LogEntropyModel.load(results_path + norm_model)
logging.info('load the LSI model')
lsi = LsiModel.load(results_path + trans_model)
for key in articles.iterkeys():
logging.info('current term: %s' % key)
term_list = articles[key].keys()
text_list = [dictionary.doc2bow(article['text'], allowUpdate=False, returnMissingWords=False)
for article in articles[key].values()]
sim_matrix = np.zeros((len(text_list), len(text_list)))
logging.info('transform the textlist')
text_list = lsi[log_ent[text_list]]
logging.info('compute similarity matrix')
for i, par1 in enumerate(text_list):
for j, par2 in enumerate(text_list):
sim_matrix[i, j] = matutils.cossim(par1, par2)
matrices[key] = {}
matrices[key]['term_list'] = term_list
matrices[key]['sim_matrix'] = sim_matrix
assert np.shape(sim_matrix)[0] == len(term_list)