本文整理汇总了Python中gensim.corpora.dictionary.Dictionary.items方法的典型用法代码示例。如果您正苦于以下问题:Python Dictionary.items方法的具体用法?Python Dictionary.items怎么用?Python Dictionary.items使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类gensim.corpora.dictionary.Dictionary
的用法示例。
在下文中一共展示了Dictionary.items方法的6个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: create_dictionaries
# 需要导入模块: from gensim.corpora.dictionary import Dictionary [as 别名]
# 或者: from gensim.corpora.dictionary.Dictionary import items [as 别名]
def create_dictionaries(train=None,
test=None,
model=None):
''' Function does are number of Jobs:
1- Creates a word to index mapping
2- Creates a word to vector mapping
3- Transforms the Training and Testing Dictionaries
'''
if (train is not None) and (model is not None) and (test is not None):
gensim_dict = Dictionary()
gensim_dict.doc2bow(model.vocab.keys(),
allow_update=True)
w2indx = {v: k+1 for k, v in gensim_dict.items()}
w2vec = {word: model[word] for word in w2indx.keys()}
def parse_dataset(data):
''' Words become integers
'''
for key in data.keys():
txt = data[key].lower().replace('\n', '').split()
new_txt = []
for word in txt:
try:
new_txt.append(w2indx[word])
except:
new_txt.append(0)
data[key] = new_txt
return data
train = parse_dataset(train)
test = parse_dataset(test)
return w2indx, w2vec, train, test
else:
print('No data provided...')
示例2: create_dictionaries
# 需要导入模块: from gensim.corpora.dictionary import Dictionary [as 别名]
# 或者: from gensim.corpora.dictionary.Dictionary import items [as 别名]
def create_dictionaries(model=None,
combined=None):
''' Function does are number of Jobs:
1- Creates a word to index mapping
2- Creates a word to vector mapping
3- Transforms the Training and Testing Dictionaries
'''
if (combined is not None) and (model is not None):
gensim_dict = Dictionary()
gensim_dict.doc2bow(model.vocab.keys(),
allow_update=True)
w2indx = {v: k+1 for k, v in gensim_dict.items()}#所有频数超过10的词语的索引
w2vec = {word: model[word] for word in w2indx.keys()}#所有频数超过10的词语的词向量
def parse_dataset(combined):
''' Words become integers
'''
data=[]
for sentence in combined:
new_txt = []
for word in sentence:
try:
new_txt.append(w2indx[word])
except:
new_txt.append(0)
data.append(new_txt)
return data
combined=parse_dataset(combined)
combined= sequence.pad_sequences(combined, maxlen=maxlen)#每个句子所含词语对应的索引,所以句子中含有频数小于10的词语,索引为0
return w2indx, w2vec,combined
else:
print 'No data provided...'
示例3: create_mapping_dicts
# 需要导入模块: from gensim.corpora.dictionary import Dictionary [as 别名]
# 或者: from gensim.corpora.dictionary.Dictionary import items [as 别名]
def create_mapping_dicts(wrd_embedding, filter_corpus=False, bodies=None,
headlines=None):
"""Generate word:index, word:vector, index:word dictionaries.
Args:
----
wrd_embedding: gensim.models.word2vec.Word2Vec fitted model
filter_corpus (optional): boolean
Filter the corpus to only those words seen in the bodies/headlines.
bodies (optional): list of lists
Must be passed in if `filter_corpus` is True.
headlines (optional): list of lists
Must be passed in if `filter_corpus` is True.
Return:
------
word_idx_dct: dict
idx_word_dct: dict
word_vector_dct: dict
"""
if filter_corpus:
if (not bodies or not headlines):
excep_str = "Must pass in bodies and headlines with filter_corpus True!"
raise Exception(excep_str)
else:
wrd_embedding = _filter_corpus(bodies, headlines, wrd_embedding)
gensim_dct = Dictionary()
gensim_dct.doc2bow(wrd_embedding.vocab.keys(), allow_update=True)
# Leave index 0 for the newline character
word_idx_dct = {wrd: (idx + 1) for idx, wrd in gensim_dct.items()}
idx_word_dct = {(idx + 1): wrd for idx, wrd in gensim_dct.items()}
word_idx_dct['\n'] = 0
idx_word_dct[0] = '\n'
word_vector_dct = {wrd: wrd_embedding[wrd] for idx, wrd in gensim_dct.items()}
vec_dim = next(len(value) for value in word_vector_dct.values())
word_vector_dct['\n'] = np.zeros((vec_dim))
return word_idx_dct, idx_word_dct, word_vector_dct
示例4: create_mapping_dicts
# 需要导入模块: from gensim.corpora.dictionary import Dictionary [as 别名]
# 或者: from gensim.corpora.dictionary.Dictionary import items [as 别名]
def create_mapping_dicts(wrd_embedding, reviews=None, vocab_size=None):
"""Generate word:index, word:vector, index:word dictionaries.
Args:
----
wrd_embedding: gensim.models.word2vec.Word2Vec fitted model
reviews (optional): np.array (or array-like) of lists of strings
Used to filter the vocabulary, either to only those words in `reviews`
or the most common `vocab_size` words in `reviews` that are also in
the `wrd_embedding`.
vocab_size (optional): int
Keep only `vocab_size` most common words from the reviews.
Return:
------
word_idx_dct: dict
idx_word_dct: dict
word_vector_dct: dict
"""
if reviews is not None:
wrd_embedding = _filter_corpus(wrd_embedding, reviews, vocab_size)
gensim_dct = Dictionary()
gensim_dct.doc2bow(wrd_embedding.vocab.keys(), allow_update=True)
# Leave index 0 for masking the padding, 1 for the end of sequence
# character (EOS), and 2 for unkown words (denoted 'UNK')
wrd_idx_dct = {wrd: (idx + 3) for idx, wrd in gensim_dct.items()}
idx_wrd_dct = {(idx + 3): wrd for idx, wrd in gensim_dct.items()}
wrd_idx_dct['EOS'] = 1
idx_wrd_dct[1] = 'EOS'
wrd_idx_dct['UNK'] = 2
idx_wrd_dct[2] = 'UNK'
wrd_vector_dct = {wrd: wrd_embedding[wrd] for idx, wrd in gensim_dct.items()}
embedding_dim = wrd_embedding.vector_size
wrd_vector_dct['EOS'] = np.zeros((embedding_dim))
wrd_vector_dct['UNK'] = np.zeros((embedding_dim))
return wrd_idx_dct, idx_wrd_dct, wrd_vector_dct
示例5: create_mapping_dicts
# 需要导入模块: from gensim.corpora.dictionary import Dictionary [as 别名]
# 或者: from gensim.corpora.dictionary.Dictionary import items [as 别名]
def create_mapping_dicts(wrd_embedding, filter_corpus=False, bodies=None,
headlines=None):
"""Generate word:index, word:vector, index:word dictionaries.
Args:
----
wrd_embedding: gensim.models.word2vec.Word2Vec fitted model
filter_corpus (optional): boolean
Filter the corpus to only those words seen in the articles. Use
to speed up iteration during intial building/training phases.
bodies (optional): list of lists
Must be passed in if `filter_corpus` is True.
headlines (optional): list of lists
Must be passed in if `filter_corpus` is True.
Return:
------
word_idx_dct: dict
idx_word_dct: dict
word_vector_dct: dict
"""
if filter_corpus:
if (not bodies or not headlines):
raise Exception('Must pass in bodies and headlines with filter_corpus as True!')
else:
wrd_embedding = _filter_corpus(bodies, headlines, wrd_embedding)
gensim_dct = Dictionary()
gensim_dct.doc2bow(wrd_embedding.vocab.keys(), allow_update=True)
word_idx_dct = {wrd: idx for idx, wrd in gensim_dct.items()}
idx_word_dct = {idx: wrd for idx, wrd in gensim_dct.items()}
word_vector_dct = {wrd: wrd_embedding[wrd] for idx, wrd in gensim_dct.items()}
return word_idx_dct, idx_word_dct, word_vector_dct
示例6: wmdistance
# 需要导入模块: from gensim.corpora.dictionary import Dictionary [as 别名]
# 或者: from gensim.corpora.dictionary.Dictionary import items [as 别名]
def wmdistance(self, document1, document2):
"""
Compute the Word Mover's Distance between two documents. When using this
code, please consider citing the following papers:
.. Ofir Pele and Michael Werman, "A linear time histogram metric for improved SIFT matching".
.. Ofir Pele and Michael Werman, "Fast and robust earth mover's distances".
.. Matt Kusner et al. "From Word Embeddings To Document Distances".
Note that if one of the documents have no words that exist in the
Word2Vec vocab, `float('inf')` (i.e. infinity) will be returned.
This method only works if `pyemd` is installed (can be installed via pip, but requires a C compiler).
Example:
>>> # Train word2vec model.
>>> model = Word2Vec(sentences)
>>> # Some sentences to test.
>>> sentence_obama = 'Obama speaks to the media in Illinois'.lower().split()
>>> sentence_president = 'The president greets the press in Chicago'.lower().split()
>>> # Remove their stopwords.
>>> from nltk.corpus import stopwords
>>> stopwords = nltk.corpus.stopwords.words('english')
>>> sentence_obama = [w for w in sentence_obama if w not in stopwords]
>>> sentence_president = [w for w in sentence_president if w not in stopwords]
>>> # Compute WMD.
>>> distance = model.wmdistance(sentence_obama, sentence_president)
"""
if not PYEMD_EXT:
raise ImportError("Please install pyemd Python package to compute WMD.")
# Remove out-of-vocabulary words.
len_pre_oov1 = len(document1)
len_pre_oov2 = len(document2)
document1 = [token for token in document1 if token in self]
document2 = [token for token in document2 if token in self]
diff1 = len_pre_oov1 - len(document1)
diff2 = len_pre_oov2 - len(document2)
if diff1 > 0 or diff2 > 0:
logger.info('Removed %d and %d OOV words from document 1 and 2 (respectively).', diff1, diff2)
if len(document1) == 0 or len(document2) == 0:
logger.info(
"At least one of the documents had no words that werein the vocabulary. "
"Aborting (returning inf)."
)
return float('inf')
dictionary = Dictionary(documents=[document1, document2])
vocab_len = len(dictionary)
if vocab_len == 1:
# Both documents are composed by a single unique token
return 0.0
# Sets for faster look-up.
docset1 = set(document1)
docset2 = set(document2)
# Compute distance matrix.
distance_matrix = zeros((vocab_len, vocab_len), dtype=double)
for i, t1 in dictionary.items():
for j, t2 in dictionary.items():
if t1 not in docset1 or t2 not in docset2:
continue
# Compute Euclidean distance between word vectors.
distance_matrix[i, j] = sqrt(np_sum((self[t1] - self[t2])**2))
if np_sum(distance_matrix) == 0.0:
# `emd` gets stuck if the distance matrix contains only zeros.
logger.info('The distance matrix is all zeros. Aborting (returning inf).')
return float('inf')
def nbow(document):
d = zeros(vocab_len, dtype=double)
nbow = dictionary.doc2bow(document) # Word frequencies.
doc_len = len(document)
for idx, freq in nbow:
d[idx] = freq / float(doc_len) # Normalized word frequencies.
return d
# Compute nBOW representation of documents.
d1 = nbow(document1)
d2 = nbow(document2)
# Compute WMD.
return emd(d1, d2, distance_matrix)