本文整理汇总了Python中keras.preprocessing.text.text_to_word_sequence方法的典型用法代码示例。如果您正苦于以下问题:Python text.text_to_word_sequence方法的具体用法?Python text.text_to_word_sequence怎么用?Python text.text_to_word_sequence使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类keras.preprocessing.text
的用法示例。
在下文中一共展示了text.text_to_word_sequence方法的11个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: doc_vec_feature
# 需要导入模块: from keras.preprocessing import text [as 别名]
# 或者: from keras.preprocessing.text import text_to_word_sequence [as 别名]
def doc_vec_feature(self, data_set, max_sentences=16):
from keras.preprocessing.text import Tokenizer, text_to_word_sequence
tokenizer = Tokenizer()
tokenizer.fit_on_texts(data_set)
data_feature = np.zeros((len(data_set), max_sentences, self.max_len), dtype='int32')
sentence_symbols = "".join(self.sentence_symbol)
split = "[" + sentence_symbols + "]"
for i, sentence in enumerate(data_set):
short_sents = re.split(split, sentence)
for j, sent in enumerate(short_sents):
if j < max_sentences and sent.strip():
words = text_to_word_sequence(sent)
k = 0
for w in words:
if k < self.max_len:
if w in tokenizer.word_index:
data_feature[i, j, k] = tokenizer.word_index[w]
k += 1
word_index = tokenizer.word_index
logger.info('Number of Unique Tokens: %d' % len(word_index))
print('Shape of Data Tensor:', data_feature.shape)
return data_feature
示例2: _handle_rare_words
# 需要导入模块: from keras.preprocessing import text [as 别名]
# 或者: from keras.preprocessing.text import text_to_word_sequence [as 别名]
def _handle_rare_words(self, captions):
if self._rare_words_handling == 'nothing':
return captions
elif self._rare_words_handling == 'discard':
tokenizer = Tokenizer()
tokenizer.fit_on_texts(captions)
new_captions = []
for caption in captions:
words = text_to_word_sequence(caption)
new_words = [w for w in words
if tokenizer.word_counts.get(w, 0) >=
self._words_min_occur]
new_captions.append(' '.join(new_words))
return new_captions
raise NotImplementedError('rare_words_handling={} is not implemented '
'yet!'.format(self._rare_words_handling))
示例3: texts_to_sequences
# 需要导入模块: from keras.preprocessing import text [as 别名]
# 或者: from keras.preprocessing.text import text_to_word_sequence [as 别名]
def texts_to_sequences(wordIndex, texts, num_words):
lastWord = num_words - 1
sequences = []
for text in texts:
seq = text_to_word_sequence(text)
vect = []
for w in seq:
i = wordIndex.get(w)
if i is not None:
if num_words and i >= num_words:
vect.append(lastWord)
else:
vect.append(i)
else:
vect.append(lastWord)
sequences.append(vect)
return sequences
示例4: preprocessing
# 需要导入模块: from keras.preprocessing import text [as 别名]
# 或者: from keras.preprocessing.text import text_to_word_sequence [as 别名]
def preprocessing(self):
"""Preprocessing of the text to make it more resonant for training
"""
paras = []
labels = []
texts = []
for idx in range(self.text.shape[0]):
text = self.clean_string(self.text[idx])
texts.append(text)
sentences = tokenize.sent_tokenize(text)
paras.append(sentences)
tokenizer = Tokenizer(num_words=self.max_features, oov_token=True)
tokenizer.fit_on_texts(texts)
data = np.zeros((len(texts), self.max_senten_num,
self.max_senten_len), dtype='int32')
for i, sentences in enumerate(paras):
for j, sent in enumerate(sentences):
if j < self.max_senten_num:
wordTokens = text_to_word_sequence(sent)
k = 0
for _, word in enumerate(wordTokens):
if k < self.max_senten_len and word in tokenizer.word_index and tokenizer.word_index[word] < self.max_features:
data[i, j, k] = tokenizer.word_index[word]
k = k+1
self.word_index = tokenizer.word_index
if self.verbose == 1:
print('Total %s unique tokens.' % len(self.word_index))
labels = pd.get_dummies(self.categories)
if self.verbose == 1:
print('Shape of data tensor:', data.shape)
print('Shape of labels tensor:', labels.shape)
assert (len(self.classes) == labels.shape[1])
assert (data.shape[0] == labels.shape[0])
return data, labels
示例5: __init__
# 需要导入模块: from keras.preprocessing import text [as 别名]
# 或者: from keras.preprocessing.text import text_to_word_sequence [as 别名]
def __init__(self, max_features, wvs, all_texts, unk=True, unk_symbol="unkunk"):
'''
max_features: the upper bound to be placed on the vocabulary size.
embedding_dims: size of the token embeddings; over-ridden if pre-trained
vectors is provided (if wvs is not None).
wvs: set of word vectors to be used for initialization
'''
self.unk = unk
self.unk_symbol = unk_symbol
self.max_features = max_features
self.tokenizer = Tokenizer(nb_words=self.max_features)
self.embedding_dims = wvs.vector_size
self.word_embeddings = wvs
self.raw_texts = all_texts
self.unked_texts = []
self.fit_tokenizer()
if self.unk:
# rewrite the 'raw texts' with unked versions, where tokens not in the
# top max_features are unked.
sorted_tokens = sorted(self.tokenizer.word_index, key=self.tokenizer.word_index.get)
self.known_tokens = sorted_tokens[:self.max_features]
self.tokens_to_unk = sorted_tokens[self.max_features:]
for idx, text in enumerate(self.raw_texts):
cur_text = text_to_word_sequence(text, split=self.tokenizer.split)
t_or_unk = lambda t : t if t in self.known_tokens else self.unk_symbol
unked_text = [t_or_unk(t) for t in cur_text]
unked_text = self.tokenizer.split.join(unked_text)
self.unked_texts.append(unked_text)
self.raw_texts = self.unked_texts
self.fit_tokenizer()
self.init_word_vectors()
示例6: normalize_captions
# 需要导入模块: from keras.preprocessing import text [as 别名]
# 或者: from keras.preprocessing.text import text_to_word_sequence [as 别名]
def normalize_captions(self, captions_txt):
captions_txt = self._add_eos(captions_txt)
word_sequences = map(text_to_word_sequence, captions_txt)
result = map(' '.join, word_sequences)
return result
示例7: prep
# 需要导入模块: from keras.preprocessing import text [as 别名]
# 或者: from keras.preprocessing.text import text_to_word_sequence [as 别名]
def prep(self, data):
init = True
for i in range(len(data)):
reg = re.findall(r"[\w']+", data[i])
if len(reg) == 0: # +++$+++
init = True
continue
sent = text_to_word_sequence(data[i], lower=True, split=' ')
if len(sent) > 15 or len(sent) < 2: # too long
init = True
continue
idx_list = self.sentence_to_idx(sent)
if len(idx_list) == 0: # <UNK> too many
init = True
continue
if init:
_in = idx_list
init = False
else:
_out = idx_list
#_rev_in = list(reversed(_in))
# (the first EOS is part of the loss)
self.data.append([_in , _out + [special_tokens['<EOS>']]])
_in = idx_list
if i % 100000 == 0:
print("building data list: " + str(i) + "/" + str(len(data)) + " done.")
print('original line num:', len(data))
print('prep data num: ', len(self.data))
self.data = np.array(self.data)
self.perm = np.arange( len(self.data), dtype=np.int )
self.shuffle_perm()
示例8: test_text_to_word_sequence
# 需要导入模块: from keras.preprocessing import text [as 别名]
# 或者: from keras.preprocessing.text import text_to_word_sequence [as 别名]
def test_text_to_word_sequence():
text = 'hello! ? world!'
assert text_to_word_sequence(text) == ['hello', 'world']
示例9: test_text_to_word_sequence_multichar_split
# 需要导入模块: from keras.preprocessing import text [as 别名]
# 或者: from keras.preprocessing.text import text_to_word_sequence [as 别名]
def test_text_to_word_sequence_multichar_split():
text = 'hello!stop?world!'
assert text_to_word_sequence(text, split='stop') == ['hello', 'world']
示例10: test_text_to_word_sequence_unicode
# 需要导入模块: from keras.preprocessing import text [as 别名]
# 或者: from keras.preprocessing.text import text_to_word_sequence [as 别名]
def test_text_to_word_sequence_unicode():
text = u'ali! veli? kırk dokuz elli'
assert text_to_word_sequence(text) == [u'ali', u'veli', u'kırk', u'dokuz', u'elli']
示例11: test_text_to_word_sequence_unicode_multichar_split
# 需要导入模块: from keras.preprocessing import text [as 别名]
# 或者: from keras.preprocessing.text import text_to_word_sequence [as 别名]
def test_text_to_word_sequence_unicode_multichar_split():
text = u'ali!stopveli?stopkırkstopdokuzstopelli'
assert text_to_word_sequence(text, split='stop') == [u'ali', u'veli', u'kırk', u'dokuz', u'elli']