本文整理汇总了Python中keras.preprocessing.text.Tokenizer方法的典型用法代码示例。如果您正苦于以下问题:Python text.Tokenizer方法的具体用法?Python text.Tokenizer怎么用?Python text.Tokenizer使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类keras.preprocessing.text
的用法示例。
在下文中一共展示了text.Tokenizer方法的14个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: doc_vec_feature
# 需要导入模块: from keras.preprocessing import text [as 别名]
# 或者: from keras.preprocessing.text import Tokenizer [as 别名]
def doc_vec_feature(self, data_set, max_sentences=16):
from keras.preprocessing.text import Tokenizer, text_to_word_sequence
tokenizer = Tokenizer()
tokenizer.fit_on_texts(data_set)
data_feature = np.zeros((len(data_set), max_sentences, self.max_len), dtype='int32')
sentence_symbols = "".join(self.sentence_symbol)
split = "[" + sentence_symbols + "]"
for i, sentence in enumerate(data_set):
short_sents = re.split(split, sentence)
for j, sent in enumerate(short_sents):
if j < max_sentences and sent.strip():
words = text_to_word_sequence(sent)
k = 0
for w in words:
if k < self.max_len:
if w in tokenizer.word_index:
data_feature[i, j, k] = tokenizer.word_index[w]
k += 1
word_index = tokenizer.word_index
logger.info('Number of Unique Tokens: %d' % len(word_index))
print('Shape of Data Tensor:', data_feature.shape)
return data_feature
示例2: _handle_rare_words
# 需要导入模块: from keras.preprocessing import text [as 别名]
# 或者: from keras.preprocessing.text import Tokenizer [as 别名]
def _handle_rare_words(self, captions):
if self._rare_words_handling == 'nothing':
return captions
elif self._rare_words_handling == 'discard':
tokenizer = Tokenizer()
tokenizer.fit_on_texts(captions)
new_captions = []
for caption in captions:
words = text_to_word_sequence(caption)
new_words = [w for w in words
if tokenizer.word_counts.get(w, 0) >=
self._words_min_occur]
new_captions.append(' '.join(new_words))
return new_captions
raise NotImplementedError('rare_words_handling={} is not implemented '
'yet!'.format(self._rare_words_handling))
示例3: main
# 需要导入模块: from keras.preprocessing import text [as 别名]
# 或者: from keras.preprocessing.text import Tokenizer [as 别名]
def main():
parser = argparse.ArgumentParser()
parser.add_argument('--raw_data_path', default='../data/train.json', type=str, required=False, help='原始训练语料')
parser.add_argument('--vocab_file', default='vocab_processed.txt', type=str, required=False, help='生成vocab链接')
parser.add_argument('--vocab_size', default=50000, type=int, required=False, help='词表大小')
args = parser.parse_args()
lac = thulac.thulac(seg_only=True)
tokenizer = Tokenizer(num_words=args.vocab_size)
print('args:\n' + args.__repr__())
print('This script is extremely slow especially for large corpus. Take a break.')
f = open(args.raw_data_path, 'r')
lines = json.load(f)
for i, line in enumerate(tqdm(lines)):
lines[i] = lac.cut(line, text=True)
tokenizer.fit_on_texts(lines)
vocab = list(tokenizer.index_word.values())
pre = ['[SEP]', '[CLS]', '[MASK]', '[PAD]', '[UNK]']
vocab = pre + vocab
with open(args.vocab_file, 'w') as f:
for word in vocab[:args.vocab_size + 5]:
f.write(word + '\n')
示例4: create_embedding_matrix
# 需要导入模块: from keras.preprocessing import text [as 别名]
# 或者: from keras.preprocessing.text import Tokenizer [as 别名]
def create_embedding_matrix(tokenizer, word_vectors, embedding_dim):
"""
Create embedding matrix containing word indexes and respective vectors from word vectors
Args:
tokenizer (keras.preprocessing.text.Tokenizer): keras tokenizer object containing word indexes
word_vectors (dict): dict containing word and their respective vectors
embedding_dim (int): dimention of word vector
Returns:
"""
nb_words = len(tokenizer.word_index) + 1
word_index = tokenizer.word_index
embedding_matrix = np.zeros((nb_words, embedding_dim))
print("Embedding matrix shape: %s" % str(embedding_matrix.shape))
for word, i in word_index.items():
try:
embedding_vector = word_vectors[word]
if embedding_vector is not None:
embedding_matrix[i] = embedding_vector
except KeyError:
print("vector not found for word - %s" % word)
print('Null word embeddings: %d' % np.sum(np.sum(embedding_matrix, axis=1) == 0))
return embedding_matrix
示例5: word_embed_meta_data
# 需要导入模块: from keras.preprocessing import text [as 别名]
# 或者: from keras.preprocessing.text import Tokenizer [as 别名]
def word_embed_meta_data(documents, embedding_dim):
"""
Load tokenizer object for given vocabs list
Args:
documents (list): list of document
embedding_dim (int): embedding dimension
Returns:
tokenizer (keras.preprocessing.text.Tokenizer): keras tokenizer object
embedding_matrix (dict): dict with word_index and vector mapping
"""
documents = [x.lower().split() for x in documents]
tokenizer = Tokenizer()
tokenizer.fit_on_texts(documents)
word_vector = train_word2vec(documents, embedding_dim)
embedding_matrix = create_embedding_matrix(tokenizer, word_vector, embedding_dim)
del word_vector
gc.collect()
return tokenizer, embedding_matrix
示例6: create_test_data
# 需要导入模块: from keras.preprocessing import text [as 别名]
# 或者: from keras.preprocessing.text import Tokenizer [as 别名]
def create_test_data(tokenizer, test_sentences_pair, max_sequence_length):
"""
Create training and validation dataset
Args:
tokenizer (keras.preprocessing.text.Tokenizer): keras tokenizer object
test_sentences_pair (list): list of tuple of sentences pairs
max_sequence_length (int): max sequence length of sentences to apply padding
Returns:
test_data_1 (list): list of input features for training set from sentences1
test_data_2 (list): list of input features for training set from sentences2
"""
test_sentences1 = [x[0].lower() for x in test_sentences_pair]
test_sentences2 = [x[1].lower() for x in test_sentences_pair]
test_sequences_1 = tokenizer.texts_to_sequences(test_sentences1)
test_sequences_2 = tokenizer.texts_to_sequences(test_sentences2)
leaks_test = [[len(set(x1)), len(set(x2)), len(set(x1).intersection(x2))]
for x1, x2 in zip(test_sequences_1, test_sequences_2)]
leaks_test = np.array(leaks_test)
test_data_1 = pad_sequences(test_sequences_1, maxlen=max_sequence_length)
test_data_2 = pad_sequences(test_sequences_2, maxlen=max_sequence_length)
return test_data_1, test_data_2, leaks_test
示例7: test_tokenizer
# 需要导入模块: from keras.preprocessing import text [as 别名]
# 或者: from keras.preprocessing.text import Tokenizer [as 别名]
def test_tokenizer():
texts = ['The cat sat on the mat.',
'The dog sat on the log.',
'Dogs and cats living together.']
tokenizer = Tokenizer(num_words=10)
tokenizer.fit_on_texts(texts)
sequences = []
for seq in tokenizer.texts_to_sequences_generator(texts):
sequences.append(seq)
assert np.max(np.max(sequences)) < 10
assert np.min(np.min(sequences)) == 1
tokenizer.fit_on_sequences(sequences)
for mode in ['binary', 'count', 'tfidf', 'freq']:
matrix = tokenizer.texts_to_matrix(texts, mode)
示例8: test_sequential_fit
# 需要导入模块: from keras.preprocessing import text [as 别名]
# 或者: from keras.preprocessing.text import Tokenizer [as 别名]
def test_sequential_fit():
texts = ['The cat sat on the mat.',
'The dog sat on the log.',
'Dogs and cats living together.']
word_sequences = [
['The', 'cat', 'is', 'sitting'],
['The', 'dog', 'is', 'standing']
]
tokenizer = Tokenizer()
tokenizer.fit_on_texts(texts)
tokenizer.fit_on_texts(word_sequences)
assert tokenizer.document_count == 5
tokenizer.texts_to_matrix(texts)
tokenizer.texts_to_matrix(word_sequences)
示例9: test_tokenizer_oov_flag
# 需要导入模块: from keras.preprocessing import text [as 别名]
# 或者: from keras.preprocessing.text import Tokenizer [as 别名]
def test_tokenizer_oov_flag():
"""
Test of Out of Vocabulary (OOV) flag in Tokenizer
"""
x_train = ['This text has only known words']
x_test = ['This text has some unknown words'] # 2 OOVs: some, unknown
# Default, without OOV flag
tokenizer = Tokenizer()
tokenizer.fit_on_texts(x_train)
x_test_seq = tokenizer.texts_to_sequences(x_test)
assert len(x_test_seq[0]) == 4 # discards 2 OOVs
# With OOV feature
tokenizer = Tokenizer(oov_token='<unk>')
tokenizer.fit_on_texts(x_train)
x_test_seq = tokenizer.texts_to_sequences(x_test)
assert len(x_test_seq[0]) == 6 # OOVs marked in place
示例10: train_wordtoVect
# 需要导入模块: from keras.preprocessing import text [as 别名]
# 或者: from keras.preprocessing.text import Tokenizer [as 别名]
def train_wordtoVect(train_inputTexts):
"""
训练词向量函数
"""
texts=[]
for doc in train_inputTexts:
seg_doc = jieba.lcut(doc.replace('\n', ''))
d =" ".join(seg_doc)
texts.append(d)
tokenizer = text.Tokenizer() # 分词MAX_NB_WORDS
tokenizer.fit_on_texts(texts)
text_sequences = tokenizer.texts_to_sequences(texts) # 受num_words影响
word_index = tokenizer.word_index # 词_索引
data = sequence.pad_sequences(text_sequences, maxlen=MAX_SEQUENCE_LENGTH)
return word_index, data
示例11: _transform_request
# 需要导入模块: from keras.preprocessing import text [as 别名]
# 或者: from keras.preprocessing.text import Tokenizer [as 别名]
def _transform_request(request):
request_str = request.decode('utf-8')
# tokenize the csv request and create json
X = pandas.read_csv(io.StringIO(request_str), engine='python', quotechar='|', header=None).values[:,0]
for index, item in enumerate(X):
reqJson = json.loads(item, object_pairs_hook=OrderedDict)
del reqJson['http']['timestamp']
del reqJson['http']['headers']
del reqJson['http']['source']
del reqJson['http']['route']
del reqJson['http']['responsePayload']
X[index] = json.dumps(reqJson, separators=(',', ':'))
tokenizer = Tokenizer(filters='\t\n', char_level=True)
tokenizer.fit_on_texts(X)
# this used to be [log_entry]
seq = tokenizer.texts_to_sequences([request_str])
max_log_length = 1024
log_entry_processed = sequence.pad_sequences(seq, maxlen=max_log_length)
return log_entry_processed
示例12: load_retures_keras
# 需要导入模块: from keras.preprocessing import text [as 别名]
# 或者: from keras.preprocessing.text import Tokenizer [as 别名]
def load_retures_keras():
from keras.preprocessing.text import Tokenizer
from keras.datasets import reuters
max_words = 1000
print('Loading data...')
(x, y), (_, _) = reuters.load_data(num_words=max_words, test_split=0.)
print(len(x), 'train sequences')
num_classes = np.max(y) + 1
print(num_classes, 'classes')
print('Vectorizing sequence data...')
tokenizer = Tokenizer(num_words=max_words)
x = tokenizer.sequences_to_matrix(x, mode='binary')
print('x_train shape:', x.shape)
return x.astype(float), y
示例13: load_imdb
# 需要导入模块: from keras.preprocessing import text [as 别名]
# 或者: from keras.preprocessing.text import Tokenizer [as 别名]
def load_imdb():
from keras.preprocessing.text import Tokenizer
from keras.datasets import imdb
max_words = 1000
print('Loading data...')
(x1, y1), (x2, y2) = imdb.load_data(num_words=max_words)
x = np.concatenate((x1, x2))
y = np.concatenate((y1, y2))
print(len(x), 'train sequences')
num_classes = np.max(y) + 1
print(num_classes, 'classes')
print('Vectorizing sequence data...')
tokenizer = Tokenizer(num_words=max_words)
x = tokenizer.sequences_to_matrix(x, mode='binary')
print('x_train shape:', x.shape)
return x.astype(float), y
示例14: create_tokenizer
# 需要导入模块: from keras.preprocessing import text [as 别名]
# 或者: from keras.preprocessing.text import Tokenizer [as 别名]
def create_tokenizer(captions):
lines = to_lines(captions)
tokenizer = Tokenizer()
tokenizer.fit_on_texts(lines)
return tokenizer
# Calculate the length of the captions with the most words