本文整理汇总了Python中keras.preprocessing.sequence.pad_sequences方法的典型用法代码示例。如果您正苦于以下问题:Python sequence.pad_sequences方法的具体用法?Python sequence.pad_sequences怎么用?Python sequence.pad_sequences使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类keras.preprocessing.sequence
的用法示例。
在下文中一共展示了sequence.pad_sequences方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: create_sequences
# 需要导入模块: from keras.preprocessing import sequence [as 别名]
# 或者: from keras.preprocessing.sequence import pad_sequences [as 别名]
def create_sequences(tokenizer, max_length, captions_list, image):
# X1 : input for image features
# X2 : input for text features
# y : output word
X1, X2, y = list(), list(), list()
vocab_size = len(tokenizer.word_index) + 1
# Walk through each caption for the image
for caption in captions_list:
# Encode the sequence
seq = tokenizer.texts_to_sequences([caption])[0]
# Split one sequence into multiple X,y pairs
for i in range(1, len(seq)):
# Split into input and output pair
in_seq, out_seq = seq[:i], seq[i]
# Pad input sequence
in_seq = pad_sequences([in_seq], maxlen=max_length)[0]
# Encode output sequence
out_seq = to_categorical([out_seq], num_classes=vocab_size)[0]
# Store
X1.append(image)
X2.append(in_seq)
y.append(out_seq)
return X1, X2, y
# Data generator, intended to be used in a call to model.fit_generator()
示例2: generate_captions
# 需要导入模块: from keras.preprocessing import sequence [as 别名]
# 或者: from keras.preprocessing.sequence import pad_sequences [as 别名]
def generate_captions(model, image, beam_size):
start = [cg.word_index['<start>']]
captions = [[start,0.0]]
while(len(captions[0][0]) < cg.max_cap_len):
temp_captions = []
for caption in captions:
partial_caption = sequence.pad_sequences([caption[0]], maxlen=cg.max_cap_len, padding='post')
next_words_pred = model.predict([np.asarray([image]), np.asarray(partial_caption)])[0]
next_words = np.argsort(next_words_pred)[-beam_size:]
for word in next_words:
new_partial_caption, new_partial_caption_prob = caption[0][:], caption[1]
new_partial_caption.append(word)
new_partial_caption_prob+=next_words_pred[word]
temp_captions.append([new_partial_caption,new_partial_caption_prob])
captions = temp_captions
captions.sort(key = lambda l:l[1])
captions = captions[-beam_size:]
return captions
示例3: load_question
# 需要导入模块: from keras.preprocessing import sequence [as 别名]
# 或者: from keras.preprocessing.sequence import pad_sequences [as 别名]
def load_question(params):
df = pd.read_csv(config.QUESTION_FILE)
df["words"] = df.words.str.split(" ").apply(lambda x: [_to_ind(z) for z in x])
df["chars"] = df.chars.str.split(" ").apply(lambda x: [_to_ind(z) for z in x])
Q = {}
Q["seq_len_word"] = sp.minimum(df["words"].apply(len).values, params["max_seq_len_word"])
Q["seq_len_char"] = sp.minimum(df["chars"].apply(len).values, params["max_seq_len_char"])
Q["words"] = pad_sequences(df["words"],
maxlen=params["max_seq_len_word"],
padding=params["pad_sequences_padding"],
truncating=params["pad_sequences_truncating"],
value=config.PADDING_INDEX_WORD)
Q["chars"] = pad_sequences(df["chars"],
maxlen=params["max_seq_len_char"],
padding=params["pad_sequences_padding"],
truncating=params["pad_sequences_truncating"],
value=config.PADDING_INDEX_CHAR)
return Q
示例4: texts_to_sequences
# 需要导入模块: from keras.preprocessing import sequence [as 别名]
# 或者: from keras.preprocessing.sequence import pad_sequences [as 别名]
def texts_to_sequences(self, texts, do_pad=True):
"""Vectorize texts as sequences of indices
Parameters
----------
texts : list of strings to vectorize into sequences of indices
do_pad : pad the sequences to `self.maxlen` if true
"""
self.X = self.tok.texts_to_sequences(texts)
if do_pad:
self.X = sequence.pad_sequences(self.X, maxlen=self.maxlen)
self.word2idx['[0]'], self.idx2word[0] = 0, '[0]' # add padding token
self.vocab_size += 1
return self.X
示例5: conv_seq_labels
# 需要导入模块: from keras.preprocessing import sequence [as 别名]
# 或者: from keras.preprocessing.sequence import pad_sequences [as 别名]
def conv_seq_labels(xds, xhs, nflips, model, debug, oov0, glove_idx2idx, vocab_size, nb_unknown_words, idx2word):
"""Convert description and hedlines to padded input vectors; headlines are one-hot to label."""
batch_size = len(xhs)
assert len(xds) == batch_size
x = [
vocab_fold(lpadd(xd) + xh, oov0, glove_idx2idx, vocab_size, nb_unknown_words)
for xd, xh in zip(xds, xhs)] # the input does not have 2nd eos
x = sequence.pad_sequences(x, maxlen=maxlen, value=empty, padding='post', truncating='post')
x = flip_headline(x, nflips=nflips, model=model, debug=debug, oov0=oov0, idx2word=idx2word)
y = np.zeros((batch_size, maxlenh, vocab_size))
for i, xh in enumerate(xhs):
xh = vocab_fold(xh, oov0, glove_idx2idx, vocab_size, nb_unknown_words) + [eos] + [empty] * maxlenh # output does have a eos at end
xh = xh[:maxlenh]
y[i, :, :] = np_utils.to_categorical(xh, vocab_size)
return x, y
示例6: load_data
# 需要导入模块: from keras.preprocessing import sequence [as 别名]
# 或者: from keras.preprocessing.sequence import pad_sequences [as 别名]
def load_data(data_path, max_len=200):
data = []
l = []
ids = []
i = 0
l_encoder = LabelEncoder()
with open(data_path, 'rb') as inf:
for line in inf:
gzip_fields = line.decode('utf-8').split('\t')
gzip_id = gzip_fields[0]
gzip_label = gzip_fields[1]
elmo_embd_str = gzip_fields[4].strip()
elmo_embd_list = ast.literal_eval(elmo_embd_str)
elmo_embd_array = np.array(elmo_embd_list)
padded_seq = sequence.pad_sequences([elmo_embd_array], maxlen=max_len, dtype='float32')[0]
data.append(padded_seq)
l.append(gzip_label)
ids.append(gzip_id)
i += 1
print(i)
label = l_encoder.fit_transform(l)
return np.array(data), np.array(label), np.array(ids)
示例7: preprocess_batch
# 需要导入模块: from keras.preprocessing import sequence [as 别名]
# 或者: from keras.preprocessing.sequence import pad_sequences [as 别名]
def preprocess_batch(self, captions_label_encoded):
captions = keras_seq.pad_sequences(captions_label_encoded,
padding='post')
# Because the number of timesteps/words resulted by the model is
# maxlen(captions) + 1 (because the first "word" is the image).
captions_extended1 = keras_seq.pad_sequences(captions,
maxlen=captions.shape[-1] + 1,
padding='post')
captions_one_hot = map(self._tokenizer.sequences_to_matrix,
np.expand_dims(captions_extended1, -1))
captions_one_hot = np.array(captions_one_hot, dtype='int')
# Decrease/shift word index by 1.
# Shifting `captions_one_hot` makes the padding word
# (index=0, encoded=[1, 0, ...]) encoded all zeros ([0, 0, ...]),
# so its cross entropy loss will be zero.
captions_decreased = captions.copy()
captions_decreased[captions_decreased > 0] -= 1
captions_one_hot_shifted = captions_one_hot[:, :, 1:]
captions_input = captions_decreased
captions_output = captions_one_hot_shifted
return captions_input, captions_output
示例8: load_tagged_data
# 需要导入模块: from keras.preprocessing import sequence [as 别名]
# 或者: from keras.preprocessing.sequence import pad_sequences [as 别名]
def load_tagged_data(tagged_data_filepath, vocab, tag2id):
"""
Load the input data to the model
:param tagged_data_filepath: the file path to the tagged data file
:param vocab: the dictionary mapping from word to id
:param tag2id: the dictionary mapping from tag to id
:return: Numpy arrays: `train_x, train_y`
"""
seg_samples_list = __get_seg_sample_list(tagged_data_filepath, mode="tagged")
words_list = [[word2tag[0] for word2tag in sample] for sample in seg_samples_list]
sample2id = [[vocab.get(word, 0) for word in sample] for sample in words_list]
max_seq_len = max(len(sample) for sample in sample2id)
train_x = pad_sequences(sample2id, max_seq_len, padding="post", value=0)
tags_list = [[word2tag[1] for word2tag in sample] for sample in seg_samples_list]
tag2id = [[tag2id.get(tag, 0) for tag in sample] for sample in tags_list]
train_y = pad_sequences(tag2id, max_seq_len, padding="post", value=0)
train_y = np.expand_dims(train_y, 2)
return train_x, train_y
示例9: preprocess
# 需要导入模块: from keras.preprocessing import sequence [as 别名]
# 或者: from keras.preprocessing.sequence import pad_sequences [as 别名]
def preprocess(fn_list, max_len):
'''
Return processed data (ndarray) and original file length (list)
'''
corpus = []
for fn in fn_list:
if not os.path.isfile(fn):
print(fn, 'not exist')
else:
with open(fn, 'rb') as f:
corpus.append(f.read())
corpus = [[byte for byte in doc] for doc in corpus]
len_list = [len(doc) for doc in corpus]
seq = pad_sequences(corpus, maxlen=max_len, padding='post', truncating='post')
return seq, len_list
示例10: next_batch
# 需要导入模块: from keras.preprocessing import sequence [as 别名]
# 或者: from keras.preprocessing.sequence import pad_sequences [as 别名]
def next_batch(self):
inverse_vocabulary = self.inverse_vocabulary
if self.stream:
q = [[inverse_vocabulary[word] for word in next(self.questions).strip().split() ] for i in range(self.batch_size)]
a = [[inverse_vocabulary[word] for word in next(self.answers).strip().split() ] for i in range(self.batch_size)]
else:
n_example = len(self.answers)
indices = random.randint(0, n_example, size=(self.batch_size))
q = [[inverse_vocabulary[word] for word in self.questions[i].split()] for i in indices]
a = [[inverse_vocabulary[word] for word in self.answers[i].split()] for i in indices]
X = pad_sequences(q, maxlen=self.sequence_length)
y = pad_sequences(a, maxlen=self.sequence_length)
if self.one_hot_target:
return (X, self.to_one_hot(y))
else:
return (X, y)
示例11: _process_data
# 需要导入模块: from keras.preprocessing import sequence [as 别名]
# 或者: from keras.preprocessing.sequence import pad_sequences [as 别名]
def _process_data(data, vocab, pos_tags, chunk_tags, maxlen=None, onehot=False):
if maxlen is None:
maxlen = max(len(s) for s in data)
word2idx = dict((w, i) for i, w in enumerate(vocab))
# set to <unk> (index 1) if not in vocab
x = [[word2idx.get(w[0].lower(), 1) for w in s] for s in data]
y_pos = [[pos_tags.index(w[1]) for w in s] for s in data]
y_chunk = [[chunk_tags.index(w[2]) for w in s] for s in data]
x = pad_sequences(x, maxlen) # left padding
# lef padded with -1. Indeed, any integer works as it will be masked
y_pos = pad_sequences(y_pos, maxlen, value=-1)
y_chunk = pad_sequences(y_chunk, maxlen, value=-1)
if onehot:
y_pos = numpy.eye(len(pos_tags), dtype='float32')[y]
y_chunk = numpy.eye(len(chunk_tags), dtype='float32')[y]
else:
y_pos = numpy.expand_dims(y_pos, 2)
y_chunk = numpy.expand_dims(y_chunk, 2)
return x, y_pos, y_chunk
示例12: vectorize_stories
# 需要导入模块: from keras.preprocessing import sequence [as 别名]
# 或者: from keras.preprocessing.sequence import pad_sequences [as 别名]
def vectorize_stories(data, word_idx, story_maxlen, query_maxlen):
X = []
Xq = []
Y = []
for story, query, answer in data:
x = [word_idx[w] for w in story]
xq = [word_idx[w] for w in query]
# 正解の単語のインデックスのみ1
y = np.zeros(len(word_idx) + 1) # 0は予約
y[word_idx[answer]] = 1
X.append(x)
Xq.append(xq)
Y.append(y)
# 時系列データをパディング
# >>> pad_sequences([[1,2], [1,2,3], [1], [1,2,3,4,5]], 5)
# array([[0, 0, 0, 1, 2],
# [0, 0, 1, 2, 3],
# [0, 0, 0, 0, 1],
# [1, 2, 3, 4, 5]], dtype=int32)
return pad_sequences(X, maxlen=story_maxlen), pad_sequences(Xq, maxlen=query_maxlen), np.array(Y)
示例13: create_test_data
# 需要导入模块: from keras.preprocessing import sequence [as 别名]
# 或者: from keras.preprocessing.sequence import pad_sequences [as 别名]
def create_test_data(tokenizer, test_sentences_pair, max_sequence_length):
"""
Create training and validation dataset
Args:
tokenizer (keras.preprocessing.text.Tokenizer): keras tokenizer object
test_sentences_pair (list): list of tuple of sentences pairs
max_sequence_length (int): max sequence length of sentences to apply padding
Returns:
test_data_1 (list): list of input features for training set from sentences1
test_data_2 (list): list of input features for training set from sentences2
"""
test_sentences1 = [x[0].lower() for x in test_sentences_pair]
test_sentences2 = [x[1].lower() for x in test_sentences_pair]
test_sequences_1 = tokenizer.texts_to_sequences(test_sentences1)
test_sequences_2 = tokenizer.texts_to_sequences(test_sentences2)
leaks_test = [[len(set(x1)), len(set(x2)), len(set(x1).intersection(x2))]
for x1, x2 in zip(test_sequences_1, test_sequences_2)]
leaks_test = np.array(leaks_test)
test_data_1 = pad_sequences(test_sequences_1, maxlen=max_sequence_length)
test_data_2 = pad_sequences(test_sequences_2, maxlen=max_sequence_length)
return test_data_1, test_data_2, leaks_test
示例14: make_pre_padding
# 需要导入模块: from keras.preprocessing import sequence [as 别名]
# 或者: from keras.preprocessing.sequence import pad_sequences [as 别名]
def make_pre_padding(X_not_padded, nb_timesteps):
X_padded = sequence.pad_sequences(X_not_padded, maxlen=nb_timesteps)
return (X_padded)
示例15: generate_caption
# 需要导入模块: from keras.preprocessing import sequence [as 别名]
# 或者: from keras.preprocessing.sequence import pad_sequences [as 别名]
def generate_caption(model, tokenizer, image, max_length):
# Seed the generation process
in_text = 'startseq'
# Iterate over the whole length of the sequence
for _ in range(max_length):
# Integer encode input sequence
sequence = tokenizer.texts_to_sequences([in_text])[0]
# Pad input
sequence = pad_sequences([sequence], maxlen=max_length)
# Predict next word
# The model will output a prediction, which will be a probability distribution over all words in the vocabulary.
yhat = model.predict([image,sequence], verbose=0)
# The output vector representins a probability distribution where maximum probability is the predicted word position
# Take output class with maximum probability and convert to integer
yhat = np.argmax(yhat)
# Map integer back to word
word = int_to_word(yhat, tokenizer)
# Stop if we cannot map the word
if word is None:
break
# Append as input for generating the next word
in_text += ' ' + word
# Stop if we predict the end of the sequence
if word == 'endseq':
break
return in_text