当前位置: 首页>>代码示例>>Python>>正文

Python sequence.pad_sequences方法代码示例

本文整理汇总了Python中keras.preprocessing.sequence.pad_sequences方法的典型用法代码示例。如果您正苦于以下问题:Python sequence.pad_sequences方法的具体用法?Python sequence.pad_sequences怎么用?Python sequence.pad_sequences使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在keras.preprocessing.sequence的用法示例。


示例1: create_sequences

# 需要导入模块: from keras.preprocessing import sequence [as 别名]
# 或者: from keras.preprocessing.sequence import pad_sequences [as 别名]
def create_sequences(tokenizer, max_length, captions_list, image):
	# X1 : input for image features
	# X2 : input for text features
	# y  : output word
	X1, X2, y = list(), list(), list()
	vocab_size = len(tokenizer.word_index) + 1
	# Walk through each caption for the image
	for caption in captions_list:
		# Encode the sequence
		seq = tokenizer.texts_to_sequences([caption])[0]
		# Split one sequence into multiple X,y pairs
		for i in range(1, len(seq)):
			# Split into input and output pair
			in_seq, out_seq = seq[:i], seq[i]
			# Pad input sequence
			in_seq = pad_sequences([in_seq], maxlen=max_length)[0]
			# Encode output sequence
			out_seq = to_categorical([out_seq], num_classes=vocab_size)[0]
			# Store
	return X1, X2, y

# Data generator, intended to be used in a call to model.fit_generator() 

示例2: generate_captions

# 需要导入模块: from keras.preprocessing import sequence [as 别名]
# 或者: from keras.preprocessing.sequence import pad_sequences [as 别名]
def generate_captions(model, image, beam_size):
	start = [cg.word_index['<start>']]
	captions = [[start,0.0]]
	while(len(captions[0][0]) < cg.max_cap_len):
		temp_captions = []
		for caption in captions:
			partial_caption = sequence.pad_sequences([caption[0]], maxlen=cg.max_cap_len, padding='post')
			next_words_pred = model.predict([np.asarray([image]), np.asarray(partial_caption)])[0]
			next_words = np.argsort(next_words_pred)[-beam_size:]
			for word in next_words:
				new_partial_caption, new_partial_caption_prob = caption[0][:], caption[1]
		captions = temp_captions
		captions.sort(key = lambda l:l[1])
		captions = captions[-beam_size:]

	return captions 

示例3: load_question

# 需要导入模块: from keras.preprocessing import sequence [as 别名]
# 或者: from keras.preprocessing.sequence import pad_sequences [as 别名]
def load_question(params):
    df = pd.read_csv(config.QUESTION_FILE)
    df["words"] = df.words.str.split(" ").apply(lambda x: [_to_ind(z) for z in x])
    df["chars"] = df.chars.str.split(" ").apply(lambda x: [_to_ind(z) for z in x])
    Q = {}
    Q["seq_len_word"] = sp.minimum(df["words"].apply(len).values, params["max_seq_len_word"])
    Q["seq_len_char"] = sp.minimum(df["chars"].apply(len).values, params["max_seq_len_char"])
    Q["words"] = pad_sequences(df["words"],
    Q["chars"] = pad_sequences(df["chars"],
    return Q 

示例4: texts_to_sequences

# 需要导入模块: from keras.preprocessing import sequence [as 别名]
# 或者: from keras.preprocessing.sequence import pad_sequences [as 别名]
def texts_to_sequences(self, texts, do_pad=True):
        """Vectorize texts as sequences of indices
        texts : list of strings to vectorize into sequences of indices
        do_pad : pad the sequences to `self.maxlen` if true
        self.X = self.tok.texts_to_sequences(texts)

        if do_pad:
            self.X = sequence.pad_sequences(self.X, maxlen=self.maxlen)
            self.word2idx['[0]'], self.idx2word[0] = 0, '[0]' # add padding token
            self.vocab_size += 1

        return self.X 

示例5: conv_seq_labels

# 需要导入模块: from keras.preprocessing import sequence [as 别名]
# 或者: from keras.preprocessing.sequence import pad_sequences [as 别名]
def conv_seq_labels(xds, xhs, nflips, model, debug, oov0, glove_idx2idx, vocab_size, nb_unknown_words, idx2word):
    """Convert description and hedlines to padded input vectors; headlines are one-hot to label."""
    batch_size = len(xhs)
    assert len(xds) == batch_size
    x = [
        vocab_fold(lpadd(xd) + xh, oov0, glove_idx2idx, vocab_size, nb_unknown_words)
        for xd, xh in zip(xds, xhs)]  # the input does not have 2nd eos
    x = sequence.pad_sequences(x, maxlen=maxlen, value=empty, padding='post', truncating='post')
    x = flip_headline(x, nflips=nflips, model=model, debug=debug, oov0=oov0, idx2word=idx2word)

    y = np.zeros((batch_size, maxlenh, vocab_size))
    for i, xh in enumerate(xhs):
        xh = vocab_fold(xh, oov0, glove_idx2idx, vocab_size, nb_unknown_words) + [eos] + [empty] * maxlenh  # output does have a eos at end
        xh = xh[:maxlenh]
        y[i, :, :] = np_utils.to_categorical(xh, vocab_size)

    return x, y 

示例6: load_data

# 需要导入模块: from keras.preprocessing import sequence [as 别名]
# 或者: from keras.preprocessing.sequence import pad_sequences [as 别名]
def load_data(data_path, max_len=200):
    data = []
    l = []
    ids = []
    i = 0
    l_encoder = LabelEncoder()
    with open(data_path, 'rb') as inf:
        for line in inf:
            gzip_fields = line.decode('utf-8').split('\t')
            gzip_id = gzip_fields[0]
            gzip_label = gzip_fields[1]
            elmo_embd_str = gzip_fields[4].strip()
            elmo_embd_list = ast.literal_eval(elmo_embd_str)
            elmo_embd_array = np.array(elmo_embd_list)
            padded_seq = sequence.pad_sequences([elmo_embd_array], maxlen=max_len, dtype='float32')[0]
            i += 1
    label = l_encoder.fit_transform(l)
    return np.array(data), np.array(label), np.array(ids) 

示例7: preprocess_batch

# 需要导入模块: from keras.preprocessing import sequence [as 别名]
# 或者: from keras.preprocessing.sequence import pad_sequences [as 别名]
def preprocess_batch(self, captions_label_encoded):
        captions = keras_seq.pad_sequences(captions_label_encoded,
        # Because the number of timesteps/words resulted by the model is
        # maxlen(captions) + 1 (because the first "word" is the image).
        captions_extended1 = keras_seq.pad_sequences(captions,
                                                maxlen=captions.shape[-1] + 1,
        captions_one_hot = map(self._tokenizer.sequences_to_matrix,
                               np.expand_dims(captions_extended1, -1))
        captions_one_hot = np.array(captions_one_hot, dtype='int')

        # Decrease/shift word index by 1.
        # Shifting `captions_one_hot` makes the padding word
        # (index=0, encoded=[1, 0, ...]) encoded all zeros ([0, 0, ...]),
        # so its cross entropy loss will be zero.
        captions_decreased = captions.copy()
        captions_decreased[captions_decreased > 0] -= 1
        captions_one_hot_shifted = captions_one_hot[:, :, 1:]

        captions_input = captions_decreased
        captions_output = captions_one_hot_shifted
        return captions_input, captions_output 

示例8: load_tagged_data

# 需要导入模块: from keras.preprocessing import sequence [as 别名]
# 或者: from keras.preprocessing.sequence import pad_sequences [as 别名]
def load_tagged_data(tagged_data_filepath, vocab, tag2id):
    Load the input data to the model
    :param tagged_data_filepath: the file path to the tagged data file
    :param vocab: the dictionary mapping from word to id
    :param tag2id: the dictionary mapping from tag to id
    :return: Numpy arrays: `train_x, train_y`
    seg_samples_list = __get_seg_sample_list(tagged_data_filepath, mode="tagged")

    words_list = [[word2tag[0] for word2tag in sample] for sample in seg_samples_list]
    sample2id = [[vocab.get(word, 0) for word in sample] for sample in words_list]
    max_seq_len = max(len(sample) for sample in sample2id)
    train_x = pad_sequences(sample2id, max_seq_len, padding="post", value=0)

    tags_list = [[word2tag[1] for word2tag in sample] for sample in seg_samples_list]
    tag2id = [[tag2id.get(tag, 0) for tag in sample] for sample in tags_list]
    train_y = pad_sequences(tag2id, max_seq_len, padding="post", value=0)
    train_y = np.expand_dims(train_y, 2)

    return train_x, train_y 

示例9: preprocess

# 需要导入模块: from keras.preprocessing import sequence [as 别名]
# 或者: from keras.preprocessing.sequence import pad_sequences [as 别名]
def preprocess(fn_list, max_len):
    Return processed data (ndarray) and original file length (list)
    corpus = []
    for fn in fn_list:
        if not os.path.isfile(fn):
            print(fn, 'not exist')
            with open(fn, 'rb') as f:
    corpus = [[byte for byte in doc] for doc in corpus]
    len_list = [len(doc) for doc in corpus]
    seq = pad_sequences(corpus, maxlen=max_len, padding='post', truncating='post')
    return seq, len_list 

示例10: next_batch

# 需要导入模块: from keras.preprocessing import sequence [as 别名]
# 或者: from keras.preprocessing.sequence import pad_sequences [as 别名]
def next_batch(self):
        inverse_vocabulary = self.inverse_vocabulary
        if self.stream:
            q = [[inverse_vocabulary[word] for word in next(self.questions).strip().split() ] for i in range(self.batch_size)]
            a = [[inverse_vocabulary[word] for word in next(self.answers).strip().split() ] for i in range(self.batch_size)]
            n_example = len(self.answers)
            indices = random.randint(0, n_example, size=(self.batch_size))
            q = [[inverse_vocabulary[word] for word in self.questions[i].split()] for i in indices]
            a = [[inverse_vocabulary[word] for word in self.answers[i].split()] for i in indices]

        X = pad_sequences(q, maxlen=self.sequence_length)
        y = pad_sequences(a, maxlen=self.sequence_length)

        if self.one_hot_target:
            return (X, self.to_one_hot(y))
            return (X, y) 

示例11: _process_data

# 需要导入模块: from keras.preprocessing import sequence [as 别名]
# 或者: from keras.preprocessing.sequence import pad_sequences [as 别名]
def _process_data(data, vocab, pos_tags, chunk_tags, maxlen=None, onehot=False):
    if maxlen is None:
        maxlen = max(len(s) for s in data)
    word2idx = dict((w, i) for i, w in enumerate(vocab))
    # set to <unk> (index 1) if not in vocab
    x = [[word2idx.get(w[0].lower(), 1) for w in s] for s in data]

    y_pos = [[pos_tags.index(w[1]) for w in s] for s in data]
    y_chunk = [[chunk_tags.index(w[2]) for w in s] for s in data]

    x = pad_sequences(x, maxlen)  # left padding

    # lef padded with -1. Indeed, any integer works as it will be masked
    y_pos = pad_sequences(y_pos, maxlen, value=-1)
    y_chunk = pad_sequences(y_chunk, maxlen, value=-1)

    if onehot:
        y_pos = numpy.eye(len(pos_tags), dtype='float32')[y]
        y_chunk = numpy.eye(len(chunk_tags), dtype='float32')[y]
        y_pos = numpy.expand_dims(y_pos, 2)
        y_chunk = numpy.expand_dims(y_chunk, 2)
    return x, y_pos, y_chunk 

示例12: vectorize_stories

# 需要导入模块: from keras.preprocessing import sequence [as 别名]
# 或者: from keras.preprocessing.sequence import pad_sequences [as 别名]
def vectorize_stories(data, word_idx, story_maxlen, query_maxlen):
    X = []
    Xq = []
    Y = []
    for story, query, answer in data:
        x = [word_idx[w] for w in story]
        xq = [word_idx[w] for w in query]
        # 正解の単語のインデックスのみ1
        y = np.zeros(len(word_idx) + 1)  # 0は予約
        y[word_idx[answer]] = 1

    # 時系列データをパディング
    # >>> pad_sequences([[1,2], [1,2,3], [1], [1,2,3,4,5]], 5)
    # array([[0, 0, 0, 1, 2],
    #        [0, 0, 1, 2, 3],
    #        [0, 0, 0, 0, 1],
    #        [1, 2, 3, 4, 5]], dtype=int32)
    return pad_sequences(X, maxlen=story_maxlen), pad_sequences(Xq, maxlen=query_maxlen), np.array(Y) 

示例13: create_test_data

# 需要导入模块: from keras.preprocessing import sequence [as 别名]
# 或者: from keras.preprocessing.sequence import pad_sequences [as 别名]
def create_test_data(tokenizer, test_sentences_pair, max_sequence_length):
    Create training and validation dataset
        tokenizer (keras.preprocessing.text.Tokenizer): keras tokenizer object
        test_sentences_pair (list): list of tuple of sentences pairs
        max_sequence_length (int): max sequence length of sentences to apply padding

        test_data_1 (list): list of input features for training set from sentences1
        test_data_2 (list): list of input features for training set from sentences2
    test_sentences1 = [x[0].lower() for x in test_sentences_pair]
    test_sentences2 = [x[1].lower() for x in test_sentences_pair]

    test_sequences_1 = tokenizer.texts_to_sequences(test_sentences1)
    test_sequences_2 = tokenizer.texts_to_sequences(test_sentences2)
    leaks_test = [[len(set(x1)), len(set(x2)), len(set(x1).intersection(x2))]
                  for x1, x2 in zip(test_sequences_1, test_sequences_2)]

    leaks_test = np.array(leaks_test)
    test_data_1 = pad_sequences(test_sequences_1, maxlen=max_sequence_length)
    test_data_2 = pad_sequences(test_sequences_2, maxlen=max_sequence_length)

    return test_data_1, test_data_2, leaks_test 

示例14: make_pre_padding

# 需要导入模块: from keras.preprocessing import sequence [as 别名]
# 或者: from keras.preprocessing.sequence import pad_sequences [as 别名]
def make_pre_padding(X_not_padded, nb_timesteps):
    X_padded = sequence.pad_sequences(X_not_padded, maxlen=nb_timesteps)

    return (X_padded) 

示例15: generate_caption

# 需要导入模块: from keras.preprocessing import sequence [as 别名]
# 或者: from keras.preprocessing.sequence import pad_sequences [as 别名]
def generate_caption(model, tokenizer, image, max_length):
	# Seed the generation process
	in_text = 'startseq'
	# Iterate over the whole length of the sequence
	for _ in range(max_length):
		# Integer encode input sequence
		sequence = tokenizer.texts_to_sequences([in_text])[0]
		# Pad input
		sequence = pad_sequences([sequence], maxlen=max_length)
		# Predict next word
		# The model will output a prediction, which will be a probability distribution over all words in the vocabulary.
		yhat = model.predict([image,sequence], verbose=0)
		# The output vector representins a probability distribution where maximum probability is the predicted word position
		# Take output class with maximum probability and convert to integer
		yhat = np.argmax(yhat)
		# Map integer back to word
		word = int_to_word(yhat, tokenizer)
		# Stop if we cannot map the word
		if word is None:
		# Append as input for generating the next word
		in_text += ' ' + word
		# Stop if we predict the end of the sequence
		if word == 'endseq':
	return in_text 
