本文整理汇总了Python中utils.Vocab.construct方法的典型用法代码示例。如果您正苦于以下问题:Python Vocab.construct方法的具体用法?Python Vocab.construct怎么用?Python Vocab.construct使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类utils.Vocab
的用法示例。
在下文中一共展示了Vocab.construct方法的12个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: WhoseLineModel
# 需要导入模块: from utils import Vocab [as 别名]
# 或者: from utils.Vocab import construct [as 别名]
class WhoseLineModel(object):
def __init__(self, config):
self.config = config
self.load_data(debug=False)
self.add_common_model_vars()
def load_data(self, debug=False):
self.wordvecs = gensim.models.Word2Vec.load_word2vec_format(self.config.wordvecpath, binary=False)
self.vocab = Vocab()
self.vocab.construct(self.wordvecs.index2word)
self.embedding_matrix = np.vstack([self.wordvecs[self.vocab.index_to_word[i]] for i in range(len(self.vocab))])
# next line is "unk" surgery cf. https://groups.google.com/forum/#!searchin/globalvectors/unknown/globalvectors/9w8ZADXJclA/X6f0FgxUnMgJ
self.embedding_matrix[0,:] = np.mean(self.embedding_matrix, axis=0)
chapter_split = load_chapter_split(self.config.datasplitpath)
self.speakers = Speakers()
for line in open(self.config.datapath):
ch, speaker, line = line.split("\t")
if chapter_split[ch] == 0:
self.speakers.add_speaker(speaker)
self.speakers.prune(self.config.speaker_count-1) # -1 for OTHER
self.train_data = []
self.dev_data = []
self.test_data = []
oldch = None
for ln in open(self.config.datapath):
ch, speaker, line = ln.split("\t")
encoded_line = (np.array([self.vocab.encode(word) for word in line.split()], dtype=np.int32),
self.speakers.encode(speaker))
if chapter_split[ch] == 0:
dataset = self.train_data
elif chapter_split[ch] == 1:
dataset = self.dev_data
else:
dataset = self.test_data
if self.config.batch_size == "chapter":
if ch == oldch:
dataset[-1].append(encoded_line)
else:
dataset.append([encoded_line])
else:
dataset.append(encoded_line)
oldch = ch
def add_common_model_vars(self):
with tf.variable_scope("word_vectors"):
self.tf_embedding_matrix = tf.constant(self.embedding_matrix, name="embedding")
示例2: RNN_Model
# 需要导入模块: from utils import Vocab [as 别名]
# 或者: from utils.Vocab import construct [as 别名]
class RNN_Model():
def load_data(self):
"""Loads train/dev/test data and builds vocabulary."""
self.train_data, self.dev_data, self.test_data = tr.simplified_data(700, 100, 200)
# build vocab from training data
self.vocab = Vocab()
train_sents = [t.get_words() for t in self.train_data]
self.vocab.construct(list(itertools.chain.from_iterable(train_sents)))
def inference(self, tree, predict_only_root=False):
"""For a given tree build the RNN models computation graph up to where it
may be used for inference.
Args:
tree: a Tree object on which to build the computation graph for the RNN
Returns:
softmax_linear: Output tensor with the computed logits.
"""
node_tensors = self.add_model(tree.root)
if predict_only_root:
node_tensors = node_tensors[tree.root]
else:
node_tensors = [tensor for node, tensor in node_tensors.iteritems() if node.label!=2]
node_tensors = tf.concat(0, node_tensors)
return self.add_projections(node_tensors)
def add_model_vars(self):
'''
You model contains the following parameters:
embedding: tensor(vocab_size, embed_size)
W1: tensor(2* embed_size, embed_size)
b1: tensor(1, embed_size)
U: tensor(embed_size, output_size)
bs: tensor(1, output_size)
Hint: Add the tensorflow variables to the graph here and *reuse* them while building
the compution graphs for composition and projection for each tree
Hint: Use a variable_scope "Composition" for the composition layer, and
"Projection") for the linear transformations preceding the softmax.
'''
embed_size = self.config.embed_size
vocab_size = len(self.vocab)
output_size = self.config.label_size
with tf.variable_scope('Composition'):
### YOUR CODE HERE
embedding = tf.get_variable("embedding", shape=(vocab_size, embed_size))
W1 = tf.get_variable("W1", shape=(2 * embed_size, embed_size))
b1 = tf.get_variable("b1", shape=(1, embed_size))
### END YOUR CODE
with tf.variable_scope('Projection'):
### YOUR CODE HERE
U = tf.get_variable("U", shape=(embed_size, output_size))
bs = tf.get_variable("bs", shape=(1, output_size))
### END YOUR CODE
self.optimizer = tf.train.AdamOptimizer(learning_rate=self.config.lr)
# dummy_total is a simple sum to ensure that the variables for the AdamOptimizer
# are created for initialization and before restore the variables later.
# It should never actually get executed.
dummy_total = tf.constant(0.0)
for v in tf.trainable_variables(): dummy_total +=tf.reduce_sum(v)
self.dummy_minimizer = self.optimizer.minimize(dummy_total)
# we then initialize variables, and because of the self.dummy_minimizer,
# all of the necessary variable/slot pairs get added and included in the
# saver variables
def add_model(self, node):
"""Recursively build the model to compute the phrase embeddings in the tree
Hint: Refer to tree.py and vocab.py before you start. Refer to
the model's vocab with self.vocab
Hint: Reuse the "Composition" variable_scope here
--Hint: Store a node's vector representation in node.tensor so it can be
used by it's parent--
Hint: If node is a leaf node, it's vector representation is just that of the
word vector (see tf.gather()).
Args:
node: a Node object
Returns:
node_tensors: Dict: key = Node, value = tensor(1, embed_size)
"""
with tf.variable_scope('Composition', reuse=True):
### YOUR CODE HERE
embedding = tf.get_variable("embedding")
W1 = tf.get_variable("W1")
b1 = tf.get_variable("b1")
### END YOUR CODE
# THOUGHT: Batch together all leaf nodes and all non leaf nodes
node_tensors = OrderedDict()
curr_node_tensor = None
if node.isLeaf:
### YOUR CODE HERE
curr_node_tensor = tf.gather(embedding, tf.constant([node.label]), name="leaf_lookup")
### END YOUR CODE
else:
node_tensors.update(self.add_model(node.left))
node_tensors.update(self.add_model(node.right))
#.........这里部分代码省略.........
示例3: RNNLM_Model
# 需要导入模块: from utils import Vocab [as 别名]
# 或者: from utils.Vocab import construct [as 别名]
class RNNLM_Model(LanguageModel):
def load_data(self, debug=False):
"""Loads starter word-vectors and train/dev/test data."""
self.vocab = Vocab()
self.vocab.construct(get_ptb_dataset('train'))
self.encoded_train = np.array(
[self.vocab.encode(word) for word in get_ptb_dataset('train')],
dtype=np.int32)
self.encoded_valid = np.array(
[self.vocab.encode(word) for word in get_ptb_dataset('valid')],
dtype=np.int32)
self.encoded_test = np.array(
[self.vocab.encode(word) for word in get_ptb_dataset('test')],
dtype=np.int32)
if debug:
num_debug = 1024
self.encoded_train = self.encoded_train[:num_debug]
self.encoded_valid = self.encoded_valid[:num_debug]
self.encoded_test = self.encoded_test[:num_debug]
def add_placeholders(self):
"""Generate placeholder variables to represent the input tensors
These placeholders are used as inputs by the rest of the model building
code and will be fed data during training. Note that when "None" is in a
placeholder's shape, it's flexible
Adds following nodes to the computational graph.
(When None is in a placeholder's shape, it's flexible)
input_placeholder: Input placeholder tensor of shape
(None, num_steps), type tf.int32
labels_placeholder: Labels placeholder tensor of shape
(None, num_steps), type tf.float32
dropout_placeholder: Dropout value placeholder (scalar),
type tf.float32
Add these placeholders to self as the instance variables
self.input_placeholder
self.labels_placeholder
self.dropout_placeholder
(Don't change the variable names)
"""
### YOUR CODE HERE
self.input_placeholder = tf.placeholder(tf.int32, shape=[None, self.config.num_steps], name='Input')
self.labels_placeholder = tf.placeholder(tf.float32, shape=[None, self.config.num_steps], name='Target')
self.dropout_placeholder = tf.placeholder(tf.int64, name='Dropout')
### END YOUR CODE
def add_embedding(self):
"""Add embedding layer.
Hint: This layer should use the input_placeholder to index into the
embedding.
Hint: You might find tf.nn.embedding_lookup useful.
Hint: You might find tf.split, tf.squeeze useful in constructing tensor inputs
Hint: Check the last slide from the TensorFlow lecture.
Hint: Here are the dimensions of the variables you will need to create:
L: (len(self.vocab), embed_size)
Returns:
inputs: List of length num_steps, each of whose elements should be
a tensor of shape (batch_size, embed_size).
"""
# The embedding lookup is currently only implemented for the CPU
with tf.device('/cpu:0'):
### YOUR CODE HERE
embeddings = tf.get_variable('Embedding', [len(self.vocab), self.config.embed_size], trainable=True)
inputs = tf.nn.embedding_lookup(embeddings, self.input_placeholder)
inputs = [tf.squeeze(x, [1]) for x in tf.split(1, self.config.num_steps, inputs)]
### END YOUR CODE
return inputs
def add_projection(self, rnn_outputs):
"""Adds a projection layer.
The projection layer transforms the hidden representation to a distribution
over the vocabulary.
Hint: Here are the dimensions of the variables you will need to
create
U: (hidden_size, len(vocab))
b_2: (len(vocab),)
Args:
rnn_outputs: List of length num_steps, each of whose elements should be
a tensor of shape (batch_size, embed_size).
Returns:
outputs: List of length num_steps, each a tensor of shape
(batch_size, len(vocab)
"""
### YOUR CODE HERE
with tf.name_scope('Projection Layer'):
U = tf.get_variable('U', [self.config.hidden_size, len(self.vocab)])
b2 = tf.get_variable('b2', len(self.vocab))
#.........这里部分代码省略.........
示例4: RNN_Model
# 需要导入模块: from utils import Vocab [as 别名]
# 或者: from utils.Vocab import construct [as 别名]
class RNN_Model():
def load_data(self):
"""Loads train/dev/test data and builds vocabulary."""
self.train_data, self.dev_data, self.test_data = tr.simplified_data(700, 100, 200)
# build vocab from training data
self.vocab = Vocab()
train_sents = [t.get_words() for t in self.train_data]
self.vocab.construct(list(itertools.chain.from_iterable(train_sents)))
def inference(self, tree, predict_only_root=False):
"""For a given tree build the RNN models computation graph up to where it
may be used for inference.
Args:
tree: a Tree object on which to build the computation graph for the RNN
Returns:
softmax_linear: Output tensor with the computed logits.
"""
node_tensors = self.add_model(tree.root)
if predict_only_root:
node_tensors = node_tensors[tree.root]
else:
node_tensors = [tensor for node, tensor in node_tensors.iteritems() if node.label!=2]
node_tensors = tf.concat(0, node_tensors)
return self.add_projections(node_tensors)
def add_model_vars(self):
'''
You model contains the following parameters:
embedding: tensor(vocab_size, embed_size)
W1: tensor(2* embed_size, embed_size)
b1: tensor(1, embed_size)
U: tensor(embed_size, output_size)
bs: tensor(1, output_size)
Hint: Add the tensorflow variables to the graph here and *reuse* them while building
the compution graphs for composition and projection for each tree
Hint: Use a variable_scope "Composition" for the composition layer, and
"Projection") for the linear transformations preceding the softmax.
'''
with tf.variable_scope('Composition'):
### YOUR CODE HERE
embed_size = self.config.embed_size
#epsilon = 0.4
#initializer = tf.random_uniform_initializer(-epsilon, epsilon)
initializer = None
embedding = tf.get_variable('embedding', [len(self.vocab), self.config.embed_size], initializer=initializer)
W1 = tf.get_variable("W1", [2 * embed_size, embed_size], initializer=initializer)
b1 = tf.get_variable("b1", [1, embed_size], initializer=initializer)
### END YOUR CODE
with tf.variable_scope('Projection'):
### YOUR CODE HERE
U = tf.get_variable("U", [embed_size, self.config.label_size], initializer=initializer)
bs = tf.get_variable("bs", [1, self.config.label_size], initializer=initializer)
### END YOUR CODE
def add_model(self, node):
"""Recursively build the model to compute the phrase embeddings in the tree
Hint: Refer to tree.py and vocab.py before you start. Refer to
the model's vocab with self.vocab
Hint: Reuse the "Composition" variable_scope here
Hint: Store a node's vector representation in node.tensor so it can be
used by it's parent
Hint: If node is a leaf node, it's vector representation is just that of the
word vector (see tf.gather()).
Args:
node: a Node object
Returns:
node_tensors: Dict: key = Node, value = tensor(1, embed_size)
"""
with tf.variable_scope('Composition', reuse=True):
### YOUR CODE HERE
embedding = tf.get_variable("embedding")
W1 = tf.get_variable("W1")
b1 = tf.get_variable("b1")
### END YOUR CODE
node_tensors = OrderedDict()
curr_node_tensor = None
if node.isLeaf:
### YOUR CODE HERE
curr_node_tensor = tf.gather(embedding, [self.vocab.encode(node.word)])
### END YOUR CODE
else:
node_tensors.update(self.add_model(node.left))
node_tensors.update(self.add_model(node.right))
### YOUR CODE HERE
node_input = tf.concat(1, [node_tensors[node.left], node_tensors[node.right]])
curr_node_tensor = tf.matmul(node_input, W1) + b1
curr_node_tensor = tf.nn.relu(curr_node_tensor)
### END YOUR CODE
node_tensors[node] = curr_node_tensor
return node_tensors
def add_projections(self, node_tensors):
"""Add projections to the composition vectors to compute the raw sentiment scores
Hint: Reuse the "Projection" variable_scope here
#.........这里部分代码省略.........
示例5: Model_RNN
# 需要导入模块: from utils import Vocab [as 别名]
# 或者: from utils.Vocab import construct [as 别名]
class Model_RNN(LanguageModel):
def load_data(self):
pair_fname = '../lastfm_train_mappings.txt'
lyrics_path = '../lyrics/data/lyrics/train/'
# X_train is a list of all examples. each examples is a 2-len list. each element is a list of words in lyrics.
# word_counts is a dictionary that maps
X_train, l_train, self.word_counts, self.config.max_steps = get_data(pair_fname, lyrics_path, threshold=100, n_class=self.config.n_class)
self.labels_train = np.zeros((len(X_train),self.config.n_class))
self.labels_train[range(len(X_train)),l_train] = 1
self.vocab = Vocab()
self.vocab.construct(self.word_counts.keys())
self.encoded_train_1 = np.zeros((len(X_train), self.config.max_steps)) # need to handle this better.
self.encoded_train_2 = np.zeros((len(X_train), self.config.max_steps))
for i in range(len(X_train)):
self.encoded_train_1[i,:len(X_train[i][0])] = [self.vocab.encode(word) for word in X_train[i][0]]
self.encoded_train_2[i,:len(X_train[i][1])] = [self.vocab.encode(word) for word in X_train[i][1]]
def add_placeholders(self):
self.X1 = tf.placeholder(tf.int32, shape=(None, self.config.max_steps), name='X1')
self.X2 = tf.placeholder(tf.int32, shape=(None, self.config.max_steps), name='X2')
self.labels = tf.placeholder(tf.float32, shape=(None, self.config.n_class), name='labels')
#self.initial_state = tf.placeholder(tf.float32, shape=(None, self.config.hidden_size), name='initial_state')
self.seq_len1 = tf.placeholder(tf.int32, shape=(None), name='seq_len1') # for variable length sequences
self.seq_len2 = tf.placeholder(tf.int32, shape=(None), name='seq_len2') # for variable length sequences
def add_embedding(self):
L = tf.get_variable('L', shape=(len(self.word_counts.keys()), self.config.embed_size), dtype=tf.float32)
inputs1 = tf.nn.embedding_lookup(L, self.X1) # self.X1 is batch_size x self.config.max_steps
inputs2 = tf.nn.embedding_lookup(L, self.X2) # input2 is batch_size x self.config.max_steps x self.config.embed_size
inputs1 = tf.split(1, self.config.max_steps, inputs1) # list of len self.config.max_steps where each element is batch_size x self.config.embed_size
inputs1 = [tf.squeeze(x) for x in inputs1]
inputs2 = tf.split(1, self.config.max_steps, inputs2) # list of len self.config.max_steps where each element is batch_size x self.config.embed_size
inputs2 = [tf.squeeze(x) for x in inputs2]
print 'onh'
print inputs1[0].get_shape
return inputs1, inputs2
def add_model(self, inputs1, inputs2, seq_len1, seq_len2):
#self.initial_state = tf.constant(np.zeros(()), dtype=tf.float32)
print 'adsf add_model'
self.initial_state = tf.constant(np.zeros((self.config.batch_size,self.config.hidden_size)), dtype=tf.float32)
rnn_outputs = []
rnn_outputs1 = []
rnn_outputs2 = []
h_curr1 = self.initial_state
h_curr2 = self.initial_state
print 'nthgnghn'
with tf.variable_scope('rnn'):
Whh = tf.get_variable('Whh', shape=(self.config.hidden_size,self.config.hidden_size), dtype=tf.float32)
Wxh = tf.get_variable('Wxh', shape=(self.config.embed_size,self.config.hidden_size), dtype=tf.float32)
b1 = tf.get_variable('bhx', shape=(self.config.hidden_size,), dtype=tf.float32)
print Wxh.get_shape
print inputs1[0].get_shape
print inputs2[0].get_shape
for i in range(self.config.max_steps):
h_curr2 = tf.matmul(h_curr2,Whh)
h_curr2 += tf.matmul(inputs2[i],Wxh)
h_curr2 += b1
h_curr2 = tf.sigmoid(h_curr2)
h_curr1 = tf.sigmoid(tf.matmul(h_curr1,Whh) + tf.matmul(inputs1[i],Wxh) + b1)
rnn_outputs1.append(h_curr1)
rnn_outputs2.append(h_curr2)
rnn_states = [tf.concat(1, [rnn_outputs1[i], rnn_outputs2[i]]) for i in range(self.config.max_steps)]
return rnn_states
def add_projection(self, rnn_states):
# rnn_outputs is a list of length batch_size of lengths = seq_len. Where each list element is ??. I think.
Whc = tf.get_variable('Whc', shape=(2*self.config.hidden_size,self.config.n_class))
bhc = tf.get_variable('bhc', shape=(self.config.n_class,))
projections = tf.matmul(rnn_states[-1],Whc) + bhc # in case we stop short sequences, the rnn_state in further time_steps should be unch
return projections
def add_loss_op(self, y):
loss = tf.nn.softmax_cross_entropy_with_logits(y, self.labels)
loss = tf.reduce_sum(loss)
return loss
def add_training_op(self, loss):
#train_op = tf.train.AdamOptimizer(learning_rate=self.config.lr).minimize(loss)
train_op = tf.train.GradientDescentOptimizer(learning_rate=self.config.lr).minimize(loss)
return train_op
def __init__(self, config):
self.config = config
self.load_data()
self.add_placeholders()
print 'adsf __init__'
print self.X1.get_shape
self.inputs1, self.inputs2 = self.add_embedding()
self.rnn_states = self.add_model(self.inputs1, self.inputs2, self.seq_len1, self.seq_len2)
self.projections = self.add_projection(self.rnn_states)
self.loss = self.add_loss_op(self.projections)
#.........这里部分代码省略.........
示例6: Model
# 需要导入模块: from utils import Vocab [as 别名]
# 或者: from utils.Vocab import construct [as 别名]
class Model():
def __init__(self, config):
self.config = config
self.load_data(debug=False)
self.build_model()
def load_vocab(self,debug):
self.vocab = Vocab()
if debug:
self.vocab.construct(get_words_dataset('dev'))
else:
self.vocab.construct(get_words_dataset('train'))
self.vocab.build_embedding_matrix(self.config.word_embed_size)
self.embedding_matrix = self.vocab.embedding_matrix
def load_data(self, debug=False):
"""
Loads starter word-vectors and train/dev/test data.
"""
self.load_vocab(debug)
config = self.config
if debug:
# Load the training set
train_data = list(get_sentences_dataset(self.vocab,
config.sent_len, 'dev', 'post'))
( self.sent1_train, self.sent2_train, self.len1_train,
self.len2_train, self.y_train ) = zip(*train_data)
self.sent1_train, self.sent2_train = np.vstack(self.sent1_train), np.vstack(self.sent2_train)
self.len1_train, self.len2_train = ( np.array(self.len1_train),
np.array(self.len2_train) )
self.y_train = np.array(self.y_train)
print('# training examples: %d' %len(self.y_train))
# Load the validation set
dev_data = list(get_sentences_dataset(self.vocab, config.sent_len,
'test', 'post'))
( self.sent1_dev, self.sent2_dev, self.len1_dev,
self.len2_dev, self.y_dev ) = zip(*dev_data)
self.sent1_dev, self.sent2_dev = np.vstack(self.sent1_dev), np.vstack(self.sent2_dev)
self.len1_dev, self.len2_dev = ( np.array(self.len1_dev),
np.array(self.len2_dev) )
self.y_dev = np.array(self.y_dev)
print('# dev examples: %d' %len(self.y_dev))
# Load the test set
test_data = list(get_sentences_dataset(self.vocab, config.sent_len,
'test', 'post'))
( self.sent1_test, self.sent2_test, self.len1_test,
self.len2_test, self.y_test ) = zip(*test_data)
self.sent1_test, self.sent2_test = np.vstack(self.sent1_test), np.vstack(self.sent2_test)
self.len1_test, self.len2_test = ( np.array(self.len1_test),
np.array(self.len2_test) )
self.y_test = np.array(self.y_test)
print('# test examples: %d' %len(self.y_test))
else:
# Load the training set
train_data = list(get_sentences_dataset(self.vocab,
config.sent_len, 'train', 'post'))
( self.sent1_train, self.sent2_train, self.len1_train,
self.len2_train, self.y_train ) = zip(*train_data)
self.sent1_train, self.sent2_train = np.vstack(self.sent1_train), np.vstack(self.sent2_train)
self.len1_train, self.len2_train = ( np.array(self.len1_train),
np.array(self.len2_train) )
self.y_train = np.array(self.y_train)
print('# training examples: %d' %len(self.y_train))
# Load the validation set
dev_data = list(get_sentences_dataset(self.vocab, config.sent_len,
'dev', 'post'))
( self.sent1_dev, self.sent2_dev, self.len1_dev,
self.len2_dev, self.y_dev ) = zip(*dev_data)
self.sent1_dev, self.sent2_dev = np.vstack(self.sent1_dev), np.vstack(self.sent2_dev)
self.len1_dev, self.len2_dev = ( np.array(self.len1_dev),
np.array(self.len2_dev) )
self.y_dev = np.array(self.y_dev)
print('# dev examples: %d' %len(self.y_dev))
# Load the test set
test_data = list(get_sentences_dataset(self.vocab, config.sent_len,
'test', 'post'))
( self.sent1_test, self.sent2_test, self.len1_test,
self.len2_test, self.y_test ) = zip(*test_data)
self.sent1_test, self.sent2_test = np.vstack(self.sent1_test), np.vstack(self.sent2_test)
self.len1_test, self.len2_test = ( np.array(self.len1_test),
np.array(self.len2_test) )
self.y_test = np.array(self.y_test)
print('# test examples: %d' %len(self.y_test))
print('min len: ', np.min(self.len2_train))
def build_model(self):
config = self.config
k = config.sentence_embed_size
L = config.sent_len
#.........这里部分代码省略.........
示例7: RNN_Model
# 需要导入模块: from utils import Vocab [as 别名]
# 或者: from utils.Vocab import construct [as 别名]
class RNN_Model():
def __init__(self, config):
self.config = config
self.load_data()
self.merged_summaries = None
self.summary_writer = None
self.is_a_leaf = tf.placeholder(tf.bool, [None], name="is_a_leaf")
self.left_child = tf.placeholder(tf.int32, [None], name="lchild")
self.right_child = tf.placeholder(tf.int32, [None], name="rchild")
self.word_index = tf.placeholder(tf.int32, [None], name="word_index")
self.labelholder = tf.placeholder(tf.int32, [None], name="labels_holder")
self.add_model_vars()
self.tensor_array = tf.TensorArray(tf.float32,
size=0,
dynamic_size=True,
clear_after_read=False,
infer_shape=False)
#tensor array stores the vectors (embedded or composed)
self.tensor_array_op = None
self.prediction = None
self.logits = None
self.root_logits = None
self.root_predict = None
self.root_loss = None
self.full_loss = None
self.training_op = None
#tensor_array_op is the operation on the TensorArray
# private functions used to construct the graph.
def _embed_word(self, word_index):
with tf.variable_scope("Composition", reuse=True) as scope:
print(scope.name)
embedding = tf.get_variable("embedding")
print(embedding.name)
return tf.expand_dims(tf.gather(embedding, word_index), 0)
# private functions used to construct the graph.
def _combine_children(self, left_index, right_index):
left_tensor = self.tensor_array.read(left_index)
right_tensor = self.tensor_array.read(right_index)
with tf.variable_scope('Composition', reuse=True):
W1 = tf.get_variable('W1')
b1 = tf.get_variable('b1')
return tf.nn.relu(tf.matmul(tf.concat(1, [left_tensor, right_tensor]), W1) + b1)
# i is the index (over data stored in the placeholders)
# identical type[out] = type[in]; can be used in while_loop
# so first iteration -> puts left most leaf on the tensorarray (and increments i)
# next iteration -> puts next left most (leaf on stack) and increments i
# ....
# until all the leaves are on the stack in the correct order
# starts combining the leaves after and adding to the stack
def _loop_over_tree(self, tensor_array, i):
is_leaf = tf.gather(self.is_a_leaf, i)
word_idx = tf.gather(self.word_index, i)
left_child = tf.gather(self.left_child, i)
right_child = tf.gather(self.right_child, i)
node_tensor = tf.cond(is_leaf, lambda : self._embed_word(word_idx),
lambda : self._combine_children(left_child, right_child))
tensor_array = tensor_array.write(i, node_tensor)
i = tf.add(i,1)
return tensor_array, i
def construct_tensor_array(self):
loop_condition = lambda tensor_array, i: \
tf.less(i, tf.squeeze(tf.shape(self.is_a_leaf)))
#iterate over all leaves + composition
tensor_array_op = tf.while_loop(cond=loop_condition,
body=self._loop_over_tree,
loop_vars=[self.tensor_array, 0],
parallel_iterations=1)[0]
return tensor_array_op
def inference_op(self, predict_only_root=False):
if predict_only_root:
return self.root_logits_op()
return self.logits_op()
def load_data(self):
"""Loads train/dev/test data and builds vocabulary."""
self.train_data, self.dev_data, self.test_data = tr.simplified_data(700, 100, 200)
# build vocab from training data
self.vocab = Vocab()
train_sents = [t.get_words() for t in self.train_data]
self.vocab.construct(list(itertools.chain.from_iterable(train_sents)))
def add_model_vars(self):
'''
You model contains the following parameters:
embedding: tensor(vocab_size, embed_size)
W1: tensor(2* embed_size, embed_size)
b1: tensor(1, embed_size)
U: tensor(embed_size, output_size)
bs: tensor(1, output_size)
Hint: Add the tensorflow variables to the graph here and *reuse* them while building
#.........这里部分代码省略.........
示例8: Model_RNN
# 需要导入模块: from utils import Vocab [as 别名]
# 或者: from utils.Vocab import construct [as 别名]
class Model_RNN(LanguageModel):
def load_data(self):
pair_fname = '../lastfm_train_mappings.txt'
lyrics_path = '../data/lyrics/train/'
# X_train is a list of all examples. each examples is a 2-len list. each element is a list of words in lyrics.
# word_counts is a dictionary that maps
if self.config.debug:
X_train, l_train, self.word_counts, seq_len1, seq_len2, self.config.max_steps = get_data(pair_fname, lyrics_path, '../glove.6B.50d.txt', threshold_down=0, threshold_up=float('inf'), npos=100, nneg=100)
else:
X_train, l_train, self.word_counts, seq_len1, seq_len2, self.config.max_steps = get_data(pair_fname, lyrics_path, threshold_down=100, threshold_up=4000, npos=10000, nneg=10000)
self.labels_train = np.zeros((len(X_train),self.config.n_class))
self.labels_train[range(len(X_train)),l_train] = 1
x = collections.Counter(l_train)
for k in x.keys():
print 'class:', k, x[k]
print ''
self.vocab = Vocab()
self.vocab.construct(self.word_counts.keys())
self.wv = self.vocab.get_wv('../glove.6B.50d.txt')
with open('word_hist.csv', 'w') as f:
for w in self.word_counts.keys():
f.write(w+','+str(self.word_counts[w])+'\n')
self.encoded_train_1 = np.zeros((len(X_train), self.config.max_steps)) # need to handle this better.
self.encoded_train_2 = np.zeros((len(X_train), self.config.max_steps))
for i in range(len(X_train)):
self.encoded_train_1[i,:len(X_train[i][0])] = [self.vocab.encode(word) for word in X_train[i][0]]
self.encoded_train_2[i,:len(X_train[i][1])] = [self.vocab.encode(word) for word in X_train[i][1]]
self.sequence_len1 = np.array(seq_len1)
self.sequence_len2 = np.array(seq_len2)
def add_placeholders(self):
self.X1 = tf.placeholder(tf.int32, shape=(None, self.config.max_steps), name='X1')
self.X2 = tf.placeholder(tf.int32, shape=(None, self.config.max_steps), name='X2')
self.labels = tf.placeholder(tf.float32, shape=(None, self.config.n_class), name='labels')
#self.initial_state = tf.placeholder(tf.float32, shape=(None, self.config.hidden_size), name='initial_state')
self.seq_len1 = tf.placeholder(tf.int32, shape=(None), name='seq_len1') # for variable length sequences
self.seq_len2 = tf.placeholder(tf.int32, shape=(None), name='seq_len2') # for variable length sequences
def add_embedding(self):
#L = tf.get_variable('L', shape=(len(self.vocab), self.config.embed_size), dtype=tf.float32)
L = tf.Variable(tf.convert_to_tensor(self.wv, dtype=tf.float32), name='L')
#L = tf.constant(tf.convert_to_tensor(self.wvi), dtype=tf.float32, name='L')
inputs1 = tf.nn.embedding_lookup(L, self.X1) # self.X1 is batch_size x self.config.max_steps
inputs2 = tf.nn.embedding_lookup(L, self.X2) # input2 is batch_size x self.config.max_steps x self.config.embed_size
inputs1 = tf.split(1, self.config.max_steps, inputs1) # list of len self.config.max_steps where each element is batch_size x self.config.embed_size
inputs1 = [tf.squeeze(x, squeeze_dims=[1]) for x in inputs1]
inputs2 = tf.split(1, self.config.max_steps, inputs2) # list of len self.config.max_steps where each element is batch_size x self.config.embed_size
inputs2 = [tf.squeeze(x, squeeze_dims=[1]) for x in inputs2]
return inputs1, inputs2
def add_model_rnn(self, inputs1, inputs2, seq_len1, seq_len2):
#self.initial_state = tf.constant(np.zeros(()), dtype=tf.float32)
self.initial_state = tf.constant(np.zeros((self.config.batch_size,self.config.hidden_size)), dtype=tf.float32)
rnn_outputs = []
rnn_outputs1 = []
rnn_outputs2 = []
h_curr1 = self.initial_state
h_curr2 = self.initial_state
with tf.variable_scope('rnn'):
Whh = tf.get_variable('Whh', shape=(self.config.hidden_size,self.config.hidden_size), dtype=tf.float32)
Wxh = tf.get_variable('Wxh', shape=(self.config.embed_size,self.config.hidden_size), dtype=tf.float32)
b1 = tf.get_variable('bhx', shape=(4*self.config.hidden_size,), dtype=tf.float32)
for i in range(self.config.max_steps):
if self.config.batch_size==1:
if i==seq_len1[0]:
breaka
tmp = tf.matmul(h_curr1,Whh) + tf.matmul(inputs1[i],Wxh) + b1
rnn_outputs1.append(h_curr1)
for i in range(self.config.max_steps):
if self.config.batch_size==1:
if i==seq_len2[0]:
breaka
h_curr2 = tf.sigmoid(tf.matmul(h_curr2,Whh) + tf.matmul(inputs2[i],Wxh) + b1)
rnn_outputs2.append(h_curr2)
#lstm_states = [tf.concat(1, [rnn_outputs1[i], rnn_outputs2[i]]) for i in range(self.config.max_steps)]
rnn_final_states = tf.concat(1, [rnn_outputs1[-1], rnn_outputs2[-1]])
return rnn_final_states
def add_model_lstm(self, inputs1, inputs2, seq_len1, seq_len2):
#self.initial_state = tf.constant(np.zeros(()), dtype=tf.float32)
self.initial_state = tf.constant(np.zeros((self.config.batch_size,self.config.hidden_size)), dtype=tf.float32)
lstm_outputs1 = []
lstm_outputs2 = []
h_curr1 = self.initial_state
h_curr2 = self.initial_state
cell1 = self.initial_state
cell2 = self.initial_state
with tf.variable_scope('lstm'):
#.........这里部分代码省略.........
示例9: RNNLM_Model
# 需要导入模块: from utils import Vocab [as 别名]
# 或者: from utils.Vocab import construct [as 别名]
class RNNLM_Model(LanguageModel):
def load_data(self, debug=False):
"""Loads starter word-vectors and train/dev/test data."""
self.vocab = Vocab()
self.vocab.construct(get_ptb_dataset('train'))
self.encoded_train = np.array(
[self.vocab.encode(word) for word in get_ptb_dataset('train')],
dtype=np.int32)
self.encoded_valid = np.array(
[self.vocab.encode(word) for word in get_ptb_dataset('valid')],
dtype=np.int32)
#self.encoded_test = np.array(
#[self.vocab.encode(word) for word in get_ptb_dataset('test')],
#dtype=np.int32)
if debug:
num_debug = 1024
self.encoded_train = self.encoded_train[:num_debug]#读入训练数据
self.encoded_valid = self.encoded_valid[:num_debug]
self.encoded_test = self.encoded_test[:num_debug]
def add_placeholders(self):
self.input_placeholder = tf.placeholder(tf.int32, (None, self.config.num_steps))
self.labels_placeholder = tf.placeholder(tf.float32, (None, self.config.num_steps))
self.dropout_placeholder = tf.placeholder(tf.float32)
def add_embedding(self):#将one-hot转化为词向量
inputs = []
with tf.device('/cpu:0'):
L = tf.get_variable("Embedding", (len(self.vocab), self.config.embed_size))
tensors = tf.nn.embedding_lookup(L, self.input_placeholder)
split_tensors = tf.split(1, self.config.num_steps, tensors)
for tensor in split_tensors:
inputs.append(tf.squeeze(tensor, [1]))
return inputs#返回的是一个list
def add_projection(self, rnn_outputs):#把隐藏层转化为词语
with tf.variable_scope("projection"):
U=tf.get_variable("U",shape=(self.config.hidden_size,len(self.vocab)))
b_2=tf.get_variable("b_2",shape=(len(self.vocab),))
outputs=[tf.matmul(x,U)+b_2 for x in rnn_outputs]###softmax?
return outputs
def add_loss_op(self, output):#计算损失函数
loss = sequence_loss([output], [tf.reshape(self.labels_placeholder, [-1])], [tf.ones([self.config.batch_size * self.config.num_steps])])
return loss
def add_training_op(self, loss):#对损失函数进行优化
optimizer=tf.train.AdamOptimizer(self.config.lr)
train_op=optimizer.minimize(loss)
return train_op
def __init__(self, config):
self.config = config
self.load_data(debug=False)
self.add_placeholders()
self.inputs = self.add_embedding()
self.rnn_outputs = self.add_model(self.inputs)
self.outputs = self.add_projection(self.rnn_outputs)
self.predictions = [tf.nn.softmax(tf.cast(o, 'float64')) for o in self.outputs]
output = tf.reshape(tf.concat(1, self.outputs), [-1, len(self.vocab)])
self.calculate_loss = self.add_loss_op(output)
self.train_step = self.add_training_op(self.calculate_loss)
def add_model(self, inputs):
hidden_size=self.config.hidden_size
embed_size=self.config.embed_size
batch_size=self.config.batch_size
with tf.variable_scope("RNN"):
H=tf.get_variable("H",shape=(hidden_size,hidden_size))
I=tf.get_variable("I",shape=(embed_size,hidden_size))
b_1=tf.get_variable("b_1",shape=(hidden_size,))
self.initial_state=tf.zeros([batch_size,hidden_size])
pre_h=self.initial_state
rnn_outputs=[]
for step in inputs:
step=tf.nn.dropout(step,self.dropout_placeholder)
pre_h=tf.sigmoid(tf.matmul(pre_h,H)+tf.matmul(step,I)+b_1)
rnn_outputs.append(tf.nn.dropout(pre_h,self.dropout_placeholder))
self.final_state=pre_h
return rnn_outputs
def run_epoch(self, session, data, train_op=None, verbose=10):
config = self.config
dp = config.dropout
if not train_op:
#.........这里部分代码省略.........
示例10: RNNLM_Model
# 需要导入模块: from utils import Vocab [as 别名]
# 或者: from utils.Vocab import construct [as 别名]
class RNNLM_Model(LanguageModel):
def load_data(self, debug=False):
"""Loads starter word-vectors and train/dev/test data."""
self.vocab = Vocab()
self.vocab.construct(get_ptb_dataset('train'))
self.encoded_train = np.array(
[self.vocab.encode(word) for word in get_ptb_dataset('train')],
dtype=np.int32)
self.encoded_valid = np.array(
[self.vocab.encode(word) for word in get_ptb_dataset('valid')],
dtype=np.int32)
self.encoded_test = np.array(
[self.vocab.encode(word) for word in get_ptb_dataset('test')],
dtype=np.int32)
if debug:
num_debug = 1024
self.encoded_train = self.encoded_train[:num_debug]
self.encoded_valid = self.encoded_valid[:num_debug]
self.encoded_test = self.encoded_test[:num_debug]
def add_placeholders(self):
"""Generate placeholder variables to represent the input tensors
These placeholders are used as inputs by the rest of the model building
code and will be fed data during training. Note that when "None" is in a
placeholder's shape, it's flexible
Adds following nodes to the computational graph.
(When None is in a placeholder's shape, it's flexible)
input_placeholder: Input placeholder tensor of shape
(None, num_steps), type tf.int32
labels_placeholder: Labels placeholder tensor of shape
(None, num_steps), type tf.float32
dropout_placeholder: Dropout value placeholder (scalar),
type tf.float32
Add these placeholders to self as the instance variables
self.input_placeholder
self.labels_placeholder
self.dropout_placeholder
(Don't change the variable names)
"""
### YOUR CODE HERE
self.input_placeholder = tf.placeholder(tf.int32, shape=(None, self.config.num_steps))
self.labels_placeholder = tf.placeholder(tf.int32, shape=(None, self.config.num_steps))
self.dropout_placeholder = tf.placeholder(tf.float32, shape=None)
### END YOUR CODE
def add_embedding(self):
"""Add embedding layer.
Hint: This layer should use the input_placeholder to index into the
embedding.
Hint: You might find tf.nn.embedding_lookup useful.
Hint: You might find tf.split, tf.squeeze useful in constructing tensor inputs
Hint: Check the last slide from the TensorFlow lecture.
Hint: Here are the dimensions of the variables you will need to create:
L: (len(self.vocab), embed_size)
Returns:
inputs: List of length num_steps, each of whose elements should be
a tensor of shape (batch_size, embed_size).
"""
# The embedding lookup is currently only implemented for the CPU
with tf.device('/cpu:0'):
### YOUR CODE HERE
L = tf.Variable(tf.random_uniform([len(self.vocab), self.config.embed_size], -1.0, 1.0), name="L")
# Shape of input_placeholder : (batch_size, num_steps)
# Shape of embed : (num_steps, batch_size, embed_size)
embed = tf.nn.embedding_lookup(L, tf.transpose(self.input_placeholder, perm=[1,0]))
inputs = [tf.squeeze(ts, [0]) for ts in tf.split(0, self.config.num_steps, embed)]
### END YOUR CODE
return inputs
def add_projection(self, rnn_outputs):
"""Adds a projection layer.
The projection layer transforms the hidden representation to a distribution
over the vocabulary.
Hint: Here are the dimensions of the variables you will need to
create
U: (hidden_size, len(vocab))
b_2: (len(vocab),)
Args:
rnn_outputs: List of length num_steps, each of whose elements should be
a tensor of shape (batch_size, hidden_size(LIBIN edited)).
Returns:
outputs: List of length num_steps, each a tensor of shape
(batch_size, len(vocab))
"""
### YOUR CODE HERE
with tf.variable_scope("projection", initializer = xavier_weight_init(), reuse=None):
#.........这里部分代码省略.........
示例11: main
# 需要导入模块: from utils import Vocab [as 别名]
# 或者: from utils.Vocab import construct [as 别名]
"""
self.node_list = []
root_node = self.walk_tree(x.root)
all_nodes = torch.cat(self.node_list)
#now I need to project out
return all_nodes
def main():
print("do nothing")
if __name__ == '__main__':
train_data, dev_data, test_data = tr.simplified_data(train_size, 100, 200)
vocab = Vocab()
train_sents = [t.get_words() for t in train_data]
vocab.construct(list(itertools.chain.from_iterable(train_sents)))
model = RNN_Model(vocab, embed_size=50)
main()
lr = 0.01
loss_history = []
optimizer = torch.optim.SGD(model.parameters(), lr=lr, momentum=0.9, dampening=0.0)
# params (iterable): iterable of parameters to optimize or dicts defining
# parameter groups
# lr (float): learning rate
# momentum (float, optional): momentum factor (default: 0)
# weight_decay (float, optional): weight decay (L2 penalty) (default: 0)
#torch.optim.SGD(model.parameters(), lr=lr, momentum=0.9, dampening=0, weight_decay=0)
# print(model.fcl._parameters['weight'])
for epoch in range(max_epochs):
示例12: Model
# 需要导入模块: from utils import Vocab [as 别名]
# 或者: from utils.Vocab import construct [as 别名]
class Model():
def __init__(self, config):
self.config = config
self.load_data()
self.build_model()
def load_vocab(self,debug):
self.vocab = Vocab()
if debug:
self.vocab.construct(get_words_dataset('dev'))
else:
self.vocab.construct(get_words_dataset('train'))
self.vocab.build_embedding_matrix(self.config.word_embed_size)
self.embedding_matrix = self.vocab.embedding_matrix
def load_data(self, debug=False):
"""
Loads starter word-vectors and train/dev/test data.
"""
self.load_vocab(debug)
config = self.config
if debug:
# Load the training set
train_data = list(get_sentences_dataset(self.vocab,
config.sent_len, 'dev', 'post'))
( self.sent1_train, self.sent2_train, self.len1_train,
self.len2_train, self.y_train ) = zip(*train_data)
self.sent1_train, self.sent2_train = np.vstack(self.sent1_train), np.vstack(self.sent2_train)
self.len1_train, self.len2_train = ( np.array(self.len1_train),
np.array(self.len2_train) )
self.y_train = np.array(self.y_train)
print('# training examples: %d' %len(self.y_train))
# Load the validation set
dev_data = list(get_sentences_dataset(self.vocab, config.sent_len,
'test', 'post'))
( self.sent1_dev, self.sent2_dev, self.len1_dev,
self.len2_dev, self.y_dev ) = zip(*dev_data)
self.sent1_dev, self.sent2_dev = np.vstack(self.sent1_dev), np.vstack(self.sent2_dev)
self.len1_dev, self.len2_dev = ( np.array(self.len1_dev),
np.array(self.len2_dev) )
self.y_dev = np.array(self.y_dev)
print('# dev examples: %d' %len(self.y_dev))
# Load the test set
test_data = list(get_sentences_dataset(self.vocab, config.sent_len,
'test', 'post'))
( self.sent1_test, self.sent2_test, self.len1_test,
self.len2_test, self.y_test ) = zip(*test_data)
self.sent1_test, self.sent2_test = np.vstack(self.sent1_test), np.vstack(self.sent2_test)
self.len1_test, self.len2_test = ( np.array(self.len1_test),
np.array(self.len2_test) )
self.y_test = np.array(self.y_test)
print('# test examples: %d' %len(self.y_test))
else:
# Load the training set
train_data = list(get_sentences_dataset(self.vocab,
config.sent_len, 'train', 'post'))
( self.sent1_train, self.sent2_train, self.len1_train,
self.len2_train, self.y_train ) = zip(*train_data)
self.sent1_train, self.sent2_train = np.vstack(self.sent1_train), np.vstack(self.sent2_train)
self.len1_train, self.len2_train = ( np.array(self.len1_train),
np.array(self.len2_train) )
self.y_train = np.array(self.y_train)
print('# training examples: %d' %len(self.y_train))
# Load the validation set
dev_data = list(get_sentences_dataset(self.vocab, config.sent_len,
'dev', 'post'))
( self.sent1_dev, self.sent2_dev, self.len1_dev,
self.len2_dev, self.y_dev ) = zip(*dev_data)
self.sent1_dev, self.sent2_dev = np.vstack(self.sent1_dev), np.vstack(self.sent2_dev)
self.len1_dev, self.len2_dev = ( np.array(self.len1_dev),
np.array(self.len2_dev) )
self.y_dev = np.array(self.y_dev)
print('# dev examples: %d' %len(self.y_dev))
# Load the test set
test_data = list(get_sentences_dataset(self.vocab, config.sent_len,
'test', 'post'))
( self.sent1_test, self.sent2_test, self.len1_test,
self.len2_test, self.y_test ) = zip(*test_data)
self.sent1_test, self.sent2_test = np.vstack(self.sent1_test), np.vstack(self.sent2_test)
self.len1_test, self.len2_test = ( np.array(self.len1_test),
np.array(self.len2_test) )
self.y_test = np.array(self.y_test)
print('# test examples: %d' %len(self.y_test))
print('min len: ', np.min(self.len2_train))
def build_model(self):
config = self.config
k = config.sentence_embed_size
L = config.sent_len
#.........这里部分代码省略.........