本文整理汇总了Python中utils.Vocab类的典型用法代码示例。如果您正苦于以下问题:Python Vocab类的具体用法?Python Vocab怎么用?Python Vocab使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。
在下文中一共展示了Vocab类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: WhoseLineModel
class WhoseLineModel(object):
def __init__(self, config):
self.config = config
self.load_data(debug=False)
self.add_common_model_vars()
def load_data(self, debug=False):
self.wordvecs = gensim.models.Word2Vec.load_word2vec_format(self.config.wordvecpath, binary=False)
self.vocab = Vocab()
self.vocab.construct(self.wordvecs.index2word)
self.embedding_matrix = np.vstack([self.wordvecs[self.vocab.index_to_word[i]] for i in range(len(self.vocab))])
# next line is "unk" surgery cf. https://groups.google.com/forum/#!searchin/globalvectors/unknown/globalvectors/9w8ZADXJclA/X6f0FgxUnMgJ
self.embedding_matrix[0,:] = np.mean(self.embedding_matrix, axis=0)
chapter_split = load_chapter_split(self.config.datasplitpath)
self.speakers = Speakers()
for line in open(self.config.datapath):
ch, speaker, line = line.split("\t")
if chapter_split[ch] == 0:
self.speakers.add_speaker(speaker)
self.speakers.prune(self.config.speaker_count-1) # -1 for OTHER
self.train_data = []
self.dev_data = []
self.test_data = []
oldch = None
for ln in open(self.config.datapath):
ch, speaker, line = ln.split("\t")
encoded_line = (np.array([self.vocab.encode(word) for word in line.split()], dtype=np.int32),
self.speakers.encode(speaker))
if chapter_split[ch] == 0:
dataset = self.train_data
elif chapter_split[ch] == 1:
dataset = self.dev_data
else:
dataset = self.test_data
if self.config.batch_size == "chapter":
if ch == oldch:
dataset[-1].append(encoded_line)
else:
dataset.append([encoded_line])
else:
dataset.append(encoded_line)
oldch = ch
def add_common_model_vars(self):
with tf.variable_scope("word_vectors"):
self.tf_embedding_matrix = tf.constant(self.embedding_matrix, name="embedding")
示例2: load_data
def load_data(self):
pair_fname = '../lastfm_train_mappings.txt'
lyrics_path = '../data/lyrics/train/'
# X_train is a list of all examples. each examples is a 2-len list. each element is a list of words in lyrics.
# word_counts is a dictionary that maps
if self.config.debug:
X_train, l_train, self.word_counts, seq_len1, seq_len2, self.config.max_steps = get_data(pair_fname, lyrics_path, '../glove.6B.50d.txt', threshold_down=0, threshold_up=float('inf'), npos=100, nneg=100)
else:
X_train, l_train, self.word_counts, seq_len1, seq_len2, self.config.max_steps = get_data(pair_fname, lyrics_path, threshold_down=100, threshold_up=4000, npos=10000, nneg=10000)
self.labels_train = np.zeros((len(X_train),self.config.n_class))
self.labels_train[range(len(X_train)),l_train] = 1
x = collections.Counter(l_train)
for k in x.keys():
print 'class:', k, x[k]
print ''
self.vocab = Vocab()
self.vocab.construct(self.word_counts.keys())
self.wv = self.vocab.get_wv('../glove.6B.50d.txt')
with open('word_hist.csv', 'w') as f:
for w in self.word_counts.keys():
f.write(w+','+str(self.word_counts[w])+'\n')
self.encoded_train_1 = np.zeros((len(X_train), self.config.max_steps)) # need to handle this better.
self.encoded_train_2 = np.zeros((len(X_train), self.config.max_steps))
for i in range(len(X_train)):
self.encoded_train_1[i,:len(X_train[i][0])] = [self.vocab.encode(word) for word in X_train[i][0]]
self.encoded_train_2[i,:len(X_train[i][1])] = [self.vocab.encode(word) for word in X_train[i][1]]
self.sequence_len1 = np.array(seq_len1)
self.sequence_len2 = np.array(seq_len2)
示例3: load_vocab
def load_vocab(self,debug):
self.vocab = Vocab()
if debug:
self.vocab.construct(get_words_dataset('dev'))
else:
self.vocab.construct(get_words_dataset('train'))
self.vocab.build_embedding_matrix(self.config.word_embed_size)
self.embedding_matrix = self.vocab.embedding_matrix
示例4: load_data
def load_data(self):
"""Loads train/dev/test data and builds vocabulary."""
self.train_data, self.dev_data, self.test_data = tr.simplified_data(700, 100, 200)
# build vocab from training data
self.vocab = Vocab()
train_sents = [t.get_words() for t in self.train_data]
self.vocab.construct(list(itertools.chain.from_iterable(train_sents)))
示例5: prep_data
def prep_data(trees, X_vocab=None, y_vocab=None):
update_vocab = False
if X_vocab is None:
X_vocab, y_vocab = Vocab(), Vocab()
update_vocab = True
X, y = [], []
for tree in tqdm(trees):
if len(tree.tokens) < 2: continue
#TODO accumulate features without iterating over all states
try:
for state, decision in tree.iter_oracle_states():
feats = state.extract_features()
if update_vocab:
X_vocab.add_words(feats)
y_vocab.add_word(decision)
X.append([X_vocab.encode(f) for f in feats])
y.append(y_vocab.encode(decision))
except:
pass
return X, y, X_vocab, y_vocab
示例6: load_data
def load_data(self, debug=False):
"""Loads starter word-vectors and train/dev/test data."""
self.vocab = Vocab()
self.vocab.construct(get_ptb_dataset('train'))
self.encoded_train = np.array(
[self.vocab.encode(word) for word in get_ptb_dataset('train')],
dtype=np.int32)
self.encoded_valid = np.array(
[self.vocab.encode(word) for word in get_ptb_dataset('valid')],
dtype=np.int32)
self.encoded_test = np.array(
[self.vocab.encode(word) for word in get_ptb_dataset('test')],
dtype=np.int32)
if debug:
num_debug = 1024
self.encoded_train = self.encoded_train[:num_debug]
self.encoded_valid = self.encoded_valid[:num_debug]
self.encoded_test = self.encoded_test[:num_debug]
示例7: load_data
def load_data(self):
pair_fname = '../lastfm_train_mappings.txt'
lyrics_path = '../lyrics/data/lyrics/train/'
# X_train is a list of all examples. each examples is a 2-len list. each element is a list of words in lyrics.
# word_counts is a dictionary that maps
X_train, l_train, self.word_counts, self.config.max_steps = get_data(pair_fname, lyrics_path, threshold=100, n_class=self.config.n_class)
self.labels_train = np.zeros((len(X_train),self.config.n_class))
self.labels_train[range(len(X_train)),l_train] = 1
self.vocab = Vocab()
self.vocab.construct(self.word_counts.keys())
self.encoded_train_1 = np.zeros((len(X_train), self.config.max_steps)) # need to handle this better.
self.encoded_train_2 = np.zeros((len(X_train), self.config.max_steps))
for i in range(len(X_train)):
self.encoded_train_1[i,:len(X_train[i][0])] = [self.vocab.encode(word) for word in X_train[i][0]]
self.encoded_train_2[i,:len(X_train[i][1])] = [self.vocab.encode(word) for word in X_train[i][1]]
示例8: set
import sys
import os
from utils import Vocab
import numpy as np
import pickle
if __name__ == "__main__":
#Create a set of all words
all_words = set()
vocab = Vocab()
count_files = 0
for name in ['test', 'train', 'val']:
filename = name + '_tokens.txt'
f = open(filename, 'r')
for line in f:
sp_line = line.strip().split()
for token in sp_line:
all_words.add(token)
vocab.add_word(token)
f.close()
glove_dir = '/media/sf_kickstarter/CS224D/Project/glove.840B.300d'
glove_f = open(os.path.join(glove_dir, 'glove.840B.300d.txt'), 'r')
embedding_matrix = np.zeros((len(vocab.word_to_index),300))
count = 0
for line in glove_f:
line_sp = line.strip().split()
示例9: RNN_Model
class RNN_Model():
def load_data(self):
"""Loads train/dev/test data and builds vocabulary."""
self.train_data, self.dev_data, self.test_data = tr.simplified_data(700, 100, 200)
# build vocab from training data
self.vocab = Vocab()
train_sents = [t.get_words() for t in self.train_data]
self.vocab.construct(list(itertools.chain.from_iterable(train_sents)))
def inference(self, tree, predict_only_root=False):
"""For a given tree build the RNN models computation graph up to where it
may be used for inference.
Args:
tree: a Tree object on which to build the computation graph for the RNN
Returns:
softmax_linear: Output tensor with the computed logits.
"""
node_tensors = self.add_model(tree.root)
if predict_only_root:
node_tensors = node_tensors[tree.root]
else:
node_tensors = [tensor for node, tensor in node_tensors.iteritems() if node.label!=2]
node_tensors = tf.concat(0, node_tensors)
return self.add_projections(node_tensors)
def add_model_vars(self):
'''
You model contains the following parameters:
embedding: tensor(vocab_size, embed_size)
W1: tensor(2* embed_size, embed_size)
b1: tensor(1, embed_size)
U: tensor(embed_size, output_size)
bs: tensor(1, output_size)
Hint: Add the tensorflow variables to the graph here and *reuse* them while building
the compution graphs for composition and projection for each tree
Hint: Use a variable_scope "Composition" for the composition layer, and
"Projection") for the linear transformations preceding the softmax.
'''
embed_size = self.config.embed_size
vocab_size = len(self.vocab)
output_size = self.config.label_size
with tf.variable_scope('Composition'):
### YOUR CODE HERE
embedding = tf.get_variable("embedding", shape=(vocab_size, embed_size))
W1 = tf.get_variable("W1", shape=(2 * embed_size, embed_size))
b1 = tf.get_variable("b1", shape=(1, embed_size))
### END YOUR CODE
with tf.variable_scope('Projection'):
### YOUR CODE HERE
U = tf.get_variable("U", shape=(embed_size, output_size))
bs = tf.get_variable("bs", shape=(1, output_size))
### END YOUR CODE
self.optimizer = tf.train.AdamOptimizer(learning_rate=self.config.lr)
# dummy_total is a simple sum to ensure that the variables for the AdamOptimizer
# are created for initialization and before restore the variables later.
# It should never actually get executed.
dummy_total = tf.constant(0.0)
for v in tf.trainable_variables(): dummy_total +=tf.reduce_sum(v)
self.dummy_minimizer = self.optimizer.minimize(dummy_total)
# we then initialize variables, and because of the self.dummy_minimizer,
# all of the necessary variable/slot pairs get added and included in the
# saver variables
def add_model(self, node):
"""Recursively build the model to compute the phrase embeddings in the tree
Hint: Refer to tree.py and vocab.py before you start. Refer to
the model's vocab with self.vocab
Hint: Reuse the "Composition" variable_scope here
--Hint: Store a node's vector representation in node.tensor so it can be
used by it's parent--
Hint: If node is a leaf node, it's vector representation is just that of the
word vector (see tf.gather()).
Args:
node: a Node object
Returns:
node_tensors: Dict: key = Node, value = tensor(1, embed_size)
"""
with tf.variable_scope('Composition', reuse=True):
### YOUR CODE HERE
embedding = tf.get_variable("embedding")
W1 = tf.get_variable("W1")
b1 = tf.get_variable("b1")
### END YOUR CODE
# THOUGHT: Batch together all leaf nodes and all non leaf nodes
node_tensors = OrderedDict()
curr_node_tensor = None
if node.isLeaf:
### YOUR CODE HERE
curr_node_tensor = tf.gather(embedding, tf.constant([node.label]), name="leaf_lookup")
### END YOUR CODE
else:
node_tensors.update(self.add_model(node.left))
node_tensors.update(self.add_model(node.right))
#.........这里部分代码省略.........
示例10: RNNLM_Model
class RNNLM_Model(LanguageModel):
def load_data(self, debug=False):
"""Loads starter word-vectors and train/dev/test data."""
self.vocab = Vocab()
self.vocab.construct(get_ptb_dataset('train'))
self.encoded_train = np.array(
[self.vocab.encode(word) for word in get_ptb_dataset('train')],
dtype=np.int32)
self.encoded_valid = np.array(
[self.vocab.encode(word) for word in get_ptb_dataset('valid')],
dtype=np.int32)
self.encoded_test = np.array(
[self.vocab.encode(word) for word in get_ptb_dataset('test')],
dtype=np.int32)
if debug:
num_debug = 1024
self.encoded_train = self.encoded_train[:num_debug]
self.encoded_valid = self.encoded_valid[:num_debug]
self.encoded_test = self.encoded_test[:num_debug]
def add_placeholders(self):
"""Generate placeholder variables to represent the input tensors
These placeholders are used as inputs by the rest of the model building
code and will be fed data during training. Note that when "None" is in a
placeholder's shape, it's flexible
Adds following nodes to the computational graph.
(When None is in a placeholder's shape, it's flexible)
input_placeholder: Input placeholder tensor of shape
(None, num_steps), type tf.int32
labels_placeholder: Labels placeholder tensor of shape
(None, num_steps), type tf.float32
dropout_placeholder: Dropout value placeholder (scalar),
type tf.float32
Add these placeholders to self as the instance variables
self.input_placeholder
self.labels_placeholder
self.dropout_placeholder
(Don't change the variable names)
"""
### YOUR CODE HERE
self.input_placeholder = tf.placeholder(tf.int32, shape=[None, self.config.num_steps], name='Input')
self.labels_placeholder = tf.placeholder(tf.float32, shape=[None, self.config.num_steps], name='Target')
self.dropout_placeholder = tf.placeholder(tf.int64, name='Dropout')
### END YOUR CODE
def add_embedding(self):
"""Add embedding layer.
Hint: This layer should use the input_placeholder to index into the
embedding.
Hint: You might find tf.nn.embedding_lookup useful.
Hint: You might find tf.split, tf.squeeze useful in constructing tensor inputs
Hint: Check the last slide from the TensorFlow lecture.
Hint: Here are the dimensions of the variables you will need to create:
L: (len(self.vocab), embed_size)
Returns:
inputs: List of length num_steps, each of whose elements should be
a tensor of shape (batch_size, embed_size).
"""
# The embedding lookup is currently only implemented for the CPU
with tf.device('/cpu:0'):
### YOUR CODE HERE
embeddings = tf.get_variable('Embedding', [len(self.vocab), self.config.embed_size], trainable=True)
inputs = tf.nn.embedding_lookup(embeddings, self.input_placeholder)
inputs = [tf.squeeze(x, [1]) for x in tf.split(1, self.config.num_steps, inputs)]
### END YOUR CODE
return inputs
def add_projection(self, rnn_outputs):
"""Adds a projection layer.
The projection layer transforms the hidden representation to a distribution
over the vocabulary.
Hint: Here are the dimensions of the variables you will need to
create
U: (hidden_size, len(vocab))
b_2: (len(vocab),)
Args:
rnn_outputs: List of length num_steps, each of whose elements should be
a tensor of shape (batch_size, embed_size).
Returns:
outputs: List of length num_steps, each a tensor of shape
(batch_size, len(vocab)
"""
### YOUR CODE HERE
with tf.name_scope('Projection Layer'):
U = tf.get_variable('U', [self.config.hidden_size, len(self.vocab)])
b2 = tf.get_variable('b2', len(self.vocab))
#.........这里部分代码省略.........
示例11: RNN_Model
class RNN_Model():
def load_data(self):
"""Loads train/dev/test data and builds vocabulary."""
self.train_data, self.dev_data, self.test_data = tr.simplified_data(700, 100, 200)
# build vocab from training data
self.vocab = Vocab()
train_sents = [t.get_words() for t in self.train_data]
self.vocab.construct(list(itertools.chain.from_iterable(train_sents)))
def inference(self, tree, predict_only_root=False):
"""For a given tree build the RNN models computation graph up to where it
may be used for inference.
Args:
tree: a Tree object on which to build the computation graph for the RNN
Returns:
softmax_linear: Output tensor with the computed logits.
"""
node_tensors = self.add_model(tree.root)
if predict_only_root:
node_tensors = node_tensors[tree.root]
else:
node_tensors = [tensor for node, tensor in node_tensors.iteritems() if node.label!=2]
node_tensors = tf.concat(0, node_tensors)
return self.add_projections(node_tensors)
def add_model_vars(self):
'''
You model contains the following parameters:
embedding: tensor(vocab_size, embed_size)
W1: tensor(2* embed_size, embed_size)
b1: tensor(1, embed_size)
U: tensor(embed_size, output_size)
bs: tensor(1, output_size)
Hint: Add the tensorflow variables to the graph here and *reuse* them while building
the compution graphs for composition and projection for each tree
Hint: Use a variable_scope "Composition" for the composition layer, and
"Projection") for the linear transformations preceding the softmax.
'''
with tf.variable_scope('Composition'):
### YOUR CODE HERE
embed_size = self.config.embed_size
#epsilon = 0.4
#initializer = tf.random_uniform_initializer(-epsilon, epsilon)
initializer = None
embedding = tf.get_variable('embedding', [len(self.vocab), self.config.embed_size], initializer=initializer)
W1 = tf.get_variable("W1", [2 * embed_size, embed_size], initializer=initializer)
b1 = tf.get_variable("b1", [1, embed_size], initializer=initializer)
### END YOUR CODE
with tf.variable_scope('Projection'):
### YOUR CODE HERE
U = tf.get_variable("U", [embed_size, self.config.label_size], initializer=initializer)
bs = tf.get_variable("bs", [1, self.config.label_size], initializer=initializer)
### END YOUR CODE
def add_model(self, node):
"""Recursively build the model to compute the phrase embeddings in the tree
Hint: Refer to tree.py and vocab.py before you start. Refer to
the model's vocab with self.vocab
Hint: Reuse the "Composition" variable_scope here
Hint: Store a node's vector representation in node.tensor so it can be
used by it's parent
Hint: If node is a leaf node, it's vector representation is just that of the
word vector (see tf.gather()).
Args:
node: a Node object
Returns:
node_tensors: Dict: key = Node, value = tensor(1, embed_size)
"""
with tf.variable_scope('Composition', reuse=True):
### YOUR CODE HERE
embedding = tf.get_variable("embedding")
W1 = tf.get_variable("W1")
b1 = tf.get_variable("b1")
### END YOUR CODE
node_tensors = OrderedDict()
curr_node_tensor = None
if node.isLeaf:
### YOUR CODE HERE
curr_node_tensor = tf.gather(embedding, [self.vocab.encode(node.word)])
### END YOUR CODE
else:
node_tensors.update(self.add_model(node.left))
node_tensors.update(self.add_model(node.right))
### YOUR CODE HERE
node_input = tf.concat(1, [node_tensors[node.left], node_tensors[node.right]])
curr_node_tensor = tf.matmul(node_input, W1) + b1
curr_node_tensor = tf.nn.relu(curr_node_tensor)
### END YOUR CODE
node_tensors[node] = curr_node_tensor
return node_tensors
def add_projections(self, node_tensors):
"""Add projections to the composition vectors to compute the raw sentiment scores
Hint: Reuse the "Projection" variable_scope here
#.........这里部分代码省略.........
示例12: Model_RNN
class Model_RNN(LanguageModel):
def load_data(self):
pair_fname = '../lastfm_train_mappings.txt'
lyrics_path = '../lyrics/data/lyrics/train/'
# X_train is a list of all examples. each examples is a 2-len list. each element is a list of words in lyrics.
# word_counts is a dictionary that maps
X_train, l_train, self.word_counts, self.config.max_steps = get_data(pair_fname, lyrics_path, threshold=100, n_class=self.config.n_class)
self.labels_train = np.zeros((len(X_train),self.config.n_class))
self.labels_train[range(len(X_train)),l_train] = 1
self.vocab = Vocab()
self.vocab.construct(self.word_counts.keys())
self.encoded_train_1 = np.zeros((len(X_train), self.config.max_steps)) # need to handle this better.
self.encoded_train_2 = np.zeros((len(X_train), self.config.max_steps))
for i in range(len(X_train)):
self.encoded_train_1[i,:len(X_train[i][0])] = [self.vocab.encode(word) for word in X_train[i][0]]
self.encoded_train_2[i,:len(X_train[i][1])] = [self.vocab.encode(word) for word in X_train[i][1]]
def add_placeholders(self):
self.X1 = tf.placeholder(tf.int32, shape=(None, self.config.max_steps), name='X1')
self.X2 = tf.placeholder(tf.int32, shape=(None, self.config.max_steps), name='X2')
self.labels = tf.placeholder(tf.float32, shape=(None, self.config.n_class), name='labels')
#self.initial_state = tf.placeholder(tf.float32, shape=(None, self.config.hidden_size), name='initial_state')
self.seq_len1 = tf.placeholder(tf.int32, shape=(None), name='seq_len1') # for variable length sequences
self.seq_len2 = tf.placeholder(tf.int32, shape=(None), name='seq_len2') # for variable length sequences
def add_embedding(self):
L = tf.get_variable('L', shape=(len(self.word_counts.keys()), self.config.embed_size), dtype=tf.float32)
inputs1 = tf.nn.embedding_lookup(L, self.X1) # self.X1 is batch_size x self.config.max_steps
inputs2 = tf.nn.embedding_lookup(L, self.X2) # input2 is batch_size x self.config.max_steps x self.config.embed_size
inputs1 = tf.split(1, self.config.max_steps, inputs1) # list of len self.config.max_steps where each element is batch_size x self.config.embed_size
inputs1 = [tf.squeeze(x) for x in inputs1]
inputs2 = tf.split(1, self.config.max_steps, inputs2) # list of len self.config.max_steps where each element is batch_size x self.config.embed_size
inputs2 = [tf.squeeze(x) for x in inputs2]
print 'onh'
print inputs1[0].get_shape
return inputs1, inputs2
def add_model(self, inputs1, inputs2, seq_len1, seq_len2):
#self.initial_state = tf.constant(np.zeros(()), dtype=tf.float32)
print 'adsf add_model'
self.initial_state = tf.constant(np.zeros((self.config.batch_size,self.config.hidden_size)), dtype=tf.float32)
rnn_outputs = []
rnn_outputs1 = []
rnn_outputs2 = []
h_curr1 = self.initial_state
h_curr2 = self.initial_state
print 'nthgnghn'
with tf.variable_scope('rnn'):
Whh = tf.get_variable('Whh', shape=(self.config.hidden_size,self.config.hidden_size), dtype=tf.float32)
Wxh = tf.get_variable('Wxh', shape=(self.config.embed_size,self.config.hidden_size), dtype=tf.float32)
b1 = tf.get_variable('bhx', shape=(self.config.hidden_size,), dtype=tf.float32)
print Wxh.get_shape
print inputs1[0].get_shape
print inputs2[0].get_shape
for i in range(self.config.max_steps):
h_curr2 = tf.matmul(h_curr2,Whh)
h_curr2 += tf.matmul(inputs2[i],Wxh)
h_curr2 += b1
h_curr2 = tf.sigmoid(h_curr2)
h_curr1 = tf.sigmoid(tf.matmul(h_curr1,Whh) + tf.matmul(inputs1[i],Wxh) + b1)
rnn_outputs1.append(h_curr1)
rnn_outputs2.append(h_curr2)
rnn_states = [tf.concat(1, [rnn_outputs1[i], rnn_outputs2[i]]) for i in range(self.config.max_steps)]
return rnn_states
def add_projection(self, rnn_states):
# rnn_outputs is a list of length batch_size of lengths = seq_len. Where each list element is ??. I think.
Whc = tf.get_variable('Whc', shape=(2*self.config.hidden_size,self.config.n_class))
bhc = tf.get_variable('bhc', shape=(self.config.n_class,))
projections = tf.matmul(rnn_states[-1],Whc) + bhc # in case we stop short sequences, the rnn_state in further time_steps should be unch
return projections
def add_loss_op(self, y):
loss = tf.nn.softmax_cross_entropy_with_logits(y, self.labels)
loss = tf.reduce_sum(loss)
return loss
def add_training_op(self, loss):
#train_op = tf.train.AdamOptimizer(learning_rate=self.config.lr).minimize(loss)
train_op = tf.train.GradientDescentOptimizer(learning_rate=self.config.lr).minimize(loss)
return train_op
def __init__(self, config):
self.config = config
self.load_data()
self.add_placeholders()
print 'adsf __init__'
print self.X1.get_shape
self.inputs1, self.inputs2 = self.add_embedding()
self.rnn_states = self.add_model(self.inputs1, self.inputs2, self.seq_len1, self.seq_len2)
self.projections = self.add_projection(self.rnn_states)
self.loss = self.add_loss_op(self.projections)
#.........这里部分代码省略.........
示例13: Model
class Model():
def __init__(self, config):
self.config = config
self.load_data(debug=False)
self.build_model()
def load_vocab(self,debug):
self.vocab = Vocab()
if debug:
self.vocab.construct(get_words_dataset('dev'))
else:
self.vocab.construct(get_words_dataset('train'))
self.vocab.build_embedding_matrix(self.config.word_embed_size)
self.embedding_matrix = self.vocab.embedding_matrix
def load_data(self, debug=False):
"""
Loads starter word-vectors and train/dev/test data.
"""
self.load_vocab(debug)
config = self.config
if debug:
# Load the training set
train_data = list(get_sentences_dataset(self.vocab,
config.sent_len, 'dev', 'post'))
( self.sent1_train, self.sent2_train, self.len1_train,
self.len2_train, self.y_train ) = zip(*train_data)
self.sent1_train, self.sent2_train = np.vstack(self.sent1_train), np.vstack(self.sent2_train)
self.len1_train, self.len2_train = ( np.array(self.len1_train),
np.array(self.len2_train) )
self.y_train = np.array(self.y_train)
print('# training examples: %d' %len(self.y_train))
# Load the validation set
dev_data = list(get_sentences_dataset(self.vocab, config.sent_len,
'test', 'post'))
( self.sent1_dev, self.sent2_dev, self.len1_dev,
self.len2_dev, self.y_dev ) = zip(*dev_data)
self.sent1_dev, self.sent2_dev = np.vstack(self.sent1_dev), np.vstack(self.sent2_dev)
self.len1_dev, self.len2_dev = ( np.array(self.len1_dev),
np.array(self.len2_dev) )
self.y_dev = np.array(self.y_dev)
print('# dev examples: %d' %len(self.y_dev))
# Load the test set
test_data = list(get_sentences_dataset(self.vocab, config.sent_len,
'test', 'post'))
( self.sent1_test, self.sent2_test, self.len1_test,
self.len2_test, self.y_test ) = zip(*test_data)
self.sent1_test, self.sent2_test = np.vstack(self.sent1_test), np.vstack(self.sent2_test)
self.len1_test, self.len2_test = ( np.array(self.len1_test),
np.array(self.len2_test) )
self.y_test = np.array(self.y_test)
print('# test examples: %d' %len(self.y_test))
else:
# Load the training set
train_data = list(get_sentences_dataset(self.vocab,
config.sent_len, 'train', 'post'))
( self.sent1_train, self.sent2_train, self.len1_train,
self.len2_train, self.y_train ) = zip(*train_data)
self.sent1_train, self.sent2_train = np.vstack(self.sent1_train), np.vstack(self.sent2_train)
self.len1_train, self.len2_train = ( np.array(self.len1_train),
np.array(self.len2_train) )
self.y_train = np.array(self.y_train)
print('# training examples: %d' %len(self.y_train))
# Load the validation set
dev_data = list(get_sentences_dataset(self.vocab, config.sent_len,
'dev', 'post'))
( self.sent1_dev, self.sent2_dev, self.len1_dev,
self.len2_dev, self.y_dev ) = zip(*dev_data)
self.sent1_dev, self.sent2_dev = np.vstack(self.sent1_dev), np.vstack(self.sent2_dev)
self.len1_dev, self.len2_dev = ( np.array(self.len1_dev),
np.array(self.len2_dev) )
self.y_dev = np.array(self.y_dev)
print('# dev examples: %d' %len(self.y_dev))
# Load the test set
test_data = list(get_sentences_dataset(self.vocab, config.sent_len,
'test', 'post'))
( self.sent1_test, self.sent2_test, self.len1_test,
self.len2_test, self.y_test ) = zip(*test_data)
self.sent1_test, self.sent2_test = np.vstack(self.sent1_test), np.vstack(self.sent2_test)
self.len1_test, self.len2_test = ( np.array(self.len1_test),
np.array(self.len2_test) )
self.y_test = np.array(self.y_test)
print('# test examples: %d' %len(self.y_test))
print('min len: ', np.min(self.len2_train))
def build_model(self):
config = self.config
k = config.sentence_embed_size
L = config.sent_len
#.........这里部分代码省略.........
示例14: Model
class Model():
def __init__(self, config):
self.config = config
self.load_data()
self.build_model()
def load_vocab(self,debug):
self.vocab = Vocab()
if debug:
self.vocab.construct(get_words_dataset('dev'))
else:
self.vocab.construct(get_words_dataset('train'))
self.vocab.build_embedding_matrix(self.config.word_embed_size)
self.embedding_matrix = self.vocab.embedding_matrix
def load_data(self, debug=False):
"""
Loads starter word-vectors and train/dev/test data.
"""
self.load_vocab(debug)
config = self.config
if debug:
# Load the training set
train_data = list(get_sentences_dataset(self.vocab,
config.sent_len, 'dev', 'post'))
( self.sent1_train, self.sent2_train, self.len1_train,
self.len2_train, self.y_train ) = zip(*train_data)
self.sent1_train, self.sent2_train = np.vstack(self.sent1_train), np.vstack(self.sent2_train)
self.len1_train, self.len2_train = ( np.array(self.len1_train),
np.array(self.len2_train) )
self.y_train = np.array(self.y_train)
print('# training examples: %d' %len(self.y_train))
# Load the validation set
dev_data = list(get_sentences_dataset(self.vocab, config.sent_len,
'test', 'post'))
( self.sent1_dev, self.sent2_dev, self.len1_dev,
self.len2_dev, self.y_dev ) = zip(*dev_data)
self.sent1_dev, self.sent2_dev = np.vstack(self.sent1_dev), np.vstack(self.sent2_dev)
self.len1_dev, self.len2_dev = ( np.array(self.len1_dev),
np.array(self.len2_dev) )
self.y_dev = np.array(self.y_dev)
print('# dev examples: %d' %len(self.y_dev))
# Load the test set
test_data = list(get_sentences_dataset(self.vocab, config.sent_len,
'test', 'post'))
( self.sent1_test, self.sent2_test, self.len1_test,
self.len2_test, self.y_test ) = zip(*test_data)
self.sent1_test, self.sent2_test = np.vstack(self.sent1_test), np.vstack(self.sent2_test)
self.len1_test, self.len2_test = ( np.array(self.len1_test),
np.array(self.len2_test) )
self.y_test = np.array(self.y_test)
print('# test examples: %d' %len(self.y_test))
else:
# Load the training set
train_data = list(get_sentences_dataset(self.vocab,
config.sent_len, 'train', 'post'))
( self.sent1_train, self.sent2_train, self.len1_train,
self.len2_train, self.y_train ) = zip(*train_data)
self.sent1_train, self.sent2_train = np.vstack(self.sent1_train), np.vstack(self.sent2_train)
self.len1_train, self.len2_train = ( np.array(self.len1_train),
np.array(self.len2_train) )
self.y_train = np.array(self.y_train)
print('# training examples: %d' %len(self.y_train))
# Load the validation set
dev_data = list(get_sentences_dataset(self.vocab, config.sent_len,
'dev', 'post'))
( self.sent1_dev, self.sent2_dev, self.len1_dev,
self.len2_dev, self.y_dev ) = zip(*dev_data)
self.sent1_dev, self.sent2_dev = np.vstack(self.sent1_dev), np.vstack(self.sent2_dev)
self.len1_dev, self.len2_dev = ( np.array(self.len1_dev),
np.array(self.len2_dev) )
self.y_dev = np.array(self.y_dev)
print('# dev examples: %d' %len(self.y_dev))
# Load the test set
test_data = list(get_sentences_dataset(self.vocab, config.sent_len,
'test', 'post'))
( self.sent1_test, self.sent2_test, self.len1_test,
self.len2_test, self.y_test ) = zip(*test_data)
self.sent1_test, self.sent2_test = np.vstack(self.sent1_test), np.vstack(self.sent2_test)
self.len1_test, self.len2_test = ( np.array(self.len1_test),
np.array(self.len2_test) )
self.y_test = np.array(self.y_test)
print('# test examples: %d' %len(self.y_test))
print('min len: ', np.min(self.len2_train))
def build_model(self):
config = self.config
k = config.sentence_embed_size
L = config.sent_len
#.........这里部分代码省略.........
示例15: main
"""
Forward function accepts input data and returns a Variable of output data
"""
self.node_list = []
root_node = self.walk_tree(x.root)
all_nodes = torch.cat(self.node_list)
#now I need to project out
return all_nodes
def main():
print("do nothing")
if __name__ == '__main__':
train_data, dev_data, test_data = tr.simplified_data(train_size, 100, 200)
vocab = Vocab()
train_sents = [t.get_words() for t in train_data]
vocab.construct(list(itertools.chain.from_iterable(train_sents)))
model = RNN_Model(vocab, embed_size=50)
main()
lr = 0.01
loss_history = []
optimizer = torch.optim.SGD(model.parameters(), lr=lr, momentum=0.9, dampening=0.0)
# params (iterable): iterable of parameters to optimize or dicts defining
# parameter groups
# lr (float): learning rate
# momentum (float, optional): momentum factor (default: 0)
# weight_decay (float, optional): weight decay (L2 penalty) (default: 0)
#torch.optim.SGD(model.parameters(), lr=lr, momentum=0.9, dampening=0, weight_decay=0)
# print(model.fcl._parameters['weight'])