本文整理汇总了Python中allennlp.data.Vocabulary.get_vocab_size方法的典型用法代码示例。如果您正苦于以下问题:Python Vocabulary.get_vocab_size方法的具体用法?Python Vocabulary.get_vocab_size怎么用?Python Vocabulary.get_vocab_size使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类allennlp.data.Vocabulary
的用法示例。
在下文中一共展示了Vocabulary.get_vocab_size方法的14个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: __init__
# 需要导入模块: from allennlp.data import Vocabulary [as 别名]
# 或者: from allennlp.data.Vocabulary import get_vocab_size [as 别名]
def __init__(self,
vocab: Vocabulary,
question_embedder: TextFieldEmbedder,
action_embedding_dim: int,
encoder: Seq2SeqEncoder,
entity_encoder: Seq2VecEncoder,
max_decoding_steps: int,
use_neighbor_similarity_for_linking: bool = False,
dropout: float = 0.0,
num_linking_features: int = 10,
rule_namespace: str = 'rule_labels',
tables_directory: str = '/wikitables/') -> None:
super(WikiTablesSemanticParser, self).__init__(vocab)
self._question_embedder = question_embedder
self._encoder = encoder
self._entity_encoder = TimeDistributed(entity_encoder)
self._max_decoding_steps = max_decoding_steps
self._use_neighbor_similarity_for_linking = use_neighbor_similarity_for_linking
if dropout > 0:
self._dropout = torch.nn.Dropout(p=dropout)
else:
self._dropout = lambda x: x
self._rule_namespace = rule_namespace
self._denotation_accuracy = WikiTablesAccuracy(tables_directory)
self._action_sequence_accuracy = Average()
self._has_logical_form = Average()
self._action_padding_index = -1 # the padding value used by IndexField
num_actions = vocab.get_vocab_size(self._rule_namespace)
self._action_embedder = Embedding(num_embeddings=num_actions, embedding_dim=action_embedding_dim)
self._output_action_embedder = Embedding(num_embeddings=num_actions, embedding_dim=action_embedding_dim)
self._action_biases = Embedding(num_embeddings=num_actions, embedding_dim=1)
# This is what we pass as input in the first step of decoding, when we don't have a
# previous action, or a previous question attention.
self._first_action_embedding = torch.nn.Parameter(torch.FloatTensor(action_embedding_dim))
self._first_attended_question = torch.nn.Parameter(torch.FloatTensor(encoder.get_output_dim()))
torch.nn.init.normal_(self._first_action_embedding)
torch.nn.init.normal_(self._first_attended_question)
check_dimensions_match(entity_encoder.get_output_dim(), question_embedder.get_output_dim(),
"entity word average embedding dim", "question embedding dim")
self._num_entity_types = 4 # TODO(mattg): get this in a more principled way somehow?
self._num_start_types = 5 # TODO(mattg): get this in a more principled way somehow?
self._embedding_dim = question_embedder.get_output_dim()
self._type_params = torch.nn.Linear(self._num_entity_types, self._embedding_dim)
self._neighbor_params = torch.nn.Linear(self._embedding_dim, self._embedding_dim)
if num_linking_features > 0:
self._linking_params = torch.nn.Linear(num_linking_features, 1)
else:
self._linking_params = None
if self._use_neighbor_similarity_for_linking:
self._question_entity_params = torch.nn.Linear(1, 1)
self._question_neighbor_params = torch.nn.Linear(1, 1)
else:
self._question_entity_params = None
self._question_neighbor_params = None
示例2: __init__
# 需要导入模块: from allennlp.data import Vocabulary [as 别名]
# 或者: from allennlp.data.Vocabulary import get_vocab_size [as 别名]
def __init__(self, vocab: Vocabulary,
text_field_embedder: TextFieldEmbedder,
attend_feedforward: FeedForward,
similarity_function: SimilarityFunction,
compare_feedforward: FeedForward,
aggregate_feedforward: FeedForward,
premise_encoder: Optional[Seq2SeqEncoder] = None,
hypothesis_encoder: Optional[Seq2SeqEncoder] = None,
initializer: InitializerApplicator = InitializerApplicator(),
regularizer: Optional[RegularizerApplicator] = None) -> None:
super(DecomposableAttention, self).__init__(vocab, regularizer)
self._text_field_embedder = text_field_embedder
self._attend_feedforward = TimeDistributed(attend_feedforward)
self._matrix_attention = LegacyMatrixAttention(similarity_function)
self._compare_feedforward = TimeDistributed(compare_feedforward)
self._aggregate_feedforward = aggregate_feedforward
self._premise_encoder = premise_encoder
self._hypothesis_encoder = hypothesis_encoder or premise_encoder
self._num_labels = vocab.get_vocab_size(namespace="labels")
check_dimensions_match(text_field_embedder.get_output_dim(), attend_feedforward.get_input_dim(),
"text field embedding dim", "attend feedforward input dim")
check_dimensions_match(aggregate_feedforward.get_output_dim(), self._num_labels,
"final output dimension", "number of labels")
self._accuracy = CategoricalAccuracy()
self._loss = torch.nn.CrossEntropyLoss()
initializer(self)
示例3: _read_embeddings_from_hdf5
# 需要导入模块: from allennlp.data import Vocabulary [as 别名]
# 或者: from allennlp.data.Vocabulary import get_vocab_size [as 别名]
def _read_embeddings_from_hdf5(embeddings_filename: str,
embedding_dim: int,
vocab: Vocabulary,
namespace: str = "tokens") -> torch.FloatTensor:
"""
Reads from a hdf5 formatted file. The embedding matrix is assumed to
be keyed by 'embedding' and of size ``(num_tokens, embedding_dim)``.
"""
with h5py.File(embeddings_filename, 'r') as fin:
embeddings = fin['embedding'][...]
if list(embeddings.shape) != [vocab.get_vocab_size(namespace), embedding_dim]:
raise ConfigurationError(
"Read shape {0} embeddings from the file, but expected {1}".format(
list(embeddings.shape), [vocab.get_vocab_size(namespace), embedding_dim]))
return torch.FloatTensor(embeddings)
示例4: get_vocab
# 需要导入模块: from allennlp.data import Vocabulary [as 别名]
# 或者: from allennlp.data.Vocabulary import get_vocab_size [as 别名]
def get_vocab(word2freq, max_v_sizes):
'''Build vocabulary'''
vocab = Vocabulary(counter=None, max_vocab_size=max_v_sizes['word'])
words_by_freq = [(word, freq) for word, freq in word2freq.items()]
words_by_freq.sort(key=lambda x: x[1], reverse=True)
for word, _ in words_by_freq[:max_v_sizes['word']]:
vocab.add_token_to_namespace(word, 'tokens')
log.info("\tFinished building vocab. Using %d words", vocab.get_vocab_size('tokens'))
return vocab
示例5: __init__
# 需要导入模块: from allennlp.data import Vocabulary [as 别名]
# 或者: from allennlp.data.Vocabulary import get_vocab_size [as 别名]
def __init__(self,
vocab: Vocabulary,
utterance_embedder: TextFieldEmbedder,
action_embedding_dim: int,
encoder: Seq2SeqEncoder,
decoder_beam_search: BeamSearch,
max_decoding_steps: int,
input_attention: Attention,
add_action_bias: bool = True,
dropout: float = 0.0,
initializer: InitializerApplicator = InitializerApplicator(),
regularizer: Optional[RegularizerApplicator] = None) -> None:
super().__init__(vocab, regularizer)
self._utterance_embedder = utterance_embedder
self._encoder = encoder
self._max_decoding_steps = max_decoding_steps
self._add_action_bias = add_action_bias
self._dropout = torch.nn.Dropout(p=dropout)
self._exact_match = Average()
self._valid_sql_query = Average()
self._action_similarity = Average()
self._denotation_accuracy = Average()
# the padding value used by IndexField
self._action_padding_index = -1
num_actions = vocab.get_vocab_size("rule_labels")
input_action_dim = action_embedding_dim
if self._add_action_bias:
input_action_dim += 1
self._action_embedder = Embedding(num_embeddings=num_actions, embedding_dim=input_action_dim)
self._output_action_embedder = Embedding(num_embeddings=num_actions, embedding_dim=action_embedding_dim)
# This is what we pass as input in the first step of decoding, when we don't have a
# previous action, or a previous utterance attention.
self._first_action_embedding = torch.nn.Parameter(torch.FloatTensor(action_embedding_dim))
self._first_attended_utterance = torch.nn.Parameter(torch.FloatTensor(encoder.get_output_dim()))
torch.nn.init.normal_(self._first_action_embedding)
torch.nn.init.normal_(self._first_attended_utterance)
self._beam_search = decoder_beam_search
self._decoder_trainer = MaximumMarginalLikelihood(beam_size=1)
self._transition_function = BasicTransitionFunction(encoder_output_dim=self._encoder.get_output_dim(),
action_embedding_dim=action_embedding_dim,
input_attention=input_attention,
predict_start_type_separately=False,
add_action_bias=self._add_action_bias,
dropout=dropout)
initializer(self)
示例6: test_read_hdf5_raises_on_invalid_shape
# 需要导入模块: from allennlp.data import Vocabulary [as 别名]
# 或者: from allennlp.data.Vocabulary import get_vocab_size [as 别名]
def test_read_hdf5_raises_on_invalid_shape(self):
vocab = Vocabulary()
vocab.add_token_to_namespace("word")
embeddings_filename = self.TEST_DIR + "embeddings.hdf5"
embeddings = numpy.random.rand(vocab.get_vocab_size(), 10)
with h5py.File(embeddings_filename, 'w') as fout:
_ = fout.create_dataset(
'embedding', embeddings.shape, dtype='float32', data=embeddings
)
params = Params({
'pretrained_file': embeddings_filename,
'embedding_dim': 5,
})
with pytest.raises(ConfigurationError):
_ = Embedding.from_params(vocab, params)
示例7: test_read_hdf5_format_file
# 需要导入模块: from allennlp.data import Vocabulary [as 别名]
# 或者: from allennlp.data.Vocabulary import get_vocab_size [as 别名]
def test_read_hdf5_format_file(self):
vocab = Vocabulary()
vocab.add_token_to_namespace("word")
vocab.add_token_to_namespace("word2")
embeddings_filename = self.TEST_DIR + "embeddings.hdf5"
embeddings = numpy.random.rand(vocab.get_vocab_size(), 5)
with h5py.File(embeddings_filename, 'w') as fout:
_ = fout.create_dataset(
'embedding', embeddings.shape, dtype='float32', data=embeddings
)
params = Params({
'pretrained_file': embeddings_filename,
'embedding_dim': 5,
})
embedding_layer = Embedding.from_params(vocab, params)
assert numpy.allclose(embedding_layer.weight.data.numpy(), embeddings)
示例8: __init__
# 需要导入模块: from allennlp.data import Vocabulary [as 别名]
# 或者: from allennlp.data.Vocabulary import get_vocab_size [as 别名]
def __init__(self, vocab: Vocabulary,
text_field_embedder: TextFieldEmbedder,
encoder: Seq2SeqEncoder,
similarity_function: SimilarityFunction,
projection_feedforward: FeedForward,
inference_encoder: Seq2SeqEncoder,
output_feedforward: FeedForward,
output_logit: FeedForward,
dropout: float = 0.5,
initializer: InitializerApplicator = InitializerApplicator(),
regularizer: Optional[RegularizerApplicator] = None) -> None:
super().__init__(vocab, regularizer)
self._text_field_embedder = text_field_embedder
self._encoder = encoder
self._matrix_attention = LegacyMatrixAttention(similarity_function)
self._projection_feedforward = projection_feedforward
self._inference_encoder = inference_encoder
if dropout:
self.dropout = torch.nn.Dropout(dropout)
self.rnn_input_dropout = InputVariationalDropout(dropout)
else:
self.dropout = None
self.rnn_input_dropout = None
self._output_feedforward = output_feedforward
self._output_logit = output_logit
self._num_labels = vocab.get_vocab_size(namespace="labels")
check_dimensions_match(text_field_embedder.get_output_dim(), encoder.get_input_dim(),
"text field embedding dim", "encoder input dim")
check_dimensions_match(encoder.get_output_dim() * 4, projection_feedforward.get_input_dim(),
"encoder output dim", "projection feedforward input")
check_dimensions_match(projection_feedforward.get_output_dim(), inference_encoder.get_input_dim(),
"proj feedforward output dim", "inference lstm input dim")
self._accuracy = CategoricalAccuracy()
self._loss = torch.nn.CrossEntropyLoss()
initializer(self)
示例9: from_params
# 需要导入模块: from allennlp.data import Vocabulary [as 别名]
# 或者: from allennlp.data.Vocabulary import get_vocab_size [as 别名]
def from_params(cls, vocab: Vocabulary, params: Params) -> 'Embedding':
"""
We need the vocabulary here to know how many items we need to embed, and we look for a
``vocab_namespace`` key in the parameter dictionary to know which vocabulary to use. If
you know beforehand exactly how many embeddings you need, or aren't using a vocabulary
mapping for the things getting embedded here, then you can pass in the ``num_embeddings``
key directly, and the vocabulary will be ignored.
"""
num_embeddings = params.pop_int('num_embeddings', None)
vocab_namespace = params.pop("vocab_namespace", "tokens")
if num_embeddings is None:
num_embeddings = vocab.get_vocab_size(vocab_namespace)
embedding_dim = params.pop_int('embedding_dim')
pretrained_file = params.pop("pretrained_file", None)
projection_dim = params.pop_int("projection_dim", None)
trainable = params.pop_bool("trainable", True)
padding_index = params.pop_int('padding_index', None)
max_norm = params.pop_float('max_norm', None)
norm_type = params.pop_float('norm_type', 2.)
scale_grad_by_freq = params.pop_bool('scale_grad_by_freq', False)
sparse = params.pop_bool('sparse', False)
params.assert_empty(cls.__name__)
if pretrained_file:
# If we're loading a saved model, we don't want to actually read a pre-trained
# embedding file - the embeddings will just be in our saved weights, and we might not
# have the original embedding file anymore, anyway.
weight = _read_pretrained_embedding_file(pretrained_file,
embedding_dim,
vocab,
vocab_namespace)
else:
weight = None
return cls(num_embeddings=num_embeddings,
embedding_dim=embedding_dim,
projection_dim=projection_dim,
weight=weight,
padding_index=padding_index,
trainable=trainable,
max_norm=max_norm,
norm_type=norm_type,
scale_grad_by_freq=scale_grad_by_freq,
sparse=sparse)
示例10: _read_embeddings_from_text_file
# 需要导入模块: from allennlp.data import Vocabulary [as 别名]
# 或者: from allennlp.data.Vocabulary import get_vocab_size [as 别名]
def _read_embeddings_from_text_file(file_uri: str,
embedding_dim: int,
vocab: Vocabulary,
namespace: str = "tokens") -> torch.FloatTensor:
"""
Read pre-trained word vectors from an eventually compressed text file, possibly contained
inside an archive with multiple files. The text file is assumed to be utf-8 encoded with
space-separated fields: [word] [dim 1] [dim 2] ...
Lines that contain more numerical tokens than ``embedding_dim`` raise a warning and are skipped.
The remainder of the docstring is identical to ``_read_pretrained_embeddings_file``.
"""
tokens_to_keep = set(vocab.get_index_to_token_vocabulary(namespace).values())
vocab_size = vocab.get_vocab_size(namespace)
embeddings = {}
# First we read the embeddings from the file, only keeping vectors for the words we need.
logger.info("Reading pretrained embeddings from file")
with EmbeddingsTextFile(file_uri) as embeddings_file:
for line in Tqdm.tqdm(embeddings_file):
token = line.split(' ', 1)[0]
if token in tokens_to_keep:
fields = line.rstrip().split(' ')
if len(fields) - 1 != embedding_dim:
# Sometimes there are funny unicode parsing problems that lead to different
# fields lengths (e.g., a word with a unicode space character that splits
# into more than one column). We skip those lines. Note that if you have
# some kind of long header, this could result in all of your lines getting
# skipped. It's hard to check for that here; you just have to look in the
# embedding_misses_file and at the model summary to make sure things look
# like they are supposed to.
logger.warning("Found line with wrong number of dimensions (expected: %d; actual: %d): %s",
embedding_dim, len(fields) - 1, line)
continue
vector = numpy.asarray(fields[1:], dtype='float32')
embeddings[token] = vector
if not embeddings:
raise ConfigurationError("No embeddings of correct dimension found; you probably "
"misspecified your embedding_dim parameter, or didn't "
"pre-populate your Vocabulary")
all_embeddings = numpy.asarray(list(embeddings.values()))
embeddings_mean = float(numpy.mean(all_embeddings))
embeddings_std = float(numpy.std(all_embeddings))
# Now we initialize the weight matrix for an embedding layer, starting with random vectors,
# then filling in the word vectors we just read.
logger.info("Initializing pre-trained embedding layer")
embedding_matrix = torch.FloatTensor(vocab_size, embedding_dim).normal_(embeddings_mean,
embeddings_std)
num_tokens_found = 0
index_to_token = vocab.get_index_to_token_vocabulary(namespace)
for i in range(vocab_size):
token = index_to_token[i]
# If we don't have a pre-trained vector for this word, we'll just leave this row alone,
# so the word has a random initialization.
if token in embeddings:
embedding_matrix[i] = torch.FloatTensor(embeddings[token])
num_tokens_found += 1
else:
logger.debug("Token %s was not found in the embedding file. Initialising randomly.", token)
logger.info("Pretrained embeddings were found for %d out of %d tokens",
num_tokens_found, vocab_size)
return embedding_matrix
示例11: from_params
# 需要导入模块: from allennlp.data import Vocabulary [as 别名]
# 或者: from allennlp.data.Vocabulary import get_vocab_size [as 别名]
def from_params(cls, vocab: Vocabulary, params: Params) -> 'Embedding': # type: ignore
"""
We need the vocabulary here to know how many items we need to embed, and we look for a
``vocab_namespace`` key in the parameter dictionary to know which vocabulary to use. If
you know beforehand exactly how many embeddings you need, or aren't using a vocabulary
mapping for the things getting embedded here, then you can pass in the ``num_embeddings``
key directly, and the vocabulary will be ignored.
In the configuration file, a file containing pretrained embeddings can be specified
using the parameter ``"pretrained_file"``.
It can be the path to a local file or an URL of a (cached) remote file.
Two formats are supported:
* hdf5 file - containing an embedding matrix in the form of a torch.Tensor;
* text file - an utf-8 encoded text file with space separated fields::
[word] [dim 1] [dim 2] ...
The text file can eventually be compressed with gzip, bz2, lzma or zip.
You can even select a single file inside an archive containing multiple files
using the URI::
"(archive_uri)#file_path_inside_the_archive"
where ``archive_uri`` can be a file system path or a URL. For example::
"(http://nlp.stanford.edu/data/glove.twitter.27B.zip)#glove.twitter.27B.200d.txt"
"""
# pylint: disable=arguments-differ
num_embeddings = params.pop_int('num_embeddings', None)
vocab_namespace = params.pop("vocab_namespace", "tokens")
if num_embeddings is None:
num_embeddings = vocab.get_vocab_size(vocab_namespace)
embedding_dim = params.pop_int('embedding_dim')
pretrained_file = params.pop("pretrained_file", None)
projection_dim = params.pop_int("projection_dim", None)
trainable = params.pop_bool("trainable", True)
padding_index = params.pop_int('padding_index', None)
max_norm = params.pop_float('max_norm', None)
norm_type = params.pop_float('norm_type', 2.)
scale_grad_by_freq = params.pop_bool('scale_grad_by_freq', False)
sparse = params.pop_bool('sparse', False)
params.assert_empty(cls.__name__)
if pretrained_file:
# If we're loading a saved model, we don't want to actually read a pre-trained
# embedding file - the embeddings will just be in our saved weights, and we might not
# have the original embedding file anymore, anyway.
weight = _read_pretrained_embeddings_file(pretrained_file,
embedding_dim,
vocab,
vocab_namespace)
else:
weight = None
return cls(num_embeddings=num_embeddings,
embedding_dim=embedding_dim,
projection_dim=projection_dim,
weight=weight,
padding_index=padding_index,
trainable=trainable,
max_norm=max_norm,
norm_type=norm_type,
scale_grad_by_freq=scale_grad_by_freq,
sparse=sparse)
示例12: _read_pretrained_word2vec_format_embedding_file
# 需要导入模块: from allennlp.data import Vocabulary [as 别名]
# 或者: from allennlp.data.Vocabulary import get_vocab_size [as 别名]
def _read_pretrained_word2vec_format_embedding_file(embeddings_filename: str, # pylint: disable=invalid-name
embedding_dim: int,
vocab: Vocabulary,
namespace: str = "tokens") -> torch.FloatTensor:
"""
Read from a gzipped-word2vec format file. The embeddings file is assumed to be gzipped and
space delimited, e.g. [word] [dim 1] [dim 2] ...
The remainder of the docstring is identical to ``_read_pretrained_embedding_file``.
"""
words_to_keep = set(vocab.get_index_to_token_vocabulary(namespace).values())
vocab_size = vocab.get_vocab_size(namespace)
embeddings = {}
# First we read the embeddings from the file, only keeping vectors for the words we need.
logger.info("Reading embeddings from file")
with gzip.open(cached_path(embeddings_filename), 'rb') as embeddings_file:
for line in embeddings_file:
fields = line.decode('utf-8').strip().split(' ')
if len(fields) - 1 != embedding_dim:
# Sometimes there are funny unicode parsing problems that lead to different
# fields lengths (e.g., a word with a unicode space character that splits
# into more than one column). We skip those lines. Note that if you have
# some kind of long header, this could result in all of your lines getting
# skipped. It's hard to check for that here; you just have to look in the
# embedding_misses_file and at the model summary to make sure things look
# like they are supposed to.
logger.warning("Found line with wrong number of dimensions (expected %d, was %d): %s",
embedding_dim, len(fields) - 1, line)
continue
word = fields[0]
if word in words_to_keep:
vector = numpy.asarray(fields[1:], dtype='float32')
embeddings[word] = vector
if not embeddings:
raise ConfigurationError("No embeddings of correct dimension found; you probably "
"misspecified your embedding_dim parameter, or didn't "
"pre-populate your Vocabulary")
all_embeddings = numpy.asarray(list(embeddings.values()))
embeddings_mean = float(numpy.mean(all_embeddings))
embeddings_std = float(numpy.std(all_embeddings))
# Now we initialize the weight matrix for an embedding layer, starting with random vectors,
# then filling in the word vectors we just read.
logger.info("Initializing pre-trained embedding layer")
embedding_matrix = torch.FloatTensor(vocab_size, embedding_dim).normal_(embeddings_mean,
embeddings_std)
for i in range(0, vocab_size):
word = vocab.get_token_from_index(i, namespace)
# If we don't have a pre-trained vector for this word, we'll just leave this row alone,
# so the word has a random initialization.
if word in embeddings:
embedding_matrix[i] = torch.FloatTensor(embeddings[word])
else:
logger.debug("Word %s was not found in the embedding file. Initialising randomly.", word)
# The weight matrix is initialized, so we construct and return the actual Embedding.
return embedding_matrix
示例13: __init__
# 需要导入模块: from allennlp.data import Vocabulary [as 别名]
# 或者: from allennlp.data.Vocabulary import get_vocab_size [as 别名]
def __init__(self,
vocab: Vocabulary,
utterance_embedder: TextFieldEmbedder,
action_embedding_dim: int,
encoder: Seq2SeqEncoder,
decoder_beam_search: BeamSearch,
max_decoding_steps: int,
input_attention: Attention,
add_action_bias: bool = True,
training_beam_size: int = None,
decoder_num_layers: int = 1,
dropout: float = 0.0,
rule_namespace: str = 'rule_labels',
database_file='/atis/atis.db') -> None:
# Atis semantic parser init
super().__init__(vocab)
self._utterance_embedder = utterance_embedder
self._encoder = encoder
self._max_decoding_steps = max_decoding_steps
self._add_action_bias = add_action_bias
if dropout > 0:
self._dropout = torch.nn.Dropout(p=dropout)
else:
self._dropout = lambda x: x
self._rule_namespace = rule_namespace
self._exact_match = Average()
self._valid_sql_query = Average()
self._action_similarity = Average()
self._denotation_accuracy = Average()
self._executor = SqlExecutor(database_file)
self._action_padding_index = -1 # the padding value used by IndexField
num_actions = vocab.get_vocab_size(self._rule_namespace)
if self._add_action_bias:
input_action_dim = action_embedding_dim + 1
else:
input_action_dim = action_embedding_dim
self._action_embedder = Embedding(num_embeddings=num_actions, embedding_dim=input_action_dim)
self._output_action_embedder = Embedding(num_embeddings=num_actions, embedding_dim=action_embedding_dim)
# This is what we pass as input in the first step of decoding, when we don't have a
# previous action, or a previous utterance attention.
self._first_action_embedding = torch.nn.Parameter(torch.FloatTensor(action_embedding_dim))
self._first_attended_utterance = torch.nn.Parameter(torch.FloatTensor(encoder.get_output_dim()))
torch.nn.init.normal_(self._first_action_embedding)
torch.nn.init.normal_(self._first_attended_utterance)
self._num_entity_types = 2 # TODO(kevin): get this in a more principled way somehow?
self._entity_type_decoder_embedding = Embedding(self._num_entity_types, action_embedding_dim)
self._decoder_num_layers = decoder_num_layers
self._beam_search = decoder_beam_search
self._decoder_trainer = MaximumMarginalLikelihood(training_beam_size)
self._transition_function = LinkingTransitionFunction(encoder_output_dim=self._encoder.get_output_dim(),
action_embedding_dim=action_embedding_dim,
input_attention=input_attention,
predict_start_type_separately=False,
add_action_bias=self._add_action_bias,
dropout=dropout,
num_layers=self._decoder_num_layers)
示例14: __init__
# 需要导入模块: from allennlp.data import Vocabulary [as 别名]
# 或者: from allennlp.data.Vocabulary import get_vocab_size [as 别名]
def __init__(self,
vocab: Vocabulary,
question_embedder: TextFieldEmbedder,
action_embedding_dim: int,
encoder: Seq2SeqEncoder,
decoder_beam_search: BeamSearch,
max_decoding_steps: int,
attention: Attention,
mixture_feedforward: FeedForward = None,
add_action_bias: bool = True,
dropout: float = 0.0,
num_linking_features: int = 0,
num_entity_bits: int = 0,
entity_bits_output: bool = True,
use_entities: bool = False,
denotation_only: bool = False,
# Deprecated parameter to load older models
entity_encoder: Seq2VecEncoder = None, # pylint: disable=unused-argument
entity_similarity_mode: str = "dot_product",
rule_namespace: str = 'rule_labels') -> None:
super(QuarelSemanticParser, self).__init__(vocab)
self._question_embedder = question_embedder
self._encoder = encoder
self._beam_search = decoder_beam_search
self._max_decoding_steps = max_decoding_steps
if dropout > 0:
self._dropout = torch.nn.Dropout(p=dropout)
else:
self._dropout = lambda x: x
self._rule_namespace = rule_namespace
self._denotation_accuracy = Average()
self._action_sequence_accuracy = Average()
self._has_logical_form = Average()
self._embedding_dim = question_embedder.get_output_dim()
self._use_entities = use_entities
# Note: there's only one non-trivial entity type in QuaRel for now, so most of the
# entity_type stuff is irrelevant
self._num_entity_types = 4 # TODO(mattg): get this in a more principled way somehow?
self._num_start_types = 1 # Hardcoded until we feed lf syntax into the model
self._entity_type_encoder_embedding = Embedding(self._num_entity_types, self._embedding_dim)
self._entity_type_decoder_embedding = Embedding(self._num_entity_types, action_embedding_dim)
self._entity_similarity_layer = None
self._entity_similarity_mode = entity_similarity_mode
if self._entity_similarity_mode == "weighted_dot_product":
self._entity_similarity_layer = \
TimeDistributed(torch.nn.Linear(self._embedding_dim, 1, bias=False))
# Center initial values around unweighted dot product
self._entity_similarity_layer._module.weight.data += 1 # pylint: disable=protected-access
elif self._entity_similarity_mode == "dot_product":
pass
else:
raise ValueError("Invalid entity_similarity_mode: {}".format(self._entity_similarity_mode))
if num_linking_features > 0:
self._linking_params = torch.nn.Linear(num_linking_features, 1)
else:
self._linking_params = None
self._decoder_trainer = MaximumMarginalLikelihood()
self._encoder_output_dim = self._encoder.get_output_dim()
if entity_bits_output:
self._encoder_output_dim += num_entity_bits
self._entity_bits_output = entity_bits_output
self._debug_count = 10
self._num_denotation_cats = 2 # Hardcoded for simplicity
self._denotation_only = denotation_only
if self._denotation_only:
self._denotation_accuracy_cat = CategoricalAccuracy()
self._denotation_classifier = torch.nn.Linear(self._encoder_output_dim,
self._num_denotation_cats)
# Rest of init not needed for denotation only where no decoding to actions needed
return
self._action_padding_index = -1 # the padding value used by IndexField
num_actions = vocab.get_vocab_size(self._rule_namespace)
self._num_actions = num_actions
self._action_embedder = Embedding(num_embeddings=num_actions, embedding_dim=action_embedding_dim)
# We are tying the action embeddings used for input and output
# self._output_action_embedder = Embedding(num_embeddings=num_actions, embedding_dim=action_embedding_dim)
self._output_action_embedder = self._action_embedder # tied weights
self._add_action_bias = add_action_bias
if self._add_action_bias:
self._action_biases = Embedding(num_embeddings=num_actions, embedding_dim=1)
# This is what we pass as input in the first step of decoding, when we don't have a
# previous action, or a previous question attention.
self._first_action_embedding = torch.nn.Parameter(torch.FloatTensor(action_embedding_dim))
self._first_attended_question = torch.nn.Parameter(torch.FloatTensor(self._encoder_output_dim))
torch.nn.init.normal_(self._first_action_embedding)
torch.nn.init.normal_(self._first_attended_question)
self._decoder_step = LinkingTransitionFunction(encoder_output_dim=self._encoder_output_dim,
action_embedding_dim=action_embedding_dim,
#.........这里部分代码省略.........