本文整理汇总了Python中allennlp.common.util.START_SYMBOL属性的典型用法代码示例。如果您正苦于以下问题:Python util.START_SYMBOL属性的具体用法?Python util.START_SYMBOL怎么用?Python util.START_SYMBOL使用的例子?那么恭喜您, 这里精选的属性代码示例或许可以为您提供帮助。您也可以进一步了解该属性所在类allennlp.common.util
的用法示例。
在下文中一共展示了util.START_SYMBOL属性的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: __init__
# 需要导入模块: from allennlp.common import util [as 别名]
# 或者: from allennlp.common.util import START_SYMBOL [as 别名]
def __init__(self,
lazy: bool = False,
tokenizer: Tokenizer = None,
token_indexers: Dict[str, TokenIndexer] = None,
words_per_instance: int = 35
) -> None:
super().__init__(lazy)
self._tokenizer = tokenizer or WordTokenizer(
start_tokens=[START_SYMBOL],
end_tokens=[END_SYMBOL]
)
self._token_indexers = token_indexers or {
"tokens": SingleIdTokenIndexer(namespace="tokens", lowercase_tokens=True)
}
self._words_per_instance = words_per_instance
示例2: text_to_instance
# 需要导入模块: from allennlp.common import util [as 别名]
# 或者: from allennlp.common.util import START_SYMBOL [as 别名]
def text_to_instance(self, text: str, sentences: List[str] = None, tags: List[int] = None) -> Instance:
if sentences is None:
if self._language == "ru":
sentences = [s.text for s in razdel.sentenize(text)]
else:
sentences = nltk.tokenize.sent_tokenize(text)
sentences_tokens = []
for sentence in sentences[:self._max_sentences_count]:
sentence = sentence.lower() if self._lowercase else sentence
tokens = self._tokenizer.tokenize(sentence)[:self._sentence_max_tokens]
tokens.insert(0, Token(START_SYMBOL))
tokens.append(Token(END_SYMBOL))
indexed_tokens = TextField(tokens, self._source_token_indexers)
sentences_tokens.append(indexed_tokens)
sentences_tokens_indexed = ListField(sentences_tokens)
result = {'source_sentences': sentences_tokens_indexed}
if tags:
result["sentences_tags"] = SequenceLabelField(tags[:self._max_sentences_count], sentences_tokens_indexed)
return Instance(result)
示例3: detokenize
# 需要导入模块: from allennlp.common import util [as 别名]
# 或者: from allennlp.common.util import START_SYMBOL [as 别名]
def detokenize(array, vocab):
"""
Given an array of ints, we'll turn this into a string or a list of strings.
:param array: possibly multidimensional numpy array
:return:
"""
if array.ndim > 1:
return [detokenize(x, vocab) for x in array]
tokenized = [vocab.get_token_from_index(v) for v in array]
return ' '.join([x for x in tokenized if x not in (vocab._padding_token, START_SYMBOL, END_SYMBOL)])
示例4: text_to_instance
# 需要导入模块: from allennlp.common import util [as 别名]
# 或者: from allennlp.common.util import START_SYMBOL [as 别名]
def text_to_instance(self, source_string , target_string = None) : # type: ignore
# pylint: disable=arguments-differ
tokenized_source = self._source_tokenizer.tokenize(source_string)
if self._source_add_start_token:
tokenized_source.insert(0, Token(START_SYMBOL))
tokenized_source.append(Token(END_SYMBOL))
source_field = TextField(tokenized_source, self._source_token_indexers)
if target_string is not None:
tokenized_target = self._target_tokenizer.tokenize(target_string)
tokenized_target.insert(0, Token(START_SYMBOL))
tokenized_target.append(Token(END_SYMBOL))
target_field = TextField(tokenized_target, self._target_token_indexers)
return Instance({u"source_tokens": source_field, u"target_tokens": target_field})
else:
return Instance({u'source_tokens': source_field})
示例5: __str__
# 需要导入模块: from allennlp.common import util [as 别名]
# 或者: from allennlp.common.util import START_SYMBOL [as 别名]
def __str__(self):
# TODO (pradeep): This limits the number of basic types we can have to 26. We may want to
# change this in the future if we extend to domains where we have more than 26 basic types.
if self._string_rep == START_SYMBOL:
return START_SYMBOL
else:
return self._string_rep.lower()[0]
示例6: __init__
# 需要导入模块: from allennlp.common import util [as 别名]
# 或者: from allennlp.common.util import START_SYMBOL [as 别名]
def __init__(self,
vocab ,
source_embedder ,
encoder ,
max_decoding_steps ,
target_namespace = u"tokens",
target_embedding_dim = None,
attention_function = None,
scheduled_sampling_ratio = 0.0) :
super(SimpleSeq2Seq, self).__init__(vocab)
self._source_embedder = source_embedder
self._encoder = encoder
self._max_decoding_steps = max_decoding_steps
self._target_namespace = target_namespace
self._attention_function = attention_function
self._scheduled_sampling_ratio = scheduled_sampling_ratio
# We need the start symbol to provide as the input at the first timestep of decoding, and
# end symbol as a way to indicate the end of the decoded sequence.
self._start_index = self.vocab.get_token_index(START_SYMBOL, self._target_namespace)
self._end_index = self.vocab.get_token_index(END_SYMBOL, self._target_namespace)
num_classes = self.vocab.get_vocab_size(self._target_namespace)
# Decoder output dim needs to be the same as the encoder output dim since we initialize the
# hidden state of the decoder with that of the final hidden states of the encoder. Also, if
# we're using attention with ``DotProductSimilarity``, this is needed.
self._decoder_output_dim = self._encoder.get_output_dim()
target_embedding_dim = target_embedding_dim or self._source_embedder.get_output_dim()
self._target_embedder = Embedding(num_classes, target_embedding_dim)
if self._attention_function:
self._decoder_attention = LegacyAttention(self._attention_function)
# The output of attention, a weighted average over encoder outputs, will be
# concatenated to the input vector of the decoder at each time step.
self._decoder_input_dim = self._encoder.get_output_dim() + target_embedding_dim
else:
self._decoder_input_dim = target_embedding_dim
# TODO (pradeep): Do not hardcode decoder cell type.
self._decoder_cell = LSTMCell(self._decoder_input_dim, self._decoder_output_dim)
self._output_projection_layer = Linear(self._decoder_output_dim, num_classes)
#overrides
示例7: get_nonterminal_productions
# 需要导入模块: from allennlp.common import util [as 别名]
# 或者: from allennlp.common.util import START_SYMBOL [as 别名]
def get_nonterminal_productions(self) -> Dict[str, List[str]]:
"""
Induces a grammar from the defined collection of predicates in this language and returns
all productions in that grammar, keyed by the non-terminal they are expanding.
This includes terminal productions implied by each predicate as well as productions for the
`return type` of each defined predicate. For example, defining a "multiply" predicate adds
a "<int,int:int> -> multiply" terminal production to the grammar, and `also` a "int ->
[<int,int:int>, int, int]" non-terminal production, because I can use the "multiply"
predicate to produce an int.
"""
if not self._nonterminal_productions:
actions: Dict[str, Set[str]] = defaultdict(set)
# If you didn't give us a set of valid start types, we'll assume all types we know
# about (including functional types) are valid start types.
if self._start_types:
start_types = self._start_types
else:
start_types = set()
for type_list in self._function_types.values():
start_types.update(type_list)
for start_type in start_types:
actions[START_SYMBOL].add(f"{START_SYMBOL} -> {start_type}")
for name, function_type_list in self._function_types.items():
for function_type in function_type_list:
actions[str(function_type)].add(f"{function_type} -> {name}")
if isinstance(function_type, FunctionType):
return_type = function_type.return_type
arg_types = function_type.argument_types
right_side = f"[{function_type}, {', '.join(str(arg_type) for arg_type in arg_types)}]"
actions[str(return_type)].add(f"{return_type} -> {right_side}")
self._nonterminal_productions = {key: sorted(value) for key, value in actions.items()}
return self._nonterminal_productions
示例8: text_to_instance
# 需要导入模块: from allennlp.common import util [as 别名]
# 或者: from allennlp.common.util import START_SYMBOL [as 别名]
def text_to_instance(self, query_id:str, doc_id:str, query_sequence: str, doc_sequence: str) -> Instance: # type: ignore
# pylint: disable=arguments-differ
query_id_field = LabelField(int(query_id), skip_indexing=True)
doc_id_field = LabelField(int(doc_id), skip_indexing=True)
query_tokenized = self._tokenizer.tokenize(query_sequence)
#if self._source_add_start_token:
# query_tokenized.insert(0, Token(START_SYMBOL))
#query_tokenized.append(Token(END_SYMBOL))
if self.max_query_length > -1:
query_tokenized = query_tokenized[:self.max_query_length]
query_field = TextField(query_tokenized, self._token_indexers)
doc_tokenized = self._tokenizer.tokenize(doc_sequence)
#doc_tokenized.insert(0, Token(START_SYMBOL))
#doc_tokenized.append(Token(END_SYMBOL))
if self.max_doc_length > -1:
doc_tokenized = doc_tokenized[:self.max_doc_length]
doc_field = TextField(doc_tokenized, self._token_indexers)
query_length = LabelField(len(query_tokenized), skip_indexing=True)
doc_length = LabelField(len(doc_tokenized), skip_indexing=True)
return Instance({
"query_id":query_id_field,
"doc_id":doc_id_field,
"query_tokens":query_field,
"doc_tokens":doc_field,
"query_length":query_length,
"doc_length":doc_length})
示例9: text_to_instance
# 需要导入模块: from allennlp.common import util [as 别名]
# 或者: from allennlp.common.util import START_SYMBOL [as 别名]
def text_to_instance(self, query_sequence: str, doc_pos_sequence: str, doc_neg_sequence: str) -> Instance: # type: ignore
# pylint: disable=arguments-differ
query_tokenized = self._tokenizer.tokenize(query_sequence)
#if self._source_add_start_token:
# query_tokenized.insert(0, Token(START_SYMBOL))
#query_tokenized.append(Token(END_SYMBOL))
if self.max_query_length > -1:
query_tokenized = query_tokenized[:self.max_query_length]
query_field = TextField(query_tokenized, self._token_indexers)
doc_pos_tokenized = self._tokenizer.tokenize(doc_pos_sequence)
#doc_pos_tokenized.insert(0, Token(START_SYMBOL))
#doc_pos_tokenized.append(Token(END_SYMBOL))
if self.max_doc_length > -1:
doc_pos_tokenized = doc_pos_tokenized[:self.max_doc_length]
doc_pos_field = TextField(doc_pos_tokenized, self._token_indexers)
doc_neg_tokenized = self._tokenizer.tokenize(doc_neg_sequence)
#doc_neg_tokenized.insert(0, Token(START_SYMBOL))
#doc_neg_tokenized.append(Token(END_SYMBOL))
if self.max_doc_length > -1:
doc_neg_tokenized = doc_neg_tokenized[:self.max_doc_length]
doc_neg_field = TextField(doc_neg_tokenized, self._token_indexers)
query_length = LabelField(len(query_tokenized), skip_indexing=True)
doc_pos_length = LabelField(len(doc_pos_tokenized), skip_indexing=True)
doc_neg_length = LabelField(len(doc_neg_tokenized), skip_indexing=True)
return Instance({
"query_tokens":query_field,
"doc_pos_tokens":doc_pos_field,
"doc_neg_tokens": doc_neg_field,
"query_length":query_length,
"doc_pos_length":doc_pos_length,
"doc_neg_length":doc_neg_length})
示例10: text_to_instance
# 需要导入模块: from allennlp.common import util [as 别名]
# 或者: from allennlp.common.util import START_SYMBOL [as 别名]
def text_to_instance(self, query_id:str, doc_id:str, query_sequence: str, doc_sequence: str) -> Instance: # type: ignore
# pylint: disable=arguments-differ
query_id_field = MetadataField(query_id)
doc_id_field = MetadataField(doc_id)
query_tokenized = self._tokenizer.tokenize(query_sequence)
#if self._source_add_start_token:
# query_tokenized.insert(0, Token(START_SYMBOL))
#query_tokenized.append(Token(END_SYMBOL))
if self.max_query_length > -1:
query_tokenized = query_tokenized[:self.max_query_length]
if self.min_query_length > -1 and len(query_tokenized) < self.min_query_length:
query_tokenized = query_tokenized + [self.padding_value] * (self.min_query_length - len(query_tokenized))
query_field = TextField(query_tokenized, self._token_indexers)
doc_tokenized = self._tokenizer.tokenize(doc_sequence)
#doc_tokenized.insert(0, Token(START_SYMBOL))
#doc_tokenized.append(Token(END_SYMBOL))
if self.max_doc_length > -1:
doc_tokenized = doc_tokenized[:self.max_doc_length]
if self.min_doc_length > -1 and len(doc_tokenized) < self.min_doc_length:
doc_tokenized = doc_tokenized + [self.padding_value] * (self.min_doc_length - len(doc_tokenized))
doc_field = TextField(doc_tokenized, self._token_indexers)
return Instance({
"query_id":query_id_field,
"doc_id":doc_id_field,
"query_tokens":query_field,
"doc_tokens":doc_field})
示例11: test_cnn_dailymail_reader
# 需要导入模块: from allennlp.common import util [as 别名]
# 或者: from allennlp.common.util import START_SYMBOL [as 别名]
def test_cnn_dailymail_reader(self):
tokenizer = WordTokenizer(word_splitter=SimpleWordSplitter())
reader = CNNDailyMailReader(tokenizer, cnn_tokenized_dir=TEST_STORIES_DIR, separate_namespaces=False)
dataset = reader.read(TEST_URLS_FILE)
for sample in dataset:
self.assertEqual(sample.fields["source_tokens"][0].text, START_SYMBOL)
self.assertEqual(sample.fields["source_tokens"][-1].text, END_SYMBOL)
self.assertGreater(len(sample.fields["source_tokens"]), 2)
self.assertEqual(sample.fields["target_tokens"][0].text, START_SYMBOL)
self.assertEqual(sample.fields["target_tokens"][-1].text, END_SYMBOL)
self.assertGreater(len(sample.fields["target_tokens"]), 2)
示例12: test_ria_reader
# 需要导入模块: from allennlp.common import util [as 别名]
# 或者: from allennlp.common.util import START_SYMBOL [as 别名]
def test_ria_reader(self):
tokenizer = WordTokenizer(word_splitter=SimpleWordSplitter())
reader = RIAReader(tokenizer)
dataset = reader.read(RIA_EXAMPLE_FILE)
for sample in dataset:
self.assertEqual(sample.fields["source_tokens"][0].text, START_SYMBOL)
self.assertEqual(sample.fields["source_tokens"][-1].text, END_SYMBOL)
self.assertGreater(len(sample.fields["source_tokens"]), 2)
self.assertEqual(sample.fields["target_tokens"][0].text, START_SYMBOL)
self.assertEqual(sample.fields["target_tokens"][-1].text, END_SYMBOL)
self.assertGreater(len(sample.fields["target_tokens"]), 2)
示例13: _walk
# 需要导入模块: from allennlp.common import util [as 别名]
# 或者: from allennlp.common.util import START_SYMBOL [as 别名]
def _walk(self) :
u"""
Walk over action space to collect completed paths of at most ``self._max_path_length`` steps.
"""
# Buffer of NTs to expand, previous actions
incomplete_paths = [([unicode(type_)], ["{START_SYMBOL} -> {type_}"]) for type_ in
self._world.get_valid_starting_types()]
self._completed_paths = []
actions = self._world.get_valid_actions()
# Overview: We keep track of the buffer of non-terminals to expand, and the action history
# for each incomplete path. At every iteration in the while loop below, we iterate over all
# incomplete paths, expand one non-terminal from the buffer in a depth-first fashion, get
# all possible next actions triggered by that non-terminal and add to the paths. Then, we
# check the expanded paths, to see if they are 1) complete, in which case they are
# added to completed_paths, 2) longer than max_path_length, in which case they are
# discarded, or 3) neither, in which case they are used to form the incomplete_paths for the
# next iteration of this while loop.
# While the non-terminal expansion is done in a depth-first fashion, note that the search over
# the action space itself is breadth-first.
while incomplete_paths:
next_paths = []
for nonterminal_buffer, history in incomplete_paths:
# Taking the last non-terminal added to the buffer. We're going depth-first.
nonterminal = nonterminal_buffer.pop()
# Iterating over all possible next actions.
for action in actions[nonterminal]:
new_history = history + [action]
new_nonterminal_buffer = nonterminal_buffer[:]
# Since we expand the last action added to the buffer, the left child should be
# added after the right child.
for right_side_part in reversed(self._get_right_side_parts(action)):
if types.is_nonterminal(right_side_part):
new_nonterminal_buffer.append(right_side_part)
next_paths.append((new_nonterminal_buffer, new_history))
incomplete_paths = []
for nonterminal_buffer, path in next_paths:
# An empty buffer means that we've completed this path.
if not nonterminal_buffer:
# Indexing completed paths by the nonterminals they contain.
next_path_index = len(self._completed_paths)
for action in path:
for value in self._get_right_side_parts(action):
if not types.is_nonterminal(value):
self._terminal_path_index[action].add(next_path_index)
self._completed_paths.append(path)
# We're adding to incomplete_paths for the next iteration, only those paths that are
# shorter than the max_path_length. The remaining paths will be discarded.
elif len(path) <= self._max_path_length:
incomplete_paths.append((nonterminal_buffer, path))
示例14: __init__
# 需要导入模块: from allennlp.common import util [as 别名]
# 或者: from allennlp.common.util import START_SYMBOL [as 别名]
def __init__(
self,
vocab: Vocabulary,
decoder_net: DecoderNet,
max_decoding_steps: int,
target_embedder: Embedding,
target_namespace: str = "tokens",
tie_output_embedding: bool = False,
scheduled_sampling_ratio: float = 0,
label_smoothing_ratio: Optional[float] = None,
beam_size: int = 4,
tensor_based_metric: Metric = None,
token_based_metric: Metric = None,
) -> None:
super().__init__(target_embedder)
self._vocab = vocab
self._decoder_net = decoder_net
self._max_decoding_steps = max_decoding_steps
self._target_namespace = target_namespace
self._label_smoothing_ratio = label_smoothing_ratio
self._start_index = self._vocab.get_token_index(START_SYMBOL, self._target_namespace)
self._end_index = self._vocab.get_token_index(END_SYMBOL, self._target_namespace)
self._beam_search = BeamSearch(self._end_index, max_steps=max_decoding_steps, beam_size=beam_size)
target_vocab_size = self._vocab.get_vocab_size(self._target_namespace)
if self.target_embedder.get_output_dim() != self._decoder_net.target_embedding_dim:
raise ConfigurationError("Target Embedder output_dim doesn't match decoder module's input.")
self._output_projection_layer = Linear(self._decoder_net.get_output_dim(), target_vocab_size)
if tie_output_embedding:
if self._output_projection_layer.weight.shape != self.target_embedder.weight.shape:
raise ConfigurationError("Can't tie embeddings with output linear layer, due to shape mismatch")
self._output_projection_layer.weight = self.target_embedder.weight
self._tensor_based_metric = tensor_based_metric
self._token_based_metric = token_based_metric
self._scheduled_sampling_ratio = scheduled_sampling_ratio
示例15: text_to_instance
# 需要导入模块: from allennlp.common import util [as 别名]
# 或者: from allennlp.common.util import START_SYMBOL [as 别名]
def text_to_instance(self, source: str, target: str = None) -> Instance:
def prepare_text(text, max_tokens):
text = text.lower() if self._lowercase else text
tokens = self._tokenizer.tokenize(text)[:max_tokens]
tokens.insert(0, Token(START_SYMBOL))
tokens.append(Token(END_SYMBOL))
return tokens
source_tokens = prepare_text(source, self._source_max_tokens)
source_tokens_indexed = TextField(source_tokens, self._source_token_indexers)
result = {'source_tokens': source_tokens_indexed}
meta_fields = {}
if self._save_copy_fields:
source_to_target_field = NamespaceSwappingField(source_tokens[1:-1], self._target_namespace)
result["source_to_target"] = source_to_target_field
meta_fields["source_tokens"] = [x.text for x in source_tokens[1:-1]]
if self._save_pgn_fields:
source_to_target_field = NamespaceSwappingField(source_tokens, self._target_namespace)
result["source_to_target"] = source_to_target_field
meta_fields["source_tokens"] = [x.text for x in source_tokens]
if target:
target_tokens = prepare_text(target, self._target_max_tokens)
target_tokens_indexed = TextField(target_tokens, self._target_token_indexers)
result['target_tokens'] = target_tokens_indexed
if self._save_pgn_fields:
meta_fields["target_tokens"] = [y.text for y in target_tokens]
source_and_target_token_ids = self._tokens_to_ids(source_tokens + target_tokens, self._lowercase)
source_token_ids = source_and_target_token_ids[:len(source_tokens)]
result["source_token_ids"] = ArrayField(np.array(source_token_ids, dtype='long'))
target_token_ids = source_and_target_token_ids[len(source_tokens):]
result["target_token_ids"] = ArrayField(np.array(target_token_ids, dtype='long'))
if self._save_copy_fields:
meta_fields["target_tokens"] = [y.text for y in target_tokens[1:-1]]
source_and_target_token_ids = self._tokens_to_ids(source_tokens[1:-1] + target_tokens, self._lowercase)
source_token_ids = source_and_target_token_ids[:len(source_tokens)-2]
result["source_token_ids"] = ArrayField(np.array(source_token_ids))
target_token_ids = source_and_target_token_ids[len(source_tokens)-2:]
result["target_token_ids"] = ArrayField(np.array(target_token_ids))
elif self._save_copy_fields:
source_token_ids = self._tokens_to_ids(source_tokens[1:-1], self._lowercase)
result["source_token_ids"] = ArrayField(np.array(source_token_ids))
elif self._save_pgn_fields:
source_token_ids = self._tokens_to_ids(source_tokens, self._lowercase)
result["source_token_ids"] = ArrayField(np.array(source_token_ids))
if self._save_copy_fields or self._save_pgn_fields:
result["metadata"] = MetadataField(meta_fields)
return Instance(result)