本文整理汇总了Python中allennlp.common.util.END_SYMBOL属性的典型用法代码示例。如果您正苦于以下问题:Python util.END_SYMBOL属性的具体用法?Python util.END_SYMBOL怎么用?Python util.END_SYMBOL使用的例子?那么恭喜您, 这里精选的属性代码示例或许可以为您提供帮助。您也可以进一步了解该属性所在类allennlp.common.util
的用法示例。
在下文中一共展示了util.END_SYMBOL属性的13个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: get_generator
# 需要导入模块: from allennlp.common import util [as 别名]
# 或者: from allennlp.common.util import END_SYMBOL [as 别名]
def get_generator(self,
model_path: str,
token_vocab_path: str,
stress_vocab_dump_path: str) -> Generator:
if self.generator is None:
assert os.path.isdir(model_path) and os.path.isdir(token_vocab_path)
vocabulary = Vocabulary.from_files(token_vocab_path)
stress_vocabulary = StressVocabulary()
if not os.path.isfile(stress_vocab_dump_path):
stress_vocabulary = inflate_stress_vocabulary(vocabulary, self.get_stress_predictor())
stress_vocabulary.save(stress_vocab_dump_path)
else:
stress_vocabulary.load(stress_vocab_dump_path)
eos_index = vocabulary.get_token_index(END_SYMBOL)
unk_index = vocabulary.get_token_index(DEFAULT_OOV_TOKEN)
exclude_transform = ExcludeTransform((unk_index, eos_index))
model = LanguageModel.load(model_path, vocabulary_dir=token_vocab_path,
transforms=[exclude_transform, ])
self.generator = Generator(model, vocabulary, stress_vocabulary, eos_index)
return self.generator
示例2: __init__
# 需要导入模块: from allennlp.common import util [as 别名]
# 或者: from allennlp.common.util import END_SYMBOL [as 别名]
def __init__(self,
lazy: bool = False,
tokenizer: Tokenizer = None,
token_indexers: Dict[str, TokenIndexer] = None,
words_per_instance: int = 35
) -> None:
super().__init__(lazy)
self._tokenizer = tokenizer or WordTokenizer(
start_tokens=[START_SYMBOL],
end_tokens=[END_SYMBOL]
)
self._token_indexers = token_indexers or {
"tokens": SingleIdTokenIndexer(namespace="tokens", lowercase_tokens=True)
}
self._words_per_instance = words_per_instance
示例3: text_to_instance
# 需要导入模块: from allennlp.common import util [as 别名]
# 或者: from allennlp.common.util import END_SYMBOL [as 别名]
def text_to_instance(self, text: str, sentences: List[str] = None, tags: List[int] = None) -> Instance:
if sentences is None:
if self._language == "ru":
sentences = [s.text for s in razdel.sentenize(text)]
else:
sentences = nltk.tokenize.sent_tokenize(text)
sentences_tokens = []
for sentence in sentences[:self._max_sentences_count]:
sentence = sentence.lower() if self._lowercase else sentence
tokens = self._tokenizer.tokenize(sentence)[:self._sentence_max_tokens]
tokens.insert(0, Token(START_SYMBOL))
tokens.append(Token(END_SYMBOL))
indexed_tokens = TextField(tokens, self._source_token_indexers)
sentences_tokens.append(indexed_tokens)
sentences_tokens_indexed = ListField(sentences_tokens)
result = {'source_sentences': sentences_tokens_indexed}
if tags:
result["sentences_tags"] = SequenceLabelField(tags[:self._max_sentences_count], sentences_tokens_indexed)
return Instance(result)
示例4: detokenize
# 需要导入模块: from allennlp.common import util [as 别名]
# 或者: from allennlp.common.util import END_SYMBOL [as 别名]
def detokenize(array, vocab):
"""
Given an array of ints, we'll turn this into a string or a list of strings.
:param array: possibly multidimensional numpy array
:return:
"""
if array.ndim > 1:
return [detokenize(x, vocab) for x in array]
tokenized = [vocab.get_token_from_index(v) for v in array]
return ' '.join([x for x in tokenized if x not in (vocab._padding_token, START_SYMBOL, END_SYMBOL)])
示例5: text_to_instance
# 需要导入模块: from allennlp.common import util [as 别名]
# 或者: from allennlp.common.util import END_SYMBOL [as 别名]
def text_to_instance(self, source_string , target_string = None) : # type: ignore
# pylint: disable=arguments-differ
tokenized_source = self._source_tokenizer.tokenize(source_string)
if self._source_add_start_token:
tokenized_source.insert(0, Token(START_SYMBOL))
tokenized_source.append(Token(END_SYMBOL))
source_field = TextField(tokenized_source, self._source_token_indexers)
if target_string is not None:
tokenized_target = self._target_tokenizer.tokenize(target_string)
tokenized_target.insert(0, Token(START_SYMBOL))
tokenized_target.append(Token(END_SYMBOL))
target_field = TextField(tokenized_target, self._target_token_indexers)
return Instance({u"source_tokens": source_field, u"target_tokens": target_field})
else:
return Instance({u'source_tokens': source_field})
示例6: __init__
# 需要导入模块: from allennlp.common import util [as 别名]
# 或者: from allennlp.common.util import END_SYMBOL [as 别名]
def __init__(self,
vocab ,
source_embedder ,
encoder ,
max_decoding_steps ,
target_namespace = u"tokens",
target_embedding_dim = None,
attention_function = None,
scheduled_sampling_ratio = 0.0) :
super(SimpleSeq2Seq, self).__init__(vocab)
self._source_embedder = source_embedder
self._encoder = encoder
self._max_decoding_steps = max_decoding_steps
self._target_namespace = target_namespace
self._attention_function = attention_function
self._scheduled_sampling_ratio = scheduled_sampling_ratio
# We need the start symbol to provide as the input at the first timestep of decoding, and
# end symbol as a way to indicate the end of the decoded sequence.
self._start_index = self.vocab.get_token_index(START_SYMBOL, self._target_namespace)
self._end_index = self.vocab.get_token_index(END_SYMBOL, self._target_namespace)
num_classes = self.vocab.get_vocab_size(self._target_namespace)
# Decoder output dim needs to be the same as the encoder output dim since we initialize the
# hidden state of the decoder with that of the final hidden states of the encoder. Also, if
# we're using attention with ``DotProductSimilarity``, this is needed.
self._decoder_output_dim = self._encoder.get_output_dim()
target_embedding_dim = target_embedding_dim or self._source_embedder.get_output_dim()
self._target_embedder = Embedding(num_classes, target_embedding_dim)
if self._attention_function:
self._decoder_attention = LegacyAttention(self._attention_function)
# The output of attention, a weighted average over encoder outputs, will be
# concatenated to the input vector of the decoder at each time step.
self._decoder_input_dim = self._encoder.get_output_dim() + target_embedding_dim
else:
self._decoder_input_dim = target_embedding_dim
# TODO (pradeep): Do not hardcode decoder cell type.
self._decoder_cell = LSTMCell(self._decoder_input_dim, self._decoder_output_dim)
self._output_projection_layer = Linear(self._decoder_output_dim, num_classes)
#overrides
示例7: text_to_instance
# 需要导入模块: from allennlp.common import util [as 别名]
# 或者: from allennlp.common.util import END_SYMBOL [as 别名]
def text_to_instance(self, query_id:str, doc_id:str, query_sequence: str, doc_sequence: str) -> Instance: # type: ignore
# pylint: disable=arguments-differ
query_id_field = LabelField(int(query_id), skip_indexing=True)
doc_id_field = LabelField(int(doc_id), skip_indexing=True)
query_tokenized = self._tokenizer.tokenize(query_sequence)
#if self._source_add_start_token:
# query_tokenized.insert(0, Token(START_SYMBOL))
#query_tokenized.append(Token(END_SYMBOL))
if self.max_query_length > -1:
query_tokenized = query_tokenized[:self.max_query_length]
query_field = TextField(query_tokenized, self._token_indexers)
doc_tokenized = self._tokenizer.tokenize(doc_sequence)
#doc_tokenized.insert(0, Token(START_SYMBOL))
#doc_tokenized.append(Token(END_SYMBOL))
if self.max_doc_length > -1:
doc_tokenized = doc_tokenized[:self.max_doc_length]
doc_field = TextField(doc_tokenized, self._token_indexers)
query_length = LabelField(len(query_tokenized), skip_indexing=True)
doc_length = LabelField(len(doc_tokenized), skip_indexing=True)
return Instance({
"query_id":query_id_field,
"doc_id":doc_id_field,
"query_tokens":query_field,
"doc_tokens":doc_field,
"query_length":query_length,
"doc_length":doc_length})
示例8: text_to_instance
# 需要导入模块: from allennlp.common import util [as 别名]
# 或者: from allennlp.common.util import END_SYMBOL [as 别名]
def text_to_instance(self, query_sequence: str, doc_pos_sequence: str, doc_neg_sequence: str) -> Instance: # type: ignore
# pylint: disable=arguments-differ
query_tokenized = self._tokenizer.tokenize(query_sequence)
#if self._source_add_start_token:
# query_tokenized.insert(0, Token(START_SYMBOL))
#query_tokenized.append(Token(END_SYMBOL))
if self.max_query_length > -1:
query_tokenized = query_tokenized[:self.max_query_length]
query_field = TextField(query_tokenized, self._token_indexers)
doc_pos_tokenized = self._tokenizer.tokenize(doc_pos_sequence)
#doc_pos_tokenized.insert(0, Token(START_SYMBOL))
#doc_pos_tokenized.append(Token(END_SYMBOL))
if self.max_doc_length > -1:
doc_pos_tokenized = doc_pos_tokenized[:self.max_doc_length]
doc_pos_field = TextField(doc_pos_tokenized, self._token_indexers)
doc_neg_tokenized = self._tokenizer.tokenize(doc_neg_sequence)
#doc_neg_tokenized.insert(0, Token(START_SYMBOL))
#doc_neg_tokenized.append(Token(END_SYMBOL))
if self.max_doc_length > -1:
doc_neg_tokenized = doc_neg_tokenized[:self.max_doc_length]
doc_neg_field = TextField(doc_neg_tokenized, self._token_indexers)
query_length = LabelField(len(query_tokenized), skip_indexing=True)
doc_pos_length = LabelField(len(doc_pos_tokenized), skip_indexing=True)
doc_neg_length = LabelField(len(doc_neg_tokenized), skip_indexing=True)
return Instance({
"query_tokens":query_field,
"doc_pos_tokens":doc_pos_field,
"doc_neg_tokens": doc_neg_field,
"query_length":query_length,
"doc_pos_length":doc_pos_length,
"doc_neg_length":doc_neg_length})
示例9: text_to_instance
# 需要导入模块: from allennlp.common import util [as 别名]
# 或者: from allennlp.common.util import END_SYMBOL [as 别名]
def text_to_instance(self, query_id:str, doc_id:str, query_sequence: str, doc_sequence: str) -> Instance: # type: ignore
# pylint: disable=arguments-differ
query_id_field = MetadataField(query_id)
doc_id_field = MetadataField(doc_id)
query_tokenized = self._tokenizer.tokenize(query_sequence)
#if self._source_add_start_token:
# query_tokenized.insert(0, Token(START_SYMBOL))
#query_tokenized.append(Token(END_SYMBOL))
if self.max_query_length > -1:
query_tokenized = query_tokenized[:self.max_query_length]
if self.min_query_length > -1 and len(query_tokenized) < self.min_query_length:
query_tokenized = query_tokenized + [self.padding_value] * (self.min_query_length - len(query_tokenized))
query_field = TextField(query_tokenized, self._token_indexers)
doc_tokenized = self._tokenizer.tokenize(doc_sequence)
#doc_tokenized.insert(0, Token(START_SYMBOL))
#doc_tokenized.append(Token(END_SYMBOL))
if self.max_doc_length > -1:
doc_tokenized = doc_tokenized[:self.max_doc_length]
if self.min_doc_length > -1 and len(doc_tokenized) < self.min_doc_length:
doc_tokenized = doc_tokenized + [self.padding_value] * (self.min_doc_length - len(doc_tokenized))
doc_field = TextField(doc_tokenized, self._token_indexers)
return Instance({
"query_id":query_id_field,
"doc_id":doc_id_field,
"query_tokens":query_field,
"doc_tokens":doc_field})
示例10: test_cnn_dailymail_reader
# 需要导入模块: from allennlp.common import util [as 别名]
# 或者: from allennlp.common.util import END_SYMBOL [as 别名]
def test_cnn_dailymail_reader(self):
tokenizer = WordTokenizer(word_splitter=SimpleWordSplitter())
reader = CNNDailyMailReader(tokenizer, cnn_tokenized_dir=TEST_STORIES_DIR, separate_namespaces=False)
dataset = reader.read(TEST_URLS_FILE)
for sample in dataset:
self.assertEqual(sample.fields["source_tokens"][0].text, START_SYMBOL)
self.assertEqual(sample.fields["source_tokens"][-1].text, END_SYMBOL)
self.assertGreater(len(sample.fields["source_tokens"]), 2)
self.assertEqual(sample.fields["target_tokens"][0].text, START_SYMBOL)
self.assertEqual(sample.fields["target_tokens"][-1].text, END_SYMBOL)
self.assertGreater(len(sample.fields["target_tokens"]), 2)
示例11: test_ria_reader
# 需要导入模块: from allennlp.common import util [as 别名]
# 或者: from allennlp.common.util import END_SYMBOL [as 别名]
def test_ria_reader(self):
tokenizer = WordTokenizer(word_splitter=SimpleWordSplitter())
reader = RIAReader(tokenizer)
dataset = reader.read(RIA_EXAMPLE_FILE)
for sample in dataset:
self.assertEqual(sample.fields["source_tokens"][0].text, START_SYMBOL)
self.assertEqual(sample.fields["source_tokens"][-1].text, END_SYMBOL)
self.assertGreater(len(sample.fields["source_tokens"]), 2)
self.assertEqual(sample.fields["target_tokens"][0].text, START_SYMBOL)
self.assertEqual(sample.fields["target_tokens"][-1].text, END_SYMBOL)
self.assertGreater(len(sample.fields["target_tokens"]), 2)
示例12: __init__
# 需要导入模块: from allennlp.common import util [as 别名]
# 或者: from allennlp.common.util import END_SYMBOL [as 别名]
def __init__(
self,
vocab: Vocabulary,
decoder_net: DecoderNet,
max_decoding_steps: int,
target_embedder: Embedding,
target_namespace: str = "tokens",
tie_output_embedding: bool = False,
scheduled_sampling_ratio: float = 0,
label_smoothing_ratio: Optional[float] = None,
beam_size: int = 4,
tensor_based_metric: Metric = None,
token_based_metric: Metric = None,
) -> None:
super().__init__(target_embedder)
self._vocab = vocab
self._decoder_net = decoder_net
self._max_decoding_steps = max_decoding_steps
self._target_namespace = target_namespace
self._label_smoothing_ratio = label_smoothing_ratio
self._start_index = self._vocab.get_token_index(START_SYMBOL, self._target_namespace)
self._end_index = self._vocab.get_token_index(END_SYMBOL, self._target_namespace)
self._beam_search = BeamSearch(self._end_index, max_steps=max_decoding_steps, beam_size=beam_size)
target_vocab_size = self._vocab.get_vocab_size(self._target_namespace)
if self.target_embedder.get_output_dim() != self._decoder_net.target_embedding_dim:
raise ConfigurationError("Target Embedder output_dim doesn't match decoder module's input.")
self._output_projection_layer = Linear(self._decoder_net.get_output_dim(), target_vocab_size)
if tie_output_embedding:
if self._output_projection_layer.weight.shape != self.target_embedder.weight.shape:
raise ConfigurationError("Can't tie embeddings with output linear layer, due to shape mismatch")
self._output_projection_layer.weight = self.target_embedder.weight
self._tensor_based_metric = tensor_based_metric
self._token_based_metric = token_based_metric
self._scheduled_sampling_ratio = scheduled_sampling_ratio
示例13: text_to_instance
# 需要导入模块: from allennlp.common import util [as 别名]
# 或者: from allennlp.common.util import END_SYMBOL [as 别名]
def text_to_instance(self, source: str, target: str = None) -> Instance:
def prepare_text(text, max_tokens):
text = text.lower() if self._lowercase else text
tokens = self._tokenizer.tokenize(text)[:max_tokens]
tokens.insert(0, Token(START_SYMBOL))
tokens.append(Token(END_SYMBOL))
return tokens
source_tokens = prepare_text(source, self._source_max_tokens)
source_tokens_indexed = TextField(source_tokens, self._source_token_indexers)
result = {'source_tokens': source_tokens_indexed}
meta_fields = {}
if self._save_copy_fields:
source_to_target_field = NamespaceSwappingField(source_tokens[1:-1], self._target_namespace)
result["source_to_target"] = source_to_target_field
meta_fields["source_tokens"] = [x.text for x in source_tokens[1:-1]]
if self._save_pgn_fields:
source_to_target_field = NamespaceSwappingField(source_tokens, self._target_namespace)
result["source_to_target"] = source_to_target_field
meta_fields["source_tokens"] = [x.text for x in source_tokens]
if target:
target_tokens = prepare_text(target, self._target_max_tokens)
target_tokens_indexed = TextField(target_tokens, self._target_token_indexers)
result['target_tokens'] = target_tokens_indexed
if self._save_pgn_fields:
meta_fields["target_tokens"] = [y.text for y in target_tokens]
source_and_target_token_ids = self._tokens_to_ids(source_tokens + target_tokens, self._lowercase)
source_token_ids = source_and_target_token_ids[:len(source_tokens)]
result["source_token_ids"] = ArrayField(np.array(source_token_ids, dtype='long'))
target_token_ids = source_and_target_token_ids[len(source_tokens):]
result["target_token_ids"] = ArrayField(np.array(target_token_ids, dtype='long'))
if self._save_copy_fields:
meta_fields["target_tokens"] = [y.text for y in target_tokens[1:-1]]
source_and_target_token_ids = self._tokens_to_ids(source_tokens[1:-1] + target_tokens, self._lowercase)
source_token_ids = source_and_target_token_ids[:len(source_tokens)-2]
result["source_token_ids"] = ArrayField(np.array(source_token_ids))
target_token_ids = source_and_target_token_ids[len(source_tokens)-2:]
result["target_token_ids"] = ArrayField(np.array(target_token_ids))
elif self._save_copy_fields:
source_token_ids = self._tokens_to_ids(source_tokens[1:-1], self._lowercase)
result["source_token_ids"] = ArrayField(np.array(source_token_ids))
elif self._save_pgn_fields:
source_token_ids = self._tokens_to_ids(source_tokens, self._lowercase)
result["source_token_ids"] = ArrayField(np.array(source_token_ids))
if self._save_copy_fields or self._save_pgn_fields:
result["metadata"] = MetadataField(meta_fields)
return Instance(result)