本文整理汇总了Python中allennlp.data.fields.TextField方法的典型用法代码示例。如果您正苦于以下问题:Python fields.TextField方法的具体用法?Python fields.TextField怎么用?Python fields.TextField使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类allennlp.data.fields
的用法示例。
在下文中一共展示了fields.TextField方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: text_to_instance
# 需要导入模块: from allennlp.data import fields [as 别名]
# 或者: from allennlp.data.fields import TextField [as 别名]
def text_to_instance(self, context_tokens: List[Token], tokens: List[Token], tags: List[str] = None,
intents: List[str] = None, dialog_act: Dict[str, Any] = None) -> Instance: # type: ignore
"""
We take `pre-tokenized` input here, because we don't have a tokenizer in this class.
"""
# pylint: disable=arguments-differ
fields: Dict[str, Field] = {}
# print([t.text for t in context_tokens])
fields["context_tokens"] = TextField(context_tokens, self._token_indexers)
fields["tokens"] = TextField(tokens, self._token_indexers)
fields["metadata"] = MetadataField({"words": [x.text for x in tokens]})
if tags is not None:
fields["tags"] = SequenceLabelField(tags, fields["tokens"])
if intents is not None:
fields["intents"] = MultiLabelField(intents, label_namespace="intent_labels")
if dialog_act is not None:
fields["metadata"] = MetadataField({"words": [x.text for x in tokens],
'dialog_act': dialog_act})
else:
fields["metadata"] = MetadataField({"words": [x.text for x in tokens], 'dialog_act': {}})
return Instance(fields)
示例2: text_to_instance
# 需要导入模块: from allennlp.data import fields [as 别名]
# 或者: from allennlp.data.fields import TextField [as 别名]
def text_to_instance(self, tokens: List[Token], tags: List[str] = None, domain: str = None,
intent: str = None, dialog_act: Dict[str, Any] = None) -> Instance: # type: ignore
"""
We take `pre-tokenized` input here, because we don't have a tokenizer in this class.
"""
# pylint: disable=arguments-differ
fields: Dict[str, Field] = {}
sequence = TextField(tokens, self._token_indexers)
fields["tokens"] = sequence
if tags:
fields["tags"] = SequenceLabelField(tags, sequence)
if domain:
fields["domain"] = LabelField(domain, label_namespace="domain_labels")
if intent:
fields["intent"] = LabelField(intent, label_namespace="intent_labels")
if dialog_act is not None:
fields["metadata"] = MetadataField({"words": [x.text for x in tokens],
'dialog_act': dialog_act})
else:
fields["metadata"] = MetadataField({"words": [x.text for x in tokens], 'dialog_act': {}})
return Instance(fields)
示例3: text_to_instance
# 需要导入模块: from allennlp.data import fields [as 别名]
# 或者: from allennlp.data.fields import TextField [as 别名]
def text_to_instance(self, # type: ignore
tokens: List[str],
entity_1: Tuple[int],
entity_2: Tuple[int],
label: str = None) -> Instance:
# pylint: disable=arguments-differ
fields: Dict[str, Field] = {}
tokens = [OpenAISplitter._standardize(token) for token in tokens]
tokens = ['__start__'] + tokens[entity_1[0]:entity_1[1]+1] + ['__del1__'] + tokens[entity_2[0]:entity_2[1]+1] + ['__del2__'] + tokens + ['__clf__']
sentence = TextField([Token(text=t) for t in tokens], self._token_indexers)
fields['sentence'] = sentence
#fields['entity1'] = SpanField(*entity_1, sequence_field=sentence)
#fields['entity2'] = SpanField(*entity_2, sequence_field=sentence)
if label:
fields['label'] = LabelField(label)
return Instance(fields)
示例4: _read
# 需要导入模块: from allennlp.data import fields [as 别名]
# 或者: from allennlp.data.fields import TextField [as 别名]
def _read(self, file_path: str) -> Iterable[Instance]:
# if `file_path` is a URL, redirect to the cache
file_path = cached_path(file_path)
with open(file_path, "r") as data_file:
logger.info("Reading instances from lines in file at: %s", file_path)
# Group into alternative divider / sentence chunks.
for is_divider, lines in itertools.groupby(data_file, _is_divider):
# Ignore the divider chunks, so that `lines` corresponds to the words
# of a single sentence.
if not is_divider:
fields = [line.strip().split() for line in lines]
# unzipping trick returns tuples, but our Fields need lists
fields = [list(field) for field in zip(*fields)]
tokens_, _, _, pico_tags = fields
# TextField requires ``Token`` objects
tokens = [Token(token) for token in tokens_]
yield self.text_to_instance(tokens, pico_tags)
示例5: text_to_instance
# 需要导入模块: from allennlp.data import fields [as 别名]
# 或者: from allennlp.data.fields import TextField [as 别名]
def text_to_instance(self, query_sequence: str, doc_pos_sequence: str, doc_neg_sequence: str) -> Instance: # type: ignore
# pylint: disable=arguments-differ
query_tokenized = self._tokenizer.tokenize(query_sequence)
if self.max_query_length > -1:
query_tokenized = query_tokenized[:self.max_query_length]
query_field = TextField(query_tokenized, self._token_indexers)
doc_pos_tokenized = self._tokenizer.tokenize(doc_pos_sequence)
if self.max_doc_length > -1:
doc_pos_tokenized = doc_pos_tokenized[:self.max_doc_length]
doc_pos_field = TextField(doc_pos_tokenized, self._token_indexers)
doc_neg_tokenized = self._tokenizer.tokenize(doc_neg_sequence)
if self.max_doc_length > -1:
doc_neg_tokenized = doc_neg_tokenized[:self.max_doc_length]
doc_neg_field = TextField(doc_neg_tokenized, self._token_indexers)
return Instance({
"query_tokens":query_field,
"doc_pos_tokens":doc_pos_field,
"doc_neg_tokens": doc_neg_field})
示例6: text_to_instance
# 需要导入模块: from allennlp.data import fields [as 别名]
# 或者: from allennlp.data.fields import TextField [as 别名]
def text_to_instance(self, # type: ignore
item_id: Any,
question_text: str,
choice_text_list: List[str],
answer_id: int
) -> Instance:
# pylint: disable=arguments-differ
fields: Dict[str, Field] = {}
question_tokens = self._tokenizer.tokenize(question_text)
choices_tokens_list = [self._tokenizer.tokenize(x) for x in choice_text_list]
fields['question'] = TextField(question_tokens, self._token_indexers)
fields['choices_list'] = ListField([TextField(x, self._token_indexers) for x in choices_tokens_list])
fields['label'] = LabelField(answer_id, skip_indexing=True)
metadata = {
"id": item_id,
"question_text": question_text,
"choice_text_list": choice_text_list,
"question_tokens": [x.text for x in question_tokens],
"choice_tokens_list": [[x.text for x in ct] for ct in choices_tokens_list],
}
fields["metadata"] = MetadataField(metadata)
return Instance(fields)
示例7: preprocess
# 需要导入模块: from allennlp.data import fields [as 别名]
# 或者: from allennlp.data.fields import TextField [as 别名]
def preprocess(self, token_batch):
seq_lens = [len(sequence) for sequence in token_batch if sequence]
if not seq_lens:
return []
max_len = min(max(seq_lens), self.max_len)
batches = []
for indexer in self.indexers:
batch = []
for sequence in token_batch:
tokens = sequence[:max_len]
tokens = [Token(token) for token in ['$START'] + tokens]
batch.append(Instance({'tokens': TextField(tokens, indexer)}))
batch = Batch(batch)
batch.index_instances(self.vocab)
batches.append(batch)
return batches
示例8: _read
# 需要导入模块: from allennlp.data import fields [as 别名]
# 或者: from allennlp.data.fields import TextField [as 别名]
def _read(self, file_path: str) -> Iterable[Instance]:
# if `file_path` is a URL, redirect to the cache
file_path = cached_path(file_path)
with open(file_path, "r") as data_file:
logger.info("Reading instances from lines in file at: %s", file_path)
# Group into alternative divider / sentence chunks.
for is_divider, lines in itertools.groupby(data_file, _is_divider):
# Ignore the divider chunks, so that `lines` corresponds to the words
# of a single sentence.
if not is_divider:
fields = [line.strip().split() for line in lines]
# unzipping trick returns tuples, but our Fields need lists
fields = [list(field) for field in zip(*fields)]
tokens_, pos_tags, chunk_tags, ner_tags = fields
# TextField requires `Token` objects
tokens = [Token(token) for token in tokens_]
yield self.text_to_instance(tokens, pos_tags, chunk_tags, ner_tags)
示例9: get_instances
# 需要导入模块: from allennlp.data import fields [as 别名]
# 或者: from allennlp.data.fields import TextField [as 别名]
def get_instances(self):
field1 = TextField(
[Token(t) for t in ["this", "is", "a", "sentence", "."]], self.token_indexer
)
field2 = TextField(
[Token(t) for t in ["this", "is", "a", "different", "sentence", "."]],
self.token_indexer,
)
field3 = TextField(
[Token(t) for t in ["here", "is", "a", "sentence", "."]], self.token_indexer
)
field4 = TextField([Token(t) for t in ["this", "is", "short"]], self.token_indexer)
instances = [
Instance({"text1": field1, "text2": field2}),
Instance({"text1": field3, "text2": field4}),
]
return instances
示例10: test_duplicate
# 需要导入模块: from allennlp.data import fields [as 别名]
# 或者: from allennlp.data.fields import TextField [as 别名]
def test_duplicate(self):
# Verify the `duplicate()` method works with a `PretrainedTransformerIndexer` in
# a `TextField`. See https://github.com/allenai/allennlp/issues/4270.
instance = Instance(
{
"words": TextField(
[Token("hello")], {"tokens": PretrainedTransformerIndexer("bert-base-uncased")}
)
}
)
other = instance.duplicate()
assert other == instance
# Adding new fields to the original instance should not effect the duplicate.
instance.add_field("labels", LabelField("some_label"))
assert "labels" not in other.fields
assert other != instance # sanity check on the '__eq__' method.
示例11: test_saving_and_loading_works_with_byte_encoding
# 需要导入模块: from allennlp.data import fields [as 别名]
# 或者: from allennlp.data.fields import TextField [as 别名]
def test_saving_and_loading_works_with_byte_encoding(self):
# We're going to set a vocabulary from a TextField using byte encoding, index it, save the
# vocab, load the vocab, then index the text field again, and make sure we get the same
# result.
tokenizer = CharacterTokenizer(byte_encoding="utf-8")
token_indexer = TokenCharactersIndexer(character_tokenizer=tokenizer, min_padding_length=2)
tokens = [Token(t) for t in ["Øyvind", "für", "汉字"]]
text_field = TextField(tokens, {"characters": token_indexer})
dataset = Batch([Instance({"sentence": text_field})])
vocab = Vocabulary.from_instances(dataset)
text_field.index(vocab)
indexed_tokens = deepcopy(text_field._indexed_tokens)
vocab_dir = self.TEST_DIR / "vocab_save"
vocab.save_to_files(vocab_dir)
vocab2 = Vocabulary.from_files(vocab_dir)
text_field2 = TextField(tokens, {"characters": token_indexer})
text_field2.index(vocab2)
indexed_tokens2 = deepcopy(text_field2._indexed_tokens)
assert indexed_tokens == indexed_tokens2
示例12: test_from_params_extend_config
# 需要导入模块: from allennlp.data import fields [as 别名]
# 或者: from allennlp.data.fields import TextField [as 别名]
def test_from_params_extend_config(self):
vocab_dir = self.TEST_DIR / "vocab_save"
original_vocab = Vocabulary(non_padded_namespaces=["tokens"])
original_vocab.add_token_to_namespace("a", namespace="tokens")
original_vocab.save_to_files(vocab_dir)
text_field = TextField(
[Token(t) for t in ["a", "b"]], {"tokens": SingleIdTokenIndexer("tokens")}
)
instances = Batch([Instance({"text": text_field})])
# If you ask to extend vocab from `directory`, instances must be passed
# in Vocabulary constructor, or else there is nothing to extend to.
params = Params({"type": "extend", "directory": vocab_dir})
with pytest.raises(ConfigurationError):
_ = Vocabulary.from_params(params)
# If you ask to extend vocab, `directory` key must be present in params,
# or else there is nothing to extend from.
params = Params({"type": "extend"})
with pytest.raises(ConfigurationError):
_ = Vocabulary.from_params(params, instances=instances)
示例13: test_max_vocab_size_partial_dict
# 需要导入模块: from allennlp.data import fields [as 别名]
# 或者: from allennlp.data.fields import TextField [as 别名]
def test_max_vocab_size_partial_dict(self):
indexers = {
"tokens": SingleIdTokenIndexer(),
"token_characters": TokenCharactersIndexer(min_padding_length=3),
}
instance = Instance(
{
"text": TextField(
[Token(w) for w in "Abc def ghi jkl mno pqr stu vwx yz".split(" ")], indexers
)
}
)
dataset = Batch([instance])
params = Params({"max_vocab_size": {"tokens": 1}})
vocab = Vocabulary.from_params(params=params, instances=dataset)
assert len(vocab.get_index_to_token_vocabulary("tokens").values()) == 3 # 1 + 2
assert len(vocab.get_index_to_token_vocabulary("token_characters").values()) == 28 # 26 + 2
示例14: test_equality
# 需要导入模块: from allennlp.data import fields [as 别名]
# 或者: from allennlp.data.fields import TextField [as 别名]
def test_equality(self):
index_field1 = IndexField(4, self.text)
index_field2 = IndexField(4, self.text)
index_field3 = IndexField(
4,
TextField(
[Token(t) for t in ["AllenNLP", "is", "the", "bomb", "!"]],
{"words": SingleIdTokenIndexer("words")},
),
)
assert index_field1 == 4
assert index_field1 == index_field1
assert index_field1 == index_field2
assert index_field1 != index_field3
assert index_field2 != index_field3
assert index_field3 == index_field3
示例15: test_as_tensor_handles_characters
# 需要导入模块: from allennlp.data import fields [as 别名]
# 或者: from allennlp.data.fields import TextField [as 别名]
def test_as_tensor_handles_characters(self):
field = TextField(
[Token(t) for t in ["This", "is", "a", "sentence", "."]],
token_indexers={
"characters": TokenCharactersIndexer("characters", min_padding_length=1)
},
)
field.index(self.vocab)
padding_lengths = field.get_padding_lengths()
tensor_dict = field.as_tensor(padding_lengths)
expected_character_array = numpy.array(
[
[1, 1, 1, 3, 0, 0, 0, 0],
[1, 3, 0, 0, 0, 0, 0, 0],
[1, 0, 0, 0, 0, 0, 0, 0],
[3, 4, 5, 6, 4, 5, 7, 4],
[1, 0, 0, 0, 0, 0, 0, 0],
]
)
numpy.testing.assert_array_almost_equal(
tensor_dict["characters"]["token_characters"].detach().cpu().numpy(),
expected_character_array,
)