本文整理汇总了Python中allennlp.data.Token方法的典型用法代码示例。如果您正苦于以下问题:Python data.Token方法的具体用法?Python data.Token怎么用?Python data.Token使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类allennlp.data
的用法示例。
在下文中一共展示了data.Token方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: _tokens_distances
# 需要导入模块: from allennlp import data [as 别名]
# 或者: from allennlp.data import Token [as 别名]
def _tokens_distances(self, tokens):
e1_loc = []
e2_loc = []
while len(tokens) < 5: # a hack to make sure all sentences are at least 5 tokens. CNN breaks otherwise.
tokens.append(Token(text='.'))
for i, token in enumerate(tokens):
if token.text.startswith('<e1>'):
e1_loc.append((i, 'start'))
token.text = token.text[4:]
if token.text.endswith('</e1>'):
e1_loc.append((i, 'end'))
token.text = token.text[:-5]
if token.text.startswith('<e2>'):
e2_loc.append((i, 'start'))
token.text = token.text[4:]
if token.text.endswith('</e2>'):
e2_loc.append((i, 'end'))
token.text = token.text[:-5]
positions1 = self._positions(len(tokens), e1_loc)
positions2 = self._positions(len(tokens), e2_loc)
return tokens, positions1, positions2
示例2: test_start_and_end_tokens
# 需要导入模块: from allennlp import data [as 别名]
# 或者: from allennlp.data import Token [as 别名]
def test_start_and_end_tokens(self):
vocab = Vocabulary()
vocab.add_token_to_namespace("A", namespace="characters") # 2
vocab.add_token_to_namespace("s", namespace="characters") # 3
vocab.add_token_to_namespace("e", namespace="characters") # 4
vocab.add_token_to_namespace("n", namespace="characters") # 5
vocab.add_token_to_namespace("t", namespace="characters") # 6
vocab.add_token_to_namespace("c", namespace="characters") # 7
vocab.add_token_to_namespace("<", namespace="characters") # 8
vocab.add_token_to_namespace(">", namespace="characters") # 9
vocab.add_token_to_namespace("/", namespace="characters") # 10
indexer = TokenCharactersIndexer(
"characters", start_tokens=["<s>"], end_tokens=["</s>"], min_padding_length=1
)
indices = indexer.tokens_to_indices([Token("sentential")], vocab)
assert indices == {
"token_characters": [[8, 3, 9], [3, 4, 5, 6, 4, 5, 6, 1, 1, 1], [8, 10, 3, 9]]
}
示例3: test_count_vocab_items_with_non_default_feature_name
# 需要导入模块: from allennlp import data [as 别名]
# 或者: from allennlp.data import Token [as 别名]
def test_count_vocab_items_with_non_default_feature_name(self):
tokenizer = SpacyTokenizer(parse=True)
tokens = tokenizer.tokenize("This is a sentence.")
tokens = [Token("<S>")] + [t for t in tokens] + [Token("</S>")]
indexer = SingleIdTokenIndexer(
namespace="dep_labels", feature_name="dep_", default_value="NONE"
)
counter = defaultdict(lambda: defaultdict(int))
for token in tokens:
indexer.count_vocab_items(token, counter)
assert counter["dep_labels"] == {
"ROOT": 1,
"nsubj": 1,
"det": 1,
"NONE": 2,
"attr": 1,
"punct": 1,
}
示例4: get_instances
# 需要导入模块: from allennlp import data [as 别名]
# 或者: from allennlp.data import Token [as 别名]
def get_instances(self):
field1 = TextField(
[Token(t) for t in ["this", "is", "a", "sentence", "."]], self.token_indexer
)
field2 = TextField(
[Token(t) for t in ["this", "is", "a", "different", "sentence", "."]],
self.token_indexer,
)
field3 = TextField(
[Token(t) for t in ["here", "is", "a", "sentence", "."]], self.token_indexer
)
field4 = TextField([Token(t) for t in ["this", "is", "short"]], self.token_indexer)
instances = [
Instance({"text1": field1, "text2": field2}),
Instance({"text1": field3, "text2": field4}),
]
return instances
示例5: test_saving_and_loading_works_with_byte_encoding
# 需要导入模块: from allennlp import data [as 别名]
# 或者: from allennlp.data import Token [as 别名]
def test_saving_and_loading_works_with_byte_encoding(self):
# We're going to set a vocabulary from a TextField using byte encoding, index it, save the
# vocab, load the vocab, then index the text field again, and make sure we get the same
# result.
tokenizer = CharacterTokenizer(byte_encoding="utf-8")
token_indexer = TokenCharactersIndexer(character_tokenizer=tokenizer, min_padding_length=2)
tokens = [Token(t) for t in ["Øyvind", "für", "汉字"]]
text_field = TextField(tokens, {"characters": token_indexer})
dataset = Batch([Instance({"sentence": text_field})])
vocab = Vocabulary.from_instances(dataset)
text_field.index(vocab)
indexed_tokens = deepcopy(text_field._indexed_tokens)
vocab_dir = self.TEST_DIR / "vocab_save"
vocab.save_to_files(vocab_dir)
vocab2 = Vocabulary.from_files(vocab_dir)
text_field2 = TextField(tokens, {"characters": token_indexer})
text_field2.index(vocab2)
indexed_tokens2 = deepcopy(text_field2._indexed_tokens)
assert indexed_tokens == indexed_tokens2
示例6: test_from_params_extend_config
# 需要导入模块: from allennlp import data [as 别名]
# 或者: from allennlp.data import Token [as 别名]
def test_from_params_extend_config(self):
vocab_dir = self.TEST_DIR / "vocab_save"
original_vocab = Vocabulary(non_padded_namespaces=["tokens"])
original_vocab.add_token_to_namespace("a", namespace="tokens")
original_vocab.save_to_files(vocab_dir)
text_field = TextField(
[Token(t) for t in ["a", "b"]], {"tokens": SingleIdTokenIndexer("tokens")}
)
instances = Batch([Instance({"text": text_field})])
# If you ask to extend vocab from `directory`, instances must be passed
# in Vocabulary constructor, or else there is nothing to extend to.
params = Params({"type": "extend", "directory": vocab_dir})
with pytest.raises(ConfigurationError):
_ = Vocabulary.from_params(params)
# If you ask to extend vocab, `directory` key must be present in params,
# or else there is nothing to extend from.
params = Params({"type": "extend"})
with pytest.raises(ConfigurationError):
_ = Vocabulary.from_params(params, instances=instances)
示例7: test_max_vocab_size_partial_dict
# 需要导入模块: from allennlp import data [as 别名]
# 或者: from allennlp.data import Token [as 别名]
def test_max_vocab_size_partial_dict(self):
indexers = {
"tokens": SingleIdTokenIndexer(),
"token_characters": TokenCharactersIndexer(min_padding_length=3),
}
instance = Instance(
{
"text": TextField(
[Token(w) for w in "Abc def ghi jkl mno pqr stu vwx yz".split(" ")], indexers
)
}
)
dataset = Batch([instance])
params = Params({"max_vocab_size": {"tokens": 1}})
vocab = Vocabulary.from_params(params=params, instances=dataset)
assert len(vocab.get_index_to_token_vocabulary("tokens").values()) == 3 # 1 + 2
assert len(vocab.get_index_to_token_vocabulary("token_characters").values()) == 28 # 26 + 2
示例8: test_equality
# 需要导入模块: from allennlp import data [as 别名]
# 或者: from allennlp.data import Token [as 别名]
def test_equality(self):
index_field1 = IndexField(4, self.text)
index_field2 = IndexField(4, self.text)
index_field3 = IndexField(
4,
TextField(
[Token(t) for t in ["AllenNLP", "is", "the", "bomb", "!"]],
{"words": SingleIdTokenIndexer("words")},
),
)
assert index_field1 == 4
assert index_field1 == index_field1
assert index_field1 == index_field2
assert index_field1 != index_field3
assert index_field2 != index_field3
assert index_field3 == index_field3
示例9: test_as_tensor_handles_characters
# 需要导入模块: from allennlp import data [as 别名]
# 或者: from allennlp.data import Token [as 别名]
def test_as_tensor_handles_characters(self):
field = TextField(
[Token(t) for t in ["This", "is", "a", "sentence", "."]],
token_indexers={
"characters": TokenCharactersIndexer("characters", min_padding_length=1)
},
)
field.index(self.vocab)
padding_lengths = field.get_padding_lengths()
tensor_dict = field.as_tensor(padding_lengths)
expected_character_array = numpy.array(
[
[1, 1, 1, 3, 0, 0, 0, 0],
[1, 3, 0, 0, 0, 0, 0, 0],
[1, 0, 0, 0, 0, 0, 0, 0],
[3, 4, 5, 6, 4, 5, 7, 4],
[1, 0, 0, 0, 0, 0, 0, 0],
]
)
numpy.testing.assert_array_almost_equal(
tensor_dict["characters"]["token_characters"].detach().cpu().numpy(),
expected_character_array,
)
示例10: test_token_padding_lengths_are_computed_correctly
# 需要导入模块: from allennlp import data [as 别名]
# 或者: from allennlp.data import Token [as 别名]
def test_token_padding_lengths_are_computed_correctly(self):
field = TextField(
[Token(t) for t in ["A", "sentence"]],
token_indexers={
"field_with_dict": DictReturningTokenIndexer(token_min_padding_length=3),
"words": SingleIdTokenIndexer("words", token_min_padding_length=3),
"characters": TokenCharactersIndexer(
"characters", min_padding_length=1, token_min_padding_length=3
),
},
)
field.index(self.vocab)
padding_lengths = field.get_padding_lengths()
assert padding_lengths == {
"field_with_dict___token_ids": 5,
"field_with_dict___additional_key": 3,
"words___tokens": 3,
"characters___token_characters": 3,
"characters___num_token_characters": 8,
}
tensors = field.as_tensor(padding_lengths)
assert tensors["field_with_dict"]["additional_key"].tolist()[-1] == 0
assert tensors["words"]["tokens"].tolist()[-1] == 0
assert tensors["characters"]["token_characters"].tolist()[-1] == [0] * 8
示例11: text_to_instance
# 需要导入模块: from allennlp import data [as 别名]
# 或者: from allennlp.data import Token [as 别名]
def text_to_instance(self, # type: ignore
premise: str,
hypothesis: str,
pid: str = None,
label: str = None) -> Instance:
fields: Dict[str, Field] = {}
premise_tokens = [Token(t) for t in premise.split(' ')] # Removing code for parentheses in NLI
hypothesis_tokens = [Token(t) for t in hypothesis.split(' ')]
if self.max_l is not None:
premise_tokens = premise_tokens[:self.max_l]
hypothesis_tokens = hypothesis_tokens[:self.max_l]
fields['premise'] = TextField(premise_tokens, self._token_indexers)
fields['hypothesis'] = TextField(hypothesis_tokens, self._token_indexers)
if label:
fields['selection_label'] = LabelField(label, label_namespace='selection_labels')
if pid:
fields['pid'] = IdField(pid)
return Instance(fields)
示例12: text_to_instance
# 需要导入模块: from allennlp import data [as 别名]
# 或者: from allennlp.data import Token [as 别名]
def text_to_instance(self, # type: ignore
premise: str,
hypothesis: str,
pid: str = None,
label: str = None) -> Instance:
fields: Dict[str, Field] = {}
premise_tokens = [Token(t) for t in premise.split(' ')] # Removing code for parentheses in NLI
hypothesis_tokens = [Token(t) for t in hypothesis.split(' ')]
if self.max_l is not None:
premise_tokens = premise_tokens[:self.max_l]
hypothesis_tokens = hypothesis_tokens[:self.max_l]
fields['premise'] = TextField(premise_tokens, self._token_indexers)
fields['hypothesis'] = TextField(hypothesis_tokens, self._token_indexers)
if label:
fields['label'] = LabelField(label, label_namespace='labels')
if pid:
fields['pid'] = IdField(pid)
return Instance(fields)
示例13: get_vocab_and_both_elmo_indexed_ids
# 需要导入模块: from allennlp import data [as 别名]
# 或者: from allennlp.data import Token [as 别名]
def get_vocab_and_both_elmo_indexed_ids(batch ):
instances = []
indexer = ELMoTokenCharactersIndexer()
indexer2 = SingleIdTokenIndexer()
for sentence in batch:
tokens = [Token(token) for token in sentence]
field = TextField(tokens,
{u'character_ids': indexer,
u'tokens': indexer2})
instance = Instance({u"elmo": field})
instances.append(instance)
dataset = Batch(instances)
vocab = Vocabulary.from_instances(instances)
dataset.index_instances(vocab)
return vocab, dataset.as_tensor_dict()[u"elmo"]
示例14: test_bpe
# 需要导入模块: from allennlp import data [as 别名]
# 或者: from allennlp.data import Token [as 别名]
def test_bpe(self):
# [e, w, o, e</w>] -> best pair (e, w)
# [ew, o, e</w>] -> best pair (o, e</w>)
# [ew, oe</w>] -> done
token = Token(u"ewoe")
assert self.indexer.byte_pair_encode(token) == [u'ew', u'oe</w>']
# Prefer "ew" to "we"
token = Token(u"ewe")
assert self.indexer.byte_pair_encode(token) == [u'ew', u'e</w>']
# Prefer ending a word
token = Token(u"eee")
assert self.indexer.byte_pair_encode(token) == [u'e', u'ee</w>']
# Encodes up to a single symbol when appropriate
token = Token(u"woe")
assert self.indexer.byte_pair_encode(token) == [u'woe</w>']
示例15: test_tokens_to_indices
# 需要导入模块: from allennlp import data [as 别名]
# 或者: from allennlp.data import Token [as 别名]
def test_tokens_to_indices(self):
tokens = [Token(u'ewoe'), Token(u'woe'), Token(u'ewe'), Token(u'ee')]
indices = self.indexer.tokens_to_indices(tokens, None, u'test')
assert set(indices.keys()) == set([u"test", u"test-offsets", u"mask"])
text_tokens = indices[u'test']
offsets = indices[u'test-offsets']
assert text_tokens[:6] == [
self.indexer.encoder.get(symbol, 0)
for symbol in [u'ew', u'oe</w>'] + [u'woe</w>'] + [u'ew', u'e</w>'] + [u'ee</w>']
]
assert offsets == [
1, # end of first word
2, # end of second word
4, # end of third word
5, # end of last word
]