本文整理汇总了Python中torchtext.data.Field方法的典型用法代码示例。如果您正苦于以下问题:Python data.Field方法的具体用法?Python data.Field怎么用?Python data.Field使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类torchtext.data
的用法示例。
在下文中一共展示了data.Field方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: test_process
# 需要导入模块: from torchtext import data [as 别名]
# 或者: from torchtext.data import Field [as 别名]
def test_process(self):
raw_field = data.RawField()
field = data.Field(sequential=True, use_vocab=False, batch_first=True)
# Test tensor-like batch data which is accepted by both RawField and Field
batch = [[1, 2, 3], [2, 3, 4]]
batch_tensor = torch.LongTensor(batch)
raw_field_processed = raw_field.process(batch)
field_processed = field.process(batch, device=-1, train=False)
assert raw_field_processed == batch
assert field_processed.data.equal(batch_tensor)
# Test non-tensor data which is only accepted by RawField
any_obj = [object() for _ in range(5)]
raw_field_processed = raw_field.process(any_obj)
assert any_obj == raw_field_processed
with pytest.raises(TypeError):
field.process(any_obj)
示例2: from_list
# 需要导入模块: from torchtext import data [as 别名]
# 或者: from torchtext.data import Field [as 别名]
def from_list(src_list, tgt_list=None, share_fields_from=None, **kwargs):
if tgt_list is None:
corpus = zip(src_list)
else:
corpus = zip(src_list, tgt_list)
if share_fields_from is not None:
src_field = share_fields_from.fields[src_field_name]
if tgt_list is None:
tgt_field = None
else:
tgt_field = share_fields_from.fields[tgt_field_name]
else:
# tokenize by character
src_field = Field(batch_first=True, include_lengths=True, tokenize=list,
init_token=SOS, eos_token=EOS, unk_token=None)
if tgt_list is None:
tgt_field = None
else:
tgt_field = Field(batch_first=True, tokenize=list,
init_token=SOS, eos_token=EOS, unk_token=None)
return Seq2SeqDataset(corpus, src_field, tgt_field, **kwargs)
示例3: test_preprocess
# 需要导入模块: from torchtext import data [as 别名]
# 或者: from torchtext.data import Field [as 别名]
def test_preprocess(self):
# Default case.
field = data.Field()
assert field.preprocess("Test string.") == ["Test", "string."]
# Test that lowercase is properly applied.
field_lower = data.Field(lower=True)
assert field_lower.preprocess("Test string.") == ["test", "string."]
# Test that custom preprocessing pipelines are properly applied.
preprocess_pipeline = data.Pipeline(lambda x: x + "!")
field_preprocessing = data.Field(preprocessing=preprocess_pipeline,
lower=True)
assert field_preprocessing.preprocess("Test string.") == ["test!", "string.!"]
# Test that non-sequential data is properly handled.
field_not_sequential = data.Field(sequential=False, lower=True,
preprocessing=preprocess_pipeline)
assert field_not_sequential.preprocess("Test string.") == "test string.!"
# Non-regression test that we do not try to decode unicode strings to unicode
field_not_sequential = data.Field(sequential=False, lower=True,
preprocessing=preprocess_pipeline)
assert field_not_sequential.preprocess("ᑌᑎIᑕOᗪᕮ_Tᕮ᙭T") == "ᑌᑎiᑕoᗪᕮ_tᕮ᙭t!"
示例4: test_numericalize_include_lengths
# 需要导入模块: from torchtext import data [as 别名]
# 或者: from torchtext.data import Field [as 别名]
def test_numericalize_include_lengths(self):
self.write_test_ppid_dataset(data_format="tsv")
question_field = data.Field(sequential=True, include_lengths=True)
tsv_fields = [("id", None), ("q1", question_field),
("q2", question_field), ("label", None)]
tsv_dataset = data.TabularDataset(
path=self.test_ppid_dataset_path, format="tsv",
fields=tsv_fields)
question_field.build_vocab(tsv_dataset)
test_example_data = [["When", "do", "you", "use", "シ",
"instead", "of", "し?"],
["What", "is", "2+2", "<pad>", "<pad>",
"<pad>", "<pad>", "<pad>"],
["Here", "is", "a", "sentence", "with",
"some", "oovs", "<pad>"]]
test_example_lengths = [8, 3, 7]
# Test with include_lengths
include_lengths_numericalized = question_field.numericalize(
(test_example_data, test_example_lengths), device=-1)
verify_numericalized_example(question_field,
test_example_data,
include_lengths_numericalized,
test_example_lengths)
示例5: test_numericalize_batch_first
# 需要导入模块: from torchtext import data [as 别名]
# 或者: from torchtext.data import Field [as 别名]
def test_numericalize_batch_first(self):
self.write_test_ppid_dataset(data_format="tsv")
question_field = data.Field(sequential=True, batch_first=True)
tsv_fields = [("id", None), ("q1", question_field),
("q2", question_field), ("label", None)]
tsv_dataset = data.TabularDataset(
path=self.test_ppid_dataset_path, format="tsv",
fields=tsv_fields)
question_field.build_vocab(tsv_dataset)
test_example_data = [["When", "do", "you", "use", "シ",
"instead", "of", "し?"],
["What", "is", "2+2", "<pad>", "<pad>",
"<pad>", "<pad>", "<pad>"],
["Here", "is", "a", "sentence", "with",
"some", "oovs", "<pad>"]]
# Test with batch_first
include_lengths_numericalized = question_field.numericalize(
test_example_data, device=-1)
verify_numericalized_example(question_field,
test_example_data,
include_lengths_numericalized,
batch_first=True)
示例6: test_errors
# 需要导入模块: from torchtext import data [as 别名]
# 或者: from torchtext.data import Field [as 别名]
def test_errors(self):
# Test that passing a non-tuple (of data and length) to numericalize
# with Field.include_lengths = True raises an error.
with self.assertRaises(ValueError):
self.write_test_ppid_dataset(data_format="tsv")
question_field = data.Field(sequential=True, include_lengths=True)
tsv_fields = [("id", None), ("q1", question_field),
("q2", question_field), ("label", None)]
tsv_dataset = data.TabularDataset(
path=self.test_ppid_dataset_path, format="tsv",
fields=tsv_fields)
question_field.build_vocab(tsv_dataset)
test_example_data = [["When", "do", "you", "use", "シ",
"instead", "of", "し?"],
["What", "is", "2+2", "<pad>", "<pad>",
"<pad>", "<pad>", "<pad>"],
["Here", "is", "a", "sentence", "with",
"some", "oovs", "<pad>"]]
question_field.numericalize(
test_example_data, device=-1)
示例7: __init__
# 需要导入模块: from torchtext import data [as 别名]
# 或者: from torchtext.data import Field [as 别名]
def __init__(self, emb_dim=50, mbsize=32):
self.TEXT = data.Field(init_token='<start>', eos_token='<eos>', lower=True, tokenize='spacy', fix_length=16)
self.LABEL = data.Field(sequential=False, unk_token=None)
# Only take sentences with length <= 15
f = lambda ex: len(ex.text) <= 15 and ex.label != 'neutral'
train, val, test = datasets.SST.splits(
self.TEXT, self.LABEL, fine_grained=False, train_subtrees=False,
filter_pred=f
)
self.TEXT.build_vocab(train, vectors=GloVe('6B', dim=emb_dim))
self.LABEL.build_vocab(train)
self.n_vocab = len(self.TEXT.vocab.itos)
self.emb_dim = emb_dim
self.train_iter, self.val_iter, _ = data.BucketIterator.splits(
(train, val, test), batch_size=mbsize, device=-1,
shuffle=True, repeat=True
)
self.train_iter = iter(self.train_iter)
self.val_iter = iter(self.val_iter)
示例8: get_sst
# 需要导入模块: from torchtext import data [as 别名]
# 或者: from torchtext.data import Field [as 别名]
def get_sst():
inputs = data.Field(lower='preserve-case')
answers = data.Field(sequential=False, unk_token=None)
# build with subtrees so inputs are right
train_s, dev_s, test_s = datasets.SST.splits(inputs, answers, fine_grained = False, train_subtrees = True,
filter_pred=lambda ex: ex.label != 'neutral')
inputs.build_vocab(train_s, dev_s, test_s)
answers.build_vocab(train_s)
# rebuild without subtrees to get longer sentences
train, dev, test = datasets.SST.splits(inputs, answers, fine_grained = False, train_subtrees = False,
filter_pred=lambda ex: ex.label != 'neutral')
train_iter, dev_iter, test_iter = data.BucketIterator.splits(
(train, dev, test), batch_size=1, device=0)
return inputs, answers, train_iter, dev_iter
示例9: evaluate_predictions
# 需要导入模块: from torchtext import data [as 别名]
# 或者: from torchtext.data import Field [as 别名]
def evaluate_predictions(snapshot_file):
print('loading', snapshot_file)
try: # load onto gpu
model = torch.load(snapshot_file)
except: # load onto cpu
model = torch.load(snapshot_file, map_location=lambda storage, loc: storage)
inputs = data.Field()
answers = data.Field(sequential=False, unk_token=None)
train, dev, test = datasets.SST.splits(inputs, answers, fine_grained=False, train_subtrees=False,
filter_pred=lambda ex: ex.label != 'neutral')
inputs.build_vocab(train)
answers.build_vocab(train)
train_iter, dev_iter, test_iter = data.BucketIterator.splits(
(train, dev, test), batch_size=1, device=0)
train_iter.init_epoch()
for batch_idx, batch in enumerate(train_iter):
print('batch_idx', batch_idx)
out = model(batch)
target = batch.label
break
return batch, out, target
# batch of [start, stop) with unigrams working
示例10: load_mr
# 需要导入模块: from torchtext import data [as 别名]
# 或者: from torchtext.data import Field [as 别名]
def load_mr(text_field, label_field, batch_size):
print('loading data')
train_data, dev_data, test_data = MR.splits(text_field, label_field)
text_field.build_vocab(train_data, dev_data, test_data)
label_field.build_vocab(train_data, dev_data, test_data)
print('building batches')
train_iter, dev_iter, test_iter = data.Iterator.splits(
(train_data, dev_data, test_data), batch_sizes=(batch_size, len(dev_data), len(test_data)),repeat=False,
device = -1
)
return train_iter, dev_iter, test_iter
#
# text_field = data.Field(lower=True)
# label_field = data.Field(sequential=False)
# train_iter, dev_iter , test_iter = load_mr(text_field, label_field, batch_size=50)
示例11: test_batch_iter
# 需要导入模块: from torchtext import data [as 别名]
# 或者: from torchtext.data import Field [as 别名]
def test_batch_iter(self):
self.write_test_numerical_features_dataset()
FLOAT = data.Field(use_vocab=False, sequential=False,
dtype=torch.float)
INT = data.Field(use_vocab=False, sequential=False, is_target=True)
TEXT = data.Field(sequential=False)
dst = data.TabularDataset(path=self.test_numerical_features_dataset_path,
format="tsv", skip_header=False,
fields=[("float", FLOAT),
("int", INT),
("text", TEXT)])
TEXT.build_vocab(dst)
itr = data.Iterator(dst, batch_size=2, device=-1, shuffle=False)
fld_order = [k for k, v in dst.fields.items() if
v is not None and not v.is_target]
batch = next(iter(itr))
(x1, x2), y = batch
x = (x1, x2)[fld_order.index("float")]
self.assertEquals(y.data[0], 1)
self.assertEquals(y.data[1], 12)
self.assertAlmostEqual(x.data[0], 0.1, places=4)
self.assertAlmostEqual(x.data[1], 0.5, places=4)
示例12: test_input_with_newlines_in_text
# 需要导入模块: from torchtext import data [as 别名]
# 或者: from torchtext.data import Field [as 别名]
def test_input_with_newlines_in_text(self):
# Smoke test for ensuring that TabularDataset works with files with newlines
example_with_newlines = [("\"hello \n world\"", "1"),
("\"there is a \n newline\"", "0"),
("\"there is no newline\"", "1")]
fields = [("text", data.Field(lower=True)),
("label", data.Field(sequential=False))]
for delim in [",", "\t"]:
with open(self.test_newline_dataset_path, "wt") as f:
for line in example_with_newlines:
f.write("{}\n".format(delim.join(line)))
format_ = "csv" if delim == "," else "tsv"
dataset = data.TabularDataset(
path=self.test_newline_dataset_path, format=format_, fields=fields)
# if the newline is not parsed correctly, this should raise an error
for example in dataset:
self.assert_(hasattr(example, "text"))
self.assert_(hasattr(example, "label"))
示例13: test_process
# 需要导入模块: from torchtext import data [as 别名]
# 或者: from torchtext.data import Field [as 别名]
def test_process(self):
raw_field = data.RawField()
field = data.Field(sequential=True, use_vocab=False, batch_first=True)
# Test tensor-like batch data which is accepted by both RawField and Field
batch = [[1, 2, 3], [2, 3, 4]]
batch_tensor = torch.LongTensor(batch)
raw_field_processed = raw_field.process(batch)
field_processed = field.process(batch)
assert raw_field_processed == batch
assert field_processed.data.equal(batch_tensor)
# Test non-tensor data which is only accepted by RawField
any_obj = [object() for _ in range(5)]
raw_field_processed = raw_field.process(any_obj)
assert any_obj == raw_field_processed
with pytest.raises(TypeError):
field.process(any_obj)
示例14: test_numericalize_basic
# 需要导入模块: from torchtext import data [as 别名]
# 或者: from torchtext.data import Field [as 别名]
def test_numericalize_basic(self):
self.write_test_ppid_dataset(data_format="tsv")
question_field = data.Field(sequential=True)
tsv_fields = [("id", None), ("q1", question_field),
("q2", question_field), ("label", None)]
tsv_dataset = data.TabularDataset(
path=self.test_ppid_dataset_path, format="tsv",
fields=tsv_fields)
question_field.build_vocab(tsv_dataset)
test_example_data = [["When", "do", "you", "use", "シ",
"instead", "of", "し?"],
["What", "is", "2+2", "<pad>", "<pad>",
"<pad>", "<pad>", "<pad>"],
["Here", "is", "a", "sentence", "with",
"some", "oovs", "<pad>"]]
# Test default
default_numericalized = question_field.numericalize(test_example_data)
verify_numericalized_example(question_field, test_example_data,
default_numericalized)
示例15: test_numericalize_include_lengths
# 需要导入模块: from torchtext import data [as 别名]
# 或者: from torchtext.data import Field [as 别名]
def test_numericalize_include_lengths(self):
self.write_test_ppid_dataset(data_format="tsv")
question_field = data.Field(sequential=True, include_lengths=True)
tsv_fields = [("id", None), ("q1", question_field),
("q2", question_field), ("label", None)]
tsv_dataset = data.TabularDataset(
path=self.test_ppid_dataset_path, format="tsv",
fields=tsv_fields)
question_field.build_vocab(tsv_dataset)
test_example_data = [["When", "do", "you", "use", "シ",
"instead", "of", "し?"],
["What", "is", "2+2", "<pad>", "<pad>",
"<pad>", "<pad>", "<pad>"],
["Here", "is", "a", "sentence", "with",
"some", "oovs", "<pad>"]]
test_example_lengths = [8, 3, 7]
# Test with include_lengths
include_lengths_numericalized = question_field.numericalize(
(test_example_data, test_example_lengths))
verify_numericalized_example(question_field,
test_example_data,
include_lengths_numericalized,
test_example_lengths)