本文整理汇总了Python中torchtext.data.TabularDataset方法的典型用法代码示例。如果您正苦于以下问题:Python data.TabularDataset方法的具体用法?Python data.TabularDataset怎么用?Python data.TabularDataset使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类torchtext.data
的用法示例。
在下文中一共展示了data.TabularDataset方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: test_numericalize_include_lengths
# 需要导入模块: from torchtext import data [as 别名]
# 或者: from torchtext.data import TabularDataset [as 别名]
def test_numericalize_include_lengths(self):
self.write_test_ppid_dataset(data_format="tsv")
question_field = data.Field(sequential=True, include_lengths=True)
tsv_fields = [("id", None), ("q1", question_field),
("q2", question_field), ("label", None)]
tsv_dataset = data.TabularDataset(
path=self.test_ppid_dataset_path, format="tsv",
fields=tsv_fields)
question_field.build_vocab(tsv_dataset)
test_example_data = [["When", "do", "you", "use", "シ",
"instead", "of", "し?"],
["What", "is", "2+2", "<pad>", "<pad>",
"<pad>", "<pad>", "<pad>"],
["Here", "is", "a", "sentence", "with",
"some", "oovs", "<pad>"]]
test_example_lengths = [8, 3, 7]
# Test with include_lengths
include_lengths_numericalized = question_field.numericalize(
(test_example_data, test_example_lengths), device=-1)
verify_numericalized_example(question_field,
test_example_data,
include_lengths_numericalized,
test_example_lengths)
示例2: test_errors
# 需要导入模块: from torchtext import data [as 别名]
# 或者: from torchtext.data import TabularDataset [as 别名]
def test_errors(self):
# Test that passing a non-tuple (of data and length) to numericalize
# with Field.include_lengths = True raises an error.
with self.assertRaises(ValueError):
self.write_test_ppid_dataset(data_format="tsv")
question_field = data.Field(sequential=True, include_lengths=True)
tsv_fields = [("id", None), ("q1", question_field),
("q2", question_field), ("label", None)]
tsv_dataset = data.TabularDataset(
path=self.test_ppid_dataset_path, format="tsv",
fields=tsv_fields)
question_field.build_vocab(tsv_dataset)
test_example_data = [["When", "do", "you", "use", "シ",
"instead", "of", "し?"],
["What", "is", "2+2", "<pad>", "<pad>",
"<pad>", "<pad>", "<pad>"],
["Here", "is", "a", "sentence", "with",
"some", "oovs", "<pad>"]]
question_field.numericalize(
test_example_data, device=-1)
示例3: test_batch_iter
# 需要导入模块: from torchtext import data [as 别名]
# 或者: from torchtext.data import TabularDataset [as 别名]
def test_batch_iter(self):
self.write_test_numerical_features_dataset()
FLOAT = data.Field(use_vocab=False, sequential=False,
dtype=torch.float)
INT = data.Field(use_vocab=False, sequential=False, is_target=True)
TEXT = data.Field(sequential=False)
dst = data.TabularDataset(path=self.test_numerical_features_dataset_path,
format="tsv", skip_header=False,
fields=[("float", FLOAT),
("int", INT),
("text", TEXT)])
TEXT.build_vocab(dst)
itr = data.Iterator(dst, batch_size=2, device=-1, shuffle=False)
fld_order = [k for k, v in dst.fields.items() if
v is not None and not v.is_target]
batch = next(iter(itr))
(x1, x2), y = batch
x = (x1, x2)[fld_order.index("float")]
self.assertEquals(y.data[0], 1)
self.assertEquals(y.data[1], 12)
self.assertAlmostEqual(x.data[0], 0.1, places=4)
self.assertAlmostEqual(x.data[1], 0.5, places=4)
示例4: test_numericalize_basic
# 需要导入模块: from torchtext import data [as 别名]
# 或者: from torchtext.data import TabularDataset [as 别名]
def test_numericalize_basic(self):
self.write_test_ppid_dataset(data_format="tsv")
question_field = data.Field(sequential=True)
tsv_fields = [("id", None), ("q1", question_field),
("q2", question_field), ("label", None)]
tsv_dataset = data.TabularDataset(
path=self.test_ppid_dataset_path, format="tsv",
fields=tsv_fields)
question_field.build_vocab(tsv_dataset)
test_example_data = [["When", "do", "you", "use", "シ",
"instead", "of", "し?"],
["What", "is", "2+2", "<pad>", "<pad>",
"<pad>", "<pad>", "<pad>"],
["Here", "is", "a", "sentence", "with",
"some", "oovs", "<pad>"]]
# Test default
default_numericalized = question_field.numericalize(test_example_data)
verify_numericalized_example(question_field, test_example_data,
default_numericalized)
示例5: test_numericalize_include_lengths
# 需要导入模块: from torchtext import data [as 别名]
# 或者: from torchtext.data import TabularDataset [as 别名]
def test_numericalize_include_lengths(self):
self.write_test_ppid_dataset(data_format="tsv")
question_field = data.Field(sequential=True, include_lengths=True)
tsv_fields = [("id", None), ("q1", question_field),
("q2", question_field), ("label", None)]
tsv_dataset = data.TabularDataset(
path=self.test_ppid_dataset_path, format="tsv",
fields=tsv_fields)
question_field.build_vocab(tsv_dataset)
test_example_data = [["When", "do", "you", "use", "シ",
"instead", "of", "し?"],
["What", "is", "2+2", "<pad>", "<pad>",
"<pad>", "<pad>", "<pad>"],
["Here", "is", "a", "sentence", "with",
"some", "oovs", "<pad>"]]
test_example_lengths = [8, 3, 7]
# Test with include_lengths
include_lengths_numericalized = question_field.numericalize(
(test_example_data, test_example_lengths))
verify_numericalized_example(question_field,
test_example_data,
include_lengths_numericalized,
test_example_lengths)
示例6: test_numericalize_batch_first
# 需要导入模块: from torchtext import data [as 别名]
# 或者: from torchtext.data import TabularDataset [as 别名]
def test_numericalize_batch_first(self):
self.write_test_ppid_dataset(data_format="tsv")
question_field = data.Field(sequential=True, batch_first=True)
tsv_fields = [("id", None), ("q1", question_field),
("q2", question_field), ("label", None)]
tsv_dataset = data.TabularDataset(
path=self.test_ppid_dataset_path, format="tsv",
fields=tsv_fields)
question_field.build_vocab(tsv_dataset)
test_example_data = [["When", "do", "you", "use", "シ",
"instead", "of", "し?"],
["What", "is", "2+2", "<pad>", "<pad>",
"<pad>", "<pad>", "<pad>"],
["Here", "is", "a", "sentence", "with",
"some", "oovs", "<pad>"]]
# Test with batch_first
include_lengths_numericalized = question_field.numericalize(
test_example_data)
verify_numericalized_example(question_field,
test_example_data,
include_lengths_numericalized,
batch_first=True)
示例7: test_errors
# 需要导入模块: from torchtext import data [as 别名]
# 或者: from torchtext.data import TabularDataset [as 别名]
def test_errors(self):
# Test that passing a non-tuple (of data and length) to numericalize
# with Field.include_lengths = True raises an error.
with self.assertRaises(ValueError):
self.write_test_ppid_dataset(data_format="tsv")
question_field = data.Field(sequential=True, include_lengths=True)
tsv_fields = [("id", None), ("q1", question_field),
("q2", question_field), ("label", None)]
tsv_dataset = data.TabularDataset(
path=self.test_ppid_dataset_path, format="tsv",
fields=tsv_fields)
question_field.build_vocab(tsv_dataset)
test_example_data = [["When", "do", "you", "use", "シ",
"instead", "of", "し?"],
["What", "is", "2+2", "<pad>", "<pad>",
"<pad>", "<pad>", "<pad>"],
["Here", "is", "a", "sentence", "with",
"some", "oovs", "<pad>"]]
question_field.numericalize(
test_example_data)
示例8: test_vocab_size
# 需要导入模块: from torchtext import data [as 别名]
# 或者: from torchtext.data import TabularDataset [as 别名]
def test_vocab_size(self):
# Set up fields
question_field = data.Field(sequential=True)
label_field = data.LabelField()
# Copied from test_build_vocab with minor changes
# Write TSV dataset and construct a Dataset
self.write_test_ppid_dataset(data_format="tsv")
tsv_fields = [("id", None), ("q1", question_field),
("q2", question_field), ("label", label_field)]
tsv_dataset = data.TabularDataset(
path=self.test_ppid_dataset_path, format="tsv",
fields=tsv_fields)
# Skipping json dataset as we can rely on the original build vocab test
label_field.build_vocab(tsv_dataset)
assert label_field.vocab.freqs == Counter({'1': 2, '0': 1})
expected_stoi = {'1': 0, '0': 1} # No <unk>
assert dict(label_field.vocab.stoi) == expected_stoi
# Turn the stoi dictionary into an itos list
expected_itos = [x[0] for x in sorted(expected_stoi.items(),
key=lambda tup: tup[1])]
assert label_field.vocab.itos == expected_itos
示例9: init_train_set
# 需要导入模块: from torchtext import data [as 别名]
# 或者: from torchtext.data import TabularDataset [as 别名]
def init_train_set(self):
set_all_random_seed(self.config['random_seed'])
train_file_path = self.config['train_file']
print('Loading train set from {}'.format(train_file_path))
self.train_set = tt_data.TabularDataset(path=train_file_path,
format='csv',
fields=[('Id', self.ID),
('Text', self.TEXT),
('Pos1', self.POS),
('Pos2', self.POS),
('Label', self.TRAIN_LABEL)],
skip_header=False)
self.train_iter = tt_data.Iterator(self.train_set,
sort_key=lambda x: len(x.Text),
batch_size=self.config['train_batch_size'],
train=True,
repeat=False,
sort_within_batch=True,
device=self.device)
示例10: init_dev_set
# 需要导入模块: from torchtext import data [as 别名]
# 或者: from torchtext.data import TabularDataset [as 别名]
def init_dev_set(self):
dev_file_path = self.config['dev_file']
print('Loading dev set from {}'.format(dev_file_path))
self.dev_set = tt_data.TabularDataset(path=dev_file_path,
format='csv',
fields=[('Id', self.ID),
('Text', self.TEXT),
('Pos1', self.POS),
('Pos2', self.POS),
('Label', self.LABEL)],
skip_header=False)
self.dev_iter = tt_data.Iterator(self.dev_set,
sort_key=lambda x: len(x.Text),
batch_size=self.config['test_batch_size'],
train=False,
repeat=False,
sort_within_batch=True,
device=self.device)
示例11: init_test_set
# 需要导入模块: from torchtext import data [as 别名]
# 或者: from torchtext.data import TabularDataset [as 别名]
def init_test_set(self):
test_file_path = self.config['test_file']
print('Loading test set {}'.format(test_file_path))
self.test_set = tt_data.TabularDataset(path=test_file_path,
format='csv',
fields=[('Id', self.ID),
('Text', self.TEXT),
('Pos1', self.POS),
('Pos2', self.POS),
('Label', self.LABEL)],
skip_header=False)
self.test_iter = tt_data.Iterator(self.test_set,
sort_key=lambda x: len(x.Text),
batch_size=self.config['test_batch_size'],
train=False,
repeat=False,
sort_within_batch=True,
device=self.device)
示例12: test_json_dataset_one_key_multiple_fields
# 需要导入模块: from torchtext import data [as 别名]
# 或者: from torchtext.data import TabularDataset [as 别名]
def test_json_dataset_one_key_multiple_fields(self):
self.write_test_ppid_dataset(data_format="json")
question_field = data.Field(sequential=True)
spacy_tok_question_field = data.Field(sequential=True, tokenize="spacy")
label_field = data.Field(sequential=False)
fields = {"question1": [("q1", question_field),
("q1_spacy", spacy_tok_question_field)],
"question2": [("q2", question_field),
("q2_spacy", spacy_tok_question_field)],
"label": ("label", label_field)}
dataset = data.TabularDataset(
path=self.test_ppid_dataset_path, format="json", fields=fields)
expected_examples = [
(["When", "do", "you", "use", "シ", "instead", "of", "し?"],
["When", "do", "you", "use", "シ", "instead", "of", "し", "?"],
["When", "do", "you", "use", "\"&\"",
"instead", "of", "\"and\"?"],
["When", "do", "you", "use", "\"", "&", "\"",
"instead", "of", "\"", "and", "\"", "?"], "0"),
(["Where", "was", "Lincoln", "born?"],
["Where", "was", "Lincoln", "born", "?"],
["Which", "location", "was", "Abraham", "Lincoln", "born?"],
["Which", "location", "was", "Abraham", "Lincoln", "born", "?"],
"1"),
(["What", "is", "2+2"], ["What", "is", "2", "+", "2"],
["2+2=?"], ["2", "+", "2=", "?"], "1")]
for i, example in enumerate(dataset):
self.assertEqual(example.q1, expected_examples[i][0])
self.assertEqual(example.q1_spacy, expected_examples[i][1])
self.assertEqual(example.q2, expected_examples[i][2])
self.assertEqual(example.q2_spacy, expected_examples[i][3])
self.assertEqual(example.label, expected_examples[i][4])
示例13: test_errors
# 需要导入模块: from torchtext import data [as 别名]
# 或者: from torchtext.data import TabularDataset [as 别名]
def test_errors(self):
# Ensure that trying to retrieve a key not in JSON data errors
self.write_test_ppid_dataset(data_format="json")
question_field = data.Field(sequential=True)
label_field = data.Field(sequential=False)
fields = {"qeustion1": ("q1", question_field),
"question2": ("q2", question_field),
"label": ("label", label_field)}
with self.assertRaises(ValueError):
data.TabularDataset(
path=self.test_ppid_dataset_path, format="json", fields=fields)
示例14: test_numericalize_basic
# 需要导入模块: from torchtext import data [as 别名]
# 或者: from torchtext.data import TabularDataset [as 别名]
def test_numericalize_basic(self):
self.write_test_ppid_dataset(data_format="tsv")
question_field = data.Field(sequential=True)
tsv_fields = [("id", None), ("q1", question_field),
("q2", question_field), ("label", None)]
tsv_dataset = data.TabularDataset(
path=self.test_ppid_dataset_path, format="tsv",
fields=tsv_fields)
question_field.build_vocab(tsv_dataset)
test_example_data = [["When", "do", "you", "use", "シ",
"instead", "of", "し?"],
["What", "is", "2+2", "<pad>", "<pad>",
"<pad>", "<pad>", "<pad>"],
["Here", "is", "a", "sentence", "with",
"some", "oovs", "<pad>"]]
# Test default
default_numericalized = question_field.numericalize(
test_example_data, device=-1)
verify_numericalized_example(question_field, test_example_data,
default_numericalized)
# Test with train=False
volatile_numericalized = question_field.numericalize(
test_example_data, device=-1, train=False)
verify_numericalized_example(question_field, test_example_data,
volatile_numericalized, train=False)
示例15: test_numericalize_postprocessing
# 需要导入模块: from torchtext import data [as 别名]
# 或者: from torchtext.data import TabularDataset [as 别名]
def test_numericalize_postprocessing(self):
self.write_test_ppid_dataset(data_format="tsv")
def reverse_postprocess(arr, vocab, train):
return [list(reversed(sentence)) for sentence in arr]
question_field = data.Field(sequential=True,
postprocessing=reverse_postprocess)
tsv_fields = [("id", None), ("q1", question_field),
("q2", question_field), ("label", None)]
tsv_dataset = data.TabularDataset(
path=self.test_ppid_dataset_path, format="tsv",
fields=tsv_fields)
question_field.build_vocab(tsv_dataset)
test_example_data = [["When", "do", "you", "use", "シ",
"instead", "of", "し?"],
["What", "is", "2+2", "<pad>", "<pad>",
"<pad>", "<pad>", "<pad>"],
["Here", "is", "a", "sentence", "with",
"some", "oovs", "<pad>"]]
reversed_test_example_data = [list(reversed(sentence)) for sentence in
test_example_data]
postprocessed_numericalized = question_field.numericalize(
(test_example_data), device=-1)
verify_numericalized_example(question_field,
reversed_test_example_data,
postprocessed_numericalized)