本文整理汇总了Python中torchtext.data.Pipeline方法的典型用法代码示例。如果您正苦于以下问题:Python data.Pipeline方法的具体用法?Python data.Pipeline怎么用?Python data.Pipeline使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类torchtext.data
的用法示例。
在下文中一共展示了data.Pipeline方法的13个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: test_composition
# 需要导入模块: from torchtext import data [as 别名]
# 或者: from torchtext.data import Pipeline [as 别名]
def test_composition(self):
id_pipeline = data.Pipeline()
pipeline = data.Pipeline(TestPipeline.repeat_n)
pipeline.add_before(id_pipeline)
pipeline.add_after(id_pipeline)
pipeline.add_before(six.text_type.lower)
pipeline.add_after(six.text_type.capitalize)
other_pipeline = data.Pipeline(six.text_type.swapcase)
other_pipeline.add_before(pipeline)
# Assert pipeline gives proper results after composition
# (test that we aren't modfifying pipes member)
assert pipeline("teST") == "Testtesttest"
assert pipeline(["ElE1", "eLe2"]) == ["Ele1ele1ele1", "Ele2ele2ele2"]
# Assert pipeline that we added to gives proper results
assert other_pipeline("teST") == "tESTTESTTEST"
assert other_pipeline(["ElE1", "eLe2"]) == ["eLE1ELE1ELE1", "eLE2ELE2ELE2"]
示例2: test_preprocess
# 需要导入模块: from torchtext import data [as 别名]
# 或者: from torchtext.data import Pipeline [as 别名]
def test_preprocess(self):
# Default case.
field = data.Field()
assert field.preprocess("Test string.") == ["Test", "string."]
# Test that lowercase is properly applied.
field_lower = data.Field(lower=True)
assert field_lower.preprocess("Test string.") == ["test", "string."]
# Test that custom preprocessing pipelines are properly applied.
preprocess_pipeline = data.Pipeline(lambda x: x + "!")
field_preprocessing = data.Field(preprocessing=preprocess_pipeline,
lower=True)
assert field_preprocessing.preprocess("Test string.") == ["test!", "string.!"]
# Test that non-sequential data is properly handled.
field_not_sequential = data.Field(sequential=False, lower=True,
preprocessing=preprocess_pipeline)
assert field_not_sequential.preprocess("Test string.") == "test string.!"
# Non-regression test that we do not try to decode unicode strings to unicode
field_not_sequential = data.Field(sequential=False, lower=True,
preprocessing=preprocess_pipeline)
assert field_not_sequential.preprocess("ᑌᑎIᑕOᗪᕮ_Tᕮ᙭T") == "ᑌᑎiᑕoᗪᕮ_tᕮ᙭t!"
示例3: __init__
# 需要导入模块: from torchtext import data [as 别名]
# 或者: from torchtext.data import Pipeline [as 别名]
def __init__(self, text_field, label_field, path=None, examples=None, **kwargs):
"""Create an MR dataset instance given a path and fields.
Arguments:
text_field: The field that will be used for text data.
label_field: The field that will be used for label data.
path: Path to the data file.
examples: The examples contain all the data.
Remaining keyword arguments: Passed to the constructor of
data.Dataset.
"""
# text_field.preprocessing = data.Pipeline(clean_str)
fields = [('text', text_field), ('label', label_field)]
if examples is None:
path = self.dirname if path is None else path
examples = []
with codecs.open(os.path.join(path, 'rt-polarity.neg'),'r','utf8') as f:
examples += [
data.Example.fromlist([line, 'negative'], fields) for line in f]
with codecs.open(os.path.join(path, 'rt-polarity.pos'),'r','utf8') as f:
examples += [
data.Example.fromlist([line, 'positive'], fields) for line in f]
super(MR, self).__init__(examples, fields, **kwargs)
示例4: test_composition
# 需要导入模块: from torchtext import data [as 别名]
# 或者: from torchtext.data import Pipeline [as 别名]
def test_composition(self):
id_pipeline = data.Pipeline()
pipeline = data.Pipeline(TestPipeline.repeat_n)
pipeline.add_before(id_pipeline)
pipeline.add_after(id_pipeline)
pipeline.add_before(str.lower)
pipeline.add_after(str.capitalize)
other_pipeline = data.Pipeline(str.swapcase)
other_pipeline.add_before(pipeline)
# Assert pipeline gives proper results after composition
# (test that we aren't modfifying pipes member)
assert pipeline("teST") == "Testtesttest"
assert pipeline(["ElE1", "eLe2"]) == ["Ele1ele1ele1", "Ele2ele2ele2"]
# Assert pipeline that we added to gives proper results
assert other_pipeline("teST") == "tESTTESTTEST"
assert other_pipeline(["ElE1", "eLe2"]) == ["eLE1ELE1ELE1", "eLE2ELE2ELE2"]
示例5: test_pipeline
# 需要导入模块: from torchtext import data [as 别名]
# 或者: from torchtext.data import Pipeline [as 别名]
def test_pipeline(self):
id_pipeline = data.Pipeline()
assert id_pipeline("Test STring") == "Test STring"
assert id_pipeline("ᑌᑎIᑕOᗪᕮ_Tᕮ᙭T") == "ᑌᑎIᑕOᗪᕮ_Tᕮ᙭T"
assert id_pipeline(["1241", "Some String"]) == ["1241", "Some String"]
pipeline = data.Pipeline(six.text_type.lower)
assert pipeline("Test STring") == "test string"
assert pipeline("ᑌᑎIᑕOᗪᕮ_Tᕮ᙭T") == "ᑌᑎiᑕoᗪᕮ_tᕮ᙭t"
assert pipeline(["1241", "Some String"]) == ["1241", "some string"]
args_pipeline = data.Pipeline(TestPipeline.repeat_n)
assert args_pipeline("test", 5) == "testtesttesttesttest"
assert args_pipeline(["ele1", "ele2"], 2) == ["ele1ele1", "ele2ele2"]
示例6: test_exceptions
# 需要导入模块: from torchtext import data [as 别名]
# 或者: from torchtext.data import Pipeline [as 别名]
def test_exceptions(self):
with self.assertRaises(ValueError):
data.Pipeline("Not Callable")
示例7: test_pipeline
# 需要导入模块: from torchtext import data [as 别名]
# 或者: from torchtext.data import Pipeline [as 别名]
def test_pipeline(self):
id_pipeline = data.Pipeline()
assert id_pipeline("Test STring") == "Test STring"
assert id_pipeline("ᑌᑎIᑕOᗪᕮ_Tᕮ᙭T") == "ᑌᑎIᑕOᗪᕮ_Tᕮ᙭T"
assert id_pipeline(["1241", "Some String"]) == ["1241", "Some String"]
pipeline = data.Pipeline(str.lower)
assert pipeline("Test STring") == "test string"
assert pipeline("ᑌᑎIᑕOᗪᕮ_Tᕮ᙭T") == "ᑌᑎiᑕoᗪᕮ_tᕮ᙭t"
assert pipeline(["1241", "Some String"]) == ["1241", "some string"]
args_pipeline = data.Pipeline(TestPipeline.repeat_n)
assert args_pipeline("test", 5) == "testtesttesttesttest"
assert args_pipeline(["ele1", "ele2"], 2) == ["ele1ele1", "ele2ele2"]
示例8: __init__
# 需要导入模块: from torchtext import data [as 别名]
# 或者: from torchtext.data import Pipeline [as 别名]
def __init__(self, args):
if not args.cuda:
args.gpu = -1
if torch.cuda.is_available() and args.cuda:
print("Note: You are using GPU for training")
torch.cuda.set_device(args.gpu)
torch.cuda.manual_seed(args.seed)
if torch.cuda.is_available() and not args.cuda:
print("Warning: You have Cuda but do not use it. You are using CPU for training")
torch.manual_seed(args.seed)
np.random.seed(args.seed)
random.seed(args.seed)
self.QID = data.Field(sequential=False)
self.QUESTION = data.Field(batch_first=True)
self.ANSWER = data.Field(batch_first=True)
self.LABEL = data.Field(sequential=False)
self.EXTERNAL = data.Field(sequential=True, tensor_type=torch.FloatTensor, batch_first=True, use_vocab=False,
postprocessing=data.Pipeline(lambda arr, _, train: [float(y) for y in arr]))
if 'TrecQA' in args.dataset:
train, dev, test = TrecDataset.splits(self.QID, self.QUESTION, self.ANSWER, self.EXTERNAL, self.LABEL)
elif 'WikiQA' in args.dataset:
train, dev, test = WikiDataset.splits(self.QID, self.QUESTION, self.ANSWER, self.EXTERNAL, self.LABEL)
else:
print("Unsupported dataset")
exit()
self.QID.build_vocab(train, dev, test)
self.QUESTION.build_vocab(train, dev, test)
self.ANSWER.build_vocab(train, dev, test)
self.LABEL.build_vocab(train, dev, test)
if args.cuda:
self.model = torch.load(args.model, map_location=lambda storage, location: storage.cuda(args.gpu))
else:
self.model = torch.load(args.model, map_location=lambda storage, location: storage)
self.gpu = args.gpu
示例9: get_E2E_loaders
# 需要导入模块: from torchtext import data [as 别名]
# 或者: from torchtext.data import Pipeline [as 别名]
def get_E2E_loaders(path, valid=0.1, batch_size=32):
utterance = data.Field(tokenize=tokenizer, lower=True)
label = data.Field(sequential=False, postprocessing=Pipeline(convert_token=convert_token))
id = data.Field(use_vocab=False,sequential=False)
fields = [('id', id),
('turn1', utterance),
('turn2', utterance),
('turn3', utterance),
('label', label)]
train = data.TabularDataset('{}/train.txt'.format(path),
format='tsv',
fields=fields,
skip_header=True)
valid = data.TabularDataset('{}/valid.txt'.format(path),
format='tsv',
fields=fields,
skip_header=True)
test = data.TabularDataset('{}/test.txt'.format(path),
format='tsv',
fields=fields,
skip_header=True)
vectors = vocab.Vectors(name='emojiplusglove.txt', cache='/media/backup/nlp-cic/DialogueRNN/')
utterance.build_vocab(train, valid, test, vectors=vectors)
#utterance.build_vocab(train, valid, test, vectors='glove.840B.300d')
label.build_vocab(train)
train_iter = BucketIterator(train,
train=True,
batch_size=batch_size,
sort_key=lambda x: len(x.turn3),
device=torch.device(0))
valid_iter = BucketIterator(valid,
batch_size=batch_size,
sort_key=lambda x: len(x.turn3),
device=torch.device(0))
test_iter = BucketIterator(test,
batch_size=batch_size,
sort_key=lambda x: len(x.turn3),
device=torch.device(0))
return train_iter, valid_iter, test_iter,\
utterance.vocab.vectors if not args.cuda else utterance.vocab.vectors.cuda(),\
label.vocab.itos
示例10: __init__
# 需要导入模块: from torchtext import data [as 别名]
# 或者: from torchtext.data import Pipeline [as 别名]
def __init__(self, text_field, label_field, path=None, text_cnt=1000, examples=None, **kwargs):
"""Create an MR dataset instance given a path and fields.
Arguments:
text_field: The field that will be used for text data.
label_field: The field that will be used for label data.
path: Path to the data file.
examples: The examples contain all the data.
Remaining keyword arguments: Passed to the constructor of
data.Dataset.
"""
def clean_str(string):
"""
Tokenization/string cleaning for all datasets except for SST.
Original taken from https://github.com/yoonkim/CNN_sentence/blob/master/process_data.py
"""
string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string)
string = re.sub(r"\'s", " \'s", string)
string = re.sub(r"\'ve", " \'ve", string)
string = re.sub(r"n\'t", " n\'t", string)
string = re.sub(r"\'re", " \'re", string)
string = re.sub(r"\'d", " \'d", string)
string = re.sub(r"\'ll", " \'ll", string)
string = re.sub(r",", " , ", string)
string = re.sub(r"!", " ! ", string)
string = re.sub(r"\(", " \( ", string)
string = re.sub(r"\)", " \) ", string)
string = re.sub(r"\?", " \? ", string)
string = re.sub(r"\s{2,}", " ", string)
return string.strip().lower()
text_field.preprocessing = data.Pipeline(clean_str)
fields = [('text', text_field), ('label', label_field)]
categories = ['alt.atheism', 'comp.graphics', 'sci.med', 'soc.religion.christian']
if examples is None:
path = self.dirname if path is None else path
examples = []
for sub_path in categories:
sub_path_one = os.path.join(path, sub_path)
sub_paths_two = os.listdir(sub_path_one)
cnt = 0
for sub_path_two in sub_paths_two:
lines = ""
with open(os.path.join(sub_path_one, sub_path_two), encoding="utf8", errors='ignore') as f:
lines = f.read()
examples += [data.Example.fromlist([lines, sub_path], fields)]
cnt += 1
super(NEWS_20, self).__init__(examples, fields, **kwargs)
示例11: get_input_processor_words
# 需要导入模块: from torchtext import data [as 别名]
# 或者: from torchtext.data import Pipeline [as 别名]
def get_input_processor_words(vocab_word, vocab_char=None, convert_digits=True):
"""
Returns a function that converts text into a processed batch. Required duing
inference.
Parameters:
vocab_word: Instance of torchtext.Vocab for input word vocabulary
vocab_char[optional]: Instance of torchtext.Vocab for input per-word
character vocabulary
convert_digits: If True will convert numbers to single 0's
"""
inputs_word = data.Field(init_token="<bos>", eos_token="<eos>", batch_first=True, lower=True,
preprocessing=data.Pipeline(
lambda w: '0' if convert_digits and w.isdigit() else w ))
# Set the vocab object manually without building from training dataset
inputs_word.vocab = vocab_word
if vocab_char is not None:
inputs_char_nesting = data.Field(tokenize=list, init_token="<bos>", eos_token="<eos>",
batch_first=True)
inputs_char = data.NestedField(inputs_char_nesting,
init_token="<bos>", eos_token="<eos>")
# Set the vocab object manually without building from training dataset
inputs_char.vocab = inputs_char_nesting.vocab = vocab_char
fields = [(('inputs_word', 'inputs_char'), (inputs_word, inputs_char))]
else:
fields = [('inputs_word', inputs_word)]
def input_processor_fn(inputs):
if not isinstance(inputs, list):
inputs = [inputs]
examples = []
for line in inputs:
examples.append(data.Example.fromlist([line], fields))
dataset = data.Dataset(examples, fields)
# Entire input in one batch
return data.Batch(data=dataset,
dataset=dataset,
device=torch.device("cuda:0" if torch.cuda.is_available() else "cpu"))
return input_processor_fn
示例12: __init__
# 需要导入模块: from torchtext import data [as 别名]
# 或者: from torchtext.data import Pipeline [as 别名]
def __init__(self, text_field, label_field, path=None, examples=None, **kwargs):
"""Create an MR dataset instance given a path and fields.
Arguments:
text_field: The field that will be used for text data.
label_field: The field that will be used for label data.
path: Path to the data file.
examples: The examples contain all the data.
Remaining keyword arguments: Passed to the constructor of
data.Dataset.
"""
def clean_str(string):
"""
Tokenization/string cleaning for all datasets except for SST.
Original taken from https://github.com/yoonkim/CNN_sentence/blob/master/process_data.py
"""
string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string)
string = re.sub(r"\'s", " \'s", string)
string = re.sub(r"\'ve", " \'ve", string)
string = re.sub(r"n\'t", " n\'t", string)
string = re.sub(r"\'re", " \'re", string)
string = re.sub(r"\'d", " \'d", string)
string = re.sub(r"\'ll", " \'ll", string)
string = re.sub(r",", " , ", string)
string = re.sub(r"!", " ! ", string)
string = re.sub(r"\(", " \( ", string)
string = re.sub(r"\)", " \) ", string)
string = re.sub(r"\?", " \? ", string)
string = re.sub(r"\s{2,}", " ", string)
return string.strip()
text_field.preprocessing = data.Pipeline(clean_str)
fields = [('text', text_field), ('label', label_field)]
if examples is None:
path = self.dirname if path is None else path
examples = []
with open(os.path.join(path, 'rt-polarity.neg'), errors='ignore') as f:
examples += [
data.Example.fromlist([line, 'negative'], fields) for line in f]
with open(os.path.join(path, 'rt-polarity.pos'), errors='ignore') as f:
examples += [
data.Example.fromlist([line, 'positive'], fields) for line in f]
super(MR, self).__init__(examples, fields, **kwargs)
示例13: __init__
# 需要导入模块: from torchtext import data [as 别名]
# 或者: from torchtext.data import Pipeline [as 别名]
def __init__(self, text_field, label_field, path=None, examples=None, **kwargs):
"""Create an MR dataset instance given a path and fields.
Arguments:
text_field: The field that will be used for text data.
label_field: The field that will be used for label data.
path: Path to the data file.
examples: The examples contain all the data.
Remaining keyword arguments: Passed to the constructor of
data.Dataset.
"""
def clean_str(string):
"""
Tokenization/string cleaning for all datasets except for SST.
Original taken from https://github.com/yoonkim/CNN_sentence/blob/master/process_data.py
"""
string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string)
string = re.sub(r"\'s", " \'s", string)
string = re.sub(r"\'ve", " \'ve", string)
string = re.sub(r"n\'t", " n\'t", string)
string = re.sub(r"\'re", " \'re", string)
string = re.sub(r"\'d", " \'d", string)
string = re.sub(r"\'ll", " \'ll", string)
string = re.sub(r",", " , ", string)
string = re.sub(r"!", " ! ", string)
string = re.sub(r"\(", " \( ", string)
string = re.sub(r"\)", " \) ", string)
string = re.sub(r"\?", " \? ", string)
string = re.sub(r"\s{2,}", " ", string)
return string.strip()
text_field.preprocessing = data.Pipeline(clean_str)
fields = [('text', text_field), ('label', label_field)]
if examples is None:
path = self.dirname if path is None else path
examples = []
with codecs.open(os.path.join(path, 'rt-polarity.neg'), encoding='utf-8', errors='ignore') as f:
examples += [
data.Example.fromlist([line, 'negative'], fields) for line in f]
with codecs.open(os.path.join(path, 'rt-polarity.pos'), encoding='utf-8', errors='ignore') as f:
examples += [
data.Example.fromlist([line, 'positive'], fields) for line in f]
super(MR, self).__init__(examples, fields, **kwargs)