本文整理汇总了Python中torchtext.data.LabelField方法的典型用法代码示例。如果您正苦于以下问题:Python data.LabelField方法的具体用法?Python data.LabelField怎么用?Python data.LabelField使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类torchtext.data
的用法示例。
在下文中一共展示了data.LabelField方法的7个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: test_vocab_size
# 需要导入模块: from torchtext import data [as 别名]
# 或者: from torchtext.data import LabelField [as 别名]
def test_vocab_size(self):
# Set up fields
question_field = data.Field(sequential=True)
label_field = data.LabelField()
# Copied from test_build_vocab with minor changes
# Write TSV dataset and construct a Dataset
self.write_test_ppid_dataset(data_format="tsv")
tsv_fields = [("id", None), ("q1", question_field),
("q2", question_field), ("label", label_field)]
tsv_dataset = data.TabularDataset(
path=self.test_ppid_dataset_path, format="tsv",
fields=tsv_fields)
# Skipping json dataset as we can rely on the original build vocab test
label_field.build_vocab(tsv_dataset)
assert label_field.vocab.freqs == Counter({'1': 2, '0': 1})
expected_stoi = {'1': 0, '0': 1} # No <unk>
assert dict(label_field.vocab.stoi) == expected_stoi
# Turn the stoi dictionary into an itos list
expected_itos = [x[0] for x in sorted(expected_stoi.items(),
key=lambda tup: tup[1])]
assert label_field.vocab.itos == expected_itos
示例2: create
# 需要导入模块: from torchtext import data [as 别名]
# 或者: from torchtext.data import LabelField [as 别名]
def create(model_config, batch_size, vectors=None):
""" Create an IMDB dataset """
path = model_config.data_dir('imdb')
text_field = data.Field(lower=True, tokenize='spacy', batch_first=True)
label_field = data.LabelField(is_target=True)
train_source, test_source = IMDBCached.splits(
root=path,
text_field=text_field,
label_field=label_field
)
text_field.build_vocab(train_source, max_size=25_000, vectors=vectors)
label_field.build_vocab(train_source)
train_iterator, test_iterator = data.BucketIterator.splits(
(train_source, test_source),
batch_size=batch_size,
device=model_config.torch_device(),
shuffle=True
)
return TextData(
train_source, test_source, train_iterator, test_iterator, text_field, label_field
)
示例3: load_data
# 需要导入模块: from torchtext import data [as 别名]
# 或者: from torchtext.data import LabelField [as 别名]
def load_data(batch_size=32):
# define a tokenizer
# tokenize = lambda s : nltk.word_tokenize(s)
tokenize = lambda s : s.split()
# fields : ( text_field, label_field )
print(':: creating fields')
text_field = data.Field(sequential=True, tokenize=tokenize, lower=True, include_lengths=True, batch_first=True, fix_length=200)
#text_field = data.Field(sequential=True, tokenize=tokenize, lower=True)
label_field = data.LabelField(sequential=False)
# get IMDB data
print(':: fetching IMDB data')
train_data, test_data = datasets.IMDB.splits(text_field, label_field)
# build vocabulary for fields
text_field.build_vocab(train_data, vectors=GloVe(name='6B', dim=300))
label_field.build_vocab(train_data)
# split train into train and valid
train_data, valid_data = train_data.split()
print(':: labels :', label_field.vocab.stoi)
# iterators
train_iter, test_iter, valid_iter = data.BucketIterator.splits(
(train_data, test_data, valid_data),
batch_size=batch_size,
sort_key=lambda x : len(x.text),
repeat=False,
shuffle=True)
return ( (text_field, label_field), (train_iter, test_iter, valid_iter),
text_field.vocab.vectors, # GloVe vectors
len(text_field.vocab)
)
示例4: test_init
# 需要导入模块: from torchtext import data [as 别名]
# 或者: from torchtext.data import LabelField [as 别名]
def test_init(self):
# basic init
label_field = data.LabelField()
assert label_field.sequential is False
assert label_field.unk_token is None
# init with preset fields
label_field = data.LabelField(sequential=True, unk_token="<unk>")
assert label_field.sequential is False
assert label_field.unk_token is None
示例5: load_dataset
# 需要导入模块: from torchtext import data [as 别名]
# 或者: from torchtext.data import LabelField [as 别名]
def load_dataset(test_sen=None):
"""
tokenizer : Breaks sentences into a list of words. If sequential=False, no tokenization is applied
Field : A class that stores information about the way of preprocessing
fix_length : An important property of TorchText is that we can let the input to be variable length, and TorchText will
dynamically pad each sequence to the longest sequence in that "batch". But here we are using fi_length which
will pad each sequence to have a fix length of 200.
build_vocab : It will first make a vocabulary or dictionary mapping all the unique words present in the train_data to an
idx and then after it will use GloVe word embedding to map the index to the corresponding word embedding.
vocab.vectors : This returns a torch tensor of shape (vocab_size x embedding_dim) containing the pre-trained word embeddings.
BucketIterator : Defines an iterator that batches examples of similar lengths together to minimize the amount of padding needed.
"""
tokenize = lambda x: x.split()
TEXT = data.Field(sequential=True, tokenize=tokenize, lower=True, include_lengths=True, batch_first=True, fix_length=200)
LABEL = data.LabelField(tensor_type=torch.FloatTensor)
train_data, test_data = datasets.IMDB.splits(TEXT, LABEL)
TEXT.build_vocab(train_data, vectors=GloVe(name='6B', dim=300))
LABEL.build_vocab(train_data)
word_embeddings = TEXT.vocab.vectors
print ("Length of Text Vocabulary: " + str(len(TEXT.vocab)))
print ("Vector size of Text Vocabulary: ", TEXT.vocab.vectors.size())
print ("Label Length: " + str(len(LABEL.vocab)))
train_data, valid_data = train_data.split() # Further splitting of training_data to create new training_data & validation_data
train_iter, valid_iter, test_iter = data.BucketIterator.splits((train_data, valid_data, test_data), batch_size=32, sort_key=lambda x: len(x.text), repeat=False, shuffle=True)
'''Alternatively we can also use the default configurations'''
# train_iter, test_iter = datasets.IMDB.iters(batch_size=32)
vocab_size = len(TEXT.vocab)
return TEXT, vocab_size, word_embeddings, train_iter, valid_iter, test_iter
示例6: create_fields
# 需要导入模块: from torchtext import data [as 别名]
# 或者: from torchtext.data import LabelField [as 别名]
def create_fields(self, seq_input=True, seq_ner=True, seq_cat=False):
if self.level == "word":
sentence_field = data.Field(sequential=seq_input, preprocessing=self.preprocessor, fix_length=self.fix_length,
init_token="<start>", eos_token="<end>")
elif self.level == "char":
sentence_field = data.Field(sequential=seq_input, tokenize=self.evil_workaround_tokenizer, fix_length=1014)
# sentence_field = data.NestedField(nested_field)
else:
raise KeyError("Sentence_field is undefined!")
ner_label_field = data.Field(sequential=seq_ner, init_token="<start>", eos_token="<end>", unk_token=None)
category_label_field = data.LabelField(sequential=seq_cat)
return sentence_field, ner_label_field, category_label_field
示例7: __init__
# 需要导入模块: from torchtext import data [as 别名]
# 或者: from torchtext.data import LabelField [as 别名]
def __init__(self, root_dir='data', batch_size=64, use_vector=True):
self.TEXT = Field(sequential=True, use_vocab=True,
tokenize='spacy', lower=True, batch_first=True)
self.LABEL = LabelField(tensor_type=torch.FloatTensor)
vectors = Vectors(name='mr_vocab.txt', cache='./')
dataset_path = os.path.join(root_dir, '{}.tsv')
self.dataset = {}
self.dataloader = {}
for target in ['train', 'dev', 'test']:
self.dataset[target] = TabularDataset(
path=dataset_path.format(target),
format='tsv',
fields=[('text', self.TEXT), ('label', self.LABEL)]
)
if use_vector:
self.TEXT.build_vocab(self.dataset[target], max_size=25000, vectors=vectors)
else:
self.TEXT.build_vocab(self.dataset[target], max_size=25000)
self.LABEL.build_vocab(self.dataset[target])
self.dataloader[target] = Iterator(self.dataset[target],
batch_size=batch_size,
device=None,
repeat=False,
sort_key=lambda x: len(x.text),
shuffle=True)