本文整理汇总了Python中torchtext.data方法的典型用法代码示例。如果您正苦于以下问题:Python torchtext.data方法的具体用法?Python torchtext.data怎么用?Python torchtext.data使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类torchtext
的用法示例。
在下文中一共展示了torchtext.data方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: get_fields
# 需要导入模块: import torchtext [as 别名]
# 或者: from torchtext import data [as 别名]
def get_fields(data_type, n_src_features, n_tgt_features):
"""
Args:
data_type: type of the source input. Options are [text|img|audio].
n_src_features: the number of source features to
create `torchtext.data.Field` for.
n_tgt_features: the number of target features to
create `torchtext.data.Field` for.
Returns:
A dictionary whose keys are strings and whose values are the
corresponding Field objects.
"""
if data_type == 'text':
return TextDataset.get_fields(n_src_features, n_tgt_features)
elif data_type == 'img':
return ImageDataset.get_fields(n_src_features, n_tgt_features)
elif data_type == 'video':
return VideoDataset.get_fields(n_src_features, n_tgt_features)
elif data_type == 'audio':
return AudioDataset.get_fields(n_src_features, n_tgt_features)
示例2: create_batches
# 需要导入模块: import torchtext [as 别名]
# 或者: from torchtext import data [as 别名]
def create_batches(self):
""" Create batches """
if self.train:
def _pool(data, random_shuffler):
for p in torchtext.data.batch(data, self.batch_size * 100):
p_batch = torchtext.data.batch(
sorted(p, key=self.sort_key),
self.batch_size, self.batch_size_fn)
for b in random_shuffler(list(p_batch)):
yield b
self.batches = _pool(self.data(), self.random_shuffler)
else:
self.batches = []
for b in torchtext.data.batch(self.data(), self.batch_size,
self.batch_size_fn):
self.batches.append(sorted(b, key=self.sort_key))
示例3: _merge_field_vocabs
# 需要导入模块: import torchtext [as 别名]
# 或者: from torchtext import data [as 别名]
def _merge_field_vocabs(knl_field, src_field, tgt_field, vocab_size, min_freq):
# in the long run, shouldn't it be possible to do this by calling
# build_vocab with both the src and tgt data?
specials = [tgt_field.unk_token, tgt_field.pad_token,
tgt_field.init_token, tgt_field.eos_token]
merged = sum(
[knl_field.vocab.freqs, src_field.vocab.freqs, tgt_field.vocab.freqs], Counter()
)
merged_vocab = Vocab(
merged, specials=specials,
max_size=vocab_size, min_freq=min_freq
)
knl_field.vocab = merged_vocab
src_field.vocab = merged_vocab
tgt_field.vocab = merged_vocab
assert len(src_field.vocab) == len(tgt_field.vocab) == len(knl_field.vocab)
示例4: load_dataloaders
# 需要导入模块: import torchtext [as 别名]
# 或者: from torchtext import data [as 别名]
def load_dataloaders(args):
logger.info("Preparing dataloaders...")
FR = torchtext.data.Field(tokenize=dum_tokenizer, lower=True, init_token="<sos>", eos_token="<eos>",\
batch_first=True)
EN = torchtext.data.Field(tokenize=dum_tokenizer, lower=True, batch_first=True)
train_path = os.path.join("./data/", "df.csv")
if not os.path.isfile(train_path):
tokenize_data(args)
train = torchtext.data.TabularDataset(train_path, format="csv", \
fields=[("EN", EN), ("FR", FR)])
FR.build_vocab(train)
EN.build_vocab(train)
train_iter = BucketIterator(train, batch_size=args.batch_size, repeat=False, sort_key=lambda x: (len(x["EN"]), len(x["FR"])),\
shuffle=True, train=True)
train_length = len(train)
logger.info("Loaded dataloaders.")
return train_iter, FR, EN, train_length
示例5: get_fields
# 需要导入模块: import torchtext [as 别名]
# 或者: from torchtext import data [as 别名]
def get_fields(data_type, n_src_features, n_tgt_features):
"""
Args:
data_type: type of the source input. Options are [text|img|audio].
n_src_features: the number of source features to
create `torchtext.data.Field` for.
n_tgt_features: the number of target features to
create `torchtext.data.Field` for.
Returns:
A dictionary whose keys are strings and whose values are the
corresponding Field objects.
"""
if data_type == 'text':
return TextDataset.get_fields(n_src_features, n_tgt_features)
elif data_type == 'img':
return ImageDataset.get_fields(n_src_features, n_tgt_features)
elif data_type == 'audio':
return AudioDataset.get_fields(n_src_features, n_tgt_features)
示例6: _old_style_vocab
# 需要导入模块: import torchtext [as 别名]
# 或者: from torchtext import data [as 别名]
def _old_style_vocab(vocab):
"""Detect old-style vocabs (``List[Tuple[str, torchtext.data.Vocab]]``).
Args:
vocab: some object loaded from a *.vocab.pt file
Returns:
Whether ``vocab`` is a list of pairs where the second object
is a :class:`torchtext.vocab.Vocab` object.
This exists because previously only the vocab objects from the fields
were saved directly, not the fields themselves, and the fields needed to
be reconstructed at training and translation time.
"""
return isinstance(vocab, list) and \
any(isinstance(v[1], Vocab) for v in vocab)
示例7: _merge_field_vocabs
# 需要导入模块: import torchtext [as 别名]
# 或者: from torchtext import data [as 别名]
def _merge_field_vocabs(src_field, tgt_field, vocab_size, min_freq,
vocab_size_multiple):
# in the long run, shouldn't it be possible to do this by calling
# build_vocab with both the src and tgt data?
specials = [tgt_field.unk_token, tgt_field.pad_token,
tgt_field.init_token, tgt_field.eos_token]
merged = sum(
[src_field.vocab.freqs, tgt_field.vocab.freqs], Counter()
)
merged_vocab = Vocab(
merged, specials=specials,
max_size=vocab_size, min_freq=min_freq
)
if vocab_size_multiple > 1:
_pad_vocab_to_multiple(merged_vocab, vocab_size_multiple)
src_field.vocab = merged_vocab
tgt_field.vocab = merged_vocab
assert len(src_field.vocab) == len(tgt_field.vocab)
示例8: create_batches
# 需要导入模块: import torchtext [as 别名]
# 或者: from torchtext import data [as 别名]
def create_batches(self):
if self.train:
def _pool(data, random_shuffler):
for p in torchtext.data.batch(data, self.batch_size * 100):
p_batch = batch_iter(
sorted(p, key=self.sort_key),
self.batch_size,
batch_size_fn=self.batch_size_fn,
batch_size_multiple=self.batch_size_multiple)
for b in random_shuffler(list(p_batch)):
yield b
self.batches = _pool(self.data(), self.random_shuffler)
else:
self.batches = []
for b in batch_iter(
self.data(),
self.batch_size,
batch_size_fn=self.batch_size_fn,
batch_size_multiple=self.batch_size_multiple):
self.batches.append(sorted(b, key=self.sort_key))
示例9: get_fields
# 需要导入模块: import torchtext [as 别名]
# 或者: from torchtext import data [as 别名]
def get_fields(data_type, n_src_features, n_tgt_features):
"""
Args:
data_type: type of the source input. Options are [text|img|audio].
n_src_features: the number of source features to
create `torchtext.data.Field` for.
n_tgt_features: the number of target features to
create `torchtext.data.Field` for.
Returns:
A dictionary whose keys are strings and whose values are the
corresponding Field objects.
"""
if data_type == 'text':
return TextDataset.get_fields(n_src_features, n_tgt_features)
elif data_type == 'img':
return ImageDataset.get_fields(n_src_features, n_tgt_features)
elif data_type == 'audio':
return AudioDataset.get_fields(n_src_features, n_tgt_features)
elif data_type == 'gcn':
return GCNDataset.get_fields(n_src_features, n_tgt_features)
示例10: get_morph
# 需要导入模块: import torchtext [as 别名]
# 或者: from torchtext import data [as 别名]
def get_morph(batch):
#Not very nice but we do not have access to value comming from opt.gpuid command line parameter here.
use_cuda = batch.src[0].is_cuda
# morph_index = batch.morph.data.transpose(0, 1) # [ seqLen x batch_size ] ==> [ batch_size x seqLen ]
# morph_voc = batch.dataset.fields['morph'].vocab.stoi
morph_index = batch.morph.view((batch.src[0].data.size()[0], 6, batch.src[0].data.size()[1]))
morph_index = morph_index.permute(2, 0, 1).contiguous()
# morph_index = torch.LongTensor(morph_index)
morph_mask = torch.lt(torch.eq(morph_index, 1), 1).float()
# morph_index = autograd.Variable(morph_index)
# morph_mask = autograd.Variable(torch.FloatTensor(morph_mask), requires_grad=False)
if use_cuda:
morph_index = morph_index.cuda()
morph_mask = morph_mask.cuda()
return morph_index, morph_mask
示例11: make_features
# 需要导入模块: import torchtext [as 别名]
# 或者: from torchtext import data [as 别名]
def make_features(batch, side):
"""
Args:
batch (Variable): a batch of source or target data.
side (str): for source or for target.
Returns:
A sequence of src/tgt tensors with optional feature tensors
of size (len x batch).
"""
assert side in ['src', 'tgt']
if isinstance(batch.__dict__[side], tuple):
data = batch.__dict__[side][0]
else:
data = batch.__dict__[side]
feat_start = side + "_feat_"
features = sorted(batch.__dict__[k]
for k in batch.__dict__ if feat_start in k)
levels = [data] + features
return torch.cat([level.unsqueeze(2) for level in levels], 2)
示例12: collapse_copy_scores
# 需要导入模块: import torchtext [as 别名]
# 或者: from torchtext import data [as 别名]
def collapse_copy_scores(self, scores, batch, tgt_vocab):
"""Given scores from an expanded dictionary
corresponeding to a batch, sums together copies,
with a dictionary word when it is ambigious.
"""
offset = len(tgt_vocab)
for b in range(batch.batch_size):
index = batch.indices.data[b]
src_vocab = self.src_vocabs[index]
for i in range(1, len(src_vocab)):
sw = src_vocab.itos[i]
ti = tgt_vocab.stoi[sw]
if ti != 0:
scores[:, b, ti] += scores[:, b, offset + i]
scores[:, b, offset + i].fill_(1e-20)
return scores
示例13: predict
# 需要导入模块: import torchtext [as 别名]
# 或者: from torchtext import data [as 别名]
def predict(test_mode, dataset_iter):
model.eval()
dataset_iter.init_epoch()
qids = []
predictions = []
labels = []
for dev_batch_idx, dev_batch in enumerate(dataset_iter):
qid_array = np.transpose(dev_batch.id.cpu().data.numpy())
true_label_array = np.transpose(dev_batch.label.cpu().data.numpy())
output = model.convModel(dev_batch)
scores = model.linearLayer(output)
score_array = scores.cpu().data.numpy().reshape(-1)
qids.extend(qid_array.tolist())
predictions.extend(score_array.tolist())
labels.extend(true_label_array.tolist())
dev_map, dev_mrr = get_map_mrr(qids, predictions, labels)
logger.info("{} {}".format(dev_map, dev_mrr))
# Run the model on the dev set
示例14: create_batches
# 需要导入模块: import torchtext [as 别名]
# 或者: from torchtext import data [as 别名]
def create_batches(self):
if self.train:
if self.yield_raw_example:
self.batches = batch_iter(
self.data(),
1,
batch_size_fn=None,
batch_size_multiple=1)
else:
self.batches = _pool(
self.data(),
self.batch_size,
self.batch_size_fn,
self.batch_size_multiple,
self.sort_key,
self.random_shuffler,
self.pool_factor)
else:
self.batches = []
for b in batch_iter(
self.data(),
self.batch_size,
batch_size_fn=self.batch_size_fn,
batch_size_multiple=self.batch_size_multiple):
self.batches.append(sorted(b, key=self.sort_key))
示例15: make_features
# 需要导入模块: import torchtext [as 别名]
# 或者: from torchtext import data [as 别名]
def make_features(batch, side, data_type='text'):
"""
Args:
batch (Variable): a batch of source or target data.
side (str): for source or for target.
data_type (str): type of the source input. Options are [text|img].
Returns:
A sequence of src/tgt tensors with optional feature tensors
of size (len x batch).
"""
assert side in ['src', 'tgt']
if isinstance(batch.__dict__[side], tuple):
data = batch.__dict__[side][0]
else:
data = batch.__dict__[side]
feat_start = side + "_feat_"
keys = sorted([k for k in batch.__dict__ if feat_start in k])
features = [batch.__dict__[k] for k in keys]
levels = [data] + features
if data_type == 'text':
return torch.cat([level.unsqueeze(2) for level in levels], 2)
else:
return levels[0]