本文整理汇总了Python中fairseq.data.FairseqDataset方法的典型用法代码示例。如果您正苦于以下问题:Python data.FairseqDataset方法的具体用法?Python data.FairseqDataset怎么用?Python data.FairseqDataset使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类fairseq.data
的用法示例。
在下文中一共展示了data.FairseqDataset方法的10个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: dataset
# 需要导入模块: from fairseq import data [as 别名]
# 或者: from fairseq.data import FairseqDataset [as 别名]
def dataset(self, split):
"""
Return a loaded dataset split.
Args:
split (str): name of the split (e.g., train, valid, test)
Returns:
a :class:`~fairseq.data.FairseqDataset` corresponding to *split*
"""
from fairseq.data import FairseqDataset
if split not in self.datasets:
raise KeyError("Dataset not loaded: " + split)
if not isinstance(self.datasets[split], FairseqDataset):
raise TypeError("Datasets are expected to be of type FairseqDataset")
return self.datasets[split]
示例2: dataset
# 需要导入模块: from fairseq import data [as 别名]
# 或者: from fairseq.data import FairseqDataset [as 别名]
def dataset(self, split):
"""
Return a loaded dataset split.
Args:
split (str): name of the split (e.g., train, valid, test)
Returns:
a :class:`~fairseq.data.FairseqDataset` corresponding to *split*
"""
from fairseq.data import FairseqDataset
if split not in self.datasets:
raise KeyError('Dataset not loaded: ' + split)
if not isinstance(self.datasets[split], FairseqDataset):
raise TypeError('Datasets are expected to be of type FairseqDataset')
return self.datasets[split]
示例3: dataset
# 需要导入模块: from fairseq import data [as 别名]
# 或者: from fairseq.data import FairseqDataset [as 别名]
def dataset(self, split):
"""Return a dataset split."""
if split not in self.datasets:
raise KeyError('Dataset not loaded: ' + split)
if not isinstance(self.datasets[split], FairseqDataset):
raise TypeError('Datasets are expected to be of type FairseqDataset')
return self.datasets[split]
示例4: train_step
# 需要导入模块: from fairseq import data [as 别名]
# 或者: from fairseq.data import FairseqDataset [as 别名]
def train_step(
self, sample, model, criterion, optimizer, update_num, ignore_grad=False
):
"""
Do forward and backward, and return the loss as computed by *criterion*
for the given *model* and *sample*.
Args:
sample (dict): the mini-batch. The format is defined by the
:class:`~fairseq.data.FairseqDataset`.
model (~fairseq.models.BaseFairseqModel): the model
criterion (~fairseq.criterions.FairseqCriterion): the criterion
optimizer (~fairseq.optim.FairseqOptimizer): the optimizer
update_num (int): the current update
ignore_grad (bool): multiply loss by 0 if this is set to True
Returns:
tuple:
- the loss
- the sample size, which is used as the denominator for the
gradient
- logging outputs to display while training
"""
model.train()
model.set_num_updates(update_num)
with torch.autograd.profiler.record_function("forward"):
loss, sample_size, logging_output = criterion(model, sample)
if ignore_grad:
loss *= 0
with torch.autograd.profiler.record_function("backward"):
optimizer.backward(loss)
return loss, sample_size, logging_output
示例5: dataset
# 需要导入模块: from fairseq import data [as 别名]
# 或者: from fairseq.data import FairseqDataset [as 别名]
def dataset(self, split):
"""Return a dataset split."""
from fairseq.data import FairseqDataset
if split not in self.datasets:
raise KeyError('Dataset not loaded: ' + split)
if not isinstance(self.datasets[split], FairseqDataset):
raise TypeError('Datasets are expected to be of type FairseqDataset')
return self.datasets[split]
示例6: train_step
# 需要导入模块: from fairseq import data [as 别名]
# 或者: from fairseq.data import FairseqDataset [as 别名]
def train_step(
self, sample, model, criterion, optimizer, update_num, ignore_grad=False
):
"""
Do forward and backward, and return the loss as computed by *criterion*
for the given *model* and *sample*.
Args:
sample (dict): the mini-batch. The format is defined by the
:class:`~fairseq.data.FairseqDataset`.
model (~fairseq.models.BaseFairseqModel): the model
criterion (~fairseq.criterions.FairseqCriterion): the criterion
optimizer (~fairseq.optim.FairseqOptimizer): the optimizer
update_num (int): the current update
ignore_grad (bool): multiply loss by 0 if this is set to True
Returns:
tuple:
- the loss
- the sample size, which is used as the denominator for the
gradient
- logging outputs to display while training
"""
model.train()
model.set_num_updates(update_num)
loss, sample_size, logging_output = criterion(model, sample)
if ignore_grad:
loss *= 0
optimizer.backward(loss)
return loss, sample_size, logging_output
示例7: train_step
# 需要导入模块: from fairseq import data [as 别名]
# 或者: from fairseq.data import FairseqDataset [as 别名]
def train_step(self, sample, model, criterion, optimizer, ignore_grad=False):
"""
Do forward and backward, and return the loss as computed by *criterion*
for the given *model* and *sample*.
Args:
sample (dict): the mini-batch. The format is defined by the
:class:`~fairseq.data.FairseqDataset`.
model (~fairseq.models.BaseFairseqModel): the model
criterion (~fairseq.criterions.FairseqCriterion): the criterion
optimizer (~fairseq.optim.FairseqOptimizer): the optimizer
ignore_grad (bool): multiply loss by 0 if this is set to True
Returns:
tuple:
- the loss
- the sample size, which is used as the denominator for the
gradient
- logging outputs to display while training
"""
model.train()
loss, sample_size, logging_output = criterion(model, sample)
if ignore_grad:
loss *= 0
optimizer.backward(loss)
return loss, sample_size, logging_output
示例8: __init__
# 需要导入模块: from fairseq import data [as 别名]
# 或者: from fairseq.data import FairseqDataset [as 别名]
def __init__(
self,
dataset: FairseqDataset,
sizes: np.ndarray,
vocab: Dictionary,
pad_idx: int,
mask_idx: int,
classif_token_idx: int,
sep_token_idx: int,
seed: int = 1,
shuffle: bool = True,
has_pairs: bool = True,
segment_id: int = 0,
masking_ratio: float = 0.15,
masking_prob: float = 0.8,
random_token_prob: float = 0.1
):
# Make sure the input datasets are the ones supported
assert (
isinstance(dataset, TokenBlockDataset) or
isinstance(dataset, BlockPairDataset) or
isinstance(dataset, ConcatDataset)
), "MaskedLMDataset only wraps TokenBlockDataset or BlockPairDataset or " \
"ConcatDataset"
self.dataset = dataset
self.sizes = np.array(sizes)
self.vocab = vocab
self.pad_idx = pad_idx
self.mask_idx = mask_idx
self.classif_token_idx = classif_token_idx
self.sep_token_idx = sep_token_idx
self.shuffle = shuffle
self.seed = seed
self.has_pairs = has_pairs
self.segment_id = segment_id
self.masking_ratio = masking_ratio
self.masking_prob = masking_prob
self.random_token_prob = random_token_prob
# If we have only one block then sizes needs to be updated to include
# the classification token
if not has_pairs:
self.sizes = self.sizes + 1
示例9: get_batch_iterator
# 需要导入模块: from fairseq import data [as 别名]
# 或者: from fairseq.data import FairseqDataset [as 别名]
def get_batch_iterator(
self,
dataset,
max_tokens=None,
max_sentences=None,
max_positions=None,
ignore_invalid_inputs=False,
required_batch_size_multiple=1,
seed=1,
num_shards=1,
shard_id=0,
num_workers=0,
):
assert isinstance(dataset, FairseqDataset)
# get indices ordered by example size
with data_utils.numpy_seed(seed):
indices = dataset.ordered_indices()
# filter examples that are too large
indices = data_utils.filter_by_size(
indices,
dataset,
max_positions,
raise_exception=(not ignore_invalid_inputs),
)
# create mini-batches with given size constraints
batch_sampler = data_utils.batch_by_size(
indices,
dataset.num_tokens,
max_tokens=max_tokens,
max_sentences=max_sentences,
required_batch_size_multiple=required_batch_size_multiple,
)
# return a reusable, sharded iterator
return ptt_iterators.WeightedEpochBatchIterator(
dataset=dataset,
collate_fn=dataset.collater,
batch_sampler=batch_sampler,
seed=seed,
num_shards=num_shards,
shard_id=shard_id,
num_workers=num_workers,
weights=self.loss_weights,
)
示例10: get_batch_iterator
# 需要导入模块: from fairseq import data [as 别名]
# 或者: from fairseq.data import FairseqDataset [as 别名]
def get_batch_iterator(
self, dataset, max_tokens=None, max_sentences=None, max_positions=None,
ignore_invalid_inputs=False, required_batch_size_multiple=1,
seed=1, num_shards=1, shard_id=0, num_workers=0,
):
"""
Get an iterator that yields batches of data from the given dataset.
Args:
dataset (~fairseq.data.FairseqDataset): dataset to batch
max_tokens (int, optional): max number of tokens in each batch
(default: None).
max_sentences (int, optional): max number of sentences in each
batch (default: None).
max_positions (optional): max sentence length supported by the
model (default: None).
ignore_invalid_inputs (bool, optional): don't raise Exception for
sentences that are too long (default: False).
required_batch_size_multiple (int, optional): require batch size to
be a multiple of N (default: 1).
seed (int, optional): seed for random number generator for
reproducibility (default: 1).
num_shards (int, optional): shard the data iterator into N
shards (default: 1).
shard_id (int, optional): which shard of the data iterator to
return (default: 0).
num_workers (int, optional): how many subprocesses to use for data
loading. 0 means the data will be loaded in the main process
(default: 0).
Returns:
~fairseq.iterators.EpochBatchIterator: a batched iterator over the
given dataset split
"""
assert isinstance(dataset, FairseqDataset)
# get indices ordered by example size
with data_utils.numpy_seed(seed):
indices = dataset.ordered_indices()
# filter examples that are too large
indices = data_utils.filter_by_size(
indices, dataset.size, max_positions, raise_exception=(not ignore_invalid_inputs),
)
# create mini-batches with given size constraints
batch_sampler = data_utils.batch_by_size(
indices, dataset.num_tokens, max_tokens=max_tokens, max_sentences=max_sentences,
required_batch_size_multiple=required_batch_size_multiple,
)
# return a reusable, sharded iterator
return iterators.EpochBatchIterator(
dataset=dataset,
collate_fn=dataset.collater,
batch_sampler=batch_sampler,
seed=seed,
num_shards=num_shards,
shard_id=shard_id,
num_workers=num_workers,
)