本文整理汇总了Python中fairseq.data.Dictionary方法的典型用法代码示例。如果您正苦于以下问题:Python data.Dictionary方法的具体用法?Python data.Dictionary怎么用?Python data.Dictionary使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类fairseq.data
的用法示例。
在下文中一共展示了data.Dictionary方法的10个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: test_character_token_embedder
# 需要导入模块: from fairseq import data [as 别名]
# 或者: from fairseq.data import Dictionary [as 别名]
def test_character_token_embedder(self):
vocab = Dictionary()
vocab.add_symbol('hello')
vocab.add_symbol('there')
embedder = CharacterTokenEmbedder(vocab, [(2, 16), (4, 32), (8, 64), (16, 2)], 64, 5, 2)
test_sents = [['hello', 'unk', 'there'], ['there'], ['hello', 'there']]
max_len = max(len(s) for s in test_sents)
input = torch.LongTensor(len(test_sents), max_len + 2).fill_(vocab.pad())
for i in range(len(test_sents)):
input[i][0] = vocab.eos()
for j in range(len(test_sents[i])):
input[i][j + 1] = vocab.index(test_sents[i][j])
input[i][j + 2] = vocab.eos()
embs = embedder(input)
assert embs.size() == (len(test_sents), max_len + 2, 5)
self.assertAlmostEqual(embs[0][0], embs[1][0])
self.assertAlmostEqual(embs[0][0], embs[0][-1])
self.assertAlmostEqual(embs[0][1], embs[2][1])
self.assertAlmostEqual(embs[0][3], embs[1][1])
embs.sum().backward()
assert embedder.char_embeddings.weight.grad is not None
示例2: dummy_dictionary
# 需要导入模块: from fairseq import data [as 别名]
# 或者: from fairseq.data import Dictionary [as 别名]
def dummy_dictionary(vocab_size, prefix='token_'):
d = Dictionary()
for i in range(vocab_size):
token = prefix + str(i)
d.add_symbol(token)
d.finalize(padding_factor=1) # don't add extra padding symbols
return d
示例3: setup_task
# 需要导入模块: from fairseq import data [as 别名]
# 或者: from fairseq.data import Dictionary [as 别名]
def setup_task(cls, args, **kwargs):
"""Setup the task. """
dictionary = Dictionary()
for i in range(args.dict_size):
dictionary.add_symbol('word{}'.format(i))
logger.info('dictionary: {} types'.format(len(dictionary)))
return cls(args, dictionary)
示例4: __init__
# 需要导入模块: from fairseq import data [as 别名]
# 或者: from fairseq.data import Dictionary [as 别名]
def __init__(self, num_embed=50000, embed_dim=1024, num_layers=24):
super().__init__(Dictionary())
self.embed = nn.Embedding(
num_embeddings=num_embed, embedding_dim=embed_dim, padding_idx=0
)
self.layers_a = nn.ModuleList([
nn.Sequential(
nn.LayerNorm(embed_dim),
nn.Linear(embed_dim, 3*embed_dim), # q, k, v input projection
nn.Linear(3*embed_dim, embed_dim), # skip self-attention
nn.Linear(embed_dim, embed_dim), # output projection
nn.Dropout(),
)
for i in range(num_layers)
])
self.layers_b = nn.ModuleList([
nn.Sequential(
nn.LayerNorm(embed_dim),
nn.Linear(embed_dim, 4*embed_dim), # FFN
nn.ReLU(),
nn.Linear(4*embed_dim, embed_dim), # FFN
nn.Dropout(0.1),
)
for i in range(num_layers)
])
self.out_proj = nn.Linear(embed_dim, num_embed)
示例5: padding_idx
# 需要导入模块: from fairseq import data [as 别名]
# 或者: from fairseq.data import Dictionary [as 别名]
def padding_idx(self):
return Dictionary().pad() if self.vocab is None else self.vocab.pad()
示例6: __init__
# 需要导入模块: from fairseq import data [as 别名]
# 或者: from fairseq.data import Dictionary [as 别名]
def __init__(
self,
dataset: torch.utils.data.Dataset,
vocab: Dictionary,
pad_idx: int,
mask_idx: int,
return_masked_tokens: bool = False,
seed: int = 1,
mask_prob: float = 0.15,
leave_unmasked_prob: float = 0.1,
random_token_prob: float = 0.1,
freq_weighted_replacement: bool = False,
mask_whole_words: torch.Tensor = None,
):
assert 0.0 < mask_prob < 1.0
assert 0.0 <= random_token_prob <= 1.0
assert 0.0 <= leave_unmasked_prob <= 1.0
assert random_token_prob + leave_unmasked_prob <= 1.0
self.dataset = dataset
self.vocab = vocab
self.pad_idx = pad_idx
self.mask_idx = mask_idx
self.return_masked_tokens = return_masked_tokens
self.seed = seed
self.mask_prob = mask_prob
self.leave_unmasked_prob = leave_unmasked_prob
self.random_token_prob = random_token_prob
self.mask_whole_words = mask_whole_words
if random_token_prob > 0.0:
if freq_weighted_replacement:
weights = np.array(self.vocab.count)
else:
weights = np.ones(len(self.vocab))
weights[:self.vocab.nspecial] = 0
self.weights = weights / weights.sum()
self.epoch = 0
示例7: build_shared_embeddings
# 需要导入模块: from fairseq import data [as 别名]
# 或者: from fairseq.data import Dictionary [as 别名]
def build_shared_embeddings(
dicts: Dict[str, Dictionary],
langs: List[str],
embed_dim: int,
build_embedding: callable,
pretrained_embed_path: Optional[str] = None,
):
"""
Helper function to build shared embeddings for a set of languages after
checking that all dicts corresponding to those languages are equivalent.
Args:
dicts: Dict of lang_id to its corresponding Dictionary
langs: languages that we want to share embeddings for
embed_dim: embedding dimension
build_embedding: callable function to actually build the embedding
pretrained_embed_path: Optional path to load pretrained embeddings
"""
shared_dict = dicts[langs[0]]
if any(dicts[lang] != shared_dict for lang in langs):
raise ValueError(
"--share-*-embeddings requires a joined dictionary: "
"--share-encoder-embeddings requires a joined source "
"dictionary, --share-decoder-embeddings requires a joined "
"target dictionary, and --share-all-embeddings requires a "
"joint source + target dictionary."
)
return build_embedding(shared_dict, embed_dim, pretrained_embed_path)
示例8: __init__
# 需要导入模块: from fairseq import data [as 别名]
# 或者: from fairseq.data import Dictionary [as 别名]
def __init__(
self,
vocab: Dictionary,
filters: List[Tuple[int, int]],
char_embed_dim: int,
word_embed_dim: int,
highway_layers: int,
max_char_len: int = 50,
char_inputs: bool = False
):
super(CharacterTokenEmbedder, self).__init__()
self.onnx_trace = False
self.embedding_dim = word_embed_dim
self.max_char_len = max_char_len
self.char_embeddings = nn.Embedding(257, char_embed_dim, padding_idx=0)
self.symbol_embeddings = nn.Parameter(torch.FloatTensor(2, word_embed_dim))
self.eos_idx, self.unk_idx = 0, 1
self.char_inputs = char_inputs
self.convolutions = nn.ModuleList()
for width, out_c in filters:
self.convolutions.append(
nn.Conv1d(char_embed_dim, out_c, kernel_size=width)
)
last_dim = sum(f[1] for f in filters)
self.highway = Highway(last_dim, highway_layers) if highway_layers > 0 else None
self.projection = nn.Linear(last_dim, word_embed_dim)
assert vocab is not None or char_inputs, "vocab must be set if not using char inputs"
self.vocab = None
if vocab is not None:
self.set_vocab(vocab, max_char_len)
self.reset_parameters()
示例9: build_shared_embeddings
# 需要导入模块: from fairseq import data [as 别名]
# 或者: from fairseq.data import Dictionary [as 别名]
def build_shared_embeddings(
dicts: Dict[str, Dictionary],
langs: List[str],
embed_dim: int,
build_embedding: callable,
pretrained_embed_path: Optional[str] = None,
):
"""
Helper function to build shared embeddings for a set of languages after
checking that all dicts corresponding to those languages are equivalent.
Args:
dicts: Dict of lang_id to its corresponding Dictionary
langs: languages that we want to share embeddings for
embed_dim: embedding dimension
build_embedding: callable function to actually build the embedding
pretrained_embed_path: Optional path to load pretrained embeddings
"""
shared_dict = dicts[langs[0]]
if any(dicts[lang] != shared_dict for lang in langs):
raise ValueError(
'--share-*-embeddings requires a joined dictionary: '
'--share-encoder-embeddings requires a joined source '
'dictionary, --share-decoder-embeddings requires a joined '
'target dictionary, and --share-all-embeddings requires a '
'joint source + target dictionary.'
)
return build_embedding(
shared_dict, embed_dim, pretrained_embed_path
)
示例10: __init__
# 需要导入模块: from fairseq import data [as 别名]
# 或者: from fairseq.data import Dictionary [as 别名]
def __init__(
self,
dataset: FairseqDataset,
sizes: np.ndarray,
vocab: Dictionary,
pad_idx: int,
mask_idx: int,
classif_token_idx: int,
sep_token_idx: int,
seed: int = 1,
shuffle: bool = True,
has_pairs: bool = True,
segment_id: int = 0,
masking_ratio: float = 0.15,
masking_prob: float = 0.8,
random_token_prob: float = 0.1
):
# Make sure the input datasets are the ones supported
assert (
isinstance(dataset, TokenBlockDataset) or
isinstance(dataset, BlockPairDataset) or
isinstance(dataset, ConcatDataset)
), "MaskedLMDataset only wraps TokenBlockDataset or BlockPairDataset or " \
"ConcatDataset"
self.dataset = dataset
self.sizes = np.array(sizes)
self.vocab = vocab
self.pad_idx = pad_idx
self.mask_idx = mask_idx
self.classif_token_idx = classif_token_idx
self.sep_token_idx = sep_token_idx
self.shuffle = shuffle
self.seed = seed
self.has_pairs = has_pairs
self.segment_id = segment_id
self.masking_ratio = masking_ratio
self.masking_prob = masking_prob
self.random_token_prob = random_token_prob
# If we have only one block then sizes needs to be updated to include
# the classification token
if not has_pairs:
self.sizes = self.sizes + 1