当前位置: 首页>>代码示例>>Python>>正文


Python data.Dictionary方法代码示例

本文整理汇总了Python中fairseq.data.Dictionary方法的典型用法代码示例。如果您正苦于以下问题:Python data.Dictionary方法的具体用法?Python data.Dictionary怎么用?Python data.Dictionary使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在fairseq.data的用法示例。


在下文中一共展示了data.Dictionary方法的10个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: test_character_token_embedder

# 需要导入模块: from fairseq import data [as 别名]
# 或者: from fairseq.data import Dictionary [as 别名]
def test_character_token_embedder(self):
        vocab = Dictionary()
        vocab.add_symbol('hello')
        vocab.add_symbol('there')

        embedder = CharacterTokenEmbedder(vocab, [(2, 16), (4, 32), (8, 64), (16, 2)], 64, 5, 2)

        test_sents = [['hello', 'unk', 'there'], ['there'], ['hello', 'there']]
        max_len = max(len(s) for s in test_sents)
        input = torch.LongTensor(len(test_sents), max_len + 2).fill_(vocab.pad())
        for i in range(len(test_sents)):
            input[i][0] = vocab.eos()
            for j in range(len(test_sents[i])):
                input[i][j + 1] = vocab.index(test_sents[i][j])
            input[i][j + 2] = vocab.eos()
        embs = embedder(input)

        assert embs.size() == (len(test_sents), max_len + 2, 5)
        self.assertAlmostEqual(embs[0][0], embs[1][0])
        self.assertAlmostEqual(embs[0][0], embs[0][-1])
        self.assertAlmostEqual(embs[0][1], embs[2][1])
        self.assertAlmostEqual(embs[0][3], embs[1][1])

        embs.sum().backward()
        assert embedder.char_embeddings.weight.grad is not None 
开发者ID:pytorch,项目名称:fairseq,代码行数:27,代码来源:test_character_token_embedder.py

示例2: dummy_dictionary

# 需要导入模块: from fairseq import data [as 别名]
# 或者: from fairseq.data import Dictionary [as 别名]
def dummy_dictionary(vocab_size, prefix='token_'):
    d = Dictionary()
    for i in range(vocab_size):
        token = prefix + str(i)
        d.add_symbol(token)
    d.finalize(padding_factor=1)  # don't add extra padding symbols
    return d 
开发者ID:nusnlp,项目名称:crosentgec,代码行数:9,代码来源:utils.py

示例3: setup_task

# 需要导入模块: from fairseq import data [as 别名]
# 或者: from fairseq.data import Dictionary [as 别名]
def setup_task(cls, args, **kwargs):
        """Setup the task. """
        dictionary = Dictionary()
        for i in range(args.dict_size):
            dictionary.add_symbol('word{}'.format(i))
        logger.info('dictionary: {} types'.format(len(dictionary)))
        return cls(args, dictionary) 
开发者ID:pytorch,项目名称:fairseq,代码行数:9,代码来源:dummy_lm.py

示例4: __init__

# 需要导入模块: from fairseq import data [as 别名]
# 或者: from fairseq.data import Dictionary [as 别名]
def __init__(self, num_embed=50000, embed_dim=1024, num_layers=24):
        super().__init__(Dictionary())
        self.embed = nn.Embedding(
            num_embeddings=num_embed, embedding_dim=embed_dim, padding_idx=0
        )
        self.layers_a = nn.ModuleList([
            nn.Sequential(
                nn.LayerNorm(embed_dim),
                nn.Linear(embed_dim, 3*embed_dim),  # q, k, v input projection
                nn.Linear(3*embed_dim, embed_dim),  # skip self-attention
                nn.Linear(embed_dim, embed_dim),    # output projection
                nn.Dropout(),
            )
            for i in range(num_layers)
        ])
        self.layers_b = nn.ModuleList([
            nn.Sequential(
                nn.LayerNorm(embed_dim),
                nn.Linear(embed_dim, 4*embed_dim),  # FFN
                nn.ReLU(),
                nn.Linear(4*embed_dim, embed_dim),  # FFN
                nn.Dropout(0.1),
            )
            for i in range(num_layers)
        ])
        self.out_proj = nn.Linear(embed_dim, num_embed) 
开发者ID:pytorch,项目名称:fairseq,代码行数:28,代码来源:dummy_model.py

示例5: padding_idx

# 需要导入模块: from fairseq import data [as 别名]
# 或者: from fairseq.data import Dictionary [as 别名]
def padding_idx(self):
        return Dictionary().pad() if self.vocab is None else self.vocab.pad() 
开发者ID:pytorch,项目名称:fairseq,代码行数:4,代码来源:character_token_embedder.py

示例6: __init__

# 需要导入模块: from fairseq import data [as 别名]
# 或者: from fairseq.data import Dictionary [as 别名]
def __init__(
        self,
        dataset: torch.utils.data.Dataset,
        vocab: Dictionary,
        pad_idx: int,
        mask_idx: int,
        return_masked_tokens: bool = False,
        seed: int = 1,
        mask_prob: float = 0.15,
        leave_unmasked_prob: float = 0.1,
        random_token_prob: float = 0.1,
        freq_weighted_replacement: bool = False,
        mask_whole_words: torch.Tensor = None,
    ):
        assert 0.0 < mask_prob < 1.0
        assert 0.0 <= random_token_prob <= 1.0
        assert 0.0 <= leave_unmasked_prob <= 1.0
        assert random_token_prob + leave_unmasked_prob <= 1.0

        self.dataset = dataset
        self.vocab = vocab
        self.pad_idx = pad_idx
        self.mask_idx = mask_idx
        self.return_masked_tokens = return_masked_tokens
        self.seed = seed
        self.mask_prob = mask_prob
        self.leave_unmasked_prob = leave_unmasked_prob
        self.random_token_prob = random_token_prob
        self.mask_whole_words = mask_whole_words

        if random_token_prob > 0.0:
            if freq_weighted_replacement:
                weights = np.array(self.vocab.count)
            else:
                weights = np.ones(len(self.vocab))
            weights[:self.vocab.nspecial] = 0
            self.weights = weights / weights.sum()

        self.epoch = 0 
开发者ID:pytorch,项目名称:fairseq,代码行数:41,代码来源:mask_tokens_dataset.py

示例7: build_shared_embeddings

# 需要导入模块: from fairseq import data [as 别名]
# 或者: from fairseq.data import Dictionary [as 别名]
def build_shared_embeddings(
        dicts: Dict[str, Dictionary],
        langs: List[str],
        embed_dim: int,
        build_embedding: callable,
        pretrained_embed_path: Optional[str] = None,
    ):
        """
        Helper function to build shared embeddings for a set of languages after
        checking that all dicts corresponding to those languages are equivalent.

        Args:
            dicts: Dict of lang_id to its corresponding Dictionary
            langs: languages that we want to share embeddings for
            embed_dim: embedding dimension
            build_embedding: callable function to actually build the embedding
            pretrained_embed_path: Optional path to load pretrained embeddings
        """
        shared_dict = dicts[langs[0]]
        if any(dicts[lang] != shared_dict for lang in langs):
            raise ValueError(
                "--share-*-embeddings requires a joined dictionary: "
                "--share-encoder-embeddings requires a joined source "
                "dictionary, --share-decoder-embeddings requires a joined "
                "target dictionary, and --share-all-embeddings requires a "
                "joint source + target dictionary."
            )
        return build_embedding(shared_dict, embed_dim, pretrained_embed_path) 
开发者ID:pytorch,项目名称:fairseq,代码行数:30,代码来源:fairseq_model.py

示例8: __init__

# 需要导入模块: from fairseq import data [as 别名]
# 或者: from fairseq.data import Dictionary [as 别名]
def __init__(
            self,
            vocab: Dictionary,
            filters: List[Tuple[int, int]],
            char_embed_dim: int,
            word_embed_dim: int,
            highway_layers: int,
            max_char_len: int = 50,
            char_inputs: bool = False
    ):
        super(CharacterTokenEmbedder, self).__init__()

        self.onnx_trace = False
        self.embedding_dim = word_embed_dim
        self.max_char_len = max_char_len
        self.char_embeddings = nn.Embedding(257, char_embed_dim, padding_idx=0)
        self.symbol_embeddings = nn.Parameter(torch.FloatTensor(2, word_embed_dim))
        self.eos_idx, self.unk_idx = 0, 1
        self.char_inputs = char_inputs

        self.convolutions = nn.ModuleList()
        for width, out_c in filters:
            self.convolutions.append(
                nn.Conv1d(char_embed_dim, out_c, kernel_size=width)
            )

        last_dim = sum(f[1] for f in filters)

        self.highway = Highway(last_dim, highway_layers) if highway_layers > 0 else None

        self.projection = nn.Linear(last_dim, word_embed_dim)

        assert vocab is not None or char_inputs, "vocab must be set if not using char inputs"
        self.vocab = None
        if vocab is not None:
            self.set_vocab(vocab, max_char_len)

        self.reset_parameters() 
开发者ID:kakaobrain,项目名称:helo_word,代码行数:40,代码来源:character_token_embedder.py

示例9: build_shared_embeddings

# 需要导入模块: from fairseq import data [as 别名]
# 或者: from fairseq.data import Dictionary [as 别名]
def build_shared_embeddings(
        dicts: Dict[str, Dictionary],
        langs: List[str],
        embed_dim: int,
        build_embedding: callable,
        pretrained_embed_path: Optional[str] = None,
    ):
        """
        Helper function to build shared embeddings for a set of languages after
        checking that all dicts corresponding to those languages are equivalent.

        Args:
            dicts: Dict of lang_id to its corresponding Dictionary
            langs: languages that we want to share embeddings for
            embed_dim: embedding dimension
            build_embedding: callable function to actually build the embedding
            pretrained_embed_path: Optional path to load pretrained embeddings
        """
        shared_dict = dicts[langs[0]]
        if any(dicts[lang] != shared_dict for lang in langs):
            raise ValueError(
                '--share-*-embeddings requires a joined dictionary: '
                '--share-encoder-embeddings requires a joined source '
                'dictionary, --share-decoder-embeddings requires a joined '
                'target dictionary, and --share-all-embeddings requires a '
                'joint source + target dictionary.'
            )
        return build_embedding(
            shared_dict, embed_dim, pretrained_embed_path
        ) 
开发者ID:kakaobrain,项目名称:helo_word,代码行数:32,代码来源:fairseq_model.py

示例10: __init__

# 需要导入模块: from fairseq import data [as 别名]
# 或者: from fairseq.data import Dictionary [as 别名]
def __init__(
            self,
            dataset: FairseqDataset,
            sizes: np.ndarray,
            vocab: Dictionary,
            pad_idx: int,
            mask_idx: int,
            classif_token_idx: int,
            sep_token_idx: int,
            seed: int = 1,
            shuffle: bool = True,
            has_pairs: bool = True,
            segment_id: int = 0,
            masking_ratio: float = 0.15,
            masking_prob: float = 0.8,
            random_token_prob: float = 0.1
    ):
        # Make sure the input datasets are the ones supported
        assert (
            isinstance(dataset, TokenBlockDataset) or
            isinstance(dataset, BlockPairDataset) or
            isinstance(dataset, ConcatDataset)
        ), "MaskedLMDataset only wraps TokenBlockDataset or BlockPairDataset or " \
           "ConcatDataset"

        self.dataset = dataset
        self.sizes = np.array(sizes)
        self.vocab = vocab
        self.pad_idx = pad_idx
        self.mask_idx = mask_idx
        self.classif_token_idx = classif_token_idx
        self.sep_token_idx = sep_token_idx
        self.shuffle = shuffle
        self.seed = seed
        self.has_pairs = has_pairs
        self.segment_id = segment_id
        self.masking_ratio = masking_ratio
        self.masking_prob = masking_prob
        self.random_token_prob = random_token_prob

        # If we have only one block then sizes needs to be updated to include
        # the classification token
        if not has_pairs:
            self.sizes = self.sizes + 1 
开发者ID:pytorch,项目名称:fairseq,代码行数:46,代码来源:masked_lm_dataset.py


注:本文中的fairseq.data.Dictionary方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。