当前位置: 首页>>代码示例>>Python>>正文


Python j_v.JVReplacer类代码示例

本文整理汇总了Python中cltk.stem.latin.j_v.JVReplacer的典型用法代码示例。如果您正苦于以下问题:Python JVReplacer类的具体用法?Python JVReplacer怎么用?Python JVReplacer使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。


在下文中一共展示了JVReplacer类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: test_tag_ner_str_list_latin

 def test_tag_ner_str_list_latin(self):
     """Test make_ner(), str, list."""
     text_str = """ut Venus, ut Sirius, ut Spica, ut aliae quae primae dicuntur esse mangitudinis."""
     jv_replacer = JVReplacer()
     text_str_iu = jv_replacer.replace(text_str)
     tokens = ner.tag_ner('latin', input_text=text_str_iu, output_type=list)
     target = [('ut',), ('Uenus', 'Entity'), (',',), ('ut',), ('Sirius', 'Entity'), (',',), ('ut',), ('Spica', 'Entity'), (',',), ('ut',), ('aliae',), ('quae',), ('primae',), ('dicuntur',), ('esse',), ('mangitudinis',), ('.',)]
     self.assertEqual(tokens, target)
开发者ID:TylerKirby,项目名称:cltk,代码行数:8,代码来源:test_tag.py

示例2: test_tag_ner_list_str_latin

 def test_tag_ner_list_str_latin(self):
     """Test make_ner(), list, str."""
     text_list = ['ut', 'Venus', 'Sirius']
     jv_replacer = JVReplacer()
     text_list_iu = [jv_replacer.replace(x) for x in text_list]
     text = ner.tag_ner('latin', input_text=text_list_iu, output_type=str)
     target = ' ut Uenus/Entity Sirius/Entity'
     self.assertEqual(text, target)
开发者ID:TylerKirby,项目名称:cltk,代码行数:8,代码来源:test_tag.py

示例3: test_tag_ner_list_list_latin

 def test_tag_ner_list_list_latin(self):
     """Test make_ner(), list, list."""
     text_list = ['ut', 'Venus', 'Sirius']
     jv_replacer = JVReplacer()
     text_list_iu = [jv_replacer.replace(x) for x in text_list]
     tokens = ner.tag_ner('latin', input_text=text_list_iu, output_type=list)
     target = [('ut',), ('Uenus', 'Entity'), ('Sirius', 'Entity')]
     self.assertEqual(tokens, target)
开发者ID:TylerKirby,项目名称:cltk,代码行数:8,代码来源:test_tag.py

示例4: test_tag_ner_str_str_latin

 def test_tag_ner_str_str_latin(self):
     """Test make_ner(), str, str."""
     jv_replacer = JVReplacer()
     text_str = """ut Venus, ut Sirius, ut Spica, ut aliae quae primae dicuntur esse mangitudinis."""
     jv_replacer = JVReplacer()
     text_str_iu = jv_replacer.replace(text_str)
     text = ner.tag_ner('latin', input_text=text_str_iu, output_type=str)
     target = ' ut Uenus/Entity, ut Sirius/Entity, ut Spica/Entity, ut aliae quae primae dicuntur esse mangitudinis.'
     self.assertEqual(text, target)
开发者ID:TylerKirby,项目名称:cltk,代码行数:9,代码来源:test_tag.py

示例5: latin_lem_replacement

def latin_lem_replacement(input_words):
    replacer = JVReplacer()

    if type(input_words) == list:
        for i in range(len(input_words)):
            input_words[i] = normalize_word(replacer.replace(input_words[i]))
    else:
        input_words = normalize_word(replacer.replace(input_words))

    return input_words
开发者ID:baileymiller,项目名称:intertextualityProject,代码行数:10,代码来源:translate.py

示例6: test_roman_numeral_lemmatizer

 def test_roman_numeral_lemmatizer(self):
     """Test roman_numeral_lemmatizer()"""
     lemmatizer = RomanNumeralLemmatizer()
     test_str = 'i ii iii iv v vi vii vii ix x xx xxx xl l lx c cc'
     target = [('i', 'NUM'), ('ii', 'NUM'), ('iii', 'NUM'), ('iu', 'NUM'), ('u', 'NUM'), ('ui', 'NUM'), ('uii', 'NUM'), ('uii', 'NUM'), ('ix', 'NUM'), ('x', 'NUM'), ('xx', 'NUM'), ('xxx', 'NUM'), ('xl', 'NUM'), ('l', 'NUM'), ('lx', 'NUM'), ('c', 'NUM'), ('cc', 'NUM')]  # pylint: disable=line-too-long
     jv_replacer = JVReplacer()
     test_str = test_str.lower()
     test_str = jv_replacer.replace(test_str)
     tokens = test_str.split()
     lemmas = lemmatizer.lemmatize(tokens)
     self.assertEqual(lemmas, target)
开发者ID:cltk,项目名称:cltk,代码行数:11,代码来源:test_lemmatize.py

示例7: jv_transform

def jv_transform(string_matrix: List[List[str]]) -> List[List[str]]:
    """

    :param string_matrix: a data matrix: a list wrapping a list of strings, with each sublist being a sentence.
    >>> jv_transform([['venio', 'jacet'], ['julius', 'caesar']])
    [['uenio', 'iacet'], ['iulius', 'caesar']]
    """
    jvreplacer = JVReplacer()
    return [[jvreplacer.replace(word)
             for word in sentence]
            for sentence in string_matrix]
开发者ID:cltk,项目名称:cltk,代码行数:11,代码来源:matrix_corpus_fun.py

示例8: test_identity_lemmatizer

 def test_identity_lemmatizer(self):
     """Test identity_lemmatizer()"""
     lemmatizer = IdentityLemmatizer()
     test_str = 'Ceterum antequam destinata componam'
     target = [('ceterum', 'ceterum'), ('antequam', 'antequam'), ('destinata', 'destinata'), ('componam', 'componam')]  # pylint: disable=line-too-long
     jv_replacer = JVReplacer()
     tokenizer = WordTokenizer('latin')
     test_str = test_str.lower()
     test_str = jv_replacer.replace(test_str)
     tokens = tokenizer.tokenize(test_str)
     lemmas = lemmatizer.lemmatize(tokens)
     self.assertEqual(lemmas, target)
开发者ID:TylerKirby,项目名称:cltk,代码行数:12,代码来源:test_lemmatize.py

示例9: test_bigram_pos_lemmatizer

 def test_bigram_pos_lemmatizer(self):
     train = [[('dixissem', 'dico', 'v')], [('de', 'de', 'r'), ('te', 'tu', 'p'), ('autem', 'autem', 'c'), (',', 'punc', 'u'), ('catilina', 'catilina', 'n'), (',', 'punc', 'u'), ('cum', 'cum2', 'c'), ('quiescunt', 'quiesco', 'v'), (',', 'punc', 'u'), ('probant', 'probo', 'v'), (',', 'punc', 'u'), ('cum', 'cum2', 'c'), ('patiuntur', 'patior', 'v'), (',', 'punc', 'u'), ('decernunt', 'decerno', 'v'), (',', 'punc', 'u'), ('cum', 'cum2', 'c'), ('tacent', 'taceo', 'v'), (',', 'punc', 'u'), ('clamant', 'clamo', 'v'), (',', 'punc', 'u'), ('neque', 'neque', 'c'), ('hi', 'hic', 'p'), ('solum', 'solus', 'd'), ('quorum', 'qui', 'p'), ('tibi', 'tu', 'p'), ('auctoritas', 'auctoritas', 'n'), ('est', 'sum', 'v'), ('uidelicet', 'uidelicet', 'd'), ('cara', 'carus', 'a'), (',', 'punc', 'u'), ('uita', 'uita', 'n'), ('uilissima', 'uilis', 'a'), (',', 'punc', 'u'), ('sed', 'sed', 'c'), ('etiam', 'etiam', 'c'), ('illi', 'ille', 'p'), ('equites', 'eques', 'n'), ('romani', 'romanus', 'a'), (',', 'punc', 'u'), ('honestissimi', 'honestus', 'a'), ('atque', 'atque', 'c'), ('optimi', 'bonus', 'a'), ('uiri', 'uir', 'n'), (',', 'punc', 'u'), ('ceteri', 'ceterus', 'a'), ('-que', '-que', 'c'), ('fortissimi', 'fortis', 'a'), ('ciues', 'ciuis', 'n'), ('qui', 'qui', 'p'), ('circumstant', 'circumsto', 'v'), ('senatum', 'senatus', 'n'), (',', 'punc', 'u'), ('quorum', 'qui', 'p'), ('tu', 'tu', 'p'), ('et', 'et', 'c'), ('frequentiam', 'frequentia', 'n'), ('uidere', 'uideo', 'v'), ('et', 'et', 'c'), ('studia', 'studium', 'n'), ('perspicere', 'perspicio', 'v'), ('et', 'et', 'c'), ('uoces', 'uox', 'n'), ('paulo', 'paulus', 'd'), ('ante', 'ante', 'd'), ('exaudire', 'exaudio', 'v'), ('potuisti', 'possum', 'v'), ('.', 'punc', 'u')]]
     lemmatizer = BigramPOSLemmatizer(train=train, include=['cum'])
     test_str = """Quod cum esset intellectum et animadversum fecit animo libentissimo populus Romanus"""
     target = [('quod', None), ('cum', 'cum2'), ('esset', None), ('intellectum', None), ('et', None), ('animaduersum', None), ('fecit', None), ('animo', None), ('libentissimo', None), ('populus', None), ('romanus', None)]  # pylint: disable=line-too-long
     jv_replacer = JVReplacer()
     tokenizer = WordTokenizer('latin')
     test_str = test_str.lower()
     test_str = jv_replacer.replace(test_str)
     tokens = tokenizer.tokenize(test_str)
     lemmas = lemmatizer.lemmatize(tokens)
     self.assertEqual(lemmas, target)
开发者ID:TylerKirby,项目名称:cltk,代码行数:12,代码来源:test_lemmatize.py

示例10: test_latin_lemmata

 def test_latin_lemmata(self):
     """Test Lemmata class lookup() method"""
     lemmatizer = Lemmata(dictionary = 'lemmata', language = 'latin')
     test_str = 'Ceterum antequam destinata componam'
     target = [('ceterum', [('ceterus', 1.0)]), ('antequam', [('antequam', 1.0)]), ('destinata', [('destinatus', 0.25), ('destinatum', 0.25), ('destinata', 0.25), ('destino', 0.25)]), ('componam', [('compono', 1.0)])]  # pylint: disable=line-too-long
     jv_replacer = JVReplacer()
     tokenizer = WordTokenizer('latin')
     test_str = test_str.lower()
     test_str = jv_replacer.replace(test_str)
     tokens = tokenizer.tokenize(test_str)
     lemmas = lemmatizer.lookup(tokens)
     self.assertEqual(lemmas, target)
开发者ID:TylerKirby,项目名称:cltk,代码行数:12,代码来源:test_semantics.py

示例11: get_sims

def get_sims(word, language, lemmatized=False, threshold=0.70):
    """Get similar Word2Vec terms from vocabulary or trained model.

    TODO: Add option to install corpus if not available.
    """
    # Normalize incoming word string
    jv_replacer = JVReplacer()
    if language == "latin":
        # Note that casefold() seemingly does not work with diacritic
        # Greek, likely because of it expects single code points, not
        # diacritics. Look into global string normalization to code points
        # for all languages, especially Greek.
        word = jv_replacer.replace(word).casefold()

    model_dirs = {
        "greek": "~/cltk_data/greek/model/greek_word2vec_cltk",
        "latin": "~/cltk_data/latin/model/latin_word2vec_cltk",
    }
    assert language in model_dirs.keys(), "Langauges available with Word2Vec model: {}".format(model_dirs.keys())
    if lemmatized:
        lemma_str = "_lemmed"
    else:
        lemma_str = ""
    model_name = "{0}_s100_w30_min5_sg{1}.model".format(language, lemma_str)
    model_dir_abs = os.path.expanduser(model_dirs[language])
    model_path = os.path.join(model_dir_abs, model_name)
    w2v = Word2Vec()
    try:
        model = w2v.load(model_path)
    except FileNotFoundError as fnf_error:
        print(fnf_error)
        print("CLTK's Word2Vec models cannot be found. Please import '{}_word2vec_cltk'.".format(language))
        raise
    try:
        similars = model.most_similar(word)
    except KeyError as key_err:
        print(key_err)
        possible_matches = []
        for term in model.vocab:
            if term.startswith(word[:3]):
                possible_matches.append(term)
        print("The following terms in the Word2Vec model you may be looking for: '{}'.".format(possible_matches))
        return None
    returned_sims = []
    for similar in similars:
        if similar[1] > threshold:
            returned_sims.append(similar[0])
    if not returned_sims:
        print(
            "Matches found, but below the threshold of 'threshold={}'. Lower it to see these results.".format(threshold)
        )
    return returned_sims
开发者ID:paolomonella,项目名称:ursus,代码行数:52,代码来源:word2vec.py

示例12: test_unigram_lemmatizer

 def test_unigram_lemmatizer(self):
     """Test unigram_lemmatizer()"""
     train = [[('ceterum', 'ceterus'), ('antequam', 'antequam'), ('destinata', 'destino'), ('componam', 'compono')]]  # pylint: disable=line-too-long
     lemmatizer = UnigramLemmatizer(train=train)
     test_str = """Ceterum antequam destinata componam"""
     target = [('ceterum', 'ceterus'), ('antequam', 'antequam'), ('destinata', 'destino'), ('componam', 'compono')]  # pylint: disable=line-too-long
     jv_replacer = JVReplacer()
     tokenizer = WordTokenizer('latin')
     test_str = test_str.lower()
     test_str = jv_replacer.replace(test_str)
     tokens = tokenizer.tokenize(test_str)
     lemmas = lemmatizer.lemmatize(tokens)
     self.assertEqual(lemmas, target)
开发者ID:TylerKirby,项目名称:cltk,代码行数:13,代码来源:test_lemmatize.py

示例13: test_model_lemmatizer

 def test_model_lemmatizer(self):
     """Test model_lemmatizer()"""
     model = {'ceterum': 'ceterus', 'antequam': 'antequam', 'destinata': 'destino', 'componam': 'compono'}  # pylint: disable=line-too-long
     lemmatizer = TrainLemmatizer(model=model)
     test_str = 'Ceterum antequam destinata componam'
     target = [('ceterum', 'ceterus'), ('antequam', 'antequam'), ('destinata', 'destino'), ('componam', 'compono')]  # pylint: disable=line-too-long
     jv_replacer = JVReplacer()
     tokenizer = WordTokenizer('latin')
     test_str = test_str.lower()
     test_str = jv_replacer.replace(test_str)
     tokens = tokenizer.tokenize(test_str)
     lemmas = lemmatizer.lemmatize(tokens)
     self.assertEqual(lemmas, target)
开发者ID:TylerKirby,项目名称:cltk,代码行数:13,代码来源:test_lemmatize.py

示例14: test_roman_numeral_lemmatizer_with_default

 def test_roman_numeral_lemmatizer_with_default(self):
     """Test roman_numeral_lemmatizer()"""
     rn_patterns = [(r'(?=^[MDCLXVUI]+$)(?=^M{0,4}(CM|CD|D?C{0,3})(XC|XL|L?X{0,3})(IX|IV|IU|V?I{0,3}|U?I{0,3})$)', 'NUM'), (r'(?=^[mdclxvui]+$)(?=^m{0,4}(cm|cd|d?c{0,3})(xc|xl|l?x{0,3})(ix|iv|iu|v?i{0,3}|u?i{0,3})$)', 'NUM')]
     lemmatizer = RomanNumeralLemmatizer(rn_patterns, default="RN")
     test_str = 'i ii'
     target = [('i', 'RN'), ('ii', 'RN')]  # pylint: disable=line-too-long
     jv_replacer = JVReplacer()
     tokenizer = WordTokenizer('latin')
     test_str = test_str.lower()
     test_str = jv_replacer.replace(test_str)
     tokens = tokenizer.tokenize(test_str)
     lemmas = lemmatizer.lemmatize(tokens)
     self.assertEqual(lemmas, target)
开发者ID:TylerKirby,项目名称:cltk,代码行数:13,代码来源:test_lemmatize.py

示例15: test_backoff_latin_lemmatizer_verbose

 def test_backoff_latin_lemmatizer_verbose(self):
     """Test backoffLatinLemmatizer"""
     train = [[('ceterum', 'ceterus'), ('antequam', 'antequam'), ('destinata', 'destino'), ('componam', 'compono')]]  # pylint: disable=line-too-long
     lemmatizer = BackoffLatinLemmatizer(verbose=True)
     test_str = """Ceterum antequam destinata componam"""
     target = [('ceterum', 'ceterum', '<UnigramLemmatizer: CLTK Sentence Training Data>'), ('antequam', 'antequam', '<UnigramLemmatizer: CLTK Sentence Training Data>'), ('destinata', 'destino', '<UnigramLemmatizer: CLTK Sentence Training Data>'), ('componam', 'compono', '<DictLemmatizer: Morpheus Lemmas>')]  # pylint: disable=line-too-long
     jv_replacer = JVReplacer()
     tokenizer = WordTokenizer('latin')
     test_str = test_str.lower()
     test_str = jv_replacer.replace(test_str)
     tokens = tokenizer.tokenize(test_str)
     lemmas = lemmatizer.lemmatize(tokens)
     self.assertEqual(lemmas, target)
开发者ID:cltk,项目名称:cltk,代码行数:13,代码来源:test_lemmatize.py


注:本文中的cltk.stem.latin.j_v.JVReplacer类示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。