本文整理汇总了Python中cltk.stem.latin.j_v.JVReplacer类的典型用法代码示例。如果您正苦于以下问题:Python JVReplacer类的具体用法?Python JVReplacer怎么用?Python JVReplacer使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。
在下文中一共展示了JVReplacer类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: test_tag_ner_str_list_latin
def test_tag_ner_str_list_latin(self):
"""Test make_ner(), str, list."""
text_str = """ut Venus, ut Sirius, ut Spica, ut aliae quae primae dicuntur esse mangitudinis."""
jv_replacer = JVReplacer()
text_str_iu = jv_replacer.replace(text_str)
tokens = ner.tag_ner('latin', input_text=text_str_iu, output_type=list)
target = [('ut',), ('Uenus', 'Entity'), (',',), ('ut',), ('Sirius', 'Entity'), (',',), ('ut',), ('Spica', 'Entity'), (',',), ('ut',), ('aliae',), ('quae',), ('primae',), ('dicuntur',), ('esse',), ('mangitudinis',), ('.',)]
self.assertEqual(tokens, target)
示例2: test_tag_ner_list_str_latin
def test_tag_ner_list_str_latin(self):
"""Test make_ner(), list, str."""
text_list = ['ut', 'Venus', 'Sirius']
jv_replacer = JVReplacer()
text_list_iu = [jv_replacer.replace(x) for x in text_list]
text = ner.tag_ner('latin', input_text=text_list_iu, output_type=str)
target = ' ut Uenus/Entity Sirius/Entity'
self.assertEqual(text, target)
示例3: test_tag_ner_list_list_latin
def test_tag_ner_list_list_latin(self):
"""Test make_ner(), list, list."""
text_list = ['ut', 'Venus', 'Sirius']
jv_replacer = JVReplacer()
text_list_iu = [jv_replacer.replace(x) for x in text_list]
tokens = ner.tag_ner('latin', input_text=text_list_iu, output_type=list)
target = [('ut',), ('Uenus', 'Entity'), ('Sirius', 'Entity')]
self.assertEqual(tokens, target)
示例4: test_tag_ner_str_str_latin
def test_tag_ner_str_str_latin(self):
"""Test make_ner(), str, str."""
jv_replacer = JVReplacer()
text_str = """ut Venus, ut Sirius, ut Spica, ut aliae quae primae dicuntur esse mangitudinis."""
jv_replacer = JVReplacer()
text_str_iu = jv_replacer.replace(text_str)
text = ner.tag_ner('latin', input_text=text_str_iu, output_type=str)
target = ' ut Uenus/Entity, ut Sirius/Entity, ut Spica/Entity, ut aliae quae primae dicuntur esse mangitudinis.'
self.assertEqual(text, target)
示例5: latin_lem_replacement
def latin_lem_replacement(input_words):
replacer = JVReplacer()
if type(input_words) == list:
for i in range(len(input_words)):
input_words[i] = normalize_word(replacer.replace(input_words[i]))
else:
input_words = normalize_word(replacer.replace(input_words))
return input_words
示例6: test_roman_numeral_lemmatizer
def test_roman_numeral_lemmatizer(self):
"""Test roman_numeral_lemmatizer()"""
lemmatizer = RomanNumeralLemmatizer()
test_str = 'i ii iii iv v vi vii vii ix x xx xxx xl l lx c cc'
target = [('i', 'NUM'), ('ii', 'NUM'), ('iii', 'NUM'), ('iu', 'NUM'), ('u', 'NUM'), ('ui', 'NUM'), ('uii', 'NUM'), ('uii', 'NUM'), ('ix', 'NUM'), ('x', 'NUM'), ('xx', 'NUM'), ('xxx', 'NUM'), ('xl', 'NUM'), ('l', 'NUM'), ('lx', 'NUM'), ('c', 'NUM'), ('cc', 'NUM')] # pylint: disable=line-too-long
jv_replacer = JVReplacer()
test_str = test_str.lower()
test_str = jv_replacer.replace(test_str)
tokens = test_str.split()
lemmas = lemmatizer.lemmatize(tokens)
self.assertEqual(lemmas, target)
示例7: jv_transform
def jv_transform(string_matrix: List[List[str]]) -> List[List[str]]:
"""
:param string_matrix: a data matrix: a list wrapping a list of strings, with each sublist being a sentence.
>>> jv_transform([['venio', 'jacet'], ['julius', 'caesar']])
[['uenio', 'iacet'], ['iulius', 'caesar']]
"""
jvreplacer = JVReplacer()
return [[jvreplacer.replace(word)
for word in sentence]
for sentence in string_matrix]
示例8: test_identity_lemmatizer
def test_identity_lemmatizer(self):
"""Test identity_lemmatizer()"""
lemmatizer = IdentityLemmatizer()
test_str = 'Ceterum antequam destinata componam'
target = [('ceterum', 'ceterum'), ('antequam', 'antequam'), ('destinata', 'destinata'), ('componam', 'componam')] # pylint: disable=line-too-long
jv_replacer = JVReplacer()
tokenizer = WordTokenizer('latin')
test_str = test_str.lower()
test_str = jv_replacer.replace(test_str)
tokens = tokenizer.tokenize(test_str)
lemmas = lemmatizer.lemmatize(tokens)
self.assertEqual(lemmas, target)
示例9: test_bigram_pos_lemmatizer
def test_bigram_pos_lemmatizer(self):
train = [[('dixissem', 'dico', 'v')], [('de', 'de', 'r'), ('te', 'tu', 'p'), ('autem', 'autem', 'c'), (',', 'punc', 'u'), ('catilina', 'catilina', 'n'), (',', 'punc', 'u'), ('cum', 'cum2', 'c'), ('quiescunt', 'quiesco', 'v'), (',', 'punc', 'u'), ('probant', 'probo', 'v'), (',', 'punc', 'u'), ('cum', 'cum2', 'c'), ('patiuntur', 'patior', 'v'), (',', 'punc', 'u'), ('decernunt', 'decerno', 'v'), (',', 'punc', 'u'), ('cum', 'cum2', 'c'), ('tacent', 'taceo', 'v'), (',', 'punc', 'u'), ('clamant', 'clamo', 'v'), (',', 'punc', 'u'), ('neque', 'neque', 'c'), ('hi', 'hic', 'p'), ('solum', 'solus', 'd'), ('quorum', 'qui', 'p'), ('tibi', 'tu', 'p'), ('auctoritas', 'auctoritas', 'n'), ('est', 'sum', 'v'), ('uidelicet', 'uidelicet', 'd'), ('cara', 'carus', 'a'), (',', 'punc', 'u'), ('uita', 'uita', 'n'), ('uilissima', 'uilis', 'a'), (',', 'punc', 'u'), ('sed', 'sed', 'c'), ('etiam', 'etiam', 'c'), ('illi', 'ille', 'p'), ('equites', 'eques', 'n'), ('romani', 'romanus', 'a'), (',', 'punc', 'u'), ('honestissimi', 'honestus', 'a'), ('atque', 'atque', 'c'), ('optimi', 'bonus', 'a'), ('uiri', 'uir', 'n'), (',', 'punc', 'u'), ('ceteri', 'ceterus', 'a'), ('-que', '-que', 'c'), ('fortissimi', 'fortis', 'a'), ('ciues', 'ciuis', 'n'), ('qui', 'qui', 'p'), ('circumstant', 'circumsto', 'v'), ('senatum', 'senatus', 'n'), (',', 'punc', 'u'), ('quorum', 'qui', 'p'), ('tu', 'tu', 'p'), ('et', 'et', 'c'), ('frequentiam', 'frequentia', 'n'), ('uidere', 'uideo', 'v'), ('et', 'et', 'c'), ('studia', 'studium', 'n'), ('perspicere', 'perspicio', 'v'), ('et', 'et', 'c'), ('uoces', 'uox', 'n'), ('paulo', 'paulus', 'd'), ('ante', 'ante', 'd'), ('exaudire', 'exaudio', 'v'), ('potuisti', 'possum', 'v'), ('.', 'punc', 'u')]]
lemmatizer = BigramPOSLemmatizer(train=train, include=['cum'])
test_str = """Quod cum esset intellectum et animadversum fecit animo libentissimo populus Romanus"""
target = [('quod', None), ('cum', 'cum2'), ('esset', None), ('intellectum', None), ('et', None), ('animaduersum', None), ('fecit', None), ('animo', None), ('libentissimo', None), ('populus', None), ('romanus', None)] # pylint: disable=line-too-long
jv_replacer = JVReplacer()
tokenizer = WordTokenizer('latin')
test_str = test_str.lower()
test_str = jv_replacer.replace(test_str)
tokens = tokenizer.tokenize(test_str)
lemmas = lemmatizer.lemmatize(tokens)
self.assertEqual(lemmas, target)
示例10: test_latin_lemmata
def test_latin_lemmata(self):
"""Test Lemmata class lookup() method"""
lemmatizer = Lemmata(dictionary = 'lemmata', language = 'latin')
test_str = 'Ceterum antequam destinata componam'
target = [('ceterum', [('ceterus', 1.0)]), ('antequam', [('antequam', 1.0)]), ('destinata', [('destinatus', 0.25), ('destinatum', 0.25), ('destinata', 0.25), ('destino', 0.25)]), ('componam', [('compono', 1.0)])] # pylint: disable=line-too-long
jv_replacer = JVReplacer()
tokenizer = WordTokenizer('latin')
test_str = test_str.lower()
test_str = jv_replacer.replace(test_str)
tokens = tokenizer.tokenize(test_str)
lemmas = lemmatizer.lookup(tokens)
self.assertEqual(lemmas, target)
示例11: get_sims
def get_sims(word, language, lemmatized=False, threshold=0.70):
"""Get similar Word2Vec terms from vocabulary or trained model.
TODO: Add option to install corpus if not available.
"""
# Normalize incoming word string
jv_replacer = JVReplacer()
if language == "latin":
# Note that casefold() seemingly does not work with diacritic
# Greek, likely because of it expects single code points, not
# diacritics. Look into global string normalization to code points
# for all languages, especially Greek.
word = jv_replacer.replace(word).casefold()
model_dirs = {
"greek": "~/cltk_data/greek/model/greek_word2vec_cltk",
"latin": "~/cltk_data/latin/model/latin_word2vec_cltk",
}
assert language in model_dirs.keys(), "Langauges available with Word2Vec model: {}".format(model_dirs.keys())
if lemmatized:
lemma_str = "_lemmed"
else:
lemma_str = ""
model_name = "{0}_s100_w30_min5_sg{1}.model".format(language, lemma_str)
model_dir_abs = os.path.expanduser(model_dirs[language])
model_path = os.path.join(model_dir_abs, model_name)
w2v = Word2Vec()
try:
model = w2v.load(model_path)
except FileNotFoundError as fnf_error:
print(fnf_error)
print("CLTK's Word2Vec models cannot be found. Please import '{}_word2vec_cltk'.".format(language))
raise
try:
similars = model.most_similar(word)
except KeyError as key_err:
print(key_err)
possible_matches = []
for term in model.vocab:
if term.startswith(word[:3]):
possible_matches.append(term)
print("The following terms in the Word2Vec model you may be looking for: '{}'.".format(possible_matches))
return None
returned_sims = []
for similar in similars:
if similar[1] > threshold:
returned_sims.append(similar[0])
if not returned_sims:
print(
"Matches found, but below the threshold of 'threshold={}'. Lower it to see these results.".format(threshold)
)
return returned_sims
示例12: test_unigram_lemmatizer
def test_unigram_lemmatizer(self):
"""Test unigram_lemmatizer()"""
train = [[('ceterum', 'ceterus'), ('antequam', 'antequam'), ('destinata', 'destino'), ('componam', 'compono')]] # pylint: disable=line-too-long
lemmatizer = UnigramLemmatizer(train=train)
test_str = """Ceterum antequam destinata componam"""
target = [('ceterum', 'ceterus'), ('antequam', 'antequam'), ('destinata', 'destino'), ('componam', 'compono')] # pylint: disable=line-too-long
jv_replacer = JVReplacer()
tokenizer = WordTokenizer('latin')
test_str = test_str.lower()
test_str = jv_replacer.replace(test_str)
tokens = tokenizer.tokenize(test_str)
lemmas = lemmatizer.lemmatize(tokens)
self.assertEqual(lemmas, target)
示例13: test_model_lemmatizer
def test_model_lemmatizer(self):
"""Test model_lemmatizer()"""
model = {'ceterum': 'ceterus', 'antequam': 'antequam', 'destinata': 'destino', 'componam': 'compono'} # pylint: disable=line-too-long
lemmatizer = TrainLemmatizer(model=model)
test_str = 'Ceterum antequam destinata componam'
target = [('ceterum', 'ceterus'), ('antequam', 'antequam'), ('destinata', 'destino'), ('componam', 'compono')] # pylint: disable=line-too-long
jv_replacer = JVReplacer()
tokenizer = WordTokenizer('latin')
test_str = test_str.lower()
test_str = jv_replacer.replace(test_str)
tokens = tokenizer.tokenize(test_str)
lemmas = lemmatizer.lemmatize(tokens)
self.assertEqual(lemmas, target)
示例14: test_roman_numeral_lemmatizer_with_default
def test_roman_numeral_lemmatizer_with_default(self):
"""Test roman_numeral_lemmatizer()"""
rn_patterns = [(r'(?=^[MDCLXVUI]+$)(?=^M{0,4}(CM|CD|D?C{0,3})(XC|XL|L?X{0,3})(IX|IV|IU|V?I{0,3}|U?I{0,3})$)', 'NUM'), (r'(?=^[mdclxvui]+$)(?=^m{0,4}(cm|cd|d?c{0,3})(xc|xl|l?x{0,3})(ix|iv|iu|v?i{0,3}|u?i{0,3})$)', 'NUM')]
lemmatizer = RomanNumeralLemmatizer(rn_patterns, default="RN")
test_str = 'i ii'
target = [('i', 'RN'), ('ii', 'RN')] # pylint: disable=line-too-long
jv_replacer = JVReplacer()
tokenizer = WordTokenizer('latin')
test_str = test_str.lower()
test_str = jv_replacer.replace(test_str)
tokens = tokenizer.tokenize(test_str)
lemmas = lemmatizer.lemmatize(tokens)
self.assertEqual(lemmas, target)
示例15: test_backoff_latin_lemmatizer_verbose
def test_backoff_latin_lemmatizer_verbose(self):
"""Test backoffLatinLemmatizer"""
train = [[('ceterum', 'ceterus'), ('antequam', 'antequam'), ('destinata', 'destino'), ('componam', 'compono')]] # pylint: disable=line-too-long
lemmatizer = BackoffLatinLemmatizer(verbose=True)
test_str = """Ceterum antequam destinata componam"""
target = [('ceterum', 'ceterum', '<UnigramLemmatizer: CLTK Sentence Training Data>'), ('antequam', 'antequam', '<UnigramLemmatizer: CLTK Sentence Training Data>'), ('destinata', 'destino', '<UnigramLemmatizer: CLTK Sentence Training Data>'), ('componam', 'compono', '<DictLemmatizer: Morpheus Lemmas>')] # pylint: disable=line-too-long
jv_replacer = JVReplacer()
tokenizer = WordTokenizer('latin')
test_str = test_str.lower()
test_str = jv_replacer.replace(test_str)
tokens = tokenizer.tokenize(test_str)
lemmas = lemmatizer.lemmatize(tokens)
self.assertEqual(lemmas, target)