本文整理汇总了Python中cltk.stem.lemma.LemmaReplacer类的典型用法代码示例。如果您正苦于以下问题:Python LemmaReplacer类的具体用法?Python LemmaReplacer怎么用?Python LemmaReplacer使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。
在下文中一共展示了LemmaReplacer类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: main
def main():
input = open('./Gratian1.txt', 'r').read()
input = re.sub('['+string.punctuation+']', '', input)
input = input.lower()
lemmatizer = LemmaReplacer('latin')
lemmata = lemmatizer.lemmatize(input)
dictionary_1r = {}
for lemma in lemmata:
if lemma in dictionary_1r:
dictionary_1r[lemma] += 1
else:
dictionary_1r[lemma] = 1
# lemmata = dictionary_1r.keys()
# for lemma in lemmata:
# print("%2d\t%s" % (dictionary_1r[lemma], lemma))
input = open('./Gratian2.txt', 'r').read()
input = re.sub('['+string.punctuation+']', '', input)
input = input.lower()
lemmata = lemmatizer.lemmatize(input)
dictionary_2r = {}
for lemma in lemmata:
if lemma in dictionary_2r:
dictionary_2r[lemma] += 1
else:
dictionary_2r[lemma] = 1
lemmata = dictionary_2r.keys()
for lemma in lemmata:
if lemma not in dictionary_1r:
print("%2d\t%s" % (dictionary_2r[lemma], lemma))
示例2: test_lemmatizer_instr_outstring_latin
def test_lemmatizer_instr_outstring_latin(self):
"""Test the Latin lemmatizer.
"""
replacer = LemmaReplacer('latin')
unlemmatized = 'hominum divomque voluptas'
lemmatized = replacer.lemmatize(unlemmatized, return_raw=False, return_string=True)
target = 'homo divus voluptas'
self.assertEqual(lemmatized, target)
示例3: test_lemmatizer_instr_outlemma_latin
def test_lemmatizer_instr_outlemma_latin(self):
"""Test the Latin lemmatizer.
"""
replacer = LemmaReplacer('latin')
unlemmatized = 'hominum divomque voluptas'
lemmatized = replacer.lemmatize(unlemmatized, return_raw=True, return_string=False)
target = ['hominum/homo', 'divomque/divus', 'voluptas/voluptas']
self.assertEqual(lemmatized, target)
示例4: test_lemmatizer_inlist_outlemma_outstring_latin
def test_lemmatizer_inlist_outlemma_outstring_latin(self):
"""Test the Latin lemmatizer.
"""
replacer = LemmaReplacer('latin')
unlemmatized = ['hominum', 'divomque', 'voluptas']
lemmatized = replacer.lemmatize(unlemmatized, return_raw=True, return_string=True)
target = 'hominum/homo divomque/divus voluptas/voluptas'
self.assertEqual(lemmatized, target)
示例5: test_lemmatizer_inlist_latin
def test_lemmatizer_inlist_latin(self):
"""Test the Latin lemmatizer.
"""
replacer = LemmaReplacer('latin')
unlemmatized = ['hominum', 'divomque', 'voluptas']
lemmatized = replacer.lemmatize(unlemmatized, return_raw=False, return_string=False)
target = ['homo', 'divus', 'voluptas']
self.assertEqual(lemmatized, target)
示例6: test_lemmatizer_instr_outlemma_outstring_greek
def test_lemmatizer_instr_outlemma_outstring_greek(self):
"""Test the Greek lemmatizer.
"""
replacer = LemmaReplacer('greek')
unlemmatized = 'τὴν διάγνωσιν ἔρχεσθαι'
lemmatized = replacer.lemmatize(unlemmatized, return_raw=True, return_string=True)
target = 'τὴν/τὴν διάγνωσιν/διάγνωσις ἔρχεσθαι/ἔρχομαι'
self.assertEqual(lemmatized, target)
示例7: test_lemmatizer_instr_greek
def test_lemmatizer_instr_greek(self):
"""Test the Greek lemmatizer.
"""
replacer = LemmaReplacer('greek')
unlemmatized = 'τὴν διάγνωσιν ἔρχεσθαι'
lemmatized = replacer.lemmatize(unlemmatized, return_raw=False, return_string=False)
target = ['τὴν', 'διάγνωσις', 'ἔρχομαι']
self.assertEqual(lemmatized, target)
示例8: test_lemmatizer_inlist_outlemma_greek
def test_lemmatizer_inlist_outlemma_greek(self):
"""Test the Greek lemmatizer.
"""
replacer = LemmaReplacer('greek')
unlemmatized = ['τὴν', 'διάγνωσιν', 'ἔρχεσθαι']
lemmatized = replacer.lemmatize(unlemmatized, return_raw=True, return_string=False)
target = ['τὴν/τὴν', 'διάγνωσιν/διάγνωσις', 'ἔρχεσθαι/ἔρχομαι']
self.assertEqual(lemmatized, target)
示例9: test_lemmatizer_inlist_outstring_greek
def test_lemmatizer_inlist_outstring_greek(self):
"""Test the Greek lemmatizer.
"""
replacer = LemmaReplacer('greek')
unlemmatized = ['τὴν', 'διάγνωσιν', 'ἔρχεσθαι']
lemmatized = replacer.lemmatize(unlemmatized, return_lemma=False, return_string=True)
target = 'τὴν διάγνωσις ἔρχομαι'
self.assertEqual(lemmatized, target)
示例10: main
def main():
corpus_importer = CorpusImporter('latin')
corpora_list = corpus_importer.list_corpora
print(corpora_list)
corpus_importer.import_corpus('latin_models_cltk')
sentence = 'Aeneadum genetrix, hominum divomque voluptas, alma Venus, caeli subter labentia signa quae mare navigerum, quae terras frugiferentis concelebras, per te quoniam genus omne animantum concipitur visitque exortum lumina solis.'
sentence = sentence.lower()
lemmatizer = LemmaReplacer('latin')
lemmatized_sentence = lemmatizer.lemmatize(sentence)
print(lemmatized_sentence)
示例11: get_lemma
def get_lemma(input_words, language):
lang = None
if language == "Latin":
lemmatizer = LemmaReplacer("latin")
# Required for CLTK module
input_words = latin_lem_replacement(input_words)
if language == "Greek":
lemmatizer = LemmaReplacer("greek")
if type(input_words) == list:
results = lemmatizer.lemmatize(input_words)
return results
else:
input_words = normalize_word(input_words)
results = lemmatizer.lemmatize(input_words)
if len(results) > 0:
return results[0]
else:
return input_words
示例12: gen_docs
def gen_docs(corpus, lemmatize, rm_stops):
"""Open and process files from a corpus. Return a list of sentences for an author. Each sentence
is itself a list of tokenized words.
"""
assert corpus in ['phi5', 'tlg']
if corpus == 'phi5':
language = 'latin'
filepaths = assemble_phi5_author_filepaths()
jv_replacer = JVReplacer()
text_cleaner = phi5_plaintext_cleanup
word_tokenizer = WordTokenizer('latin')
if rm_stops:
stops = latin_stops
else:
stops = None
elif corpus == 'tlg':
language = 'greek'
filepaths = assemble_tlg_author_filepaths()
text_cleaner = tlg_plaintext_cleanup
word_tokenizer = WordTokenizer('greek')
if rm_stops:
stops = latin_stops
else:
stops = None
if lemmatize:
lemmatizer = LemmaReplacer(language)
sent_tokenizer = TokenizeSentence(language)
for filepath in filepaths:
with open(filepath) as f:
text = f.read()
# light first-pass cleanup, before sentence tokenization (which relies on punctuation)
text = text_cleaner(text, rm_punctuation=False, rm_periods=False)
sent_tokens = sent_tokenizer.tokenize_sentences(text)
# doc_sentences = []
for sentence in sent_tokens:
# a second cleanup at sentence-level, to rm all punctuation
sentence = text_cleaner(sentence, rm_punctuation=True, rm_periods=True)
sentence = word_tokenizer(sentence)
sentence = [s.lower() for s in sentence]
sentence = [w for w in sentence if w]
if language == 'latin':
sentence = [w[1:] if w.startswith('-') else w for w in sentence]
if stops:
sentence = [w for w in sentence if w not in stops]
sentence = [w for w in sentence if len(w) > 1] # rm short words
if sentence:
sentence = sentence
if lemmatize:
sentence = lemmatizer.lemmatize(sentence)
if sentence and language == 'latin':
sentence = [jv_replacer.replace(word) for word in sentence]
if sentence:
yield sentence
示例13: LemmaReplacer
# - report to DC list/wiki
# Import modules
# For XML
from xml.dom.minidom import parse, parseString
import codecs
# For CLTK
from cltk.stem.latin.j_v import JVReplacer
from cltk.stem.lemma import LemmaReplacer
from cltk.tag.pos import POSTag
# Initialize CLTK
lemmatizer = LemmaReplacer('latin')
tagger = POSTag('latin')
j = JVReplacer()
# Parse XML
xmldoc = parse('/home/ilbuonme/siti/paolo.monella/ursus/casanatensis.xml')
#xmldoc = parse('/home/ilbuonme/siti/paolo.monella/ursus/shorter_casanatensis.xml')
wordElementList = xmldoc.getElementsByTagName('w')
for w in wordElementList:
form = w.attributes['ana'].value
print(form)
# Parse the inflected word
try:
lemmaList = lemmatizer.lemmatize(form.lower())
示例14: getLemma
def getLemma(self):
lemmatizer = LemmaReplacer('latin')
return lemmatizer.lemmatize(self.text)
示例15: lemmata
def lemmata(text):
lemmatizer = LemmaReplacer('greek')
return [word for word in set(lemmatizer.lemmatize(text.lower())) if not word in STOPS_LIST]