当前位置: 首页>>代码示例>>Python>>正文


Python lemma.LemmaReplacer类代码示例

本文整理汇总了Python中cltk.stem.lemma.LemmaReplacer的典型用法代码示例。如果您正苦于以下问题:Python LemmaReplacer类的具体用法?Python LemmaReplacer怎么用?Python LemmaReplacer使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。


在下文中一共展示了LemmaReplacer类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: main

def main():
    input = open('./Gratian1.txt', 'r').read()
    input = re.sub('['+string.punctuation+']', '', input)
    input = input.lower()
    lemmatizer = LemmaReplacer('latin')
    lemmata = lemmatizer.lemmatize(input)
    dictionary_1r = {}
    for lemma in lemmata:
        if lemma in dictionary_1r:
            dictionary_1r[lemma] += 1
        else:
            dictionary_1r[lemma] = 1
    # lemmata = dictionary_1r.keys()
    # for lemma in lemmata:
    #     print("%2d\t%s" % (dictionary_1r[lemma], lemma))
    input = open('./Gratian2.txt', 'r').read()
    input = re.sub('['+string.punctuation+']', '', input)
    input = input.lower()
    lemmata = lemmatizer.lemmatize(input)
    dictionary_2r = {}
    for lemma in lemmata:
        if lemma in dictionary_2r:
            dictionary_2r[lemma] += 1
        else:
            dictionary_2r[lemma] = 1
    lemmata = dictionary_2r.keys()
    for lemma in lemmata:
        if lemma not in dictionary_1r:
            print("%2d\t%s" % (dictionary_2r[lemma], lemma))
开发者ID:decretist,项目名称:Sand,代码行数:29,代码来源:merged.py

示例2: test_lemmatizer_instr_outstring_latin

 def test_lemmatizer_instr_outstring_latin(self):
     """Test the Latin lemmatizer.
     """
     replacer = LemmaReplacer('latin')
     unlemmatized = 'hominum divomque voluptas'
     lemmatized = replacer.lemmatize(unlemmatized, return_raw=False, return_string=True)
     target = 'homo divus voluptas'
     self.assertEqual(lemmatized, target)
开发者ID:manu-chroma,项目名称:cltk,代码行数:8,代码来源:test_stem.py

示例3: test_lemmatizer_instr_outlemma_latin

 def test_lemmatizer_instr_outlemma_latin(self):
     """Test the Latin lemmatizer.
     """
     replacer = LemmaReplacer('latin')
     unlemmatized = 'hominum divomque voluptas'
     lemmatized = replacer.lemmatize(unlemmatized, return_raw=True, return_string=False)
     target = ['hominum/homo', 'divomque/divus', 'voluptas/voluptas']
     self.assertEqual(lemmatized, target)
开发者ID:manu-chroma,项目名称:cltk,代码行数:8,代码来源:test_stem.py

示例4: test_lemmatizer_inlist_outlemma_outstring_latin

 def test_lemmatizer_inlist_outlemma_outstring_latin(self):
     """Test the Latin lemmatizer.
     """
     replacer = LemmaReplacer('latin')
     unlemmatized = ['hominum', 'divomque', 'voluptas']
     lemmatized = replacer.lemmatize(unlemmatized, return_raw=True, return_string=True)
     target = 'hominum/homo divomque/divus voluptas/voluptas'
     self.assertEqual(lemmatized, target)
开发者ID:manu-chroma,项目名称:cltk,代码行数:8,代码来源:test_stem.py

示例5: test_lemmatizer_inlist_latin

 def test_lemmatizer_inlist_latin(self):
     """Test the Latin lemmatizer.
     """
     replacer = LemmaReplacer('latin')
     unlemmatized = ['hominum', 'divomque', 'voluptas']
     lemmatized = replacer.lemmatize(unlemmatized, return_raw=False, return_string=False)
     target = ['homo', 'divus', 'voluptas']
     self.assertEqual(lemmatized, target)
开发者ID:manu-chroma,项目名称:cltk,代码行数:8,代码来源:test_stem.py

示例6: test_lemmatizer_instr_outlemma_outstring_greek

 def test_lemmatizer_instr_outlemma_outstring_greek(self):
     """Test the Greek lemmatizer.
     """
     replacer = LemmaReplacer('greek')
     unlemmatized = 'τὴν διάγνωσιν ἔρχεσθαι'
     lemmatized = replacer.lemmatize(unlemmatized, return_raw=True, return_string=True)
     target = 'τὴν/τὴν διάγνωσιν/διάγνωσις ἔρχεσθαι/ἔρχομαι'
     self.assertEqual(lemmatized, target)
开发者ID:manu-chroma,项目名称:cltk,代码行数:8,代码来源:test_stem.py

示例7: test_lemmatizer_instr_greek

 def test_lemmatizer_instr_greek(self):
     """Test the Greek lemmatizer.
     """
     replacer = LemmaReplacer('greek')
     unlemmatized = 'τὴν διάγνωσιν ἔρχεσθαι'
     lemmatized = replacer.lemmatize(unlemmatized, return_raw=False, return_string=False)
     target = ['τὴν', 'διάγνωσις', 'ἔρχομαι']
     self.assertEqual(lemmatized, target)
开发者ID:manu-chroma,项目名称:cltk,代码行数:8,代码来源:test_stem.py

示例8: test_lemmatizer_inlist_outlemma_greek

 def test_lemmatizer_inlist_outlemma_greek(self):
     """Test the Greek lemmatizer.
     """
     replacer = LemmaReplacer('greek')
     unlemmatized = ['τὴν', 'διάγνωσιν', 'ἔρχεσθαι']
     lemmatized = replacer.lemmatize(unlemmatized, return_raw=True, return_string=False)
     target = ['τὴν/τὴν', 'διάγνωσιν/διάγνωσις', 'ἔρχεσθαι/ἔρχομαι']
     self.assertEqual(lemmatized, target)
开发者ID:manu-chroma,项目名称:cltk,代码行数:8,代码来源:test_stem.py

示例9: test_lemmatizer_inlist_outstring_greek

 def test_lemmatizer_inlist_outstring_greek(self):
     """Test the Greek lemmatizer.
     """
     replacer = LemmaReplacer('greek')
     unlemmatized = ['τὴν', 'διάγνωσιν', 'ἔρχεσθαι']
     lemmatized = replacer.lemmatize(unlemmatized, return_lemma=False, return_string=True)
     target = 'τὴν διάγνωσις ἔρχομαι'
     self.assertEqual(lemmatized, target)
开发者ID:AviAsh,项目名称:cltk,代码行数:8,代码来源:test_stem.py

示例10: main

def main():
    corpus_importer = CorpusImporter('latin')
    corpora_list = corpus_importer.list_corpora
    print(corpora_list)
    corpus_importer.import_corpus('latin_models_cltk')
    sentence = 'Aeneadum genetrix, hominum divomque voluptas, alma Venus, caeli subter labentia signa quae mare navigerum, quae terras frugiferentis concelebras, per te quoniam genus omne animantum concipitur visitque exortum lumina solis.'
    sentence = sentence.lower()
    lemmatizer = LemmaReplacer('latin')
    lemmatized_sentence = lemmatizer.lemmatize(sentence)
    print(lemmatized_sentence)
开发者ID:decretist,项目名称:Sand,代码行数:10,代码来源:test.py

示例11: get_lemma

def get_lemma(input_words, language):
    lang = None

    if language == "Latin":
        lemmatizer = LemmaReplacer("latin")

        # Required for CLTK module
        input_words = latin_lem_replacement(input_words)

    if language == "Greek":
        lemmatizer = LemmaReplacer("greek")

    if type(input_words) == list:
        results = lemmatizer.lemmatize(input_words)
        return results
    else:
        input_words = normalize_word(input_words)
        results = lemmatizer.lemmatize(input_words)
        if len(results) > 0:
            return results[0]
        else:
            return input_words
开发者ID:baileymiller,项目名称:intertextualityProject,代码行数:22,代码来源:translate.py

示例12: gen_docs

def gen_docs(corpus, lemmatize, rm_stops):
    """Open and process files from a corpus. Return a list of sentences for an author. Each sentence
    is itself a list of tokenized words.
    """

    assert corpus in ['phi5', 'tlg']

    if corpus == 'phi5':
        language = 'latin'
        filepaths = assemble_phi5_author_filepaths()
        jv_replacer = JVReplacer()
        text_cleaner = phi5_plaintext_cleanup
        word_tokenizer = WordTokenizer('latin')
        if rm_stops:
            stops = latin_stops
        else:
            stops = None
    elif corpus == 'tlg':
        language = 'greek'
        filepaths = assemble_tlg_author_filepaths()
        text_cleaner = tlg_plaintext_cleanup
        word_tokenizer = WordTokenizer('greek')

        if rm_stops:
            stops = latin_stops
        else:
            stops = None

    if lemmatize:
        lemmatizer = LemmaReplacer(language)

    sent_tokenizer = TokenizeSentence(language)

    for filepath in filepaths:
        with open(filepath) as f:
            text = f.read()
        # light first-pass cleanup, before sentence tokenization (which relies on punctuation)
        text = text_cleaner(text, rm_punctuation=False, rm_periods=False)
        sent_tokens = sent_tokenizer.tokenize_sentences(text)
        # doc_sentences = []
        for sentence in sent_tokens:
            # a second cleanup at sentence-level, to rm all punctuation
            sentence = text_cleaner(sentence, rm_punctuation=True, rm_periods=True)
            sentence = word_tokenizer(sentence)
            sentence = [s.lower() for s in sentence]
            sentence = [w for w in sentence if w]
            if language == 'latin':
                sentence = [w[1:] if w.startswith('-') else w for w in sentence]

            if stops:
                sentence = [w for w in sentence if w not in stops]

            sentence = [w for w in sentence if len(w) > 1]  # rm short words

            if sentence:
                sentence = sentence

            if lemmatize:
                sentence = lemmatizer.lemmatize(sentence)
            if sentence and language == 'latin':
                sentence = [jv_replacer.replace(word) for word in sentence]
            if sentence:
                yield sentence
开发者ID:cltk,项目名称:cltk,代码行数:63,代码来源:word2vec.py

示例13: LemmaReplacer

# - report to DC list/wiki



# Import modules

# For XML
from xml.dom.minidom import parse, parseString
import codecs
# For CLTK
from cltk.stem.latin.j_v import JVReplacer
from cltk.stem.lemma import LemmaReplacer
from cltk.tag.pos import POSTag

# Initialize CLTK
lemmatizer = LemmaReplacer('latin')
tagger = POSTag('latin')
j = JVReplacer()

# Parse XML

xmldoc = parse('/home/ilbuonme/siti/paolo.monella/ursus/casanatensis.xml')
#xmldoc = parse('/home/ilbuonme/siti/paolo.monella/ursus/shorter_casanatensis.xml')
wordElementList = xmldoc.getElementsByTagName('w')

for w in wordElementList:
        form = w.attributes['ana'].value
        print(form)
        # Parse the inflected word
        try:
            lemmaList = lemmatizer.lemmatize(form.lower())
开发者ID:paolomonella,项目名称:ursus,代码行数:31,代码来源:cltkUrsus.py

示例14: getLemma

 def getLemma(self):
   lemmatizer = LemmaReplacer('latin')
   return lemmatizer.lemmatize(self.text)
开发者ID:PonteIneptique,项目名称:Siena-2015,代码行数:3,代码来源:lemmatizer.py

示例15: lemmata

def lemmata(text):
    lemmatizer = LemmaReplacer('greek')
    return [word for word in set(lemmatizer.lemmatize(text.lower())) if not word in STOPS_LIST]
开发者ID:TylerKirby,项目名称:JupyterNotebooks,代码行数:3,代码来源:sophocles_script.py


注:本文中的cltk.stem.lemma.LemmaReplacer类示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。