本文整理汇总了Python中cltk.tokenize.word.WordTokenizer.tokenize方法的典型用法代码示例。如果您正苦于以下问题:Python WordTokenizer.tokenize方法的具体用法?Python WordTokenizer.tokenize怎么用?Python WordTokenizer.tokenize使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类cltk.tokenize.word.WordTokenizer
的用法示例。
在下文中一共展示了WordTokenizer.tokenize方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: test_tokenize_arabic_words
# 需要导入模块: from cltk.tokenize.word import WordTokenizer [as 别名]
# 或者: from cltk.tokenize.word.WordTokenizer import tokenize [as 别名]
def test_tokenize_arabic_words(self):
word_tokenizer = WordTokenizer('arabic')
tests = ['اللُّغَةُ الْعَرَبِيَّةُ جَمِيلَةٌ.',
'انما الْمُؤْمِنُونَ اخوه فاصلحوا بَيْنَ اخويكم',
'الْعَجُزُ عَنِ الْإِدْرَاكِ إِدْرَاكٌ، وَالْبَحْثَ فِي ذاتِ اللَّه اشراك.',
'اللَّهُمُّ اُسْتُرْ عُيُوبَنَا وَأَحْسَنَ خَوَاتِيمَنَا الْكَاتِبِ: نَبِيلُ جلهوم',
'الرَّأْي قَبْلَ شَجَاعَة الشّجعَانِ',
'فَأَنْزَلْنَا مِنْ السَّمَاء مَاء فَأَسْقَيْنَاكُمُوهُ',
'سُئِلَ بَعْضُ الْكُتَّابِ عَنِ الْخَطّ، مَتَى يَسْتَحِقُّ أَنْ يُوصَفَ بِالْجَوْدَةِ ؟'
]
results = []
for test in tests:
result = word_tokenizer.tokenize(test)
results.append(result)
target = [['اللُّغَةُ', 'الْعَرَبِيَّةُ', 'جَمِيلَةٌ', '.'],
['انما', 'الْمُؤْمِنُونَ', 'اخوه', 'فاصلحوا', 'بَيْنَ', 'اخويكم'],
['الْعَجُزُ', 'عَنِ', 'الْإِدْرَاكِ', 'إِدْرَاكٌ', '،', 'وَالْبَحْثَ', 'فِي', 'ذاتِ', 'اللَّه', 'اشراك', '.'], # pylint: disable=line-too-long
['اللَّهُمُّ', 'اُسْتُرْ', 'عُيُوبَنَا', 'وَأَحْسَنَ', 'خَوَاتِيمَنَا', 'الْكَاتِبِ', ':', 'نَبِيلُ', 'جلهوم'], # pylint: disable=line-too-long
['الرَّأْي', 'قَبْلَ', 'شَجَاعَة', 'الشّجعَانِ'],
['فَأَنْزَلْنَا', 'مِنْ', 'السَّمَاء', 'مَاء', 'فَأَسْقَيْنَاكُمُوهُ'],
['سُئِلَ', 'بَعْضُ', 'الْكُتَّابِ', 'عَنِ', 'الْخَطّ', '،', 'مَتَى', 'يَسْتَحِقُّ', 'أَنْ', 'يُوصَفَ', 'بِالْجَوْدَةِ', '؟'] # pylint: disable=line-too-long
]
self.assertEqual(results, target)
示例2: stemmer_middle_high_german
# 需要导入模块: from cltk.tokenize.word import WordTokenizer [as 别名]
# 或者: from cltk.tokenize.word.WordTokenizer import tokenize [as 别名]
def stemmer_middle_high_german(text_l, rem_umlauts = True, exceptions = exc_dict):
"""text_l: text in string format
rem_umlauts: choose whether to remove umlauts from string
exceptions: hard-coded dictionary for the cases the algorithm fails"""
#Normalize text
text_l = normalize_middle_high_german(text_l, to_lower_all = False, to_lower_beginning = True)
#Tokenize text
word_tokenizer = WordTokenizer("middle_high_german")
text_l = word_tokenizer.tokenize(text_l)
text = []
for word in text_l:
try:
text.append(exceptions[word]) #test if word in exception dictionary
except:
if word[0].isupper():
#MHG only uses upper case for locations, people, etc. So any word that starts with a capital
#letter while not being at the start of a sentence will automatically be excluded.
text.append(word)
elif word in MHG_STOPS:
text.append(word) #Filter stop words
else:
text.append(stem_helper(word, rem_umlaut = rem_umlauts))
return text
示例3: test_latin_word_tokenizer
# 需要导入模块: from cltk.tokenize.word import WordTokenizer [as 别名]
# 或者: from cltk.tokenize.word.WordTokenizer import tokenize [as 别名]
def test_latin_word_tokenizer(self):
"""Test Latin-specific word tokenizer."""
word_tokenizer = WordTokenizer('latin')
#Test sources:
# - V. Aen. 1.1
# - Prop. 2.5.1-2
# - Ov. Am. 1.8.65-66
# - Cic. Phillip. 13.14
tests = ['Arma virumque cano, Troiae qui primus ab oris.',
'Hoc verumst, tota te ferri, Cynthia, Roma, et non ignota vivere nequitia?',
'Nec te decipiant veteres circum atria cerae. Tolle tuos tecum, pauper amator, avos!',
'Neque enim, quod quisque potest, id ei licet, nec, si non obstatur, propterea etiam permittitur.']
results = []
for test in tests:
result = word_tokenizer.tokenize(test)
results.append(result)
target = [['Arma', 'que', 'virum', 'cano', ',', 'Troiae', 'qui', 'primus', 'ab', 'oris.'],
['Hoc', 'verum', 'est', ',', 'tota', 'te', 'ferri', ',', 'Cynthia', ',', 'Roma', ',', 'et', 'non', 'ignota', 'vivere', 'nequitia', '?'],
['Nec', 'te', 'decipiant', 'veteres', 'circum', 'atria', 'cerae.', 'Tolle', 'tuos', 'cum', 'te', ',', 'pauper', 'amator', ',', 'avos', '!'],
['que', 'Ne', 'enim', ',', 'quod', 'quisque', 'potest', ',', 'id', 'ei', 'licet', ',', 'c', 'ne', ',', 'si', 'non', 'obstatur', ',', 'propterea', 'etiam', 'permittitur.']]
self.assertEqual(results, target)
示例4: test_latin_word_tokenizer_base
# 需要导入模块: from cltk.tokenize.word import WordTokenizer [as 别名]
# 或者: from cltk.tokenize.word.WordTokenizer import tokenize [as 别名]
def test_latin_word_tokenizer_base(self):
"""Test Latin-specific word tokenizer."""
word_tokenizer = WordTokenizer('latin')
#Test sources:
# - V. Aen. 1.1
# - Prop. 2.5.1-2
# - Ov. Am. 1.8.65-66
# - Cic. Phillip. 13.14
# - Plaut. Capt. 937
# - Lucr. DRN. 5.1351-53
# - Plaut. Bacch. 837-38
# - Plaut. Amph. 823
# - Caes. Bel. 6.29.2
tests = ['Arma virumque cano, Troiae qui primus ab oris.',
'Hoc verumst, tota te ferri, Cynthia, Roma, et non ignota vivere nequitia?',
'Nec te decipiant veteres circum atria cerae. Tolle tuos tecum, pauper amator, avos!',
'Neque enim, quod quisque potest, id ei licet, nec, si non obstatur, propterea etiam permittitur.',
'Quid opust verbis? lingua nullast qua negem quidquid roges.',
'Textile post ferrumst, quia ferro tela paratur, nec ratione alia possunt tam levia gigni insilia ac fusi, radii, scapique sonantes.', # pylint: disable=line-too-long
'Dic sodes mihi, bellan videtur specie mulier?',
'Cenavin ego heri in navi in portu Persico?',
'quae ripas Ubiorum contingebat in longitudinem pedum ducentorum rescindit']
results = []
for test in tests:
result = word_tokenizer.tokenize(test)
results.append(result)
target = [['Arma', 'virumque', 'cano', ',', 'Troiae', 'qui', 'primus', 'ab', 'oris', '.'], ['Hoc', 'verumst', ',', 'tota', 'te', 'ferri', ',', 'Cynthia', ',', 'Roma', ',', 'et', 'non', 'ignota', 'vivere', 'nequitia', '?'], ['Nec', 'te', 'decipiant', 'veteres', 'circum', 'atria', 'cerae.', 'Tolle', 'tuos', 'tecum', ',', 'pauper', 'amator', ',', 'avos', '!'], ['Neque', 'enim', ',', 'quod', 'quisque', 'potest', ',', 'id', 'ei', 'licet', ',', 'nec', ',', 'si', 'non', 'obstatur', ',', 'propterea', 'etiam', 'permittitur', '.'], ['Quid', 'opust', 'verbis', '?', 'lingua', 'nullast', 'qua', 'negem', 'quidquid', 'roges', '.'], ['Textile', 'post', 'ferrumst', ',', 'quia', 'ferro', 'tela', 'paratur', ',', 'nec', 'ratione', 'alia', 'possunt', 'tam', 'levia', 'gigni', 'insilia', 'ac', 'fusi', ',', 'radii', ',', 'scapique', 'sonantes', '.'], ['Dic', 'sodes', 'mihi', ',', 'bellan', 'videtur', 'specie', 'mulier', '?'], ['Cenavin', 'ego', 'heri', 'in', 'navi', 'in', 'portu', 'Persico', '?'], ['quae', 'ripas', 'Ubiorum', 'contingebat', 'in', 'longitudinem', 'pedum', 'ducentorum', 'rescindit']]
self.assertEqual(results, target)
示例5: test_middle_english_tokenizer
# 需要导入模块: from cltk.tokenize.word import WordTokenizer [as 别名]
# 或者: from cltk.tokenize.word.WordTokenizer import tokenize [as 别名]
def test_middle_english_tokenizer(self):
text = " Fers am I ferd of oure fare;\n Fle we ful fast þer-fore. \n Can Y no cownsel bot care.\n\n"
target = ['Fers', 'am', 'I', 'ferd', 'of', 'oure', 'fare', ';', 'Fle', 'we', 'ful', 'fast', 'þer', '-', 'fore', '.',
'Can', 'Y', 'no', 'cownsel', 'bot', 'care', '.']
tokenizer = WordTokenizer('middle_english')
tokenized = tokenizer.tokenize(text)
self.assertTrue(tokenized == target)
示例6: test_latin_word_tokenizer
# 需要导入模块: from cltk.tokenize.word import WordTokenizer [as 别名]
# 或者: from cltk.tokenize.word.WordTokenizer import tokenize [as 别名]
def test_latin_word_tokenizer(self):
"""Test Latin-specific word tokenizer."""
word_tokenizer = WordTokenizer('latin')
text = 'atque haec abuterque nihil'
tokens = word_tokenizer.tokenize(text)
target = ['atque', 'haec', 'abuter', '-que', 'nihil']
self.assertEqual(tokens, target)
示例7: test_french_lemmatizer
# 需要导入模块: from cltk.tokenize.word import WordTokenizer [as 别名]
# 或者: from cltk.tokenize.word.WordTokenizer import tokenize [as 别名]
def test_french_lemmatizer(self):
text = "Li rois pense que par folie, Sire Tristran, vos aie amé ; Mais Dé plevis ma loiauté, Qui sor mon cors mete flaele, S'onques fors cil qui m’ot pucele Out m'amistié encor nul jor !"
text = str.lower(text)
tokenizer = WordTokenizer('french')
lemmatizer = LemmaReplacer()
tokens = tokenizer.tokenize(text)
lemmas = lemmatizer.lemmatize(tokens)
target = [('li', 'li'), ('rois', 'rois'), ('pense', 'pense'), ('que', 'que'), ('par', 'par'), ('folie', 'folie'), (',', ['PUNK']), ('sire', 'sire'), ('tristran', 'None'), (',', ['PUNK']), ('vos', 'vos'), ('aie', ['avoir']), ('amé', 'amer'), (';', ['PUNK']), ('mais', 'mais'), ('dé', 'dé'), ('plevis', 'plevir'), ('ma', 'ma'), ('loiauté', 'loiauté'), (',', ['PUNK']), ('qui', 'qui'), ('sor', 'sor'), ('mon', 'mon'), ('cors', 'cors'), ('mete', 'mete'), ('flaele', 'flaele'), (',', ['PUNK']), ("s'", "s'"), ('onques', 'onques'), ('fors', 'fors'), ('cil', 'cil'), ('qui', 'qui'), ("m'", "m'"), ('ot', 'ot'), ('pucele', 'pucele'), ('out', ['avoir']), ("m'", "m'"), ('amistié', 'amistié'), ('encor', 'encor'), ('nul', 'nul'), ('jor', 'jor'), ('!', ['PUNK'])]
self.assertEqual(lemmas, target)
示例8: test_old_norse_word_tokenizer
# 需要导入模块: from cltk.tokenize.word import WordTokenizer [as 别名]
# 或者: from cltk.tokenize.word.WordTokenizer import tokenize [as 别名]
def test_old_norse_word_tokenizer(self):
text = "Gylfi konungr var maðr vitr ok fjölkunnigr. " \
"Hann undraðist þat mjök, er ásafólk var svá kunnigt, at allir hlutir gengu at vilja þeira."
target = ['Gylfi', 'konungr', 'var', 'maðr', 'vitr', 'ok', 'fjölkunnigr', '.', 'Hann', 'undraðist', 'þat',
'mjök', ',', 'er', 'ásafólk', 'var', 'svá', 'kunnigt', ',', 'at', 'allir', 'hlutir', 'gengu', 'at',
'vilja', 'þeira', '.']
word_tokenizer = WordTokenizer('old_norse')
result = word_tokenizer.tokenize(text)
self.assertTrue(result == target)
示例9: test_middle_high_german_stopwords
# 需要导入模块: from cltk.tokenize.word import WordTokenizer [as 别名]
# 或者: from cltk.tokenize.word.WordTokenizer import tokenize [as 别名]
def test_middle_high_german_stopwords(self):
"""Test filtering Middle High German stopwords."""
sentence = "Swer was ze Bêârosche komn, doch hete Gâwân dâ genomn den prîs ze bêder sît al ein wan daz dervor ein ritter schein, bî rôtem wâpen unrekant, des prîs man in die hœhe bant."
lowered = sentence.lower()
tokenizer = WordTokenizer('middle_high_german')
tokens = tokenizer.tokenize(lowered)
no_stops = [w for w in tokens if w not in MHG_STOPS]
target_list = ['swer', 'bêârosche', 'komn', ',', 'gâwân', 'genomn', 'prîs', 'bêder', 'sît', 'dervor', 'ritter', 'schein', ',', 'rôtem', 'wâpen', 'unrekant', ',', 'prîs', 'hœhe', 'bant', '.']
self.assertEqual(no_stops,target_list)
示例10: test_akkadian_word_tokenizer
# 需要导入模块: from cltk.tokenize.word import WordTokenizer [as 别名]
# 或者: from cltk.tokenize.word.WordTokenizer import tokenize [as 别名]
def test_akkadian_word_tokenizer(self):
"""
Tests word_tokenizer.
"""
tokenizer = WordTokenizer('akkadian')
line = 'u2-wa-a-ru at-ta e2-kal2-la-ka _e2_-ka wu-e-er'
output = tokenizer.tokenize(line)
goal = [('u2-wa-a-ru', 'akkadian'), ('at-ta', 'akkadian'),
('e2-kal2-la-ka', 'akkadian'),
('_e2_-ka', 'sumerian'), ('wu-e-er', 'akkadian')]
self.assertEqual(output, goal)
示例11: normalize_fr
# 需要导入模块: from cltk.tokenize.word import WordTokenizer [as 别名]
# 或者: from cltk.tokenize.word.WordTokenizer import tokenize [as 别名]
def normalize_fr(string):
string = string.lower()
word_tokenizer = WordTokenizer('french')
tokens = word_tokenizer.tokenize(string)
normalized_text = []
for token in tokens:
for matches_rule, apply_rule in rules:
if matches_rule(token):
normalized = apply_rule(token)
normalized_text.append(normalized)
return normalized_text
示例12: test_middle_high_german_tokenize
# 需要导入模块: from cltk.tokenize.word import WordTokenizer [as 别名]
# 或者: from cltk.tokenize.word.WordTokenizer import tokenize [as 别名]
def test_middle_high_german_tokenize(self):
"""
Test tokenizing Middle High German
"""
word_tokenizer = WordTokenizer('middle_high_german')
text = "Mīn ougen wurden liebes alsō vol, \n\n\ndō ich die minneclīchen ērst gesach,\ndaȥ eȥ mir hiute und iemer mē tuot wol."
tokenized = word_tokenizer.tokenize(text)
target = ['Mīn', 'ougen', 'wurden', 'liebes', 'alsō', 'vol', ',', 'dō', 'ich', 'die', 'minneclīchen', 'ērst', 'gesach', ',', 'daȥ', 'eȥ', 'mir', 'hiute', 'und', 'iemer', 'mē', 'tuot', 'wol', '.']
self.assertEqual(tokenized, target)
示例13: test_latin_lemmata
# 需要导入模块: from cltk.tokenize.word import WordTokenizer [as 别名]
# 或者: from cltk.tokenize.word.WordTokenizer import tokenize [as 别名]
def test_latin_lemmata(self):
"""Test Lemmata class lookup() method"""
lemmatizer = Lemmata(dictionary = 'lemmata', language = 'latin')
test_str = 'Ceterum antequam destinata componam'
target = [('ceterum', [('ceterus', 1.0)]), ('antequam', [('antequam', 1.0)]), ('destinata', [('destinatus', 0.25), ('destinatum', 0.25), ('destinata', 0.25), ('destino', 0.25)]), ('componam', [('compono', 1.0)])] # pylint: disable=line-too-long
jv_replacer = JVReplacer()
tokenizer = WordTokenizer('latin')
test_str = test_str.lower()
test_str = jv_replacer.replace(test_str)
tokens = tokenizer.tokenize(test_str)
lemmas = lemmatizer.lookup(tokens)
self.assertEqual(lemmas, target)
示例14: test_identity_lemmatizer
# 需要导入模块: from cltk.tokenize.word import WordTokenizer [as 别名]
# 或者: from cltk.tokenize.word.WordTokenizer import tokenize [as 别名]
def test_identity_lemmatizer(self):
"""Test identity_lemmatizer()"""
lemmatizer = IdentityLemmatizer()
test_str = 'Ceterum antequam destinata componam'
target = [('ceterum', 'ceterum'), ('antequam', 'antequam'), ('destinata', 'destinata'), ('componam', 'componam')] # pylint: disable=line-too-long
jv_replacer = JVReplacer()
tokenizer = WordTokenizer('latin')
test_str = test_str.lower()
test_str = jv_replacer.replace(test_str)
tokens = tokenizer.tokenize(test_str)
lemmas = lemmatizer.lemmatize(tokens)
self.assertEqual(lemmas, target)
示例15: test_bigram_pos_lemmatizer
# 需要导入模块: from cltk.tokenize.word import WordTokenizer [as 别名]
# 或者: from cltk.tokenize.word.WordTokenizer import tokenize [as 别名]
def test_bigram_pos_lemmatizer(self):
train = [[('dixissem', 'dico', 'v')], [('de', 'de', 'r'), ('te', 'tu', 'p'), ('autem', 'autem', 'c'), (',', 'punc', 'u'), ('catilina', 'catilina', 'n'), (',', 'punc', 'u'), ('cum', 'cum2', 'c'), ('quiescunt', 'quiesco', 'v'), (',', 'punc', 'u'), ('probant', 'probo', 'v'), (',', 'punc', 'u'), ('cum', 'cum2', 'c'), ('patiuntur', 'patior', 'v'), (',', 'punc', 'u'), ('decernunt', 'decerno', 'v'), (',', 'punc', 'u'), ('cum', 'cum2', 'c'), ('tacent', 'taceo', 'v'), (',', 'punc', 'u'), ('clamant', 'clamo', 'v'), (',', 'punc', 'u'), ('neque', 'neque', 'c'), ('hi', 'hic', 'p'), ('solum', 'solus', 'd'), ('quorum', 'qui', 'p'), ('tibi', 'tu', 'p'), ('auctoritas', 'auctoritas', 'n'), ('est', 'sum', 'v'), ('uidelicet', 'uidelicet', 'd'), ('cara', 'carus', 'a'), (',', 'punc', 'u'), ('uita', 'uita', 'n'), ('uilissima', 'uilis', 'a'), (',', 'punc', 'u'), ('sed', 'sed', 'c'), ('etiam', 'etiam', 'c'), ('illi', 'ille', 'p'), ('equites', 'eques', 'n'), ('romani', 'romanus', 'a'), (',', 'punc', 'u'), ('honestissimi', 'honestus', 'a'), ('atque', 'atque', 'c'), ('optimi', 'bonus', 'a'), ('uiri', 'uir', 'n'), (',', 'punc', 'u'), ('ceteri', 'ceterus', 'a'), ('-que', '-que', 'c'), ('fortissimi', 'fortis', 'a'), ('ciues', 'ciuis', 'n'), ('qui', 'qui', 'p'), ('circumstant', 'circumsto', 'v'), ('senatum', 'senatus', 'n'), (',', 'punc', 'u'), ('quorum', 'qui', 'p'), ('tu', 'tu', 'p'), ('et', 'et', 'c'), ('frequentiam', 'frequentia', 'n'), ('uidere', 'uideo', 'v'), ('et', 'et', 'c'), ('studia', 'studium', 'n'), ('perspicere', 'perspicio', 'v'), ('et', 'et', 'c'), ('uoces', 'uox', 'n'), ('paulo', 'paulus', 'd'), ('ante', 'ante', 'd'), ('exaudire', 'exaudio', 'v'), ('potuisti', 'possum', 'v'), ('.', 'punc', 'u')]]
lemmatizer = BigramPOSLemmatizer(train=train, include=['cum'])
test_str = """Quod cum esset intellectum et animadversum fecit animo libentissimo populus Romanus"""
target = [('quod', None), ('cum', 'cum2'), ('esset', None), ('intellectum', None), ('et', None), ('animaduersum', None), ('fecit', None), ('animo', None), ('libentissimo', None), ('populus', None), ('romanus', None)] # pylint: disable=line-too-long
jv_replacer = JVReplacer()
tokenizer = WordTokenizer('latin')
test_str = test_str.lower()
test_str = jv_replacer.replace(test_str)
tokens = tokenizer.tokenize(test_str)
lemmas = lemmatizer.lemmatize(tokens)
self.assertEqual(lemmas, target)