本文整理汇总了Python中nltk.tokenize.punkt.PunktLanguageVars.word_tokenize方法的典型用法代码示例。如果您正苦于以下问题:Python PunktLanguageVars.word_tokenize方法的具体用法?Python PunktLanguageVars.word_tokenize怎么用?Python PunktLanguageVars.word_tokenize使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类nltk.tokenize.punkt.PunktLanguageVars
的用法示例。
在下文中一共展示了PunktLanguageVars.word_tokenize方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: tokenize
# 需要导入模块: from nltk.tokenize.punkt import PunktLanguageVars [as 别名]
# 或者: from nltk.tokenize.punkt.PunktLanguageVars import word_tokenize [as 别名]
def tokenize(self, string):
"""Tokenize incoming string."""
punkt = PunktLanguageVars()
generic_tokens = punkt.word_tokenize(string)
# Rewrite as an if-else block for exceptions rather than separate list comprehensions
generic_tokens = [x for item in generic_tokens for x in ([item] if item != 'nec' else ['c', 'ne'])] # Handle 'nec' as a special case.
generic_tokens = [x for item in generic_tokens for x in ([item] if item != 'sodes' else ['si', 'audes'])] # Handle 'sodes' as a special case.
generic_tokens = [x for item in generic_tokens for x in ([item] if item != 'sultis' else ['si', 'vultis'])] # Handle 'sultis' as a special case.
specific_tokens = []
for generic_token in generic_tokens:
is_enclitic = False
if generic_token not in self.exceptions:
for enclitic in self.enclitics:
if generic_token.endswith(enclitic):
if enclitic == 'cum':
if generic_token in self.inclusions:
specific_tokens += [enclitic] + [generic_token[:-len(enclitic)]]
else:
specific_tokens += [generic_token]
elif enclitic == 'st':
if generic_token.endswith('ust'):
specific_tokens += [generic_token[:-len(enclitic)+1]] + ['est']
else:
# Does not handle 'similist', 'qualist', etc. correctly
specific_tokens += [generic_token[:-len(enclitic)]] + ['est']
else:
specific_tokens += [enclitic] + [generic_token[:-len(enclitic)]]
is_enclitic = True
break
if not is_enclitic:
specific_tokens.append(generic_token)
return specific_tokens
示例2: nltk_tokenize_words
# 需要导入模块: from nltk.tokenize.punkt import PunktLanguageVars [as 别名]
# 或者: from nltk.tokenize.punkt.PunktLanguageVars import word_tokenize [as 别名]
def nltk_tokenize_words(string, attached_period=False, language=None):
"""Wrap NLTK's tokenizer PunktLanguageVars(), but make final period
its own token.
>>> nltk_punkt("Sentence 1. Sentence 2.")
>>> ['Sentence', 'one', '.', 'Sentence', 'two', '.']
Optionally keep the NLTK's output:
>>> nltk_punkt("Sentence 1. Sentence 2.", attached_period=True)
>>> ['Sentence', 'one.', 'Sentence', 'two.']
TODO: Run some tests to determine whether there is a large penalty for
re-calling PunktLanguageVars() for each use of this function. If so, this
will need to become a class, perhaps inheriting from the PunktLanguageVars
object. Maybe integrate with WordTokenizer.
"""
assert isinstance(string, str), "Incoming string must be type str."
if language=='sanskrit':
periods = ['.', 'ред','рее']
else:
periods = ['.']
punkt = PunktLanguageVars()
tokens = punkt.word_tokenize(string)
if attached_period:
return tokens
new_tokens = []
for word in tokens:
for char in periods:
if word.endswith(char):
new_tokens.append(word[:-1])
new_tokens.append(char)
break
else:
new_tokens.append(word)
return new_tokens
示例3: tokenize
# 需要导入模块: from nltk.tokenize.punkt import PunktLanguageVars [as 别名]
# 或者: from nltk.tokenize.punkt.PunktLanguageVars import word_tokenize [as 别名]
def tokenize(self, string):
"""Tokenize incoming string."""
#punkt = WhitespaceTokenizer()
punkt= PunktLanguageVars()
generic_tokens = punkt.word_tokenize(string)
generic_tokens = [x for item in generic_tokens for x in ([item] if item != 'nec' else ['c', 'ne'])] # Handle 'nec' as a special case.
specific_tokens = []
for generic_token in generic_tokens:
is_enclitic = False
if generic_token not in self.exceptions:
for enclitic in self.enclitics:
if generic_token.endswith(enclitic):
if enclitic == 'cum':
if generic_token in self.inclusions:
specific_tokens += [enclitic] + [generic_token[:-len(enclitic)]]
else:
specific_tokens += [generic_token]
elif enclitic == 'st':
if generic_token.endswith('ust'):
specific_tokens += [generic_token[:-len(enclitic)+1]] + ['est']
else:
# Does not handle 'similist', 'qualist', etc. correctly
specific_tokens += [generic_token[:-len(enclitic)]] + ['est']
else:
specific_tokens += [enclitic] + [generic_token[:-len(enclitic)]]
is_enclitic = True
break
if not is_enclitic:
specific_tokens.append(generic_token)
#return iter(specific_tokens) #change this one into an iterator.
startPoint=0 #this is to accumulate the start point.
for item in specific_tokens:
itemLength=len(item)
yield item, startPoint, startPoint+itemLength
startPoint=startPoint+itemLength+1
示例4: tokenize
# 需要导入模块: from nltk.tokenize.punkt import PunktLanguageVars [as 别名]
# 或者: from nltk.tokenize.punkt.PunktLanguageVars import word_tokenize [as 别名]
def tokenize(self, string):
"""Tokenize incoming string."""
punkt = PunktLanguageVars()
generic_tokens = punkt.word_tokenize(string)
generic_tokens = [x for item in generic_tokens for x in ([item] if item != 'nec' else ['c', 'ne'])] # Handle 'nec' as a special case.
specific_tokens = []
for generic_token in generic_tokens:
is_enclitic = False
if generic_token not in self.exceptions:
for enclitic in self.enclitics:
if generic_token.endswith(enclitic):
if enclitic == 'mst':
specific_tokens += [generic_token[:-len(enclitic)+1]] + ['e'+ generic_token[-len(enclitic)+1:]]
elif enclitic == 'cum':
if generic_token in self.inclusions:
specific_tokens += [enclitic] + [generic_token[:-len(enclitic)]]
else:
specific_tokens += [generic_token]
else:
specific_tokens += [enclitic] + [generic_token[:-len(enclitic)]]
is_enclitic = True
break
if not is_enclitic:
specific_tokens.append(generic_token)
return specific_tokens
示例5: _tokenize
# 需要导入模块: from nltk.tokenize.punkt import PunktLanguageVars [as 别名]
# 或者: from nltk.tokenize.punkt.PunktLanguageVars import word_tokenize [as 别名]
def _tokenize(self, text):
"""
Use NLTK's standard tokenizer, rm punctuation.
:param text: pre-processed text
:return: tokenized text
:rtype : list
"""
sentence_tokenizer = TokenizeSentence('latin')
sentences = sentence_tokenizer.tokenize_sentences(text.lower())
sent_words = []
punkt = PunktLanguageVars()
for sentence in sentences:
words = punkt.word_tokenize(sentence)
assert isinstance(words, list)
words_new = []
for word in words:
if word not in self.punctuation or self.abbreviations or self.numbers or self.abbreviations: # pylint: disable=line-too-long
words_new.append(word)
# rm all numbers here with: re.compose(r'[09]')
sent_words.append(words_new)
return sent_words
示例6: tokenize
# 需要导入模块: from nltk.tokenize.punkt import PunktLanguageVars [as 别名]
# 或者: from nltk.tokenize.punkt.PunktLanguageVars import word_tokenize [as 别名]
def tokenize(doc):
'''
INPUT: Document
OUTPUT: Tokenized and stemmed list of words from the document
'''
plv = PunktLanguageVars()
snowball = SnowballStemmer('english')
return [snowball.stem(word) for word in plv.word_tokenize(doc.lower())]
示例7: tokenize
# 需要导入模块: from nltk.tokenize.punkt import PunktLanguageVars [as 别名]
# 或者: from nltk.tokenize.punkt.PunktLanguageVars import word_tokenize [as 别名]
def tokenize(desc):
'''
INPUT: List of cleaned descriptions
OUTPUT: Tokenized and stemmed list of words from the descriptions
'''
plv = PunktLanguageVars()
snowball = SnowballStemmer('english')
return [snowball.stem(word) for word in plv.word_tokenize(desc.lower())]
示例8: tag_ner
# 需要导入模块: from nltk.tokenize.punkt import PunktLanguageVars [as 别名]
# 或者: from nltk.tokenize.punkt.PunktLanguageVars import word_tokenize [as 别名]
def tag_ner(lang, input_text, output_type=list):
"""Run NER for chosen language.
"""
_check_latest_data(lang)
assert lang in NER_DICT.keys(), \
'Invalid language. Choose from: {}'.format(', '.join(NER_DICT.keys()))
types = [str, list]
assert type(input_text) in types, 'Input must be: {}.'.format(', '.join(types))
assert output_type in types, 'Output must be a {}.'.format(', '.join(types))
if type(input_text) == str:
punkt = PunktLanguageVars()
tokens = punkt.word_tokenize(input_text)
new_tokens = []
for word in tokens:
if word.endswith('.'):
new_tokens.append(word[:-1])
new_tokens.append('.')
else:
new_tokens.append(word)
input_text = new_tokens
ner_file_path = os.path.expanduser(NER_DICT[lang])
with open(ner_file_path) as file_open:
ner_str = file_open.read()
ner_list = ner_str.split('\n')
ner_tuple_list = []
for count, word_token in enumerate(input_text):
match = False
for ner_word in ner_list:
# the replacer slows things down, but is necessary
if word_token == ner_word:
ner_tuple = (word_token, 'Entity')
ner_tuple_list.append(ner_tuple)
match = True
break
if not match:
ner_tuple_list.append((word_token,))
if output_type is str:
string = ''
for tup in ner_tuple_list:
start_space = ' '
final_space = ''
# this is some mediocre string reconstitution
# maybe not worth the effort
if tup[0] in [',', '.', ';', ':', '?', '!']:
start_space = ''
if len(tup) == 2:
string += start_space + tup[0] + '/' + tup[1] + final_space
else:
string += start_space + tup[0] + final_space
return string
return ner_tuple_list
示例9: test_latin_stopwords
# 需要导入模块: from nltk.tokenize.punkt import PunktLanguageVars [as 别名]
# 或者: from nltk.tokenize.punkt.PunktLanguageVars import word_tokenize [as 别名]
def test_latin_stopwords(self):
"""Test filtering Latin stopwords."""
sentence = 'Quo usque tandem abutere, Catilina, patientia nostra?'
lowered = sentence.lower()
punkt = PunktLanguageVars()
tokens = punkt.word_tokenize(lowered)
no_stops = [w for w in tokens if w not in LATIN_STOPS]
target_list = ['usque', 'tandem', 'abutere', ',', 'catilina', ',',
'patientia', 'nostra', '?']
self.assertEqual(no_stops, target_list)
示例10: test_french_stopwords
# 需要导入模块: from nltk.tokenize.punkt import PunktLanguageVars [as 别名]
# 或者: from nltk.tokenize.punkt.PunktLanguageVars import word_tokenize [as 别名]
def test_french_stopwords(self):
##test filtering French stopwords
sentence = "En pensé ai e en talant que d ’ Yonec vus die avant dunt il fu nez, e de sun pere cum il vint primes a sa mere ."
lowered = sentence.lower()
punkt = PunktLanguageVars()
tokens = punkt.word_tokenize(lowered)
no_stops = [w for w in tokens if w not in FRENCH_STOPS]
target_list = ['pensé', 'talant', 'd', '’', 'yonec', 'die', 'avant', 'dunt', 'nez', ',', 'pere', 'cum', 'primes',
'mere','.']
self.assertEqual(no_stops, target_list)
示例11: test_old_norse_stopwords
# 需要导入模块: from nltk.tokenize.punkt import PunktLanguageVars [as 别名]
# 或者: from nltk.tokenize.punkt.PunktLanguageVars import word_tokenize [as 别名]
def test_old_norse_stopwords(self):
"""
Test filtering Old Norse stopwords
Sentence extracted from Eiríks saga rauða (http://www.heimskringla.no/wiki/Eir%C3%ADks_saga_rau%C3%B0a)
"""
sentence = 'Þat var einn morgin, er þeir Karlsefni sá fyrir ofan rjóðrit flekk nökkurn, sem glitraði við þeim'
lowered = sentence.lower()
punkt = PunktLanguageVars()
tokens = punkt.word_tokenize(lowered)
no_stops = [w for w in tokens if w not in OLD_NORSE_STOPS]
target_list = ['var', 'einn', 'morgin', ',', 'karlsefni', 'rjóðrit', 'flekk', 'nökkurn', ',', 'glitraði']
self.assertEqual(no_stops, target_list)
示例12: test_greek_stopwords
# 需要导入模块: from nltk.tokenize.punkt import PunktLanguageVars [as 别名]
# 或者: from nltk.tokenize.punkt.PunktLanguageVars import word_tokenize [as 别名]
def test_greek_stopwords(self):
"""Test filtering Greek stopwords."""
sentence = 'Ἅρπαγος δὲ καταστρεψάμενος Ἰωνίην ἐποιέετο στρατηίην \
ἐπὶ Κᾶρας καὶ Καυνίους καὶ Λυκίους, ἅμα ἀγόμενος καὶ Ἴωνας καὶ \
Αἰολέας.'
lowered = sentence.lower()
punkt = PunktLanguageVars()
tokens = punkt.word_tokenize(lowered)
no_stops = [w for w in tokens if w not in GREEK_STOPS]
target_list = ['ἅρπαγος', 'καταστρεψάμενος', 'ἰωνίην', 'ἐποιέετο',
'στρατηίην', 'κᾶρας', 'καυνίους', 'λυκίους', ',',
'ἅμα', 'ἀγόμενος', 'ἴωνας', 'αἰολέας.']
self.assertEqual(no_stops, target_list)
示例13: lemmatize
# 需要导入模块: from nltk.tokenize.punkt import PunktLanguageVars [as 别名]
# 或者: from nltk.tokenize.punkt.PunktLanguageVars import word_tokenize [as 别名]
def lemmatize(self, input_text, return_raw=False, return_string=False):
"""Take incoming string or list of tokens. Lookup done against a
key-value list of lemmata-headword. If a string, tokenize with
``PunktLanguageVars()``. If a final period appears on a token, remove
it, then re-add once replacement done.
TODO: rm check for final period, change PunktLanguageVars() to nltk_tokenize_words()
"""
assert type(input_text) in [list, str], \
logger.error('Input must be a list or string.')
if type(input_text) is str:
punkt = PunktLanguageVars()
tokens = punkt.word_tokenize(input_text)
else:
tokens = input_text
lemmatized_tokens = []
for token in tokens:
# check for final period
final_period = False
if token[-1] == '.':
final_period = True
token = token[:-1]
# look for token in lemma dict keys
if token in self.lemmata.keys():
headword = self.lemmata[token.lower()]
# re-add final period if rm'd
if final_period:
headword += '.'
# append to return list
if not return_raw:
lemmatized_tokens.append(headword)
else:
lemmatized_tokens.append(token + '/' + headword)
# if token not found in lemma-headword list
else:
# re-add final period if rm'd
if final_period:
token += '.'
if not return_raw:
lemmatized_tokens.append(token)
else:
lemmatized_tokens.append(token + '/' + token)
if not return_string:
return lemmatized_tokens
elif return_string:
return ' '.join(lemmatized_tokens)
示例14: test_akkadian_stopwords
# 需要导入模块: from nltk.tokenize.punkt import PunktLanguageVars [as 别名]
# 或者: from nltk.tokenize.punkt.PunktLanguageVars import word_tokenize [as 别名]
def test_akkadian_stopwords(self):
"""
Test filtering Akkadian stopwrods
Sentence extracted from the law code of Hammurabi, law 3 (Martha Roth 2nd Edition 1997, Law Collections from
Mesopotamia and Asia Minor).
"""
sentence = "šumma awīlum ina dīnim ana šībūt sarrātim ūṣiamma awat iqbû la uktīn šumma dīnum šû dīn napištim awīlum šû iddâk"
lowered = sentence.lower()
punkt = PunktLanguageVars()
tokens = punkt.word_tokenize(lowered)
no_stops = [w for w in tokens if w not in AKKADIAN_STOPS]
target_list = ['awīlum', 'dīnim', 'šībūt', 'sarrātim', 'ūṣiamma', 'awat', 'iqbû', 'uktīn', 'dīnum',
'dīn', 'napištim', 'awīlum', 'iddâk']
self.assertEqual(no_stops, target_list)
示例15: _build_concordance
# 需要导入模块: from nltk.tokenize.punkt import PunktLanguageVars [as 别名]
# 或者: from nltk.tokenize.punkt.PunktLanguageVars import word_tokenize [as 别名]
def _build_concordance(self, text_str):
"""
Inherit or mimic the logic of ConcordanceIndex() at http://www.nltk.org/_modules/nltk/text.html
and/or ConcordanceSearchView() & SearchCorpus() at https://github.com/nltk/nltk/blob/develop/nltk/app/concordance_app.py
:param text_string: Text to be turned into a concordance
:type text_string: str
:return: list
"""
p = PunktLanguageVars()
orig_tokens = p.word_tokenize(text_str)
c = ConcordanceIndex(orig_tokens)
#! rm dupes after index, before loop
tokens = set(orig_tokens)
tokens = [x for x in tokens if x not in [',', '.', ';', ':', '"', "'", '[', ']']] # this needs to be changed or rm'ed
return c.return_concordance_all(tokens)