本文整理汇总了Python中nltk.tokenize.treebank.TreebankWordTokenizer方法的典型用法代码示例。如果您正苦于以下问题:Python treebank.TreebankWordTokenizer方法的具体用法?Python treebank.TreebankWordTokenizer怎么用?Python treebank.TreebankWordTokenizer使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类nltk.tokenize.treebank
的用法示例。
在下文中一共展示了treebank.TreebankWordTokenizer方法的12个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: __init__
# 需要导入模块: from nltk.tokenize import treebank [as 别名]
# 或者: from nltk.tokenize.treebank import TreebankWordTokenizer [as 别名]
def __init__(self, *args, **kwargs):
if 'tokenize' in kwargs:
raise TypeError('``TreebankEncoder`` does not take keyword argument ``tokenize``.')
if 'detokenize' in kwargs:
raise TypeError('``TreebankEncoder`` does not take keyword argument ``detokenize``.')
try:
import nltk
# Required for moses
nltk.download('perluniprops')
nltk.download('nonbreaking_prefixes')
from nltk.tokenize.treebank import TreebankWordTokenizer
from nltk.tokenize.treebank import TreebankWordDetokenizer
except ImportError:
print("Please install NLTK. " "See the docs at http://nltk.org for more information.")
raise
super().__init__(
*args,
tokenize=TreebankWordTokenizer().tokenize,
detokenize=TreebankWordDetokenizer().detokenize,
**kwargs)
示例2: word_tokenize
# 需要导入模块: from nltk.tokenize import treebank [as 别名]
# 或者: from nltk.tokenize.treebank import TreebankWordTokenizer [as 别名]
def word_tokenize(text, language='english', preserve_line=False):
"""
Return a tokenized copy of *text*,
using NLTK's recommended word tokenizer
(currently an improved :class:`.TreebankWordTokenizer`
along with :class:`.PunktSentenceTokenizer`
for the specified language).
:param text: text to split into words
:type text: str
:param language: the model name in the Punkt corpus
:type language: str
:param preserve_line: An option to keep the preserve the sentence and not sentence tokenize it.
:type preserve_line: bool
"""
sentences = [text] if preserve_line else sent_tokenize(text, language)
return [
token for sent in sentences for token in _treebank_word_tokenizer.tokenize(sent)
]
示例3: word_tokenize
# 需要导入模块: from nltk.tokenize import treebank [as 别名]
# 或者: from nltk.tokenize.treebank import TreebankWordTokenizer [as 别名]
def word_tokenize(text, language='english'):
"""
Return a tokenized copy of *text*,
using NLTK's recommended word tokenizer
(currently :class:`.TreebankWordTokenizer`
along with :class:`.PunktSentenceTokenizer`
for the specified language).
:param text: text to split into sentences
:param language: the model name in the Punkt corpus
"""
return [token for sent in sent_tokenize(text, language)
for token in _treebank_word_tokenize(sent)]
示例4: demo_liu_hu_lexicon
# 需要导入模块: from nltk.tokenize import treebank [as 别名]
# 或者: from nltk.tokenize.treebank import TreebankWordTokenizer [as 别名]
def demo_liu_hu_lexicon(sentence, plot=False):
"""
Basic example of sentiment classification using Liu and Hu opinion lexicon.
This function simply counts the number of positive, negative and neutral words
in the sentence and classifies it depending on which polarity is more represented.
Words that do not appear in the lexicon are considered as neutral.
:param sentence: a sentence whose polarity has to be classified.
:param plot: if True, plot a visual representation of the sentence polarity.
"""
from nltk.corpus import opinion_lexicon
from nltk.tokenize import treebank
tokenizer = treebank.TreebankWordTokenizer()
pos_words = 0
neg_words = 0
tokenized_sent = [word.lower() for word in tokenizer.tokenize(sentence)]
x = list(range(len(tokenized_sent))) # x axis for the plot
y = []
for word in tokenized_sent:
if word in opinion_lexicon.positive():
pos_words += 1
y.append(1) # positive
elif word in opinion_lexicon.negative():
neg_words += 1
y.append(-1) # negative
else:
y.append(0) # neutral
if pos_words > neg_words:
print('Positive')
elif pos_words < neg_words:
print('Negative')
elif pos_words == neg_words:
print('Neutral')
if plot == True:
_show_plot(x, y, x_labels=tokenized_sent, y_labels=['Negative', 'Neutral', 'Positive'])
示例5: tokenize_text
# 需要导入模块: from nltk.tokenize import treebank [as 别名]
# 或者: from nltk.tokenize.treebank import TreebankWordTokenizer [as 别名]
def tokenize_text(text, language="english"):
'''Tokenize a string into a list of tokens.
Use NLTK's Treebankwordtokenizer.
Note that we first split into sentences using NLTK's sent_tokenize.
We additionally call a filtering function to remove un-wanted tokens.
IN:
- text, str
OUT:
- list of strings
'''
## list of tokens
list_tokens = []
## split text into sentences
sentences=sent_tokenize(text, language=language)
## define the tokenizer
tokenizer = TreebankWordTokenizer()
## loop over all sentences
for sent in sentences:
## tokenize the sentence
sent_tokenized = tokenizer.tokenize(sent)
## lowercase the tokens
## add tokens to list of tokens
list_tokens += sent_tokenized
list_tokens = filter_tokens(list_tokens)
return list_tokens
示例6: tokenize
# 需要导入模块: from nltk.tokenize import treebank [as 别名]
# 或者: from nltk.tokenize.treebank import TreebankWordTokenizer [as 别名]
def tokenize(self, text: str):
"""
:rtype: list
:param text: text to be tokenized into sentences
:type text: str
"""
sents = self.sent_tokenizer.tokenize(text)
tokenizer = TreebankWordTokenizer()
return [item for sublist in tokenizer.tokenize_sents(sents) for item in sublist]
示例7: tokenize
# 需要导入模块: from nltk.tokenize import treebank [as 别名]
# 或者: from nltk.tokenize.treebank import TreebankWordTokenizer [as 别名]
def tokenize(self, text: str):
"""
:rtype: list
:param text: text to be tokenized into sentences
:type text: str
:param model: tokenizer object to used # Should be in init?
:type model: object
"""
sents = self.sent_tokenizer.tokenize(text)
tokenizer = TreebankWordTokenizer()
return [item for sublist in tokenizer.tokenize_sents(sents) for item in sublist]
示例8: word_tokenize
# 需要导入模块: from nltk.tokenize import treebank [as 别名]
# 或者: from nltk.tokenize.treebank import TreebankWordTokenizer [as 别名]
def word_tokenize(text):
"""
Return a tokenized copy of *text*,
using NLTK's recommended word tokenizer
(currently :class:`.TreebankWordTokenizer`).
This tokenizer is designed to work on a sentence at a time.
"""
return _word_tokenize(text)
示例9: word_tokenize
# 需要导入模块: from nltk.tokenize import treebank [as 别名]
# 或者: from nltk.tokenize.treebank import TreebankWordTokenizer [as 别名]
def word_tokenize(text, language='english'):
"""
Return a tokenized copy of *text*,
using NLTK's recommended word tokenizer
(currently :class:`.TreebankWordTokenizer`
along with :class:`.PunktSentenceTokenizer`
for the specified language).
:param text: text to split into sentences
:param language: the model name in the Punkt corpus
"""
if sys.version_info[0] < 3:
return [token for token in _treebank_word_tokenize(text)]
else:
return [token for token in _treebank_word_tokenize(text.decode("UTF-8"))]
示例10: __init__
# 需要导入模块: from nltk.tokenize import treebank [as 别名]
# 或者: from nltk.tokenize.treebank import TreebankWordTokenizer [as 别名]
def __init__(self):
super().__init__()
self.tokenizer = TreebankWordTokenizer()
示例11: __init__
# 需要导入模块: from nltk.tokenize import treebank [as 别名]
# 或者: from nltk.tokenize.treebank import TreebankWordTokenizer [as 别名]
def __init__(self, language):
"""Take language as argument to the class. Check availability and
setup class variables."""
self.language = language
self.available_languages = ['akkadian',
'arabic',
'french', # defaults to old_french
'greek',
'latin',
'middle_english',
'middle_french',
'middle_high_german',
'old_french',
'old_norse',
'sanskrit',
'multilingual']
assert self.language in self.available_languages, \
"Specific tokenizer not available for '{0}'. Only available for: '{1}'.".format(
self.language,
self.available_languages)
# raise languages-specific warnings
if self.language == 'french':
self.language = 'old_french'
LOG.warning("'french' defaults to 'old_french'. 'middle_french' also available.") # pylint: disable=line-too-long
if self.language == 'arabic':
self.toker = BaseArabyWordTokenizer('arabic')
elif self.language == 'french':
self.toker = BaseRegexWordTokenizer('old_french',
OldFrenchTokenizerPatterns)
elif self.language == 'greek':
self.toker = BasePunktWordTokenizer('greek',
GreekRegexSentenceTokenizer)
elif self.language == 'latin':
self.toker = LatinWordTokenizer()
elif self.language == 'old_norse':
self.toker = BaseRegexWordTokenizer('old_norse',
OldNorseTokenizerPatterns)
elif self.language == 'middle_english':
self.toker = BaseRegexWordTokenizer('middle_english',
MiddleEnglishTokenizerPatterns)
elif self.language == 'middle_french':
self.toker = BaseRegexWordTokenizer('old_french',
OldFrenchTokenizerPatterns)
elif self.language == 'middle_high_german':
self.toker = BaseRegexWordTokenizer('middle_high_german',
MiddleHighGermanTokenizerPatterns)
elif self.language == 'old_french':
self.toker = BaseRegexWordTokenizer('old_french',
OldFrenchTokenizerPatterns)
else:
LOG.warning("Falling back to default tokenizer, the NLTK's `TreebankWordTokenizer()`.")
self.toker = TreebankWordTokenizer()
示例12: demo_liu_hu_lexicon
# 需要导入模块: from nltk.tokenize import treebank [as 别名]
# 或者: from nltk.tokenize.treebank import TreebankWordTokenizer [as 别名]
def demo_liu_hu_lexicon(sentence, plot=False):
"""
Basic example of sentiment classification using Liu and Hu opinion lexicon.
This function simply counts the number of positive, negative and neutral words
in the sentence and classifies it depending on which polarity is more represented.
Words that do not appear in the lexicon are considered as neutral.
:param sentence: a sentence whose polarity has to be classified.
:param plot: if True, plot a visual representation of the sentence polarity.
"""
from nltk.corpus import opinion_lexicon
from nltk.tokenize import treebank
tokenizer = treebank.TreebankWordTokenizer()
pos_words = 0
neg_words = 0
tokenized_sent = [word.lower() for word in tokenizer.tokenize(sentence)]
x = list(range(len(tokenized_sent))) # x axis for the plot
y = []
for word in tokenized_sent:
if word in opinion_lexicon.positive():
pos_words += 1
y.append(1) # positive
elif word in opinion_lexicon.negative():
neg_words += 1
y.append(-1) # negative
else:
y.append(0) # neutral
if pos_words > neg_words:
print('Positive')
elif pos_words < neg_words:
print('Negative')
elif pos_words == neg_words:
print('Neutral')
if plot == True:
_show_plot(
x, y, x_labels=tokenized_sent, y_labels=['Negative', 'Neutral', 'Positive']
)