本文整理汇总了Python中nltk.data.load方法的典型用法代码示例。如果您正苦于以下问题:Python data.load方法的具体用法?Python data.load怎么用?Python data.load使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类nltk.data
的用法示例。
在下文中一共展示了data.load方法的13个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: demo_sent_subjectivity
# 需要导入模块: from nltk import data [as 别名]
# 或者: from nltk.data import load [as 别名]
def demo_sent_subjectivity(text):
"""
Classify a single sentence as subjective or objective using a stored
SentimentAnalyzer.
:param text: a sentence whose subjectivity has to be classified.
"""
from nltk.classify import NaiveBayesClassifier
from nltk.tokenize import regexp
word_tokenizer = regexp.WhitespaceTokenizer()
try:
sentim_analyzer = load('sa_subjectivity.pickle')
except LookupError:
print('Cannot find the sentiment analyzer you want to load.')
print('Training a new one using NaiveBayesClassifier.')
sentim_analyzer = demo_subjectivity(NaiveBayesClassifier.train, True)
# Tokenize and convert to lower case
tokenized_text = [word.lower() for word in word_tokenizer.tokenize(text)]
print(sentim_analyzer.classify(tokenized_text))
示例2: _load_universal_map
# 需要导入模块: from nltk import data [as 别名]
# 或者: from nltk.data import load [as 别名]
def _load_universal_map(fileid):
contents = load(join(_UNIVERSAL_DATA, fileid+'.map'), format="text")
# When mapping to the Universal Tagset,
# map unknown inputs to 'X' not 'UNK'
_MAPPINGS[fileid]['universal'].default_factory = lambda: 'X'
for line in contents.splitlines():
line = line.strip()
if line == '':
continue
fine, coarse = line.split('\t')
assert coarse in _UNIVERSAL_TAGS, 'Unexpected coarse tag: {}'.format(coarse)
assert fine not in _MAPPINGS[fileid]['universal'], 'Multiple entries for original tag: {}'.format(fine)
_MAPPINGS[fileid]['universal'][fine] = coarse
示例3: pos_tag
# 需要导入模块: from nltk import data [as 别名]
# 或者: from nltk.data import load [as 别名]
def pos_tag(tokens, tagset=None):
"""
Use NLTK's currently recommended part of speech tagger to
tag the given list of tokens.
>>> from nltk.tag import pos_tag # doctest: +SKIP
>>> from nltk.tokenize import word_tokenize # doctest: +SKIP
>>> pos_tag(word_tokenize("John's big idea isn't all that bad.")) # doctest: +SKIP
[('John', 'NNP'), ("'s", 'POS'), ('big', 'JJ'), ('idea', 'NN'), ('is',
'VBZ'), ("n't", 'RB'), ('all', 'DT'), ('that', 'DT'), ('bad', 'JJ'),
('.', '.')]
:param tokens: Sequence of tokens to be tagged
:type tokens: list(str)
:return: The tagged tokens
:rtype: list(tuple(str, str))
"""
tagger = load(_POS_TAGGER)
if tagset:
return [(token, map_tag('en-ptb', tagset, tag)) for (token, tag) in tagger.tag(tokens)]
return tagger.tag(tokens)
示例4: __init__
# 需要导入模块: from nltk import data [as 别名]
# 或者: from nltk.data import load [as 别名]
def __init__(self, language):
"""
:param str language: ISO 639-1 language code. See https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes
"""
self.language = language
model = self.supported_models.get(language)
if model:
self.splitter = load(model)
else:
raise ValueError(
"Invalid or unsupported language: '%s'. Please use one of the currently supported ones: %s" % (
language, self.supported_models.keys()))
示例5: ne_chunk
# 需要导入模块: from nltk import data [as 别名]
# 或者: from nltk.data import load [as 别名]
def ne_chunk(tagged_tokens, binary=False):
"""
Use NLTK's currently recommended named entity chunker to
chunk the given list of tagged tokens.
"""
if binary:
chunker_pickle = _BINARY_NE_CHUNKER
else:
chunker_pickle = _MULTICLASS_NE_CHUNKER
chunker = load(chunker_pickle)
return chunker.parse(tagged_tokens)
示例6: ne_chunk_sents
# 需要导入模块: from nltk import data [as 别名]
# 或者: from nltk.data import load [as 别名]
def ne_chunk_sents(tagged_sentences, binary=False):
"""
Use NLTK's currently recommended named entity chunker to chunk the
given list of tagged sentences, each consisting of a list of tagged tokens.
"""
if binary:
chunker_pickle = _BINARY_NE_CHUNKER
else:
chunker_pickle = _MULTICLASS_NE_CHUNKER
chunker = load(chunker_pickle)
return chunker.parse_sents(tagged_sentences)
示例7: _format_tagset
# 需要导入模块: from nltk import data [as 别名]
# 或者: from nltk.data import load [as 别名]
def _format_tagset(tagset, tagpattern=None):
tagdict = load("help/tagsets/" + tagset + ".pickle")
if not tagpattern:
_print_entries(sorted(tagdict), tagdict)
elif tagpattern in tagdict:
_print_entries([tagpattern], tagdict)
else:
tagpattern = re.compile(tagpattern)
tags = [tag for tag in sorted(tagdict) if tagpattern.match(tag)]
if tags:
_print_entries(tags, tagdict)
else:
print("No matching tags found.")
示例8: read_rule
# 需要导入模块: from nltk import data [as 别名]
# 或者: from nltk.data import load [as 别名]
def read_rule (self, filename):
rules = load('nltk:stemmers/rslp/' + filename, format='raw').decode("utf8")
lines = rules.split("\n")
lines = [line for line in lines if line != ""] # remove blank lines
lines = [line for line in lines if line[0] != "#"] # remove comments
# NOTE: a simple but ugly hack to make this parser happy with double '\t's
lines = [line.replace("\t\t", "\t") for line in lines]
# parse rules
rules = []
for line in lines:
rule = []
tokens = line.split("\t")
# text to be searched for at the end of the string
rule.append( tokens[0][1:-1] ) # remove quotes
# minimum stem size to perform the replacement
rule.append( int(tokens[1]) )
# text to be replaced into
rule.append( tokens[2][1:-1] ) # remove quotes
# exceptions to this rule
rule.append( [token[1:-1] for token in tokens[3].split(",")] )
# append to the results
rules.append(rule)
return rules
示例9: load
# 需要导入模块: from nltk import data [as 别名]
# 或者: from nltk.data import load [as 别名]
def load(self, path):
'''Load the pickled model weights.'''
self.weights = load(path)
示例10: __init__
# 需要导入模块: from nltk import data [as 别名]
# 或者: from nltk.data import load [as 别名]
def __init__(self, load=True):
'''
:param load: Load the pickled model upon instantiation.
'''
self.model = AveragedPerceptron()
self.tagdict = {}
self.classes = set()
if load:
AP_MODEL_LOC = str(find('taggers/averaged_perceptron_tagger/'+PICKLE))
self.load(AP_MODEL_LOC)
示例11: __init__
# 需要导入模块: from nltk import data [as 别名]
# 或者: from nltk.data import load [as 别名]
def __init__(self, load=True):
'''
:param load: Load the pickled model upon instantiation.
'''
self.model = AveragedPerceptron()
self.tagdict = {}
self.classes = set()
if load:
AP_MODEL_LOC = 'file:'+str(find('taggers/averaged_perceptron_tagger/'+PICKLE))
self.load(AP_MODEL_LOC)
开发者ID:SignalMedia,项目名称:PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda,代码行数:12,代码来源:perceptron.py
示例12: sent_tokenize
# 需要导入模块: from nltk import data [as 别名]
# 或者: from nltk.data import load [as 别名]
def sent_tokenize(text, language='english'):
"""
Return a sentence-tokenized copy of *text*,
using NLTK's recommended sentence tokenizer
(currently :class:`.PunktSentenceTokenizer`
for the specified language).
:param text: text to split into sentences
:param language: the model name in the Punkt corpus
"""
tokenizer = load('tokenizers/punkt/{0}.pickle'.format(language))
return tokenizer.tokenize(text)
# Standard word tokenizer.
示例13: pos_tag_sents
# 需要导入模块: from nltk import data [as 别名]
# 或者: from nltk.data import load [as 别名]
def pos_tag_sents(sentences):
"""
Use NLTK's currently recommended part of speech tagger to tag the
given list of sentences, each consisting of a list of tokens.
"""
tagger = load(_POS_TAGGER)
return tagger.tag_sents(sentences)