本文整理汇总了Python中nltk.data.load函数的典型用法代码示例。如果您正苦于以下问题:Python load函数的具体用法?Python load怎么用?Python load使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。
在下文中一共展示了load函数的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: parse_tweets_set
def parse_tweets_set(filename, label, word_tokenizer=None, sent_tokenizer=None,
skip_header=True):
"""
Parse csv file containing tweets and output data a list of (text, label) tuples.
:param filename: the input csv filename.
:param label: the label to be appended to each tweet contained in the csv file.
:param word_tokenizer: the tokenizer instance that will be used to tokenize
each sentence into tokens (e.g. WordPunctTokenizer() or BlanklineTokenizer()).
If no word_tokenizer is specified, tweets will not be tokenized.
:param sent_tokenizer: the tokenizer that will be used to split each tweet into
sentences.
:param skip_header: if True, skip the first line of the csv file (which usually
contains headers).
:return: a list of (text, label) tuples.
"""
tweets = []
if not sent_tokenizer:
sent_tokenizer = load('tokenizers/punkt/english.pickle')
# If we use Python3.x we can proceed using the 'rt' flag
if sys.version_info[0] == 3:
with codecs.open(filename, 'rt') as csvfile:
reader = csv.reader(csvfile)
if skip_header == True:
next(reader, None) # skip the header
i = 0
for tweet_id, text in reader:
# text = text[1]
i += 1
sys.stdout.write('Loaded {0} tweets\r'.format(i))
# Apply sentence and word tokenizer to text
if word_tokenizer:
tweet = [w for sent in sent_tokenizer.tokenize(text)
for w in word_tokenizer.tokenize(sent)]
else:
tweet = text
tweets.append((tweet, label))
# If we use Python2.x we need to handle encoding problems
elif sys.version_info[0] < 3:
with codecs.open(filename) as csvfile:
reader = csv.reader(csvfile)
if skip_header == True:
next(reader, None) # skip the header
i = 0
for row in reader:
unicode_row = [x.decode('utf8') for x in row]
text = unicode_row[1]
i += 1
sys.stdout.write('Loaded {0} tweets\r'.format(i))
# Apply sentence and word tokenizer to text
if word_tokenizer:
tweet = [w.encode('utf8') for sent in sent_tokenizer.tokenize(text)
for w in word_tokenizer.tokenize(sent)]
else:
tweet = text
tweets.append((tweet, label))
print("Loaded {0} tweets".format(i))
return tweets
示例2: tag
def tag(text):
"""Tags the input text.
Arguments:
text (str): The text to tag.
Returns:
([[(str, str)]]): List of sentences containing lists of word/tag pairs.
"""
#Separate the input text into sentences
sentences = nltk.sent_tokenize(str(text))
#Separate each sentence into words
nested = []
for sentence in sentences:
nested.append(nltk.word_tokenize(sentence))
# Prepare default tagger
_POS_TAGGER = 'taggers/maxent_treebank_pos_tagger/english.pickle'
tagger = load(_POS_TAGGER) # Same tagger as using nltk.pos_tag
# Prepare regex tagger for custom tags
regexp_tagger = nltk.tag.RegexpTagger([(r'\(', '('),
(r'\)', ')'),
(r'\[', '['),
(r'\]', ']'),
(r'_+', 'None')],
backoff=tagger)
#Add a part of speech tag to each word
nested_tagged = []
for sentence in nested:
nested_tagged.append([TaggedToken(*x) for x in regexp_tagger.tag(sentence)])
return nested_tagged
示例3: __init__
def __init__(self):
# Initializing TreeBank tokenizer from NLTK
from nltk.tokenize import TreebankWordTokenizer
self._tb_tokenizer = TreebankWordTokenizer().tokenize
# Initializing Punkt Sentence Tokenizer from NLTK
from nltk import data
self._sent_detector = data.load('tokenizers/punkt/english.pickle')
示例4: batch_pos_tag
def batch_pos_tag(sentences):
"""
Use NLTK's currently recommended part of speech tagger to tag the
given list of sentences, each consisting of a list of tokens.
"""
tagger = load(_POS_TAGGER)
return tagger.batch_tag(sentences)
示例5: read_rule
def read_rule (self, filename):
rules = load('nltk:stemmers/rslp/' + filename, format='raw').decode("utf8")
lines = rules.split("\n")
lines = [line for line in lines if line != ""] # remove blank lines
lines = [line for line in lines if line[0] != "#"] # remove comments
# NOTE: a simple but ugly hack to make this parser happy with double '\t's
lines = [line.replace("\t\t", "\t") for line in lines]
# parse rules
rules = []
for line in lines:
rule = []
tokens = line.split("\t")
# text to be searched for at the end of the string
rule.append( tokens[0][1:-1] ) # remove quotes
# minimum stem size to perform the replacement
rule.append( int(tokens[1]) )
# text to be replaced into
rule.append( tokens[2][1:-1] ) # remove quotes
# exceptions to this rule
rule.append( [token[1:-1] for token in tokens[3].split(",")] )
# append to the results
rules.append(rule)
return rules
示例6: run
def run(train, test, language, answer):
results = {}
if language == 'English':
_POS_TAGGER = 'taggers/maxent_treebank_pos_tagger/english.pickle'
tagger = load(_POS_TAGGER)
elif language == 'Spanish':
tagger = ut(cess_esp.tagged_sents())
elif language == 'Catalan':
tagger = ut(cess_cat.tagged_sents())
for lexelt in train:
train_features, y_train = extract_features(train[lexelt],language,tagger)
test_features, _ = extract_features(test[lexelt],language,tagger)
X_train, X_test = vectorize(train_features,test_features)
X_train_new, X_test_new = feature_selection(X_train, X_test,y_train)
results[lexelt] = classify(X_train_new, X_test_new,y_train)
"""
B1.c
for lexelt in train:
features = getBestWords(train[lexelt], 30)
train_features = countFeature(features, train[lexelt])
_, y_train = extract_features(train[lexelt], language)
test_features = countFeature(features, test[lexelt])
X_train, X_test = vectorize(train_features, test_features)
results[lexelt] = classify(X_train, X_test, y_train)
B1.c
"""
A.print_results(results, answer)
示例7: generate_instances
def generate_instances(self, sentences, child_conn):
# Each process has its own NLTK PoS-tagger
tagger = load('taggers/maxent_treebank_pos_tagger/english.pickle')
instances = list()
while True:
try:
s = sentences.get_nowait()
if sentences.qsize() % 500 == 0:
print(multiprocessing.current_process(), \
"Instances to process", sentences.qsize())
sentence = Sentence(s,
self.config.e1_type,
self.config.e2_type,
self.config.max_tokens_away,
self.config.min_tokens_away,
self.config.context_window_size,
tagger,
self.config)
for rel in sentence.relationships:
t = Tuple(rel.e1, rel.e2, rel.sentence, rel.before,
rel.between, rel.after, self.config)
instances.append(t)
except queue.Empty:
print(multiprocessing.current_process(), "Queue is Empty")
pid = multiprocessing.current_process().pid
child_conn.send((pid, instances))
break
示例8: get_tagger
def get_tagger(lang):
if lang == "English":
global eng_tagger
if eng_tagger:
return eng_tagger
else:
_POS_TAGGER = 'taggers/maxent_treebank_pos_tagger/english.pickle'
eng_tagger = load(_POS_TAGGER)
return eng_tagger
elif lang == "Spanish":
global spa_tagger
if spa_tagger:
return spa_tagger
else:
print 111
training = cess_esp.tagged_sents()
default_tagger = nltk.DefaultTagger('NN')
unigram_tagger = nltk.UnigramTagger(training,backoff=default_tagger)
bigram_tagger = nltk.BigramTagger(training, backoff=unigram_tagger)
spa_tagger = nltk.TrigramTagger(training, backoff=bigram_tagger)
print 555
return spa_tagger
else:
global cat_tagger
if cat_tagger:
return cat_tagger
else:
training = cess_cat.tagged_sents()
default_tagger = nltk.DefaultTagger('NN')
unigram_tagger = nltk.UnigramTagger(training,backoff=default_tagger)
bigram_tagger = nltk.BigramTagger(training, backoff=unigram_tagger)
cat_tagger = nltk.TrigramTagger(training, backoff=bigram_tagger)
return cat_tagger
示例9: test_austen
def test_austen():
from nltk.data import load
from nltk.corpus import gutenberg as g
stok = load('tokenizers/punkt/english.pickle')
train = [[w for w in tokenize(preprocess(sent))] for sent in stok.tokenize(g.raw('austen-emma.txt'))]
test1 = [[w for w in tokenize(preprocess(sent))] for sent in stok.tokenize(g.raw('austen-sense.txt'))]
test2 = [[w for w in tokenize(preprocess(sent))] for sent in stok.tokenize(g.raw('austen-persuasion.txt'))]
model1 = AdditiveSmoothing(n=2)
model1.generate_model(train)
print 'cross entropy additive smoothing:'
print 'emma to sense&sensibility: %f0.8' %cross_entropy(model1, test1)
print 'emma to persuasion: %f0.8' %cross_entropy(model1, test2)
model2 = KnesserNey(n=2)
model2.generate_model(train)
print 'cross entropy knesser-ney smoothing:'
print 'emma to sense&sensibility: %f0.8' %cross_entropy(model2, test1)
print 'emma to persuasion: %f0.8' %cross_entropy(model2, test2)
model3 = SimpleGoodTuring(n=2)
model3.generate_model(train)
print 'cross entropy simple good-turing smoothing:'
print 'emma to sense&sensibility: %f0.8' %cross_entropy(model3, test1)
print 'emma to persuasion: %f0.8' %cross_entropy(model3, test2)
model4 = KatzSmoothing(n=2)
model4.generate_model(train)
print 'cross entropy katz smoothing:'
print 'emma to sense&sensibility: %f0.8' %cross_entropy(model4, test1)
print 'emma to persuasion: %f0.8' %cross_entropy(model4, test2)
示例10: meaning_words
def meaning_words(self, text):
# meaning tags nouns and adjective only
meaning_tags = ['NN', 'NNP', 'NNPS', 'JJ']
default_tagger = data.load(tag._POS_TAGGER)
''' sometimes the nltk tagger is misclassifying some part-of-speech
such as The that should be a determiner. The duty tagger also helps
to eliminate common words that are not so important
'''
duty = dict()
[duty.update({w:'x'}) for w in self.common_words]
enchaned_tagger = tag.UnigramTagger(model=duty, backoff=default_tagger)
meaning_words = ' '.join([w for w, c in enchaned_tagger.tag(
word_tokenize(text)) if c in
meaning_tags and (len(w) > 2)])
'''if no meaning words are found, using this approach then
return the whole text
'''
if not meaning_words:
return None
else:
return meaning_words
示例11: digest
def digest(self):
if self.sentences is not None:
return
# Digest the problem into sentences
tokenizer = data.load("tokenizers/punkt/english.pickle")
self.sentences = tokenizer.tokenize(self.text.strip())
# Digest each sentence into words and part-of-speech tags
if self.sentence_tags is None:
sentence_tags = []
all_tags = []
all_words = []
for s in self.sentences:
all_words.append(s)
tags = pos_tag(word_tokenize(s))
sentence_tags.append(tags)
for t in tags:
l = len(t[0])
if not self.longest_word or self.longest_word < l:
self.longest_word = l
all_tags.append(t[1])
self.sentence_tags = sentence_tags
self.all_tags = uniq(all_tags)
self.all_words = uniq(all_words)
示例12: _split_sentence
def _split_sentence(self, s):
'''
sentence splitter
'''
#use French sentence tokenizer from nltk
pst = data.load("tokenizers/punkt/french.pickle")
return pst.tokenize(s)
示例13: __init__
def __init__(self):
"""
:param train_percent_size: 0-1
:return:
"""
_POS_TAGGER = 'taggers/maxent_treebank_pos_tagger/english.pickle'
self._tagger = load(_POS_TAGGER)
示例14: sent_tokenize
def sent_tokenize(text):
"""
Return a sentence-tokenized copy of *text*,
using NLTK's recommended sentence tokenizer
(currently :class:`.PunktSentenceTokenizer`).
"""
tokenizer = load("tokenizers/punkt/english.pickle")
return tokenizer.tokenize(text)
示例15: load
def load(self, loc):
'''
:param loc: Load a pickled model at location.
:type loc: str
'''
self.model.weights, self.tagdict, self.classes = load(loc)
self.model.classes = self.classes