本文整理汇总了Python中nltk.tokenize方法的典型用法代码示例。如果您正苦于以下问题:Python nltk.tokenize方法的具体用法?Python nltk.tokenize怎么用?Python nltk.tokenize使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类nltk
的用法示例。
在下文中一共展示了nltk.tokenize方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: demo_sent_subjectivity
# 需要导入模块: import nltk [as 别名]
# 或者: from nltk import tokenize [as 别名]
def demo_sent_subjectivity(text):
"""
Classify a single sentence as subjective or objective using a stored
SentimentAnalyzer.
:param text: a sentence whose subjectivity has to be classified.
"""
from nltk.classify import NaiveBayesClassifier
from nltk.tokenize import regexp
word_tokenizer = regexp.WhitespaceTokenizer()
try:
sentim_analyzer = load('sa_subjectivity.pickle')
except LookupError:
print('Cannot find the sentiment analyzer you want to load.')
print('Training a new one using NaiveBayesClassifier.')
sentim_analyzer = demo_subjectivity(NaiveBayesClassifier.train, True)
# Tokenize and convert to lower case
tokenized_text = [word.lower() for word in word_tokenizer.tokenize(text)]
print(sentim_analyzer.classify(tokenized_text))
示例2: process_col
# 需要导入模块: import nltk [as 别名]
# 或者: from nltk import tokenize [as 别名]
def process_col(col, **kwargs):
numeric = kwargs['num']
# process the cols to return a bags of word representation
if col.dtype == 'int64' or col.dtype =='float64':
if numeric == 'directstr':
return list(col.astype(str))
elif numeric == 'placeholder':
return [str(col.dtype)] * len(col)
if col.dtype == 'object':
return tokenize(list(col.astype(str)), **kwargs)
else:
return list(col.astype(str))
return col
示例3: __init__
# 需要导入模块: import nltk [as 别名]
# 或者: from nltk import tokenize [as 别名]
def __init__(self):
import nltk
from nltk.tag import PerceptronTagger
from nltk.tokenize import TreebankWordTokenizer
#return pkgutil.get_data('scattertext',
# 'data/viz/semiotic_new.html').decode('utf-8')
path = os.path.dirname(sys.modules['scattertext'].__file__)+'/data/'
tokenizer_fn = path + 'punkt.english.pickle'
tagger_fn = path + 'averaged_perceptron_tagger.pickle'
#tokenizer_fn = os.path.abspath(resource_filename('scattertext.data', 'punkt.english.pickle'))
#tagger_fn = os.path.abspath(resource_filename('scattertext.data', 'averaged_perceptron_tagger.pickle'))
# Load the tagger
self.tagger = PerceptronTagger(load=False)
self.tagger.load(tagger_fn)
# note: nltk.word_tokenize calls the TreebankWordTokenizer, but uses the downloader.
# Calling the TreebankWordTokenizer like this allows skipping the downloader.
# It seems the TreebankWordTokenizer uses PTB tokenization = regexes. i.e. no downloads
# https://github.com/nltk/nltk/blob/develop/nltk/tokenize/treebank.py#L25
self.tokenize = TreebankWordTokenizer().tokenize
self.sent_detector = nltk.data.load(tokenizer_fn)
# http://www.nltk.org/book/ch05.html
示例4: handle_multiple_sentences
# 需要导入模块: import nltk [as 别名]
# 或者: from nltk import tokenize [as 别名]
def handle_multiple_sentences(infile, outfile):
titles = []
f = open(infile, "r")
f2 = codecs.open(outfile, "w+", "utf-8")
for line in f:
line = line.decode("utf-8")
sentences = sent_detector.tokenize(line.strip())
for i in range(len(sentences)):
if i == 0:
sentences[i] = sentences[i].replace(sentences[i].split()[0],sentences[i].split()[0].title())
else:
sentences[i] = sentences[i].replace(sentences[i].split()[0],sentences[i].split()[0].title())
sentences[i-1] = sentences[i-1].replace(sentences[i-1].split()[-1][-1], " ::::")
titles.append(" ".join(sentences))
title_set = set(titles)
for l in title_set:
print >> f2, l
示例5: describe
# 需要导入模块: import nltk [as 别名]
# 或者: from nltk import tokenize [as 别名]
def describe(self, fileids=None, categories=None):
"""
Performs a single pass of the corpus and
returns a dictionary with a variety of metrics
concerning the state of the corpus.
"""
# Structures to perform counting.
counts = nltk.FreqDist()
tokens = nltk.FreqDist()
# Perform single pass over paragraphs, tokenize and count
for para in self.paras(fileids, categories):
for sent in para:
for word, tag in sent:
counts['words'] += 1
tokens[word] += 1
# Return data structure with information
return {
'words': counts['words'],
'vocab': len(tokens),
'lexdiv': float(counts['words']) / float(len(tokens)),
}
示例6: __init__
# 需要导入模块: import nltk [as 别名]
# 或者: from nltk import tokenize [as 别名]
def __init__(self):
import nltk
from nltk.tag import PerceptronTagger
from nltk.tokenize import TreebankWordTokenizer
tokenizer_fn = os.path.abspath(resource_filename('phrasemachine.data', 'punkt.english.pickle'))
tagger_fn = os.path.abspath(resource_filename('phrasemachine.data', 'averaged_perceptron_tagger.pickle'))
# Load the tagger
self.tagger = PerceptronTagger(load=False)
self.tagger.load(tagger_fn)
# note: nltk.word_tokenize calls the TreebankWordTokenizer, but uses the downloader.
# Calling the TreebankWordTokenizer like this allows skipping the downloader.
# It seems the TreebankWordTokenizer uses PTB tokenization = regexes. i.e. no downloads
# https://github.com/nltk/nltk/blob/develop/nltk/tokenize/treebank.py#L25
self.tokenize = TreebankWordTokenizer().tokenize
self.sent_detector = nltk.data.load(tokenizer_fn)
# http://www.nltk.org/book/ch05.html
示例7: tokenize
# 需要导入模块: import nltk [as 别名]
# 或者: from nltk import tokenize [as 别名]
def tokenize(self, example: str) -> List[str]:
"""Tokenize an input example.
Parameters
----------
example : str
The input example, as a string.
Returns
-------
List[str]
The output word tokens, as a list of strings
"""
if self.exclude_stopwords and self.stop_words:
example = ' '.join([word for word in word_tokenize(example)
if word not in self.stop_words])
if isinstance(self.ngrams, List):
ret: List[str] = []
for i in self.ngrams:
ret.extend(self._tokenize(example, i))
return ret
else:
return NGramsTokenizer._tokenize(example, self.ngrams)
示例8: tokenize_mcb
# 需要导入模块: import nltk [as 别名]
# 或者: from nltk import tokenize [as 别名]
def tokenize_mcb(sentence):
"""
MCB tokenize implementation.
--------------------
Arguments:
sentence (str): a setence that will be tokenized.
Return:
A list of tokens from the sentence.
"""
for i in [r"\?", r"\!", r"\'", r"\"", r"\$", r"\:", r"\@", r"\(", r"\)", r"\,", r"\.", r"\;"]:
sen = re.sub(i, "", sen)
for i in [r"\-", r"\/"]:
sen = re.sub(i, " ", sen)
q_list = re.sub(r"\?", "", sen.lower()).split()
q_list = list(filter(lambda x: len(x) > 0, q_list))
return q_list
示例9: tokenize
# 需要导入模块: import nltk [as 别名]
# 或者: from nltk import tokenize [as 别名]
def tokenize(text):
"""
sent_tokenize(): segment text into sentences
word_tokenize(): break sentences into words
"""
try:
regex = re.compile("[" + re.escape(string.punctuation) + "0-9\\r\\t\\n]")
text = regex.sub(" ", text) # remove punctuation
tokens_ = [word_tokenize(s) for s in sent_tokenize(text)]
tokens = []
for token_by_sent in tokens_:
tokens += token_by_sent
tokens = list(filter(lambda t: t.lower() not in stop, tokens))
filtered_tokens = [w for w in tokens if re.search("[a-zA-Z]", w)]
filtered_tokens = [w.lower() for w in filtered_tokens if len(w) >= 3]
return filtered_tokens
except TypeError as e:
print(text, e)
示例10: _tokenize
# 需要导入模块: import nltk [as 别名]
# 或者: from nltk import tokenize [as 别名]
def _tokenize(self, item):
"""Tokenizes an input string into a list of words."""
tokenized = []
for s in self._sentence_detector.tokenize(item):
tokenized.extend(nltk.tokenize.word_tokenize(s))
return tokenized
示例11: _use_stopwords
# 需要导入模块: import nltk [as 别名]
# 或者: from nltk import tokenize [as 别名]
def _use_stopwords(self, x):
words = tokenizer.tokenize(x)
words = [w for w in words if not w in eng_stopwords]
x = " ".join(words)
return x
示例12: _apostrophes
# 需要导入模块: import nltk [as 别名]
# 或者: from nltk import tokenize [as 别名]
def _apostrophes(self, x):
words = tokenizer.tokenize(x)
words = [APOSTROPHES_WORDS[word] if word in APOSTROPHES_WORDS else word for word in words]
words = [lem.lemmatize(word, "v") for word in words]
words = [w for w in words if not w in eng_stopwords]
x = " ".join(words)
return x
示例13: demo_liu_hu_lexicon
# 需要导入模块: import nltk [as 别名]
# 或者: from nltk import tokenize [as 别名]
def demo_liu_hu_lexicon(sentence, plot=False):
"""
Basic example of sentiment classification using Liu and Hu opinion lexicon.
This function simply counts the number of positive, negative and neutral words
in the sentence and classifies it depending on which polarity is more represented.
Words that do not appear in the lexicon are considered as neutral.
:param sentence: a sentence whose polarity has to be classified.
:param plot: if True, plot a visual representation of the sentence polarity.
"""
from nltk.corpus import opinion_lexicon
from nltk.tokenize import treebank
tokenizer = treebank.TreebankWordTokenizer()
pos_words = 0
neg_words = 0
tokenized_sent = [word.lower() for word in tokenizer.tokenize(sentence)]
x = list(range(len(tokenized_sent))) # x axis for the plot
y = []
for word in tokenized_sent:
if word in opinion_lexicon.positive():
pos_words += 1
y.append(1) # positive
elif word in opinion_lexicon.negative():
neg_words += 1
y.append(-1) # negative
else:
y.append(0) # neutral
if pos_words > neg_words:
print('Positive')
elif pos_words < neg_words:
print('Negative')
elif pos_words == neg_words:
print('Neutral')
if plot == True:
_show_plot(x, y, x_labels=tokenized_sent, y_labels=['Negative', 'Neutral', 'Positive'])
示例14: read_block
# 需要导入模块: import nltk [as 别名]
# 或者: from nltk import tokenize [as 别名]
def read_block(self, stream):
block = []
for para_str in self._para_block_reader(stream):
para = []
for sent_str in self._sent_tokenizer.tokenize(para_str):
sent = self._str2chunktree(sent_str, source_tagset=self._source_tagset,
target_tagset=self._target_tagset)
# If requested, throw away the tags.
if not self._tagged:
sent = self._untag(sent)
# If requested, throw away the chunks.
if not self._chunked:
sent = sent.leaves()
# Add the sentence to `para`.
if self._group_by_sent:
para.append(sent)
else:
para.extend(sent)
# Add the paragraph to `block`.
if self._group_by_para:
block.append(para)
else:
block.extend(para)
# Return the block
return block
示例15: __init__
# 需要导入模块: import nltk [as 别名]
# 或者: from nltk import tokenize [as 别名]
def __init__(self, rtepair, stop=True, lemmatize=False):
"""
:param rtepair: a ``RTEPair`` from which features should be extracted
:param stop: if ``True``, stopwords are thrown away.
:type stop: bool
"""
self.stop = stop
self.stopwords = set(['a', 'the', 'it', 'they', 'of', 'in', 'to', 'is',
'have', 'are', 'were', 'and', 'very', '.', ','])
self.negwords = set(['no', 'not', 'never', 'failed', 'rejected',
'denied'])
# Try to tokenize so that abbreviations like U.S.and monetary amounts
# like "$23.00" are kept as tokens.
from nltk.tokenize import RegexpTokenizer
tokenizer = RegexpTokenizer('([A-Z]\.)+|\w+|\$[\d\.]+')
#Get the set of word types for text and hypothesis
self.text_tokens = tokenizer.tokenize(rtepair.text)
self.hyp_tokens = tokenizer.tokenize(rtepair.hyp)
self.text_words = set(self.text_tokens)
self.hyp_words = set(self.hyp_tokens)
if lemmatize:
self.text_words = set(lemmatize(token) for token in self.text_tokens)
self.hyp_words = set(lemmatize(token) for token in self.hyp_tokens)
if self.stop:
self.text_words = self.text_words - self.stopwords
self.hyp_words = self.hyp_words - self.stopwords
self._overlap = self.hyp_words & self.text_words
self._hyp_extra = self.hyp_words - self.text_words
self._txt_extra = self.text_words - self.hyp_words