本文整理汇总了Python中nltk.tokenize.PunktSentenceTokenizer类的典型用法代码示例。如果您正苦于以下问题:Python PunktSentenceTokenizer类的具体用法?Python PunktSentenceTokenizer怎么用?Python PunktSentenceTokenizer使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。
在下文中一共展示了PunktSentenceTokenizer类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: __init__
def __init__(self,sentence):
f = open('data/training_data', 'r')
train_text=f.read()
#data=open('data2','r')
#test_data=data.read()
custom_sent_tokenizer = PunktSentenceTokenizer(train_text)
self.tokenized = custom_sent_tokenizer.tokenize(sentence)
示例2: POS_tagging
def POS_tagging(corpus):
train_text = state_union.raw("2005-GWBush.txt")
sample_text = corpus
#print(train_text)
custom_sentence_tokenizer = PunktSentenceTokenizer(train_text)
# textfile = open("POS_tagged",'w')
# textfile.write(train_text)
# textfile.write("\n\n\n\n\n\n\n\n\n\n")
# print(custom_sentence_tokenizer)
tokenized = custom_sentence_tokenizer.tokenize(sample_text)
tuples_list = []
def process_content():
try:
for i in tokenized:
words = nltk.word_tokenize(i)
tagged = nltk.pos_tag(words)
for w in tagged:
tuples_list.append(w)
except Exception as e:
c=0
# print(str(e))
process_content()
return tuples_list
示例3: extractNounPhrases
def extractNounPhrases(sentence):
nounPhrases = []
try:
tokenizer = PunktSentenceTokenizer(sentence)
tokenized = tokenizer.tokenize(sentence)
words = nltk.word_tokenize(tokenized[0])
tagged = nltk.pos_tag(words)
firstNN = False
for tag in tagged:
pos = tag[1]
if "NN" in pos:
if firstNN:
nounPhrase = firstNoun + " " + tag[0]
nounPhrases.append(nounPhrase)
firstNN = False
continue
else:
firstNoun = tag[0]
firstNN = True
continue
firstNN = False
except Exception as e:
print(str(e))
return nounPhrases
示例4: get_sentences
def get_sentences(self, remove_url=True):
'''
generator
:param remove_url --> replace URLs in sentences with one space char ;
:return: tuple of sentences for each mime-part ;
'''
tokenizer = PunktSentenceTokenizer()
for raw_line, mime_type, lang in tuple(self.get_text_mime_part()):
if 'html' in mime_type:
soup = BeautifulSoup(raw_line)
if not soup.body:
continue
# cause exactly sentences are needed, soup.body.strings returns lines+0d0a
lines = tuple(soup.body.strings)
raw_line = ''.join(lines)
try:
sents = tuple(tokenizer.tokenize(raw_line))
except Exception as err:
sents = tuple(raw_line)
if remove_url:
sents = tuple(map(lambda sent: self.__URLINTEXT_PAT.sub(' ', sent.lower()), sents))
sents = (s.strip().lower() for s in sents)
sents = tuple(s for s in tuple(sents) if s)
if len(sents) == 0:
continue
yield sents
示例5: normalize
def normalize(text):
p = PunktSentenceTokenizer()
bullet1 = '\xe2\x80\xa2'.decode('utf-8')
bullet2 = '\xc2\xb7'.decode('utf-8')
usable = ''
for sentence in p.tokenize(text):
if len(sentence) < 500:
if bullet1 not in sentence and bullet2 not in sentence:
usable += '%s ' % sentence
return usable
示例6: tokenize_english_document
def tokenize_english_document(input_text):
"""
This is a crude tokenizer for input conversations in English.
:param input_text:
:return:
"""
end_list = []
block_tokenizer = BlanklineTokenizer()
sentence_tokenizer = PunktSentenceTokenizer()
word_tokenizer = WhitespaceTokenizer()
# using the 38 characters in one line rule from ITV subtitle guidelines
characters_per_line = 38
lines_per_subtitle = 2
blocks = block_tokenizer.tokenize(input_text)
for block in blocks:
# We have one speaker
sentences = sentence_tokenizer.tokenize(block)
# We have the sentences
for sentence in sentences:
words = word_tokenizer.tokenize(sentence)
reverse_words = words[::-1]
lines = []
current_line = ''
line_full = False
while reverse_words:
word = reverse_words.pop()
longer_line = ' '.join([current_line, word]).strip()
if len(longer_line) > characters_per_line and len(current_line):
# The longer line is overreaching boundaries
reverse_words.append(word)
line_full = True
elif len(word) >= characters_per_line:
# Very long words
current_line = longer_line
line_full = True
else:
current_line = longer_line
if line_full:
lines.append(current_line)
current_line = ''
line_full = False
if len(lines) >= lines_per_subtitle:
end_list.append(lines)
lines = []
if current_line:
lines.append(current_line)
if lines:
end_list.append(lines)
return end_list
示例7: aristo_get_named_entities
def aristo_get_named_entities(self, text):
"""
Parses the texts to obtain named entities
:param text: The text to parse
:return:returns a named entity treexw
"""
custom_sent_tokenizer = PunktSentenceTokenizer(text)
tokenized = custom_sent_tokenizer.tokenize(text)
for i in tokenized[5:]:
words = nltk.word_tokenize(i)
tagged = nltk.pos_tag(words)
namedEnt = nltk.ne_chunk(tagged, binary=False)
return ((namedEnt))
示例8: tag
def tag(sentence):
try:
tokenizer = PunktSentenceTokenizer(sentence)
tokenized = tokenizer.tokenize(sentence)
words = nltk.word_tokenize(tokenized[0])
tagged = nltk.pos_tag(words)
return tagged
except Exception as e:
print(str(e))
示例9: name_ent_recog
def name_ent_recog(post):
train_text = state_union.raw("2005-GWBush.txt")
sample_text = post
custom_sent_tokenizer = PunktSentenceTokenizer(train_text)
tokenized = custom_sent_tokenizer.tokenize(sample_text)
namedEnt = []
try:
for i in tokenized:
words = nltk.word_tokenize(i)
tagged = nltk.pos_tag(words)
namedEnt.append(nltk.ne_chunk(tagged))
except Exception as e:
print(str(e))
return namedEnt
示例10: sentenceTagging
def sentenceTagging(text, trainingText):
csTokenizer = PunktSentenceTokenizer(trainingText)
tokenized = csTokenizer.tokenize(text)
taggedSentence = []
try:
for i in tokenized:
words = nltk.word_tokenize(i)
tagged = nltk.pos_tag(words)
taggedSentence.append(tagged)
#chinkingWords(tagged).draw()
namedEntityRecog(tagged)
except Exception as e:
print(str(e))
return taggedSentence
示例11: pos
def pos(self, paragraph):
wordsdict = collections.OrderedDict()
sent_tokenizer = PunktSentenceTokenizer()
for sentence in self.sent_detector.tokenize(paragraph):
tokens = sent_tokenizer.tokenize(sentence)
for token in tokens:
words = nltk.word_tokenize(token)
tagged = nltk.pos_tag(words)
for word in tagged:
if word[1] in self.tagdict:
wordsdict[word[0]] = self.tagdict[word[1]][0]
return wordsdict
示例12: Tokenizer
class Tokenizer(object):
def __init__(self, language, normalize=False, train_text_gen=None):
"""
A tokenizer using NLTK Penn Treebank tokenizer, and the Punkt sentence tokenizer.
Params:
language: Language to tokenize (currently doesn't do anything)
train_text_gen: A generator of training text for the sentence tokenizer.
"""
self.language = language
self.train_text_gen = train_text_gen
self.normalize = normalize
if train_text_gen:
self.sent_tokenizer = self._train_sentence_tokenizer()
else:
self.sent_tokenizer = PunktSentenceTokenizer()
def _train_sentence_tokenizer(self):
return PunktSentenceTokenizer(train_text="\n".join(self.train_text_gen))
def tokenize(self, text):
tokenized = []
for sentence in self.sent_tokenizer.tokenize(text):
tokenized_sentence = []
for word in word_tokenize(sentence):
if self.normalize:
word = word.lower()
tokenized_sentence.append(word)
tokenized.append(tokenized_sentence)
return tokenized
示例13: main
def main():
training_text = state_union.raw('2005-GWBush.txt')
sample_text = state_union.raw('2006-GWBush.txt')
custom_sent_tokenizer = PunktSentenceTokenizer(training_text)
tokenized = custom_sent_tokenizer.tokenize(sample_text)
choice = 0
while choice < 5:
choice = input("1 for named_chunks. This provides some information about proper nouns.\n, 2 for process_chunks. This tells you if a noun phrase followed by n adverb occurs., \n3 for proccess content, this just prints stuff, 4 for...")
if choice == 1:
named_chunks(text_trained_tokenized(sample_text, training_text))
elif choice == 2:
process_chunks(text_trained_tokenized(sample_text, training_text))
elif choice == 3:
process_content(text_trained_tokenized(sample_text, training_text))
elif choice == 4:
print "try again, bitch!"
示例14: extract_features
def extract_features(self):
"""
All approach of extracting features from raw data implemented here
"""
custom_tokenizer = PunktSentenceTokenizer()
regex_tokenizer = RegexpTokenizer(r'[a-zA-Z]+')
ps = PorterStemmer()
tokenized = []
with open(self.file_path, 'r') as current_document:
for each_line in current_document:
tokenized.extend(custom_tokenizer.tokenize(each_line)) # tokenizing words line by line
feature_list = []
try:
for each_sentence in tokenized:
# words = nltk.word_tokenize(each_sentence)
words = regex_tokenizer.tokenize(each_sentence)
tagged = nltk.pos_tag(words)
feature_list.extend([ps.stem(pos[0].lower()) for pos in tagged if pos[1] == 'NN']) # listing the nouns in a list
except Exception as E:
print(str(E))
feature_dictionary = Counter(feature_list) # converts an iterable object(in this case, LIST) to dictionary
return feature_dictionary
示例15: __init__
def __init__(self, language, normalize=False, train_text_gen=None):
"""
A tokenizer using NLTK Penn Treebank tokenizer, and the Punkt sentence tokenizer.
Params:
language: Language to tokenize (currently doesn't do anything)
train_text_gen: A generator of training text for the sentence tokenizer.
"""
self.language = language
self.train_text_gen = train_text_gen
self.normalize = normalize
if train_text_gen:
self.sent_tokenizer = self._train_sentence_tokenizer()
else:
self.sent_tokenizer = PunktSentenceTokenizer()