本文整理汇总了Python中nltk.tokenize.PunktSentenceTokenizer.tokenize方法的典型用法代码示例。如果您正苦于以下问题:Python PunktSentenceTokenizer.tokenize方法的具体用法?Python PunktSentenceTokenizer.tokenize怎么用?Python PunktSentenceTokenizer.tokenize使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类nltk.tokenize.PunktSentenceTokenizer
的用法示例。
在下文中一共展示了PunktSentenceTokenizer.tokenize方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: __init__
# 需要导入模块: from nltk.tokenize import PunktSentenceTokenizer [as 别名]
# 或者: from nltk.tokenize.PunktSentenceTokenizer import tokenize [as 别名]
def __init__(self,sentence):
f = open('data/training_data', 'r')
train_text=f.read()
#data=open('data2','r')
#test_data=data.read()
custom_sent_tokenizer = PunktSentenceTokenizer(train_text)
self.tokenized = custom_sent_tokenizer.tokenize(sentence)
示例2: POS_tagging
# 需要导入模块: from nltk.tokenize import PunktSentenceTokenizer [as 别名]
# 或者: from nltk.tokenize.PunktSentenceTokenizer import tokenize [as 别名]
def POS_tagging(corpus):
train_text = state_union.raw("2005-GWBush.txt")
sample_text = corpus
#print(train_text)
custom_sentence_tokenizer = PunktSentenceTokenizer(train_text)
# textfile = open("POS_tagged",'w')
# textfile.write(train_text)
# textfile.write("\n\n\n\n\n\n\n\n\n\n")
# print(custom_sentence_tokenizer)
tokenized = custom_sentence_tokenizer.tokenize(sample_text)
tuples_list = []
def process_content():
try:
for i in tokenized:
words = nltk.word_tokenize(i)
tagged = nltk.pos_tag(words)
for w in tagged:
tuples_list.append(w)
except Exception as e:
c=0
# print(str(e))
process_content()
return tuples_list
示例3: extractNounPhrases
# 需要导入模块: from nltk.tokenize import PunktSentenceTokenizer [as 别名]
# 或者: from nltk.tokenize.PunktSentenceTokenizer import tokenize [as 别名]
def extractNounPhrases(sentence):
nounPhrases = []
try:
tokenizer = PunktSentenceTokenizer(sentence)
tokenized = tokenizer.tokenize(sentence)
words = nltk.word_tokenize(tokenized[0])
tagged = nltk.pos_tag(words)
firstNN = False
for tag in tagged:
pos = tag[1]
if "NN" in pos:
if firstNN:
nounPhrase = firstNoun + " " + tag[0]
nounPhrases.append(nounPhrase)
firstNN = False
continue
else:
firstNoun = tag[0]
firstNN = True
continue
firstNN = False
except Exception as e:
print(str(e))
return nounPhrases
示例4: Tokenizer
# 需要导入模块: from nltk.tokenize import PunktSentenceTokenizer [as 别名]
# 或者: from nltk.tokenize.PunktSentenceTokenizer import tokenize [as 别名]
class Tokenizer(object):
def __init__(self, language, normalize=False, train_text_gen=None):
"""
A tokenizer using NLTK Penn Treebank tokenizer, and the Punkt sentence tokenizer.
Params:
language: Language to tokenize (currently doesn't do anything)
train_text_gen: A generator of training text for the sentence tokenizer.
"""
self.language = language
self.train_text_gen = train_text_gen
self.normalize = normalize
if train_text_gen:
self.sent_tokenizer = self._train_sentence_tokenizer()
else:
self.sent_tokenizer = PunktSentenceTokenizer()
def _train_sentence_tokenizer(self):
return PunktSentenceTokenizer(train_text="\n".join(self.train_text_gen))
def tokenize(self, text):
tokenized = []
for sentence in self.sent_tokenizer.tokenize(text):
tokenized_sentence = []
for word in word_tokenize(sentence):
if self.normalize:
word = word.lower()
tokenized_sentence.append(word)
tokenized.append(tokenized_sentence)
return tokenized
示例5: get_sentences
# 需要导入模块: from nltk.tokenize import PunktSentenceTokenizer [as 别名]
# 或者: from nltk.tokenize.PunktSentenceTokenizer import tokenize [as 别名]
def get_sentences(self, remove_url=True):
'''
generator
:param remove_url --> replace URLs in sentences with one space char ;
:return: tuple of sentences for each mime-part ;
'''
tokenizer = PunktSentenceTokenizer()
for raw_line, mime_type, lang in tuple(self.get_text_mime_part()):
if 'html' in mime_type:
soup = BeautifulSoup(raw_line)
if not soup.body:
continue
# cause exactly sentences are needed, soup.body.strings returns lines+0d0a
lines = tuple(soup.body.strings)
raw_line = ''.join(lines)
try:
sents = tuple(tokenizer.tokenize(raw_line))
except Exception as err:
sents = tuple(raw_line)
if remove_url:
sents = tuple(map(lambda sent: self.__URLINTEXT_PAT.sub(' ', sent.lower()), sents))
sents = (s.strip().lower() for s in sents)
sents = tuple(s for s in tuple(sents) if s)
if len(sents) == 0:
continue
yield sents
示例6: normalize
# 需要导入模块: from nltk.tokenize import PunktSentenceTokenizer [as 别名]
# 或者: from nltk.tokenize.PunktSentenceTokenizer import tokenize [as 别名]
def normalize(text):
p = PunktSentenceTokenizer()
bullet1 = '\xe2\x80\xa2'.decode('utf-8')
bullet2 = '\xc2\xb7'.decode('utf-8')
usable = ''
for sentence in p.tokenize(text):
if len(sentence) < 500:
if bullet1 not in sentence and bullet2 not in sentence:
usable += '%s ' % sentence
return usable
示例7: tokenize_english_document
# 需要导入模块: from nltk.tokenize import PunktSentenceTokenizer [as 别名]
# 或者: from nltk.tokenize.PunktSentenceTokenizer import tokenize [as 别名]
def tokenize_english_document(input_text):
"""
This is a crude tokenizer for input conversations in English.
:param input_text:
:return:
"""
end_list = []
block_tokenizer = BlanklineTokenizer()
sentence_tokenizer = PunktSentenceTokenizer()
word_tokenizer = WhitespaceTokenizer()
# using the 38 characters in one line rule from ITV subtitle guidelines
characters_per_line = 38
lines_per_subtitle = 2
blocks = block_tokenizer.tokenize(input_text)
for block in blocks:
# We have one speaker
sentences = sentence_tokenizer.tokenize(block)
# We have the sentences
for sentence in sentences:
words = word_tokenizer.tokenize(sentence)
reverse_words = words[::-1]
lines = []
current_line = ''
line_full = False
while reverse_words:
word = reverse_words.pop()
longer_line = ' '.join([current_line, word]).strip()
if len(longer_line) > characters_per_line and len(current_line):
# The longer line is overreaching boundaries
reverse_words.append(word)
line_full = True
elif len(word) >= characters_per_line:
# Very long words
current_line = longer_line
line_full = True
else:
current_line = longer_line
if line_full:
lines.append(current_line)
current_line = ''
line_full = False
if len(lines) >= lines_per_subtitle:
end_list.append(lines)
lines = []
if current_line:
lines.append(current_line)
if lines:
end_list.append(lines)
return end_list
示例8: aristo_get_named_entities
# 需要导入模块: from nltk.tokenize import PunktSentenceTokenizer [as 别名]
# 或者: from nltk.tokenize.PunktSentenceTokenizer import tokenize [as 别名]
def aristo_get_named_entities(self, text):
"""
Parses the texts to obtain named entities
:param text: The text to parse
:return:returns a named entity treexw
"""
custom_sent_tokenizer = PunktSentenceTokenizer(text)
tokenized = custom_sent_tokenizer.tokenize(text)
for i in tokenized[5:]:
words = nltk.word_tokenize(i)
tagged = nltk.pos_tag(words)
namedEnt = nltk.ne_chunk(tagged, binary=False)
return ((namedEnt))
示例9: tag
# 需要导入模块: from nltk.tokenize import PunktSentenceTokenizer [as 别名]
# 或者: from nltk.tokenize.PunktSentenceTokenizer import tokenize [as 别名]
def tag(sentence):
try:
tokenizer = PunktSentenceTokenizer(sentence)
tokenized = tokenizer.tokenize(sentence)
words = nltk.word_tokenize(tokenized[0])
tagged = nltk.pos_tag(words)
return tagged
except Exception as e:
print(str(e))
示例10: name_ent_recog
# 需要导入模块: from nltk.tokenize import PunktSentenceTokenizer [as 别名]
# 或者: from nltk.tokenize.PunktSentenceTokenizer import tokenize [as 别名]
def name_ent_recog(post):
train_text = state_union.raw("2005-GWBush.txt")
sample_text = post
custom_sent_tokenizer = PunktSentenceTokenizer(train_text)
tokenized = custom_sent_tokenizer.tokenize(sample_text)
namedEnt = []
try:
for i in tokenized:
words = nltk.word_tokenize(i)
tagged = nltk.pos_tag(words)
namedEnt.append(nltk.ne_chunk(tagged))
except Exception as e:
print(str(e))
return namedEnt
示例11: sentenceTagging
# 需要导入模块: from nltk.tokenize import PunktSentenceTokenizer [as 别名]
# 或者: from nltk.tokenize.PunktSentenceTokenizer import tokenize [as 别名]
def sentenceTagging(text, trainingText):
csTokenizer = PunktSentenceTokenizer(trainingText)
tokenized = csTokenizer.tokenize(text)
taggedSentence = []
try:
for i in tokenized:
words = nltk.word_tokenize(i)
tagged = nltk.pos_tag(words)
taggedSentence.append(tagged)
#chinkingWords(tagged).draw()
namedEntityRecog(tagged)
except Exception as e:
print(str(e))
return taggedSentence
示例12: pos
# 需要导入模块: from nltk.tokenize import PunktSentenceTokenizer [as 别名]
# 或者: from nltk.tokenize.PunktSentenceTokenizer import tokenize [as 别名]
def pos(self, paragraph):
wordsdict = collections.OrderedDict()
sent_tokenizer = PunktSentenceTokenizer()
for sentence in self.sent_detector.tokenize(paragraph):
tokens = sent_tokenizer.tokenize(sentence)
for token in tokens:
words = nltk.word_tokenize(token)
tagged = nltk.pos_tag(words)
for word in tagged:
if word[1] in self.tagdict:
wordsdict[word[0]] = self.tagdict[word[1]][0]
return wordsdict
示例13: main
# 需要导入模块: from nltk.tokenize import PunktSentenceTokenizer [as 别名]
# 或者: from nltk.tokenize.PunktSentenceTokenizer import tokenize [as 别名]
def main():
training_text = state_union.raw('2005-GWBush.txt')
sample_text = state_union.raw('2006-GWBush.txt')
custom_sent_tokenizer = PunktSentenceTokenizer(training_text)
tokenized = custom_sent_tokenizer.tokenize(sample_text)
choice = 0
while choice < 5:
choice = input("1 for named_chunks. This provides some information about proper nouns.\n, 2 for process_chunks. This tells you if a noun phrase followed by n adverb occurs., \n3 for proccess content, this just prints stuff, 4 for...")
if choice == 1:
named_chunks(text_trained_tokenized(sample_text, training_text))
elif choice == 2:
process_chunks(text_trained_tokenized(sample_text, training_text))
elif choice == 3:
process_content(text_trained_tokenized(sample_text, training_text))
elif choice == 4:
print "try again, bitch!"
示例14: get_sentence_occurrences
# 需要导入模块: from nltk.tokenize import PunktSentenceTokenizer [as 别名]
# 或者: from nltk.tokenize.PunktSentenceTokenizer import tokenize [as 别名]
def get_sentence_occurrences(document, terms, doc_term=None):
terms_present = get_document_occurrences(document, terms)
# Use a Tokenizer from NLTK to build a sentence list
tokenizer = Tokenizer(document)
sentences = tokenizer.tokenize(document)
# Create a list of lists containing the collection of terms which cooccurr
# in a sentence
occurrences = []
for sentence in sentences:
sentence_occurrences = set()
for term in terms_present:
if term != doc_term:
if re.search(' %s ' % term.label, sentence):
sentence_occurrences.add(term)
if len(sentence_occurrences) > 0:
sentence_occurrences = list(sentence_occurrences)
to_remove = set()
for inside in sentence_occurrences:
for term in sentence_occurrences:
if term != inside and\
term.label.find(inside.label) != -1:
to_remove.add(inside)
if to_remove:
print "removing", to_remove
for term in to_remove:
sentence_occurrences.remove(term)
if doc_term:
sentence_occurrences.append(doc_term)
occurrences.append(sentence_occurrences)
return occurrences
示例15: __init__
# 需要导入模块: from nltk.tokenize import PunktSentenceTokenizer [as 别名]
# 或者: from nltk.tokenize.PunktSentenceTokenizer import tokenize [as 别名]
class NER:
"""docstring for ClassName"""
def __init__(self, query):
self.original_query = query
conf = shelve.open('conf')
self.train_text = conf['train_text']
self.custom_sent_tokenizer = PunktSentenceTokenizer(self.train_text)
self.tokenized = self.custom_sent_tokenizer.tokenize(self.original_query)
def processContent(self):
try:
for i in self.tokenized:
words = nltk.word_tokenize(i)
tagged = nltk.pos_tag(words)
namedEnt = nltk.ne_chunk(tagged, binary=True)
#print(namedEnt)
#namedEnt.draw()
return namedEnt
except Exception as e:
print(str(e))
# Parse named entities from tree
def structureNamedEntities(self):
ne = []
for subtree in self.named_entity_tree:
if type(subtree) == Tree: # If subtree is a noun chunk, i.e. NE != "O"
ne_label = subtree.label()
ne_string = " ".join([token for token, pos in subtree.leaves()])
ne.append((ne_string, ne_label))
return ne
def performNER(self):
self.named_entity_tree = self.processContent()
#print(type(self.named_entity_tree))
self.named_entity_tuple = self.structureNamedEntities()
#print(ne)
names = [element[0] for element in self.named_entity_tuple]
return names