本文整理汇总了Python中nltk.tokenize.punkt.PunktSentenceTokenizer.span_tokenize方法的典型用法代码示例。如果您正苦于以下问题:Python PunktSentenceTokenizer.span_tokenize方法的具体用法?Python PunktSentenceTokenizer.span_tokenize怎么用?Python PunktSentenceTokenizer.span_tokenize使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类nltk.tokenize.punkt.PunktSentenceTokenizer
的用法示例。
在下文中一共展示了PunktSentenceTokenizer.span_tokenize方法的7个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: analyse_entry
# 需要导入模块: from nltk.tokenize.punkt import PunktSentenceTokenizer [as 别名]
# 或者: from nltk.tokenize.punkt.PunktSentenceTokenizer import span_tokenize [as 别名]
def analyse_entry(self, entry, activity):
yield entry
chunker_type = activity.params["delimiter"]
original_text = entry['nif:isString']
if chunker_type == "sentence":
tokenizer = PunktSentenceTokenizer()
if chunker_type == "paragraph":
tokenizer = LineTokenizer()
chars = list(tokenizer.span_tokenize(original_text))
if len(chars) == 1:
# This sentence was already split
return
for i, chunk in enumerate(chars):
start, end = chunk
e = Entry()
e['nif:isString'] = original_text[start:end]
if entry.id:
e.id = entry.id + "#char={},{}".format(start, end)
yield e
示例2: sent_tokenize
# 需要导入模块: from nltk.tokenize.punkt import PunktSentenceTokenizer [as 别名]
# 或者: from nltk.tokenize.punkt.PunktSentenceTokenizer import span_tokenize [as 别名]
def sent_tokenize(data, filter_threshold=None):
'''
Tokenizes a string into sentences and corresponding offsets
Args:
data(str): The document itself
filter_threshold(int): if sentence length is
less than this, it will be ignored
Returns:
tuple(list(str), list(list))): tokenized
sentences and corresponding offsets
'''
punkt_param = PunktParameters()
punkt_param.abbrev_types = set(
['dr', 'vs', 'mr', 'mrs', 'prof', 'inc', 'et', 'al', 'Fig', 'fig'])
sent_detector = PunktSentenceTokenizer(punkt_param)
sentences = sent_detector.tokenize(data)
offsets = sent_detector.span_tokenize(data)
return (sentences, offsets)
示例3: resolve_phrases
# 需要导入模块: from nltk.tokenize.punkt import PunktSentenceTokenizer [as 别名]
# 或者: from nltk.tokenize.punkt.PunktSentenceTokenizer import span_tokenize [as 别名]
def resolve_phrases(section, tokens, book, id):
phrases = []
sentences = []
# find and resolve parantheses
if book == "almizan_fa":
if int(id.split("_")[0]) <= 2:
html = section.html()
replace = lambda start, end, oldtext, newtext: oldtext[:start] + newtext + oldtext[end:]
# in chapter1, remove parantheses for ayas
iter = re.finditer(r"(<span[^\n]*>)[ ]*\(([^\)s]*)\)[^\)]*(</span[^\n]*>)", html)
for m in reversed(list(iter)):
html = replace(m.start(), m.end(), html, m.group().replace("(", "").replace(")", ""))
iter = re.finditer(r"\([^\)]{3,15}\)", html)
for match in reversed(list(iter)):
m = match.group()[1:-1]
resolved = resolve_phrase(m, tokens, book[-2:])
if resolved:
html = replace(match.start(), match.end(), html, '<em rel="{0}">{1}</em>'.format(resolved[0], m))
section.html(html)
pst = PunktSentenceTokenizer()
# resolve em elements
for em in section.find("em").items():
resolved = resolve_phrase(em.text(), tokens, book[-2:])
if resolved:
em.attr("rel", resolved[0])
phrases.append((em.text(), resolved[1], resolved[0]))
paragraph = em.parent().html(method="html")
for start, end in pst.span_tokenize(paragraph):
if paragraph[start:end].find(em.outerHtml()) != -1:
this_sentence = paragraph[start:end].lstrip()
this_sentence = refine_sentence(this_sentence)
while this_sentence.startswith("<code"):
if this_sentence.find("</code>") != -1:
new_start = this_sentence.find("</code>") + 7
this_sentence = this_sentence[new_start:].lstrip()
this_sentence = refine_sentence(this_sentence)
while this_sentence.startswith("<span"):
new_start = this_sentence.find("</span>") + 7
this_sentence = this_sentence[new_start:].lstrip()
this_sentence = refine_sentence(this_sentence)
before = this_sentence.index(em.outerHtml())
after = len(this_sentence) - len(em.outerHtml()) - before
em.attr("data-sentence", "{0}:{1}".format(before, after))
break
sentences.append((em.text(), resolved[0], (before, after), [this_sentence]))
else:
phrases.append((em.text(),))
new_section = section.html(method="html")
p = re.compile(r'<em rel="([^"]+)" data-sentence="([^"]+)">([^<]+)<\/em>')
matched_list = [(m.start(0), m.end(0), m.group(1), m.group(2)) for m in re.finditer(p, new_section)]
last_start = -1
for matched in reversed(matched_list):
start_span = matched[0] - int(matched[3].split(":")[0])
end_span = matched[1] + int(matched[3].split(":")[1])
if start_span != last_start:
new_section = (
new_section[:start_span]
+ '<span class="phrase">'
+ new_section[start_span:end_span]
+ "</span>"
+ new_section[end_span:]
)
last_start = start_span
section.html(new_section)
return phrases, sentences
示例4: sent_tokenize
# 需要导入模块: from nltk.tokenize.punkt import PunktSentenceTokenizer [as 别名]
# 或者: from nltk.tokenize.punkt.PunktSentenceTokenizer import span_tokenize [as 别名]
def sent_tokenize(data, filter_short=False, filter_verbless=False):
"""
Tokenize sentences
Tokenize `data` into two arrays: sentences and offsets
Returns a tuple (`sentences`,`offsets`)
"""
punkt_param = PunktParameters()
punkt_param.abbrev_types = set(
['dr', 'vs', 'mr', 'mrs', 'prof', 'inc', 'et', 'al', 'Fig', 'fig'])
sent_detector = PunktSentenceTokenizer(punkt_param)
sentences = sent_detector.tokenize(data)
offsets = sent_detector.span_tokenize(data)
new_sentences = []
new_offsets = []
to_del = []
if filter_verbless:
pos = pos_tagger.extract_nlp_batch()
for i in range(sentences):
okay = False
for word in pos['sentences'][i]['words']:
if word[1]['PartOfSpeech'] in verbs:
okay = True
break
if not okay: # the sentence doesn't have verb,
to_del.append(i) # mark for deletion
sentences = multi_delete(sentences, to_del)
offsetes = multi_delete(offsets, to_del)
if filter_short and not filter_verbless:
for i in range(len(sentences)):
if len(sentences[i]) >= filter_short:
new_sentences.append(sentences[i])
new_offsets.append(new_offsets[i])
new_sentences = [s for s in sentences if sentences]
# new_sentences = deepcopy(sentences)
# new_offsets = deepcopy(offsets)
# for i, off in enumerate(offsets):
# if i < len(offsets) - 1:
# if ((offsets[i + 1][0] - offsets[i][1]) < 5):
# new_sentences.append(sentences[i] + ' ' + sentences[i + 1])
# new_offsets.append((offsets[i][0], offsets[i + 1][1]))
# if i < len(offsets) - 2:
# if ((offsets[i + 2][0] - offsets[i + 1][1]) < 5) and\
# ((offsets[i + 1][0] - offsets[i][0]) < 5):
# new_sentences.append(
# sentences[i] + ' ' + sentences[i + 1] + ' ' + sentences[i + 2])
# new_offsets.append((offsets[i][0], offsets[i + 2][1]))
# if i < len(offsets) - 3:
# if (((offsets[i + 3][0] - offsets[i + 2][1]) < 5) and
# ((offsets[i + 2][0] - offsets[i + 1][0]) < 5) and
# ((offsets[i + 1][0] - offsets[i][0]) < 5)):
# new_sentences.append(sentences[
# i] + ' ' + sentences[i + 1] + ' ' + sentences[i + 2] + ' ' + sentences[i + 3])
# new_offsets.append((offsets[i][0], offsets[i + 3][1]))
# if i < len(offsets) - 4:
# if (((offsets[i + 4][0] - offsets[i + 3][1]) < 5) and
# ((offsets[i + 3][0] - offsets[i + 2][1]) < 5) and
# ((offsets[i + 2][0] - offsets[i + 1][0]) < 5) and
# ((offsets[i + 1][0] - offsets[i][0]) < 5)):
# if i < len(offsets) - 3:
# if (((offsets[i + 3][0] - offsets[i + 2][1]) < 5) and
# ((offsets[i + 2][0] - offsets[i + 1][0]) < 5) and
# ((offsets[i + 1][0] - offsets[i][0]) < 5)):
# new_sentences.append(sentences[
# i] + ' ' + sentences[i + 1] + ' ' + sentences[i + 2] + ' ' + sentences[i + 3])
# new_offsets.append((offsets[i][0], offsets[i + 3][1]))
# if i < len(offsets) - 4:
# if (((offsets[i + 4][0] - offsets[i + 3][1]) < 5) and
# ((offsets[i + 3][0] - offsets[i + 2][1]) < 5) and
# ((offsets[i + 2][0] - offsets[i + 1][0]) < 5) and
# ((offsets[i + 1][0] - offsets[i][0]) < 5)):
# new_sentences.append(sentences[
# i] + ' ' + sentences[i + 1] + ' ' + sentences[i + 2] + ' ' + sentences[i + 3] + ' ' + sentences[i + 3])
# new_offsets.append((offsets[i][0], offsets[i + 3][1]))
# if i < len(offsets) - 5:
# if (((offsets[i + 5][0] - offsets[i + 4][1]) < 5) and
# ((offsets[i + 4][0] - offsets[i + 3][1]) < 5) and
# ((offsets[i + 3][0] - offsets[i + 2][1]) < 5) and
# ((offsets[i + 2][0] - offsets[i + 1][0]) < 5) and
# ((offsets[i + 1][0] - offsets[i][0]) < 5)):
# new_sentences.append(sentences[
# i] + ' ' + sentences[i + 1] + ' ' + sentences[i + 2] + ' ' + sentences[i + 3] + ' ' + sentences[i + 3])
# new_offsets.append((offsets[i][0], offsets[i + 3][1])) new_sentences.append(sentences[
# i] + ' ' + sentences[i + 1] + ' ' + sentences[i + 2] + ' ' + sentences[i + 3] + ' ' + sentences[i + 3])
# new_offsets.append((offsets[i][0], offsets[i + 3][1]))
# if i < len(offsets) - 5:
# if (((offsets[i + 5][0] - offsets[i + 4][1]) < 5) and
# ((offsets[i + 4][0] - offsets[i + 3][1]) < 5) and
# ((offsets[i + 3][0] - offsets[i + 2][1]) < 5) and
# ((offsets[i + 2][0] - offsets[i + 1][0]) < 5) and
# ((offsets[i + 1][0] - offsets[i][0]) < 5)):
# new_sentences.append(sentences[
# i] + ' ' + sentences[i + 1] + ' ' + sentences[i + 2] + ' ' + sentences[i + 3] + ' ' + sentences[i + 3])
# new_offsets.append((offsets[i][0], offsets[i + 3][1]))
print new_offsets
return {'sentences': new_sentences, 'offsets': new_offsets}
示例5: nltk_tokenizer
# 需要导入模块: from nltk.tokenize.punkt import PunktSentenceTokenizer [as 别名]
# 或者: from nltk.tokenize.punkt.PunktSentenceTokenizer import span_tokenize [as 别名]
class nltk_tokenizer(IncrementalTransform):
"""
a streamcorpus_pipeline IncrementalTransform that converts a chunk into a new
chunk with Sentence objects generated using NLTK tokenizers
"""
tagger_id = "nltk_tokenizer"
def __init__(self, config):
self.config = config
self.sentence_tokenizer = PunktSentenceTokenizer()
self.word_tokenizer = WhitespaceTokenizer() # PunktWordTokenizer()
def _sentences(self, clean_visible):
"generate strings identified as sentences"
previous_end = 0
clean_visible = clean_visible.decode("utf8")
assert isinstance(clean_visible, unicode)
for start, end in self.sentence_tokenizer.span_tokenize(clean_visible):
## no need to check start, because the first byte of text
## is always first byte of first sentence, and we will
## have already made the previous sentence longer on the
## end if there was an overlap.
if start < previous_end:
start = previous_end
if start > end:
## skip this sentence... because it was eaten by
## an earlier sentence with a label
continue
try:
label = self.label_index.find_le(end)
except ValueError:
label = None
if label:
off = label.offsets[OffsetType.BYTES]
end = max(off.first + off.length, end)
previous_end = end
sent_str = clean_visible[start:end]
yield start, end, sent_str
def make_label_index(self, stream_item):
"make a sortedcollection on body.labels"
labels = stream_item.body.labels.get(self.config.get("annotator_id"))
if not labels:
labels = []
self.label_index = SortedCollection(labels, key=lambda label: label.offsets[OffsetType.BYTES].first)
def make_sentences(self, stream_item):
"assemble Sentence and Token objects"
self.make_label_index(stream_item)
sentences = []
token_num = 0
new_mention_id = 0
for sent_start, sent_end, sent_str in self._sentences(stream_item.body.clean_visible):
assert isinstance(sent_str, unicode)
sent = Sentence()
sentence_pos = 0
for start, end in self.word_tokenizer.span_tokenize(sent_str):
try:
token_str = sent_str[start:end].encode("utf8")
except Exception, exc:
logger.critical("died on sent_str[%d:%d].encode('utf8')", start, end, exc_info=True)
sys.exit("failed to cope with %r in %r" % (sent_str[start:end], sent_str))
tok = Token(token_num=token_num, token=token_str, sentence_pos=sentence_pos)
tok.offsets[OffsetType.BYTES] = Offset(
type=OffsetType.BYTES, first=sent_start + start, length=end - start
)
## whitespace tokenizer will never get a token
## boundary in the middle of an 'author' label
try:
# logger.debug('searching for %d in %r', sent_start + start, self.label_index._keys)
label = self.label_index.find_le(sent_start + start)
except ValueError:
label = None
if label:
off = label.offsets[OffsetType.BYTES]
if off.first + off.length > sent_start + start:
logger.info("overlapping label: %r" % label.target.target_id)
## overlaps
streamcorpus.add_annotation(tok, label)
assert label.annotator.annotator_id in tok.labels
logger.info("adding label to tok: %r has %r", tok.token, label.target.target_id)
if label in self.label_to_mention_id:
mention_id = self.label_to_mention_id[label]
else:
mention_id = new_mention_id
new_mention_id += 1
self.label_to_mention_id[label] = mention_id
tok.mention_id = mention_id
token_num += 1
sentence_pos += 1
sent.tokens.append(tok)
sentences.append(sent)
return sentences
示例6: sent_tokenize
# 需要导入模块: from nltk.tokenize.punkt import PunktSentenceTokenizer [as 别名]
# 或者: from nltk.tokenize.punkt.PunktSentenceTokenizer import span_tokenize [as 别名]
def sent_tokenize(data):
punkt_param = PunktParameters()
punkt_param.abbrev_types = set(
['dr', 'vs', 'mr', 'mrs', 'prof', 'inc', 'et', 'al', 'Fig', 'fig'])
sent_detector = PunktSentenceTokenizer(punkt_param)
sentences = sent_detector.tokenize(data)
offsets = sent_detector.span_tokenize(data)
new_sentences = deepcopy(sentences)
new_offsets = deepcopy(offsets)
for i, off in enumerate(offsets):
if len(tokenizer.tokenize(sentences[i])) < 7: # Skip short sentences
pass
else:
if i < len(offsets) - 1:
if ((offsets[i + 1][0] - offsets[i][1]) < 5):
new_sentences.append(sentences[i] + ' ' + sentences[i + 1])
new_offsets.append((offsets[i][0], offsets[i + 1][1]))
if i < len(offsets) - 2:
if ((offsets[i + 2][0] - offsets[i + 1][1]) < 5) and\
((offsets[i + 1][0] - offsets[i][0]) < 5):
new_sentences.append(
sentences[i] + ' ' + sentences[i + 1] + ' ' + sentences[i + 2])
new_offsets.append((offsets[i][0], offsets[i + 2][1]))
# if i < len(offsets) - 3:
# if (((offsets[i + 3][0] - offsets[i + 2][1]) < 5) and
# ((offsets[i + 2][0] - offsets[i + 1][0]) < 5) and
# ((offsets[i + 1][0] - offsets[i][0]) < 5)):
# new_sentences.append(sentences[
# i] + ' ' + sentences[i + 1] + ' ' + sentences[i + 2] + ' ' + sentences[i + 3])
# new_offsets.append((offsets[i][0], offsets[i + 3][1]))
# if i < len(offsets) - 4:
# if (((offsets[i + 4][0] - offsets[i + 3][1]) < 5) and
# ((offsets[i + 3][0] - offsets[i + 2][1]) < 5) and
# ((offsets[i + 2][0] - offsets[i + 1][0]) < 5) and
# ((offsets[i + 1][0] - offsets[i][0]) < 5)):
# if i < len(offsets) - 3:
# if (((offsets[i + 3][0] - offsets[i + 2][1]) < 5) and
# ((offsets[i + 2][0] - offsets[i + 1][0]) < 5) and
# ((offsets[i + 1][0] - offsets[i][0]) < 5)):
# new_sentences.append(sentences[
# i] + ' ' + sentences[i + 1] + ' ' + sentences[i + 2] + ' ' + sentences[i + 3])
# new_offsets.append((offsets[i][0], offsets[i + 3][1]))
# if i < len(offsets) - 4:
# if (((offsets[i + 4][0] - offsets[i + 3][1]) < 5) and
# ((offsets[i + 3][0] - offsets[i + 2][1]) < 5) and
# ((offsets[i + 2][0] - offsets[i + 1][0]) < 5) and
# ((offsets[i + 1][0] - offsets[i][0]) < 5)):
# new_sentences.append(sentences[
# i] + ' ' + sentences[i + 1] + ' ' + sentences[i + 2] + ' ' + sentences[i + 3] + ' ' + sentences[i + 3])
# new_offsets.append((offsets[i][0], offsets[i + 3][1]))
# if i < len(offsets) - 5:
# if (((offsets[i + 5][0] - offsets[i + 4][1]) < 5) and
# ((offsets[i + 4][0] - offsets[i + 3][1]) < 5) and
# ((offsets[i + 3][0] - offsets[i + 2][1]) < 5) and
# ((offsets[i + 2][0] - offsets[i + 1][0]) < 5) and
# ((offsets[i + 1][0] - offsets[i][0]) < 5)):
# new_sentences.append(sentences[
# i] + ' ' + sentences[i + 1] + ' ' + sentences[i + 2] + ' ' + sentences[i + 3] + ' ' + sentences[i + 3])
# new_offsets.append((offsets[i][0], offsets[i + 3][1])) new_sentences.append(sentences[
# i] + ' ' + sentences[i + 1] + ' ' + sentences[i + 2] + ' ' + sentences[i + 3] + ' ' + sentences[i + 3])
# new_offsets.append((offsets[i][0], offsets[i + 3][1]))
# if i < len(offsets) - 5:
# if (((offsets[i + 5][0] - offsets[i + 4][1]) < 5) and
# ((offsets[i + 4][0] - offsets[i + 3][1]) < 5) and
# ((offsets[i + 3][0] - offsets[i + 2][1]) < 5) and
# ((offsets[i + 2][0] - offsets[i + 1][0]) < 5) and
# ((offsets[i + 1][0] - offsets[i][0]) < 5)):
# new_sentences.append(sentences[
# i] + ' ' + sentences[i + 1] + ' ' + sentences[i + 2] + ' ' + sentences[i + 3] + ' ' + sentences[i + 3])
# new_offsets.append((offsets[i][0], offsets[i + 3][1]))
print new_offsets
return {'sentences': new_sentences, 'offsets': new_offsets}
示例7: nltk_tokenizer
# 需要导入模块: from nltk.tokenize.punkt import PunktSentenceTokenizer [as 别名]
# 或者: from nltk.tokenize.punkt.PunktSentenceTokenizer import span_tokenize [as 别名]
class nltk_tokenizer(IncrementalTransform):
'''
a streamcorpus_pipeline IncrementalTransform that converts a chunk into a new
chunk with Sentence objects generated using NLTK tokenizers
'''
config_name = 'nltk_tokenizer'
tagger_id = 'nltk_tokenizer'
def __init__(self, *args, **kwargs):
super(nltk_tokenizer, self).__init__(*args, **kwargs)
self.sentence_tokenizer = PunktSentenceTokenizer()
self.word_tokenizer = WhitespaceTokenizer() #PunktWordTokenizer()
def _sentences(self, clean_visible):
'generate strings identified as sentences'
previous_end = 0
clean_visible = clean_visible.decode('utf8')
assert isinstance(clean_visible, unicode)
for start, end in self.sentence_tokenizer.span_tokenize(clean_visible):
## no need to check start, because the first byte of text
## is always first byte of first sentence, and we will
## have already made the previous sentence longer on the
## end if there was an overlap.
if start < previous_end:
start = previous_end
if start > end:
## skip this sentence... because it was eaten by
## an earlier sentence with a label
continue
try:
label = self.label_index.find_le(end)
except ValueError:
label = None
if label:
off = label.offsets[OffsetType.BYTES]
end = max(off.first + off.length, end)
previous_end = end
sent_str = clean_visible[start:end]
yield start, end, sent_str
def make_label_index(self, stream_item):
'make a sortedcollection on body.labels'
labels = stream_item.body.labels.get(self.config.get('annotator_id'))
if not labels:
labels = []
self.label_index = SortedCollection(
labels,
key=lambda label: label.offsets[OffsetType.BYTES].first)
def make_sentences(self, stream_item):
'assemble Sentence and Token objects'
self.make_label_index(stream_item)
sentences = []
token_num = 0
new_mention_id = 0
for sent_start, sent_end, sent_str in self._sentences(stream_item.body.clean_visible):
assert isinstance(sent_str, unicode)
sent = Sentence()
sentence_pos = 0
for start, end in self.word_tokenizer.span_tokenize(sent_str):
token_str = sent_str[start:end].encode('utf8')
tok = Token(
token_num=token_num,
token=token_str,
sentence_pos=sentence_pos,
)
tok.offsets[OffsetType.BYTES] = Offset(
type=OffsetType.BYTES,
first=sent_start + start,
length = end - start,
)
## whitespace tokenizer will never get a token
## boundary in the middle of an 'author' label
try:
#logger.debug('searching for %d in %r', sent_start + start, self.label_index._keys)
label = self.label_index.find_le(sent_start + start)
except ValueError:
label = None
if label:
off = label.offsets[OffsetType.BYTES]
if off.first + off.length > sent_start + start:
logger.info('overlapping label: %r' % label.target.target_id)
## overlaps
streamcorpus.add_annotation(tok, label)
assert label.annotator.annotator_id in tok.labels
logger.info('adding label to tok: %r has %r',
tok.token, label.target.target_id)
if label in self.label_to_mention_id:
mention_id = self.label_to_mention_id[label]
else:
mention_id = new_mention_id
new_mention_id += 1
self.label_to_mention_id[label] = mention_id
tok.mention_id = mention_id
token_num += 1
sentence_pos += 1
#.........这里部分代码省略.........