当前位置: 首页>>代码示例>>Python>>正文


Python PunktSentenceTokenizer.span_tokenize方法代码示例

本文整理汇总了Python中nltk.tokenize.punkt.PunktSentenceTokenizer.span_tokenize方法的典型用法代码示例。如果您正苦于以下问题:Python PunktSentenceTokenizer.span_tokenize方法的具体用法?Python PunktSentenceTokenizer.span_tokenize怎么用?Python PunktSentenceTokenizer.span_tokenize使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在nltk.tokenize.punkt.PunktSentenceTokenizer的用法示例。


在下文中一共展示了PunktSentenceTokenizer.span_tokenize方法的7个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: analyse_entry

# 需要导入模块: from nltk.tokenize.punkt import PunktSentenceTokenizer [as 别名]
# 或者: from nltk.tokenize.punkt.PunktSentenceTokenizer import span_tokenize [as 别名]
 def analyse_entry(self, entry, activity):
     yield entry
     chunker_type = activity.params["delimiter"]
     original_text = entry['nif:isString']
     if chunker_type == "sentence":
         tokenizer = PunktSentenceTokenizer()
     if chunker_type == "paragraph":
         tokenizer = LineTokenizer()
     chars = list(tokenizer.span_tokenize(original_text))
     if len(chars) == 1:
         # This sentence was already split
         return
     for i, chunk in enumerate(chars):
         start, end = chunk
         e = Entry()
         e['nif:isString'] = original_text[start:end]
         if entry.id:
             e.id = entry.id + "#char={},{}".format(start, end)
         yield e
开发者ID:gsi-upm,项目名称:senpy,代码行数:21,代码来源:split_plugin.py

示例2: sent_tokenize

# 需要导入模块: from nltk.tokenize.punkt import PunktSentenceTokenizer [as 别名]
# 或者: from nltk.tokenize.punkt.PunktSentenceTokenizer import span_tokenize [as 别名]
def sent_tokenize(data, filter_threshold=None):
    '''
    Tokenizes a string into sentences and corresponding offsets

    Args:
        data(str): The document itself
        filter_threshold(int): if sentence length is
            less than this, it will be ignored

    Returns:
        tuple(list(str), list(list))): tokenized
            sentences and corresponding offsets
    '''
    punkt_param = PunktParameters()
    punkt_param.abbrev_types = set(
        ['dr', 'vs', 'mr', 'mrs', 'prof', 'inc', 'et', 'al', 'Fig', 'fig'])
    sent_detector = PunktSentenceTokenizer(punkt_param)
    sentences = sent_detector.tokenize(data)
    offsets = sent_detector.span_tokenize(data)
    return (sentences, offsets)
开发者ID:pdsujnow,项目名称:BioSum,代码行数:22,代码来源:preprocess.py

示例3: resolve_phrases

# 需要导入模块: from nltk.tokenize.punkt import PunktSentenceTokenizer [as 别名]
# 或者: from nltk.tokenize.punkt.PunktSentenceTokenizer import span_tokenize [as 别名]
def resolve_phrases(section, tokens, book, id):
    phrases = []
    sentences = []
    # find and resolve parantheses
    if book == "almizan_fa":
        if int(id.split("_")[0]) <= 2:
            html = section.html()
            replace = lambda start, end, oldtext, newtext: oldtext[:start] + newtext + oldtext[end:]

            # in chapter1, remove parantheses for ayas
            iter = re.finditer(r"(<span[^\n]*>)[ ]*\(([^\)s]*)\)[^\)]*(</span[^\n]*>)", html)
            for m in reversed(list(iter)):
                html = replace(m.start(), m.end(), html, m.group().replace("(", "").replace(")", ""))

            iter = re.finditer(r"\([^\)]{3,15}\)", html)
            for match in reversed(list(iter)):
                m = match.group()[1:-1]
                resolved = resolve_phrase(m, tokens, book[-2:])
                if resolved:
                    html = replace(match.start(), match.end(), html, '<em rel="{0}">{1}</em>'.format(resolved[0], m))

            section.html(html)

    pst = PunktSentenceTokenizer()
    # resolve em elements
    for em in section.find("em").items():
        resolved = resolve_phrase(em.text(), tokens, book[-2:])
        if resolved:
            em.attr("rel", resolved[0])
            phrases.append((em.text(), resolved[1], resolved[0]))
            paragraph = em.parent().html(method="html")
            for start, end in pst.span_tokenize(paragraph):
                if paragraph[start:end].find(em.outerHtml()) != -1:
                    this_sentence = paragraph[start:end].lstrip()
                    this_sentence = refine_sentence(this_sentence)

                    while this_sentence.startswith("<code"):
                        if this_sentence.find("</code>") != -1:
                            new_start = this_sentence.find("</code>") + 7
                        this_sentence = this_sentence[new_start:].lstrip()

                    this_sentence = refine_sentence(this_sentence)

                    while this_sentence.startswith("<span"):
                        new_start = this_sentence.find("</span>") + 7
                        this_sentence = this_sentence[new_start:].lstrip()

                    this_sentence = refine_sentence(this_sentence)

                    before = this_sentence.index(em.outerHtml())
                    after = len(this_sentence) - len(em.outerHtml()) - before
                    em.attr("data-sentence", "{0}:{1}".format(before, after))
                    break
            sentences.append((em.text(), resolved[0], (before, after), [this_sentence]))
        else:
            phrases.append((em.text(),))

    new_section = section.html(method="html")
    p = re.compile(r'<em rel="([^"]+)" data-sentence="([^"]+)">([^<]+)<\/em>')
    matched_list = [(m.start(0), m.end(0), m.group(1), m.group(2)) for m in re.finditer(p, new_section)]
    last_start = -1
    for matched in reversed(matched_list):
        start_span = matched[0] - int(matched[3].split(":")[0])
        end_span = matched[1] + int(matched[3].split(":")[1])
        if start_span != last_start:
            new_section = (
                new_section[:start_span]
                + '<span class="phrase">'
                + new_section[start_span:end_span]
                + "</span>"
                + new_section[end_span:]
            )
            last_start = start_span

    section.html(new_section)
    return phrases, sentences
开发者ID:khajavi,项目名称:zolal,代码行数:78,代码来源:almizan.py

示例4: sent_tokenize

# 需要导入模块: from nltk.tokenize.punkt import PunktSentenceTokenizer [as 别名]
# 或者: from nltk.tokenize.punkt.PunktSentenceTokenizer import span_tokenize [as 别名]
def sent_tokenize(data, filter_short=False, filter_verbless=False):
    """
    Tokenize sentences 

    Tokenize `data` into two arrays: sentences and offsets
    Returns a tuple (`sentences`,`offsets`)
    """
    punkt_param = PunktParameters()
    punkt_param.abbrev_types = set(
        ['dr', 'vs', 'mr', 'mrs', 'prof', 'inc', 'et', 'al', 'Fig', 'fig'])
    sent_detector = PunktSentenceTokenizer(punkt_param)
    sentences = sent_detector.tokenize(data)
    offsets = sent_detector.span_tokenize(data)
    new_sentences = []
    new_offsets = []
    to_del = []
    if filter_verbless:
        pos = pos_tagger.extract_nlp_batch()
        for i in range(sentences):
            okay = False
            for word in pos['sentences'][i]['words']:
                if word[1]['PartOfSpeech'] in verbs:
                    okay = True
                    break
            if not okay:  # the sentence doesn't have verb,
                to_del.append(i)  # mark for deletion
        sentences = multi_delete(sentences, to_del)
        offsetes = multi_delete(offsets, to_del)
    if filter_short and not filter_verbless:
        for i in range(len(sentences)):
            if len(sentences[i]) >= filter_short:
                new_sentences.append(sentences[i])
                new_offsets.append(new_offsets[i])
        new_sentences = [s for s in sentences if sentences]


#     new_sentences = deepcopy(sentences)
#     new_offsets = deepcopy(offsets)
#     for i, off in enumerate(offsets):
#         if i < len(offsets) - 1:
#             if ((offsets[i + 1][0] - offsets[i][1]) < 5):
#                 new_sentences.append(sentences[i] + ' ' + sentences[i + 1])
#                 new_offsets.append((offsets[i][0], offsets[i + 1][1]))
#         if i < len(offsets) - 2:
#             if ((offsets[i + 2][0] - offsets[i + 1][1]) < 5) and\
#                     ((offsets[i + 1][0] - offsets[i][0]) < 5):
#                 new_sentences.append(
#                     sentences[i] + ' ' + sentences[i + 1] + ' ' + sentences[i + 2])
#                 new_offsets.append((offsets[i][0], offsets[i + 2][1]))
#         if i < len(offsets) - 3:
#             if (((offsets[i + 3][0] - offsets[i + 2][1]) < 5) and
#                     ((offsets[i + 2][0] - offsets[i + 1][0]) < 5) and
#                     ((offsets[i + 1][0] - offsets[i][0]) < 5)):
#                 new_sentences.append(sentences[
#                                   i] + ' ' + sentences[i + 1] + ' ' + sentences[i + 2] + ' ' + sentences[i + 3])
#                 new_offsets.append((offsets[i][0], offsets[i + 3][1]))
#         if i < len(offsets) - 4:
#             if (((offsets[i + 4][0] - offsets[i + 3][1]) < 5) and
#                  ((offsets[i + 3][0] - offsets[i + 2][1]) < 5) and
#                     ((offsets[i + 2][0] - offsets[i + 1][0]) < 5) and
#                     ((offsets[i + 1][0] - offsets[i][0]) < 5)):
# if i < len(offsets) - 3:
#             if (((offsets[i + 3][0] - offsets[i + 2][1]) < 5) and
#                     ((offsets[i + 2][0] - offsets[i + 1][0]) < 5) and
#                     ((offsets[i + 1][0] - offsets[i][0]) < 5)):
#                 new_sentences.append(sentences[
#                                   i] + ' ' + sentences[i + 1] + ' ' + sentences[i + 2] + ' ' + sentences[i + 3])
#                 new_offsets.append((offsets[i][0], offsets[i + 3][1]))
#         if i < len(offsets) - 4:
#             if (((offsets[i + 4][0] - offsets[i + 3][1]) < 5) and
#                  ((offsets[i + 3][0] - offsets[i + 2][1]) < 5) and
#                     ((offsets[i + 2][0] - offsets[i + 1][0]) < 5) and
#                     ((offsets[i + 1][0] - offsets[i][0]) < 5)):
#                 new_sentences.append(sentences[
#                                   i] + ' ' + sentences[i + 1] + ' ' + sentences[i + 2] + ' ' + sentences[i + 3] + ' ' + sentences[i + 3])
#                 new_offsets.append((offsets[i][0], offsets[i + 3][1]))
#         if i < len(offsets) - 5:
#             if (((offsets[i + 5][0] - offsets[i + 4][1]) < 5) and
#                 ((offsets[i + 4][0] - offsets[i + 3][1]) < 5) and
#                  ((offsets[i + 3][0] - offsets[i + 2][1]) < 5) and
#                     ((offsets[i + 2][0] - offsets[i + 1][0]) < 5) and
#                     ((offsets[i + 1][0] - offsets[i][0]) < 5)):
#                 new_sentences.append(sentences[
#                                   i] + ' ' + sentences[i + 1] + ' ' + sentences[i + 2] + ' ' + sentences[i + 3] + ' ' + sentences[i + 3])
#                 new_offsets.append((offsets[i][0], offsets[i + 3][1]))      new_sentences.append(sentences[
#                                   i] + ' ' + sentences[i + 1] + ' ' + sentences[i + 2] + ' ' + sentences[i + 3] + ' ' + sentences[i + 3])
#                 new_offsets.append((offsets[i][0], offsets[i + 3][1]))
#         if i < len(offsets) - 5:
#             if (((offsets[i + 5][0] - offsets[i + 4][1]) < 5) and
#                 ((offsets[i + 4][0] - offsets[i + 3][1]) < 5) and
#                  ((offsets[i + 3][0] - offsets[i + 2][1]) < 5) and
#                     ((offsets[i + 2][0] - offsets[i + 1][0]) < 5) and
#                     ((offsets[i + 1][0] - offsets[i][0]) < 5)):
#                 new_sentences.append(sentences[
#                                   i] + ' ' + sentences[i + 1] + ' ' + sentences[i + 2] + ' ' + sentences[i + 3] + ' ' + sentences[i + 3])
#                 new_offsets.append((offsets[i][0], offsets[i + 3][1]))
    print new_offsets
    return {'sentences': new_sentences, 'offsets': new_offsets}
开发者ID:pdsujnow,项目名称:BioSum,代码行数:100,代码来源:tokenizer.py

示例5: nltk_tokenizer

# 需要导入模块: from nltk.tokenize.punkt import PunktSentenceTokenizer [as 别名]
# 或者: from nltk.tokenize.punkt.PunktSentenceTokenizer import span_tokenize [as 别名]
class nltk_tokenizer(IncrementalTransform):
    """
    a streamcorpus_pipeline IncrementalTransform that converts a chunk into a new
    chunk with Sentence objects generated using NLTK tokenizers
    """

    tagger_id = "nltk_tokenizer"

    def __init__(self, config):
        self.config = config
        self.sentence_tokenizer = PunktSentenceTokenizer()
        self.word_tokenizer = WhitespaceTokenizer()  # PunktWordTokenizer()

    def _sentences(self, clean_visible):
        "generate strings identified as sentences"
        previous_end = 0
        clean_visible = clean_visible.decode("utf8")
        assert isinstance(clean_visible, unicode)
        for start, end in self.sentence_tokenizer.span_tokenize(clean_visible):
            ## no need to check start, because the first byte of text
            ## is always first byte of first sentence, and we will
            ## have already made the previous sentence longer on the
            ## end if there was an overlap.
            if start < previous_end:
                start = previous_end
                if start > end:
                    ## skip this sentence... because it was eaten by
                    ## an earlier sentence with a label
                    continue
            try:
                label = self.label_index.find_le(end)
            except ValueError:
                label = None
            if label:
                off = label.offsets[OffsetType.BYTES]
                end = max(off.first + off.length, end)
            previous_end = end
            sent_str = clean_visible[start:end]
            yield start, end, sent_str

    def make_label_index(self, stream_item):
        "make a sortedcollection on body.labels"
        labels = stream_item.body.labels.get(self.config.get("annotator_id"))
        if not labels:
            labels = []

        self.label_index = SortedCollection(labels, key=lambda label: label.offsets[OffsetType.BYTES].first)

    def make_sentences(self, stream_item):
        "assemble Sentence and Token objects"
        self.make_label_index(stream_item)
        sentences = []
        token_num = 0
        new_mention_id = 0
        for sent_start, sent_end, sent_str in self._sentences(stream_item.body.clean_visible):
            assert isinstance(sent_str, unicode)
            sent = Sentence()
            sentence_pos = 0
            for start, end in self.word_tokenizer.span_tokenize(sent_str):
                try:
                    token_str = sent_str[start:end].encode("utf8")
                except Exception, exc:
                    logger.critical("died on sent_str[%d:%d].encode('utf8')", start, end, exc_info=True)
                    sys.exit("failed to cope with %r in %r" % (sent_str[start:end], sent_str))
                tok = Token(token_num=token_num, token=token_str, sentence_pos=sentence_pos)
                tok.offsets[OffsetType.BYTES] = Offset(
                    type=OffsetType.BYTES, first=sent_start + start, length=end - start
                )
                ## whitespace tokenizer will never get a token
                ## boundary in the middle of an 'author' label
                try:
                    # logger.debug('searching for %d in %r', sent_start + start, self.label_index._keys)
                    label = self.label_index.find_le(sent_start + start)
                except ValueError:
                    label = None
                if label:
                    off = label.offsets[OffsetType.BYTES]
                    if off.first + off.length > sent_start + start:
                        logger.info("overlapping label: %r" % label.target.target_id)
                        ## overlaps
                        streamcorpus.add_annotation(tok, label)
                        assert label.annotator.annotator_id in tok.labels

                        logger.info("adding label to tok: %r has %r", tok.token, label.target.target_id)

                        if label in self.label_to_mention_id:
                            mention_id = self.label_to_mention_id[label]
                        else:
                            mention_id = new_mention_id
                            new_mention_id += 1
                            self.label_to_mention_id[label] = mention_id

                        tok.mention_id = mention_id

                token_num += 1
                sentence_pos += 1
                sent.tokens.append(tok)
            sentences.append(sent)
        return sentences
开发者ID:nithintumma,项目名称:streamcorpus-pipeline,代码行数:101,代码来源:_tokenizer.py

示例6: sent_tokenize

# 需要导入模块: from nltk.tokenize.punkt import PunktSentenceTokenizer [as 别名]
# 或者: from nltk.tokenize.punkt.PunktSentenceTokenizer import span_tokenize [as 别名]
def sent_tokenize(data):
    punkt_param = PunktParameters()
    punkt_param.abbrev_types = set(
        ['dr', 'vs', 'mr', 'mrs', 'prof', 'inc', 'et', 'al', 'Fig', 'fig'])
    sent_detector = PunktSentenceTokenizer(punkt_param)
    sentences = sent_detector.tokenize(data)
    offsets = sent_detector.span_tokenize(data)
    new_sentences = deepcopy(sentences)
    new_offsets = deepcopy(offsets)
    for i, off in enumerate(offsets):
        if len(tokenizer.tokenize(sentences[i])) < 7:  # Skip short sentences
            pass
        else:
            if i < len(offsets) - 1:
                if ((offsets[i + 1][0] - offsets[i][1]) < 5):
                    new_sentences.append(sentences[i] + ' ' + sentences[i + 1])
                    new_offsets.append((offsets[i][0], offsets[i + 1][1]))
            if i < len(offsets) - 2:
                if ((offsets[i + 2][0] - offsets[i + 1][1]) < 5) and\
                        ((offsets[i + 1][0] - offsets[i][0]) < 5):
                    new_sentences.append(
                        sentences[i] + ' ' + sentences[i + 1] + ' ' + sentences[i + 2])
                    new_offsets.append((offsets[i][0], offsets[i + 2][1]))
    #         if i < len(offsets) - 3:
    #             if (((offsets[i + 3][0] - offsets[i + 2][1]) < 5) and
    #                     ((offsets[i + 2][0] - offsets[i + 1][0]) < 5) and
    #                     ((offsets[i + 1][0] - offsets[i][0]) < 5)):
    #                 new_sentences.append(sentences[
    #                                   i] + ' ' + sentences[i + 1] + ' ' + sentences[i + 2] + ' ' + sentences[i + 3])
    #                 new_offsets.append((offsets[i][0], offsets[i + 3][1]))
    #         if i < len(offsets) - 4:
    #             if (((offsets[i + 4][0] - offsets[i + 3][1]) < 5) and
    #                  ((offsets[i + 3][0] - offsets[i + 2][1]) < 5) and
    #                     ((offsets[i + 2][0] - offsets[i + 1][0]) < 5) and
    #                     ((offsets[i + 1][0] - offsets[i][0]) < 5)):
    # if i < len(offsets) - 3:
    #             if (((offsets[i + 3][0] - offsets[i + 2][1]) < 5) and
    #                     ((offsets[i + 2][0] - offsets[i + 1][0]) < 5) and
    #                     ((offsets[i + 1][0] - offsets[i][0]) < 5)):
    #                 new_sentences.append(sentences[
    #                                   i] + ' ' + sentences[i + 1] + ' ' + sentences[i + 2] + ' ' + sentences[i + 3])
    #                 new_offsets.append((offsets[i][0], offsets[i + 3][1]))
    #         if i < len(offsets) - 4:
    #             if (((offsets[i + 4][0] - offsets[i + 3][1]) < 5) and
    #                  ((offsets[i + 3][0] - offsets[i + 2][1]) < 5) and
    #                     ((offsets[i + 2][0] - offsets[i + 1][0]) < 5) and
    #                     ((offsets[i + 1][0] - offsets[i][0]) < 5)):
    #                 new_sentences.append(sentences[
    #                                   i] + ' ' + sentences[i + 1] + ' ' + sentences[i + 2] + ' ' + sentences[i + 3] + ' ' + sentences[i + 3])
    #                 new_offsets.append((offsets[i][0], offsets[i + 3][1]))
    #         if i < len(offsets) - 5:
    #             if (((offsets[i + 5][0] - offsets[i + 4][1]) < 5) and
    #                 ((offsets[i + 4][0] - offsets[i + 3][1]) < 5) and
    #                  ((offsets[i + 3][0] - offsets[i + 2][1]) < 5) and
    #                     ((offsets[i + 2][0] - offsets[i + 1][0]) < 5) and
    #                     ((offsets[i + 1][0] - offsets[i][0]) < 5)):
    #                 new_sentences.append(sentences[
    #                                   i] + ' ' + sentences[i + 1] + ' ' + sentences[i + 2] + ' ' + sentences[i + 3] + ' ' + sentences[i + 3])
    #                 new_offsets.append((offsets[i][0], offsets[i + 3][1]))      new_sentences.append(sentences[
    #                                   i] + ' ' + sentences[i + 1] + ' ' + sentences[i + 2] + ' ' + sentences[i + 3] + ' ' + sentences[i + 3])
    #                 new_offsets.append((offsets[i][0], offsets[i + 3][1]))
    #         if i < len(offsets) - 5:
    #             if (((offsets[i + 5][0] - offsets[i + 4][1]) < 5) and
    #                 ((offsets[i + 4][0] - offsets[i + 3][1]) < 5) and
    #                  ((offsets[i + 3][0] - offsets[i + 2][1]) < 5) and
    #                     ((offsets[i + 2][0] - offsets[i + 1][0]) < 5) and
    #                     ((offsets[i + 1][0] - offsets[i][0]) < 5)):
    #                 new_sentences.append(sentences[
    #                                   i] + ' ' + sentences[i + 1] + ' ' + sentences[i + 2] + ' ' + sentences[i + 3] + ' ' + sentences[i + 3])
    #                 new_offsets.append((offsets[i][0], offsets[i + 3][1]))
    print new_offsets
    return {'sentences': new_sentences, 'offsets': new_offsets}
开发者ID:acohan,项目名称:scientific-summ,代码行数:74,代码来源:tokenizer.py

示例7: nltk_tokenizer

# 需要导入模块: from nltk.tokenize.punkt import PunktSentenceTokenizer [as 别名]
# 或者: from nltk.tokenize.punkt.PunktSentenceTokenizer import span_tokenize [as 别名]
class nltk_tokenizer(IncrementalTransform):
    '''
    a streamcorpus_pipeline IncrementalTransform that converts a chunk into a new
    chunk with Sentence objects generated using NLTK tokenizers
    '''
    config_name = 'nltk_tokenizer'
    tagger_id = 'nltk_tokenizer'
    def __init__(self, *args, **kwargs):
        super(nltk_tokenizer, self).__init__(*args, **kwargs)
        self.sentence_tokenizer = PunktSentenceTokenizer()
        self.word_tokenizer = WhitespaceTokenizer() #PunktWordTokenizer()

    def _sentences(self, clean_visible):
        'generate strings identified as sentences'
        previous_end = 0
        clean_visible = clean_visible.decode('utf8')
        assert isinstance(clean_visible, unicode)
        for start, end in self.sentence_tokenizer.span_tokenize(clean_visible):
            ## no need to check start, because the first byte of text
            ## is always first byte of first sentence, and we will
            ## have already made the previous sentence longer on the
            ## end if there was an overlap.
            if  start < previous_end:
                start = previous_end
                if start > end:
                    ## skip this sentence... because it was eaten by
                    ## an earlier sentence with a label
                    continue
            try:
                label = self.label_index.find_le(end)
            except ValueError:
                label = None
            if label:
                off = label.offsets[OffsetType.BYTES]
                end = max(off.first + off.length, end)
            previous_end = end
            sent_str = clean_visible[start:end]
            yield start, end, sent_str

    def make_label_index(self, stream_item):
        'make a sortedcollection on body.labels'
        labels = stream_item.body.labels.get(self.config.get('annotator_id'))
        if not labels:
            labels = []

        self.label_index = SortedCollection(
            labels,
            key=lambda label: label.offsets[OffsetType.BYTES].first)

    def make_sentences(self, stream_item):
        'assemble Sentence and Token objects'
        self.make_label_index(stream_item)
        sentences = []
        token_num = 0
        new_mention_id = 0
        for sent_start, sent_end, sent_str in self._sentences(stream_item.body.clean_visible):
            assert isinstance(sent_str, unicode)
            sent = Sentence()
            sentence_pos = 0
            for start, end in self.word_tokenizer.span_tokenize(sent_str):
                token_str = sent_str[start:end].encode('utf8')
                tok = Token(
                    token_num=token_num,
                    token=token_str,
                    sentence_pos=sentence_pos,
                )
                tok.offsets[OffsetType.BYTES] = Offset(
                    type=OffsetType.BYTES, 
                    first=sent_start + start,
                    length = end - start,
                )
                ## whitespace tokenizer will never get a token
                ## boundary in the middle of an 'author' label
                try:
                    #logger.debug('searching for %d in %r', sent_start + start, self.label_index._keys)
                    label = self.label_index.find_le(sent_start + start)
                except ValueError:
                    label = None
                if label:
                    off = label.offsets[OffsetType.BYTES]
                    if off.first + off.length > sent_start + start:
                        logger.info('overlapping label: %r' % label.target.target_id)
                        ## overlaps
                        streamcorpus.add_annotation(tok, label)
                        assert label.annotator.annotator_id in tok.labels

                        logger.info('adding label to tok: %r has %r',
                                     tok.token, label.target.target_id)

                        if label in self.label_to_mention_id:
                            mention_id = self.label_to_mention_id[label]
                        else:
                            mention_id = new_mention_id
                            new_mention_id += 1
                            self.label_to_mention_id[label] = mention_id

                        tok.mention_id = mention_id

                token_num += 1
                sentence_pos += 1
#.........这里部分代码省略.........
开发者ID:zhangjing6006,项目名称:streamcorpus-pipeline,代码行数:103,代码来源:_tokenizer.py


注:本文中的nltk.tokenize.punkt.PunktSentenceTokenizer.span_tokenize方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。