当前位置: 首页>>代码示例>>Python>>正文


Python WordPunctTokenizer.span_tokenize方法代码示例

本文整理汇总了Python中nltk.tokenize.WordPunctTokenizer.span_tokenize方法的典型用法代码示例。如果您正苦于以下问题:Python WordPunctTokenizer.span_tokenize方法的具体用法?Python WordPunctTokenizer.span_tokenize怎么用?Python WordPunctTokenizer.span_tokenize使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在nltk.tokenize.WordPunctTokenizer的用法示例。


在下文中一共展示了WordPunctTokenizer.span_tokenize方法的4个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: tokens

# 需要导入模块: from nltk.tokenize import WordPunctTokenizer [as 别名]
# 或者: from nltk.tokenize.WordPunctTokenizer import span_tokenize [as 别名]
    def tokens(self):
        """Tokenize the text.
        """
        tokenizer = WordPunctTokenizer()

        # Get token character spans.
        spans = list(tokenizer.span_tokenize(self.text))

        # Materialize the token stream.
        tokens = [self.text[c1:c2] for c1, c2 in spans]

        tags = pos_tag(tokens)

        return [

            Token(
                token=token.lower(),
                char1=c1,
                char2=c2,
                pos=pos,
            )

            for (c1, c2), token, (_, pos) in
            zip(spans, tokens, tags)

        ]
开发者ID:davidmcclure,项目名称:literary-interior,代码行数:28,代码来源:text.py

示例2: tokenize

# 需要导入模块: from nltk.tokenize import WordPunctTokenizer [as 别名]
# 或者: from nltk.tokenize.WordPunctTokenizer import span_tokenize [as 别名]
def tokenize(text):
    """Tokenize a raw text.

    Args:
        text (str)

    Returns: list of {token, char1, char2, pos}
    """
    tokenizer = WordPunctTokenizer()

    # Get token character spans.
    spans = list(tokenizer.span_tokenize(text))

    # Materialize the token stream.
    tokens = [text[c1:c2] for c1, c2 in spans]

    # Tag parts-of-speech.
    tags = pos_tag(tokens)

    return [

        dict(
            token=token.lower(),
            char1=c1,
            char2=c2,
            pos=pos,
        )

        for (c1, c2), token, (_, pos) in
        zip(spans, tokens, tags)

    ]
开发者ID:davidmcclure,项目名称:stacks,代码行数:34,代码来源:utils.py

示例3: change_db2

# 需要导入模块: from nltk.tokenize import WordPunctTokenizer [as 别名]
# 或者: from nltk.tokenize.WordPunctTokenizer import span_tokenize [as 别名]
def change_db2(text, origin_dict, id):
    print origin_dict
    tokens_ar = []
    word_punct_tokenizer = WordPunctTokenizer()
    for token in word_punct_tokenizer.span_tokenize(origin_dict):
        tokens_ar.append(token)
    for line in text.split("\n"):
        markup_error_line = line.split(';')
        print "MARKUP", markup_error_line
        convert_coord_2dbformat(markup_error_line, tokens_ar, id)
开发者ID:Alenush,项目名称:zhivoeslovo,代码行数:12,代码来源:download_data2db.py

示例4: convert

# 需要导入模块: from nltk.tokenize import WordPunctTokenizer [as 别名]
# 或者: from nltk.tokenize.WordPunctTokenizer import span_tokenize [as 别名]
def convert(sgm_path, apf_path, bio_path=None):
    xml_parser = etree.XMLParser(recover=True)
    try:
        sgm_tree = etree.parse(sgm_path, xml_parser)
        apf_tree = etree.parse(apf_path, xml_parser)
        if not bio_path:
            bio_path = os.path.commonprefix([sgm_path, apf_path]) + 'bio'
        output = open(bio_path, 'w')
    except:
        print 'Something wrong when opening/parsing xml file, or opening output file'
        return
    
    init_offset = get_init_offset(sgm_path)
    text = sgm_tree.xpath('/DOC/BODY/TEXT')[0].text.strip('\n')
    
    tokenizer = WordPunctTokenizer()
    tokens = tokenizer.tokenize(text)
    spans = list(tokenizer.span_tokenize(text))
    pos = pos_tag(tokens)
    
    ts = []
    for i in range(len(tokens)):
        t = token()
        t.text = tokens[i]
        t.pos = pos[i][1]
        t.span = (spans[i][0] + init_offset, spans[i][1] - 1 + init_offset)
        t.bio = 'O'
        ts.append(t)
        
    entits = apf_tree.xpath('/source_file/document/entity')
    for enty in entits:
        enty_type = enty.get('TYPE')
        mentions = enty.xpath('entity_mention')
        for m in mentions:
            head = m.xpath('head')[0]
            span = (int(head[0].get('START')), int(head[0].get('END')))
            found = False
            for t in ts:
                if t.span[0] == span[0]:
                    t.bio = 'B-' + enty_type
                    found = True
                if t.span[0] > span[0] and t.span[1] <= span[1]:
                    t.bio = 'I-' + enty_type
                    found = True
            if not found:
                print 'entity mention head span not found', span, apf_path
    
    for t in ts:
        #print t.text, t.span
        output.write('\t'.join([t.text, t.pos, t.bio]) + '\n')
    output.close()
开发者ID:yaocheng-cs,项目名称:misc,代码行数:53,代码来源:ace2bio.py


注:本文中的nltk.tokenize.WordPunctTokenizer.span_tokenize方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。