当前位置: 首页>>代码示例>>Python>>正文


Python jieba.tokenize方法代码示例

本文整理汇总了Python中jieba.tokenize方法的典型用法代码示例。如果您正苦于以下问题:Python jieba.tokenize方法的具体用法?Python jieba.tokenize怎么用?Python jieba.tokenize使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在jieba的用法示例。


在下文中一共展示了jieba.tokenize方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: create_data

# 需要导入模块: import jieba [as 别名]
# 或者: from jieba import tokenize [as 别名]
def create_data(X: dt.Frame = None) -> Union[str, List[str],
                                                 dt.Frame, List[dt.Frame],
                                                 np.ndarray, List[np.ndarray],
                                                 pd.DataFrame, List[pd.DataFrame]]:
        # exit gracefully if method is called as a data upload rather than data modify
        if X is None:
            return []
        # Tokenize the chinese text
        import jieba
        X = dt.Frame(X).to_pandas()
        # If no columns to tokenize, use the first column
        if len(cols_to_tokenize) == 0:
            cols_to_tokenize.append(X.columns[0])
        for col in cols_to_tokenize:
            X[col] = X[col].astype('unicode').fillna(u'NA')
            X[col] = X[col].apply(lambda x: " ".join([r[0] for r in jieba.tokenize(x)]))
        return dt.Frame(X) 
开发者ID:h2oai,项目名称:driverlessai-recipes,代码行数:19,代码来源:tokenize_chinese.py

示例2: test_tokenizer

# 需要导入模块: import jieba [as 别名]
# 或者: from jieba import tokenize [as 别名]
def test_tokenizer():
    txts = ["我不要你花钱,这些路曲近通幽",
            "这个消息不胫儿走",
            "这个消息不径而走",
            "这个消息不胫而走",
            "复方甘草口服溶液限田基",
            "张老师经常背课到深夜,我们要体晾老师的心苦。",
            '新进人员时,知识当然还不过,可是人有很有精神,面对工作很认真的话,很快就学会、体会。',
            ",我遇到了问题怎么办",
            ",我遇到了问题",
            "问题",
            "北川景子参演了林诣彬导演的《速度与激情3》",
            "林志玲亮相网友:确定不是波多野结衣?",
            "龟山千广和近藤公园在龟山公园里喝酒赏花",
            "小牛曲清去蛋白提取物乙"]
    t = Tokenizer()
    for text in txts:
        print(text)
        print('deault', t.tokenize(text, 'default'))
        print('search', t.tokenize(text, 'search'))
        print('ngram', t.tokenize(text, 'ngram')) 
开发者ID:shibing624,项目名称:pycorrector,代码行数:23,代码来源:tokenizer_test.py

示例3: test_detector_tokenizer

# 需要导入模块: import jieba [as 别名]
# 或者: from jieba import tokenize [as 别名]
def test_detector_tokenizer():
    sents = ["我不要你花钱,这些路曲近通幽",
             "这个消息不胫儿走",
             "这个消息不径而走",
             "这个消息不胫而走",
             "复方甘草口服溶液限田基",
             "张老师经常背课到深夜,我们要体晾老师的心苦。",
             '新进人员时,知识当然还不过,可是人有很有精神,面对工作很认真的话,很快就学会、体会。',
             "北川景子参演了林诣彬导演的《速度与激情3》",
             "林志玲亮相网友:确定不是波多野结衣?",
             "龟山千广和近藤公园在龟山公园里喝酒赏花",
             "问题"
             ]
    d = Detector()
    d.check_detector_initialized()
    detector_tokenizer = d.tokenizer
    for text in sents:
        print(text)
        print('deault', detector_tokenizer.tokenize(text, 'default'))
        print('search', detector_tokenizer.tokenize(text, 'search')) 
开发者ID:shibing624,项目名称:pycorrector,代码行数:22,代码来源:tokenizer_test.py

示例4: __call__

# 需要导入模块: import jieba [as 别名]
# 或者: from jieba import tokenize [as 别名]
def __call__(self, text, **kargs):
        words = jieba.tokenize(text, mode="search")
        token = Token()
        for (w, start_pos, stop_pos) in words:
            if not accepted_chars.match(w) and len(w) <= 1:
                continue
            token.original = token.text = w
            token.pos = start_pos
            token.startchar = start_pos
            token.endchar = stop_pos
            yield token 
开发者ID:deepcs233,项目名称:jieba_fast,代码行数:13,代码来源:analyzer.py

示例5: testTokenize

# 需要导入模块: import jieba [as 别名]
# 或者: from jieba import tokenize [as 别名]
def testTokenize(self):
        for content in test_contents:
            result = jieba.tokenize(content)
            assert isinstance(result, types.GeneratorType), "Test Tokenize Generator error"
            result = list(result)
            assert isinstance(result, list), "Test Tokenize error on content: %s" % content
            for tk in result:
                print("word %s\t\t start: %d \t\t end:%d" % (tk[0],tk[1],tk[2]), file=sys.stderr)
        print("testTokenize", file=sys.stderr) 
开发者ID:deepcs233,项目名称:jieba_fast,代码行数:11,代码来源:jieba_test.py

示例6: testTokenize_NOHMM

# 需要导入模块: import jieba [as 别名]
# 或者: from jieba import tokenize [as 别名]
def testTokenize_NOHMM(self):
        for content in test_contents:
            result = jieba.tokenize(content,HMM=False)
            assert isinstance(result, types.GeneratorType), "Test Tokenize Generator error"
            result = list(result)
            assert isinstance(result, list), "Test Tokenize error on content: %s" % content
            for tk in result:
                print("word %s\t\t start: %d \t\t end:%d" % (tk[0],tk[1],tk[2]), file=sys.stderr)
        print("testTokenize_NOHMM", file=sys.stderr) 
开发者ID:deepcs233,项目名称:jieba_fast,代码行数:11,代码来源:jieba_test.py

示例7: cuttest

# 需要导入模块: import jieba [as 别名]
# 或者: from jieba import tokenize [as 别名]
def cuttest(test_sent):
    global g_mode
    result = jieba.tokenize(test_sent,mode=g_mode,HMM=False)
    for tk in result:
        print("word %s\t\t start: %d \t\t end:%d" % (tk[0],tk[1],tk[2])) 
开发者ID:deepcs233,项目名称:jieba_fast,代码行数:7,代码来源:test_tokenize_no_hmm.py

示例8: cuttest

# 需要导入模块: import jieba [as 别名]
# 或者: from jieba import tokenize [as 别名]
def cuttest(test_sent):
    global g_mode
    result = jieba.tokenize(test_sent,mode=g_mode)
    for tk in result:
        print("word %s\t\t start: %d \t\t end:%d" % (tk[0],tk[1],tk[2])) 
开发者ID:deepcs233,项目名称:jieba_fast,代码行数:7,代码来源:test_tokenize.py

示例9: test_segment

# 需要导入模块: import jieba [as 别名]
# 或者: from jieba import tokenize [as 别名]
def test_segment():
    """测试疾病名纠错"""
    error_sentence_1 = '这个新药奥美砂坦脂片能治疗心绞痛,效果还可以'  # 奥美沙坦酯片
    print(error_sentence_1)
    print(segment(error_sentence_1))
    import jieba
    print(list(jieba.tokenize(error_sentence_1)))
    import jieba.posseg as pseg
    words = pseg.lcut("我爱北京天安门")  # jieba默认模式
    print('old:', words)
    # jieba.enable_paddle()  # 启动paddle模式。 0.40版之后开始支持,早期版本不支持
    # words = pseg.cut("我爱北京天安门", use_paddle=True)  # paddle模式
    # for word, flag in words:
    #     print('new:','%s %s' % (word, flag)) 
开发者ID:shibing624,项目名称:pycorrector,代码行数:16,代码来源:tokenizer_test.py

示例10: posseg

# 需要导入模块: import jieba [as 别名]
# 或者: from jieba import tokenize [as 别名]
def posseg(text):
        # type: (Text) -> List[Token]
        result = []
        for (word, start, end) in jieba.tokenize(text):
            pseg_data = [(w, f) for (w, f) in pseg.cut(word)]
            result.append((pseg_data, start, end))

        return result 
开发者ID:GaoQ1,项目名称:rasa_nlu_gq,代码行数:10,代码来源:jieba_pseg_extractor.py

示例11: train

# 需要导入模块: import jieba [as 别名]
# 或者: from jieba import tokenize [as 别名]
def train(self,
              training_data: TrainingData,
              config: RasaNLUModelConfig,
              **kwargs: Any) -> None:
        for example in training_data.training_examples:
            example.set("tokens", self.tokenize(example.text)) 
开发者ID:weizhenzhao,项目名称:rasa_nlu,代码行数:8,代码来源:jieba_tokenizer.py

示例12: process

# 需要导入模块: import jieba [as 别名]
# 或者: from jieba import tokenize [as 别名]
def process(self, message: Message, **kwargs: Any) -> None:
        message.set("tokens", self.tokenize(message.text)) 
开发者ID:weizhenzhao,项目名称:rasa_nlu,代码行数:4,代码来源:jieba_tokenizer.py

示例13: tokenize

# 需要导入模块: import jieba [as 别名]
# 或者: from jieba import tokenize [as 别名]
def tokenize(text: Text) -> List[Token]:
        import jieba

        tokenized = jieba.tokenize(text)
        tokens = [Token(word, start) for (word, start, end) in tokenized]
        return tokens 
开发者ID:weizhenzhao,项目名称:rasa_nlu,代码行数:8,代码来源:jieba_tokenizer.py

示例14: posseg

# 需要导入模块: import jieba [as 别名]
# 或者: from jieba import tokenize [as 别名]
def posseg(text):
        # type: (Text) -> List[Token]

        import jieba
        import jieba.posseg as pseg

        result = []
        for (word, start, end) in jieba.tokenize(text):
            pseg_data = [(w, f) for (w, f) in pseg.cut(word)]
            result.append((pseg_data, start, end))

        return result 
开发者ID:weizhenzhao,项目名称:rasa_nlu,代码行数:14,代码来源:jieba_pseg_extractor.py

示例15: train

# 需要导入模块: import jieba [as 别名]
# 或者: from jieba import tokenize [as 别名]
def train(self, training_data, config, **kwargs):
        # type: (TrainingData, RasaNLUModelConfig, **Any) -> None
        for example in training_data.training_examples:
            example.set("tokens", self.tokenize(example.text)) 
开发者ID:Ma-Dan,项目名称:rasa_bot,代码行数:6,代码来源:jieba_tokenizer.py


注:本文中的jieba.tokenize方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。