本文整理汇总了Python中jieba.tokenize方法的典型用法代码示例。如果您正苦于以下问题:Python jieba.tokenize方法的具体用法?Python jieba.tokenize怎么用?Python jieba.tokenize使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类jieba
的用法示例。
在下文中一共展示了jieba.tokenize方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: create_data
# 需要导入模块: import jieba [as 别名]
# 或者: from jieba import tokenize [as 别名]
def create_data(X: dt.Frame = None) -> Union[str, List[str],
dt.Frame, List[dt.Frame],
np.ndarray, List[np.ndarray],
pd.DataFrame, List[pd.DataFrame]]:
# exit gracefully if method is called as a data upload rather than data modify
if X is None:
return []
# Tokenize the chinese text
import jieba
X = dt.Frame(X).to_pandas()
# If no columns to tokenize, use the first column
if len(cols_to_tokenize) == 0:
cols_to_tokenize.append(X.columns[0])
for col in cols_to_tokenize:
X[col] = X[col].astype('unicode').fillna(u'NA')
X[col] = X[col].apply(lambda x: " ".join([r[0] for r in jieba.tokenize(x)]))
return dt.Frame(X)
示例2: test_tokenizer
# 需要导入模块: import jieba [as 别名]
# 或者: from jieba import tokenize [as 别名]
def test_tokenizer():
txts = ["我不要你花钱,这些路曲近通幽",
"这个消息不胫儿走",
"这个消息不径而走",
"这个消息不胫而走",
"复方甘草口服溶液限田基",
"张老师经常背课到深夜,我们要体晾老师的心苦。",
'新进人员时,知识当然还不过,可是人有很有精神,面对工作很认真的话,很快就学会、体会。',
",我遇到了问题怎么办",
",我遇到了问题",
"问题",
"北川景子参演了林诣彬导演的《速度与激情3》",
"林志玲亮相网友:确定不是波多野结衣?",
"龟山千广和近藤公园在龟山公园里喝酒赏花",
"小牛曲清去蛋白提取物乙"]
t = Tokenizer()
for text in txts:
print(text)
print('deault', t.tokenize(text, 'default'))
print('search', t.tokenize(text, 'search'))
print('ngram', t.tokenize(text, 'ngram'))
示例3: test_detector_tokenizer
# 需要导入模块: import jieba [as 别名]
# 或者: from jieba import tokenize [as 别名]
def test_detector_tokenizer():
sents = ["我不要你花钱,这些路曲近通幽",
"这个消息不胫儿走",
"这个消息不径而走",
"这个消息不胫而走",
"复方甘草口服溶液限田基",
"张老师经常背课到深夜,我们要体晾老师的心苦。",
'新进人员时,知识当然还不过,可是人有很有精神,面对工作很认真的话,很快就学会、体会。',
"北川景子参演了林诣彬导演的《速度与激情3》",
"林志玲亮相网友:确定不是波多野结衣?",
"龟山千广和近藤公园在龟山公园里喝酒赏花",
"问题"
]
d = Detector()
d.check_detector_initialized()
detector_tokenizer = d.tokenizer
for text in sents:
print(text)
print('deault', detector_tokenizer.tokenize(text, 'default'))
print('search', detector_tokenizer.tokenize(text, 'search'))
示例4: __call__
# 需要导入模块: import jieba [as 别名]
# 或者: from jieba import tokenize [as 别名]
def __call__(self, text, **kargs):
words = jieba.tokenize(text, mode="search")
token = Token()
for (w, start_pos, stop_pos) in words:
if not accepted_chars.match(w) and len(w) <= 1:
continue
token.original = token.text = w
token.pos = start_pos
token.startchar = start_pos
token.endchar = stop_pos
yield token
示例5: testTokenize
# 需要导入模块: import jieba [as 别名]
# 或者: from jieba import tokenize [as 别名]
def testTokenize(self):
for content in test_contents:
result = jieba.tokenize(content)
assert isinstance(result, types.GeneratorType), "Test Tokenize Generator error"
result = list(result)
assert isinstance(result, list), "Test Tokenize error on content: %s" % content
for tk in result:
print("word %s\t\t start: %d \t\t end:%d" % (tk[0],tk[1],tk[2]), file=sys.stderr)
print("testTokenize", file=sys.stderr)
示例6: testTokenize_NOHMM
# 需要导入模块: import jieba [as 别名]
# 或者: from jieba import tokenize [as 别名]
def testTokenize_NOHMM(self):
for content in test_contents:
result = jieba.tokenize(content,HMM=False)
assert isinstance(result, types.GeneratorType), "Test Tokenize Generator error"
result = list(result)
assert isinstance(result, list), "Test Tokenize error on content: %s" % content
for tk in result:
print("word %s\t\t start: %d \t\t end:%d" % (tk[0],tk[1],tk[2]), file=sys.stderr)
print("testTokenize_NOHMM", file=sys.stderr)
示例7: cuttest
# 需要导入模块: import jieba [as 别名]
# 或者: from jieba import tokenize [as 别名]
def cuttest(test_sent):
global g_mode
result = jieba.tokenize(test_sent,mode=g_mode,HMM=False)
for tk in result:
print("word %s\t\t start: %d \t\t end:%d" % (tk[0],tk[1],tk[2]))
示例8: cuttest
# 需要导入模块: import jieba [as 别名]
# 或者: from jieba import tokenize [as 别名]
def cuttest(test_sent):
global g_mode
result = jieba.tokenize(test_sent,mode=g_mode)
for tk in result:
print("word %s\t\t start: %d \t\t end:%d" % (tk[0],tk[1],tk[2]))
示例9: test_segment
# 需要导入模块: import jieba [as 别名]
# 或者: from jieba import tokenize [as 别名]
def test_segment():
"""测试疾病名纠错"""
error_sentence_1 = '这个新药奥美砂坦脂片能治疗心绞痛,效果还可以' # 奥美沙坦酯片
print(error_sentence_1)
print(segment(error_sentence_1))
import jieba
print(list(jieba.tokenize(error_sentence_1)))
import jieba.posseg as pseg
words = pseg.lcut("我爱北京天安门") # jieba默认模式
print('old:', words)
# jieba.enable_paddle() # 启动paddle模式。 0.40版之后开始支持,早期版本不支持
# words = pseg.cut("我爱北京天安门", use_paddle=True) # paddle模式
# for word, flag in words:
# print('new:','%s %s' % (word, flag))
示例10: posseg
# 需要导入模块: import jieba [as 别名]
# 或者: from jieba import tokenize [as 别名]
def posseg(text):
# type: (Text) -> List[Token]
result = []
for (word, start, end) in jieba.tokenize(text):
pseg_data = [(w, f) for (w, f) in pseg.cut(word)]
result.append((pseg_data, start, end))
return result
示例11: train
# 需要导入模块: import jieba [as 别名]
# 或者: from jieba import tokenize [as 别名]
def train(self,
training_data: TrainingData,
config: RasaNLUModelConfig,
**kwargs: Any) -> None:
for example in training_data.training_examples:
example.set("tokens", self.tokenize(example.text))
示例12: process
# 需要导入模块: import jieba [as 别名]
# 或者: from jieba import tokenize [as 别名]
def process(self, message: Message, **kwargs: Any) -> None:
message.set("tokens", self.tokenize(message.text))
示例13: tokenize
# 需要导入模块: import jieba [as 别名]
# 或者: from jieba import tokenize [as 别名]
def tokenize(text: Text) -> List[Token]:
import jieba
tokenized = jieba.tokenize(text)
tokens = [Token(word, start) for (word, start, end) in tokenized]
return tokens
示例14: posseg
# 需要导入模块: import jieba [as 别名]
# 或者: from jieba import tokenize [as 别名]
def posseg(text):
# type: (Text) -> List[Token]
import jieba
import jieba.posseg as pseg
result = []
for (word, start, end) in jieba.tokenize(text):
pseg_data = [(w, f) for (w, f) in pseg.cut(word)]
result.append((pseg_data, start, end))
return result
示例15: train
# 需要导入模块: import jieba [as 别名]
# 或者: from jieba import tokenize [as 别名]
def train(self, training_data, config, **kwargs):
# type: (TrainingData, RasaNLUModelConfig, **Any) -> None
for example in training_data.training_examples:
example.set("tokens", self.tokenize(example.text))