本文整理匯總了Python中jieba.tokenize方法的典型用法代碼示例。如果您正苦於以下問題:Python jieba.tokenize方法的具體用法?Python jieba.tokenize怎麽用?Python jieba.tokenize使用的例子?那麽, 這裏精選的方法代碼示例或許可以為您提供幫助。您也可以進一步了解該方法所在類jieba
的用法示例。
在下文中一共展示了jieba.tokenize方法的15個代碼示例,這些例子默認根據受歡迎程度排序。您可以為喜歡或者感覺有用的代碼點讚,您的評價將有助於係統推薦出更棒的Python代碼示例。
示例1: create_data
# 需要導入模塊: import jieba [as 別名]
# 或者: from jieba import tokenize [as 別名]
def create_data(X: dt.Frame = None) -> Union[str, List[str],
dt.Frame, List[dt.Frame],
np.ndarray, List[np.ndarray],
pd.DataFrame, List[pd.DataFrame]]:
# exit gracefully if method is called as a data upload rather than data modify
if X is None:
return []
# Tokenize the chinese text
import jieba
X = dt.Frame(X).to_pandas()
# If no columns to tokenize, use the first column
if len(cols_to_tokenize) == 0:
cols_to_tokenize.append(X.columns[0])
for col in cols_to_tokenize:
X[col] = X[col].astype('unicode').fillna(u'NA')
X[col] = X[col].apply(lambda x: " ".join([r[0] for r in jieba.tokenize(x)]))
return dt.Frame(X)
示例2: test_tokenizer
# 需要導入模塊: import jieba [as 別名]
# 或者: from jieba import tokenize [as 別名]
def test_tokenizer():
txts = ["我不要你花錢,這些路曲近通幽",
"這個消息不脛兒走",
"這個消息不徑而走",
"這個消息不脛而走",
"複方甘草口服溶液限田基",
"張老師經常背課到深夜,我們要體晾老師的心苦。",
'新進人員時,知識當然還不過,可是人有很有精神,麵對工作很認真的話,很快就學會、體會。',
",我遇到了問題怎麽辦",
",我遇到了問題",
"問題",
"北川景子參演了林詣彬導演的《速度與激情3》",
"林誌玲亮相網友:確定不是波多野結衣?",
"龜山千廣和近藤公園在龜山公園裏喝酒賞花",
"小牛曲清去蛋白提取物乙"]
t = Tokenizer()
for text in txts:
print(text)
print('deault', t.tokenize(text, 'default'))
print('search', t.tokenize(text, 'search'))
print('ngram', t.tokenize(text, 'ngram'))
示例3: test_detector_tokenizer
# 需要導入模塊: import jieba [as 別名]
# 或者: from jieba import tokenize [as 別名]
def test_detector_tokenizer():
sents = ["我不要你花錢,這些路曲近通幽",
"這個消息不脛兒走",
"這個消息不徑而走",
"這個消息不脛而走",
"複方甘草口服溶液限田基",
"張老師經常背課到深夜,我們要體晾老師的心苦。",
'新進人員時,知識當然還不過,可是人有很有精神,麵對工作很認真的話,很快就學會、體會。',
"北川景子參演了林詣彬導演的《速度與激情3》",
"林誌玲亮相網友:確定不是波多野結衣?",
"龜山千廣和近藤公園在龜山公園裏喝酒賞花",
"問題"
]
d = Detector()
d.check_detector_initialized()
detector_tokenizer = d.tokenizer
for text in sents:
print(text)
print('deault', detector_tokenizer.tokenize(text, 'default'))
print('search', detector_tokenizer.tokenize(text, 'search'))
示例4: __call__
# 需要導入模塊: import jieba [as 別名]
# 或者: from jieba import tokenize [as 別名]
def __call__(self, text, **kargs):
words = jieba.tokenize(text, mode="search")
token = Token()
for (w, start_pos, stop_pos) in words:
if not accepted_chars.match(w) and len(w) <= 1:
continue
token.original = token.text = w
token.pos = start_pos
token.startchar = start_pos
token.endchar = stop_pos
yield token
示例5: testTokenize
# 需要導入模塊: import jieba [as 別名]
# 或者: from jieba import tokenize [as 別名]
def testTokenize(self):
for content in test_contents:
result = jieba.tokenize(content)
assert isinstance(result, types.GeneratorType), "Test Tokenize Generator error"
result = list(result)
assert isinstance(result, list), "Test Tokenize error on content: %s" % content
for tk in result:
print("word %s\t\t start: %d \t\t end:%d" % (tk[0],tk[1],tk[2]), file=sys.stderr)
print("testTokenize", file=sys.stderr)
示例6: testTokenize_NOHMM
# 需要導入模塊: import jieba [as 別名]
# 或者: from jieba import tokenize [as 別名]
def testTokenize_NOHMM(self):
for content in test_contents:
result = jieba.tokenize(content,HMM=False)
assert isinstance(result, types.GeneratorType), "Test Tokenize Generator error"
result = list(result)
assert isinstance(result, list), "Test Tokenize error on content: %s" % content
for tk in result:
print("word %s\t\t start: %d \t\t end:%d" % (tk[0],tk[1],tk[2]), file=sys.stderr)
print("testTokenize_NOHMM", file=sys.stderr)
示例7: cuttest
# 需要導入模塊: import jieba [as 別名]
# 或者: from jieba import tokenize [as 別名]
def cuttest(test_sent):
global g_mode
result = jieba.tokenize(test_sent,mode=g_mode,HMM=False)
for tk in result:
print("word %s\t\t start: %d \t\t end:%d" % (tk[0],tk[1],tk[2]))
示例8: cuttest
# 需要導入模塊: import jieba [as 別名]
# 或者: from jieba import tokenize [as 別名]
def cuttest(test_sent):
global g_mode
result = jieba.tokenize(test_sent,mode=g_mode)
for tk in result:
print("word %s\t\t start: %d \t\t end:%d" % (tk[0],tk[1],tk[2]))
示例9: test_segment
# 需要導入模塊: import jieba [as 別名]
# 或者: from jieba import tokenize [as 別名]
def test_segment():
"""測試疾病名糾錯"""
error_sentence_1 = '這個新藥奧美砂坦脂片能治療心絞痛,效果還可以' # 奧美沙坦酯片
print(error_sentence_1)
print(segment(error_sentence_1))
import jieba
print(list(jieba.tokenize(error_sentence_1)))
import jieba.posseg as pseg
words = pseg.lcut("我愛北京天安門") # jieba默認模式
print('old:', words)
# jieba.enable_paddle() # 啟動paddle模式。 0.40版之後開始支持,早期版本不支持
# words = pseg.cut("我愛北京天安門", use_paddle=True) # paddle模式
# for word, flag in words:
# print('new:','%s %s' % (word, flag))
示例10: posseg
# 需要導入模塊: import jieba [as 別名]
# 或者: from jieba import tokenize [as 別名]
def posseg(text):
# type: (Text) -> List[Token]
result = []
for (word, start, end) in jieba.tokenize(text):
pseg_data = [(w, f) for (w, f) in pseg.cut(word)]
result.append((pseg_data, start, end))
return result
示例11: train
# 需要導入模塊: import jieba [as 別名]
# 或者: from jieba import tokenize [as 別名]
def train(self,
training_data: TrainingData,
config: RasaNLUModelConfig,
**kwargs: Any) -> None:
for example in training_data.training_examples:
example.set("tokens", self.tokenize(example.text))
示例12: process
# 需要導入模塊: import jieba [as 別名]
# 或者: from jieba import tokenize [as 別名]
def process(self, message: Message, **kwargs: Any) -> None:
message.set("tokens", self.tokenize(message.text))
示例13: tokenize
# 需要導入模塊: import jieba [as 別名]
# 或者: from jieba import tokenize [as 別名]
def tokenize(text: Text) -> List[Token]:
import jieba
tokenized = jieba.tokenize(text)
tokens = [Token(word, start) for (word, start, end) in tokenized]
return tokens
示例14: posseg
# 需要導入模塊: import jieba [as 別名]
# 或者: from jieba import tokenize [as 別名]
def posseg(text):
# type: (Text) -> List[Token]
import jieba
import jieba.posseg as pseg
result = []
for (word, start, end) in jieba.tokenize(text):
pseg_data = [(w, f) for (w, f) in pseg.cut(word)]
result.append((pseg_data, start, end))
return result
示例15: train
# 需要導入模塊: import jieba [as 別名]
# 或者: from jieba import tokenize [as 別名]
def train(self, training_data, config, **kwargs):
# type: (TrainingData, RasaNLUModelConfig, **Any) -> None
for example in training_data.training_examples:
example.set("tokens", self.tokenize(example.text))