本文整理匯總了Python中janome.tokenizer.Tokenizer類的典型用法代碼示例。如果您正苦於以下問題:Python Tokenizer類的具體用法?Python Tokenizer怎麽用?Python Tokenizer使用的例子?那麽, 這裏精選的類代碼示例或許可以為您提供幫助。
在下文中一共展示了Tokenizer類的15個代碼示例,這些例子默認根據受歡迎程度排序。您可以為喜歡或者感覺有用的代碼點讚,您的評價將有助於係統推薦出更棒的Python代碼示例。
示例1: tokenize
def tokenize(text):
t = Tokenizer()
# テキストの先頭にあるヘッダとフッタを削除
text = re.split(r'\-{5,}',text)[2]
text = re.split(r'底本:', text)[0]
text = text.strip()
# ルビを削除
text = text.replace('|', '')
text = re.sub(r'《.+?》', '', text)
# テキスト內の腳注を削除
text = re.sub(r'[#.+?]', '', text)
# 一行ずつ処理
lines = text.split("\r\n")
results = []
for line in lines:
res = []
tokens = t.tokenize(line)
for tok in tokens:
bf = tok.base_form # 基本係
if bf == "*": bf = tok.surface
ps = tok.part_of_speech # 品詞情報
hinsi = ps.split(',')[0]
if hinsi in ['名詞', '動詞', '形容詞', '記號']:
res.append(bf)
l = " ".join(res)
results.append(l)
return results
示例2: separatewords
def separatewords(text):
separatedWord=[]
t=Tokenizer()
tokens=t.tokenize(unicode(text, "utf-8"))
for token in tokens:
posList=token.part_of_speech.split(",")
pos1=posList[0]
if isinstance(pos1, unicode):
pos1=pos1.encode("utf-8")
pos2=posList[1]
if isinstance(pos2, unicode):
pos2=pos2.encode("utf-8")
ruby=token.reading
if isinstance(ruby, unicode):
ruby=ruby.encode("utf-8")
if pos1=="名詞":
if pos2!="接尾" and pos2!="代名詞" and pos2!="非自立" and pos2!="數" and pos2!="形容動詞語幹":
if ruby!="*":
separatedWord.append(token.surface.lower())
print token.surface.lower()
elif pos2!="サ変接続" and len(token.surface)>3:
# 英単語に関しては4文字以上の単語を扱う
separatedWord.append(token.surface.lower())
print token.surface.lower()
return separatedWord
示例3: chunk_with_kanji
def chunk_with_kanji(istr):
t = Tokenizer()
tokens = t.tokenize(istr)
# give each element flags (jiritsu or fuzoku)
flags = [judge_jifu(x.part_of_speech) for x in tokens]
surface = [x.surface for x in tokens]
# split to chunks, delimited by KUGIRI flag
# very ugly. should be rewritten using tree structure etc.
cflags = insert_chunkflg(flags)
rstr = u""
i = 0
for j, f in enumerate(flags):
if i >= len(cflags): break
if cflags[i] == KUGIRI:
if f == KUTOU:
rstr += surface[j]
i += 1
else:
rstr += u" "
rstr += surface[j]
i += 2
else:
rstr += surface[j]
i += 1
# don't know why this is necessary
if flags != [] and j == 0 and len(surface) != 1:
while j < len(surface):
rstr += surface[j]
j += 1
return rstr
示例4: text_to_array_ja
def text_to_array_ja(textdata, wordtypes):
textdata = filter(textdata)
t = Tokenizer()
tokens = t.tokenize(textdata)
words = sorted([token.surface
for token in tokens
if token.part_of_speech.split(',')[0] in wordtypes])
return words
示例5: _tokenize
def _tokenize(text):
from collections import namedtuple
Token = namedtuple("Token", ["t", "surface", "pos"])
t = Tokenizer()
tokens = t.tokenize(text)
for t in tokens:
nt = Token(t, t.surface, t.part_of_speech.split(","))
yield nt
示例6: output_ja_text
def output_ja_text(data, wordtypes):
textdata = filter(data)
t = Tokenizer()
tokens = t.tokenize(textdata)
words = sorted([token.surface
for token in tokens
if token.part_of_speech.split(',')[0] in wordtypes])
dictionary = count_words(words)
return pyaml.dump(dictionary, sys.stdout, vspacing=[0, 1])
示例7: makekeywords
def makekeywords(text):
from janome.tokenizer import Tokenizer
t = Tokenizer()
tokens = t.tokenize(text)
keywords = []
for token in tokens:
if token.part_of_speech.find("名詞") >= 0 and token.part_of_speech.find("數") == -1 and token.part_of_speech.find("非自立") == -1 and token.part_of_speech.find("接尾") == -1:
keywords.append(token.surface)
return keywords
示例8: split
def split(self, text):
result = []
t = Tokenizer()
malist = t.tokenize(text)
for w in malist:
sf = w.surface # 區切られた単語そのまま
bf = w.base_form # 単語の基本形
if bf == '' or bf == "*": bf = sf
result.append(bf)
return result
示例9: test_func
def test_func():
t = Tokenizer()
temp = ""
for token in t.tokenize(u'この腫瘍は間葉係組織から生ずると考えられ、ビメンチンを発現する。'):
if (not re.search('^(助詞|助動詞|記號)', token.part_of_speech)):
temp = temp + token.surface
else:
temp = temp + token.surface
print(temp)
temp = ""
示例10: run
def run(self, force=None):
print('start')
# 全サイト取得と重複排除
sites = {}
for site in Site.get_all():
sites[site.url] = site
# リストに対してignoreとkeywordマッチを排除
sure = []
for key in sites:
site = sites[key]
response = requests.get(site.subjects_url)
assert (response.status_code == 200), response.text
# parse
data = list(response.text.split('\n'))
for line in data:
try:
_ = Subject(site, line)
sure.append(_)
except:
pass
print(sure)
# リスト出力
t = Tokenizer()
r = defaultdict(int)
r2 = defaultdict(list)
r3 = defaultdict(int)
for _sure in sure:
try:
for token in t.tokenize(_sure.title):
if not token_filter(token):
r[token.surface] += 1
r2[token.surface] += [_sure]
r3[token] += 0
except:
pass
# sort
sure = sorted(sure, key=lambda x: x.title)
for _sure in sure:
try:
point = 0
for token in t.tokenize(_sure.title):
if not token_filter(token):
point += r[token.surface]
if not filter_title(point, _sure):
print(_sure.title, _sure.count_res)
except:
pass
示例11: understand_move
def understand_move(self, text):
generator = Tokenizer()
tokens = []
for t in generator.tokenize(text):
tokens.append(t)
direction = self._understand_direction(tokens)
distance = self._understand_distance(tokens)
return direction, distance
示例12: get_morphs
def get_morphs(string):
t = Tokenizer()
dicts=[]
for token in t.tokenize(unicode(string, 'utf-8')):
dic = {}
token_list = str(token).replace(" ", ",").split(",")
dic["surface"] = token_list[0]
dic["base"] = token_list[7]
dic["pos"] = token_list[1]
dic["pos1"] = token_list[2]
dicts.append(dic)
return dicts
示例13: main
def main():
"""
>>> main()
すもも 名詞,一般,*,*,*,*,すもも,スモモ,スモモ
も 助詞,係助詞,*,*,*,*,も,モ,モ
もも 名詞,一般,*,*,*,*,もも,モモ,モモ
も 助詞,係助詞,*,*,*,*,も,モ,モ
もも 名詞,一般,*,*,*,*,もも,モモ,モモ
の 助詞,連體化,*,*,*,*,の,ノ,ノ
うち 名詞,非自立,副詞可能,*,*,*,うち,ウチ,ウチ
"""
t = Tokenizer()
for token in t.tokenize(u'すもももももももものうち'):
print(token)
示例14: MainTranslator
class MainTranslator(object):
def __init__ (self):
self.janome= Tokenizer()
def get_gobi(self, n):
f = n.part_of_speech.split(',')
if n.surface in ['だ','です','た','だろ','ある']:
if f[0] == '助動詞':
return 'ハゲ'
if n.surface in ['無い','ない','ぬ']:
if f[0] == '助動詞':
return 'ぬハゲ'
if f[0] == '形容詞':
return 'なしハゲ'
def Translator(self, text):
tokens = self.janome.tokenize(text)
text = ''
for n in tokens:
f = n.part_of_speech.split(',')
if n.surface in converter:
text += converter[n.surface]
elif len(f) > 3:
gobi = self.get_gobi(n)
if gobi is not None:
text += gobi
else:
text += n.surface
else:
text += n.surface
return text
示例15: JapaneseTokenizer
class JapaneseTokenizer(object):
def __init__(self, cls, nlp=None):
self.vocab = nlp.vocab if nlp is not None else cls.create_vocab(nlp)
try:
from janome.tokenizer import Tokenizer
except ImportError:
raise ImportError("The Japanese tokenizer requires the Janome "
"library: https://github.com/mocobeta/janome")
self.tokenizer = Tokenizer()
def __call__(self, text):
words = [x.surface for x in self.tokenizer.tokenize(text)]
return Doc(self.vocab, words=words, spaces=[False]*len(words))
# add dummy methods for to_bytes, from_bytes, to_disk and from_disk to
# allow serialization (see #1557)
def to_bytes(self, **exclude):
return b''
def from_bytes(self, bytes_data, **exclude):
return self
def to_disk(self, path, **exclude):
return None
def from_disk(self, path, **exclude):
return self