本文整理汇总了Python中konlpy.tag.Mecab方法的典型用法代码示例。如果您正苦于以下问题:Python tag.Mecab方法的具体用法?Python tag.Mecab怎么用?Python tag.Mecab使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类konlpy.tag
的用法示例。
在下文中一共展示了tag.Mecab方法的8个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: get_tokenizer
# 需要导入模块: from konlpy import tag [as 别名]
# 或者: from konlpy.tag import Mecab [as 别名]
def get_tokenizer(tokenizer_name):
if tokenizer_name == "komoran":
tokenizer = Komoran()
elif tokenizer_name == "okt":
tokenizer = Okt()
elif tokenizer_name == "mecab":
tokenizer = Mecab()
elif tokenizer_name == "hannanum":
tokenizer = Hannanum()
elif tokenizer_name == "kkma":
tokenizer = Kkma()
elif tokenizer_name == "khaiii":
tokenizer = KhaiiiApi()
else:
tokenizer = Mecab()
return tokenizer
示例2: morphTag
# 需要导入模块: from konlpy import tag [as 别名]
# 或者: from konlpy.tag import Mecab [as 别名]
def morphTag(in_fname, out_fname):
mec = Mecab()
corpus = readfileUTF8(in_fname)
concat_sent = []
for n in range(0, len(corpus)):
tagged = mec.pos(corpus[n])
concat = ''
for m in range(0, len(tagged)):
if m < len(tagged):
concat = concat + tagged[m][0] + '/' + tagged[m][1] + ' '
elif m == len(tagged): # When reached the final item
concat = concat + tagged[m][0] + '/' + tagged[m][1]
concat_sent.append(concat)
writefile(concat_sent, out_fname)
return concat_sent
示例3: __init__
# 需要导入模块: from konlpy import tag [as 别名]
# 或者: from konlpy.tag import Mecab [as 别名]
def __init__(self):
try:
from konlpy.tag import Mecab
except ImportError:
raise ImportError(
'Mecab is not installed. '
'You can install Mecab with "sh scripts/install_mecab.sh" '
'You can refer to the installation guide in https://github.com/lyeoni/prenlp/blob/master/scripts/install_mecab.sh or https://bitbucket.org/eunjeon/mecab-ko-dic/src')
self.tokenizer = Mecab()
示例4: get_mecab
# 需要导入模块: from konlpy import tag [as 别名]
# 或者: from konlpy.tag import Mecab [as 别名]
def get_mecab(self, dict_path):
try:
if dict_path:
return Mecab(dict_path) # for annotation
else:
return Mecab()
except Exception as e:
raise Exception(
'If you want to install mecab, The command is.. bash <(curl -s https://raw.githubusercontent.com/konlpy/konlpy/master/scripts/mecab.sh)'
)
示例5: _mecab_ko
# 需要导入模块: from konlpy import tag [as 别名]
# 或者: from konlpy.tag import Mecab [as 别名]
def _mecab_ko(self, text):
if self.word_tokenizer is None:
from konlpy.tag import Mecab
self.word_tokenizer = Mecab()
return self.word_tokenizer.morphs(text)
示例6: main
# 需要导入模块: from konlpy import tag [as 别名]
# 或者: from konlpy.tag import Mecab [as 别名]
def main(config):
print(config)
list_of_tokens = []
if config.is_tokenized:
# Read tokens
with open(config.corpus, 'r', encoding='utf8') as reader:
for li, line in enumerate(reader):
list_of_tokens += line.strip().split()
else:
# Select tokenizer
if config.tokenizer=='mecab':
from konlpy.tag import Mecab
tokenizer = Tokenizer(tokenization_fn=Mecab().morphs)
# Tokenization & read tokens
with open(config.corpus, 'r', encoding='utf8') as reader:
for li, line in enumerate(reader):
list_of_tokens += tokenizer.tokenize(line.strip())
# Build vocabulary
vocab = Vocab(list_of_tokens=list_of_tokens,
unk_token=config.unk_token,
pad_token=config.pad_token,
bos_token=config.bos_token,
eos_token=config.eos_token,
min_freq=config.min_freq,
lower=config.lower)
vocab.build()
print('Vocabulary size: ', len(vocab))
# Save vocabulary
with open(config.vocab, 'wb') as writer:
pickle.dump(vocab, writer)
print('Vocabulary saved to', config.vocab)
示例7: __init__
# 需要导入模块: from konlpy import tag [as 别名]
# 或者: from konlpy.tag import Mecab [as 别名]
def __init__(self, tagger, useful_tags, delimiters, min_token_length, stopwords, **kwargs):
if tagger == 'twitter':
self.tagger = taggers.Twitter()
self.tagger_options = {
'norm': bool(kwargs.get('norm', True)),
'stem': bool(kwargs.get('stem', True)),
}
elif tagger == 'komoran':
self.tagger = taggers.Komoran()
self.tagger_options = {
'flatten': bool(kwargs.get('flatten', True)),
}
elif tagger == 'hannanum':
self.tagger = taggers.Hannanum()
self.tagger_options = {
'ntags': int(kwargs.get('ntags', 9)),
'flatten': bool(kwargs.get('flatten', True)),
}
elif tagger == 'kkma':
self.tagger = taggers.Kkma()
self.tagger_options = {
'flatten': bool(kwargs.get('flatten', True)),
}
elif tagger == 'mecab':
self.tagger = taggers.Mecab()
self.tagger_options = {
'flatten': bool(kwargs.get('flatten', True)),
}
else:
raise LexRankError("available taggers are: twitter, komoran, hannanum, kkma, mecab")
self.useful_tags = useful_tags
self.delimiters = delimiters
self.stopwords = stopwords
self.min_token_length = min_token_length
self.splitter = self.splitterer()
self.pos = lambda text: self.tagger.pos(text, **self.tagger_options)
示例8: load_tokenizer
# 需要导入模块: from konlpy import tag [as 别名]
# 或者: from konlpy.tag import Mecab [as 别名]
def load_tokenizer(lang):
if lang == "ko":
from konlpy.tag import Mecab
tokenizer = Mecab()
elif lang == "ja":
import Mykytea
opt = "-model jp-0.4.7-1.mod"
tokenizer = Mykytea.Mykytea(opt)
elif lang == "zh_cn":
import Mykytea
opt = "-model ctb-0.4.0-1.mod"
tokenizer = Mykytea.Mykytea(opt)
elif lang == "zh_tw":
import jieba
tokenizer = jieba
elif lang == "vi":
from pyvi import ViTokenizer
tokenizer = ViTokenizer
elif lang == "th":
from pythainlp.tokenize import word_tokenize
tokenizer = word_tokenize
elif lang == "ar":
import pyarabic.araby as araby
tokenizer = araby
# elif lang=="en":
# from nltk import word_tokenize
# tokenizer = word_tokenize
else:
from nltk.tokenize import ToktokTokenizer
tokenizer = ToktokTokenizer()
return tokenizer