本文整理汇总了Python中MeCab.Tagger方法的典型用法代码示例。如果您正苦于以下问题:Python MeCab.Tagger方法的具体用法?Python MeCab.Tagger怎么用?Python MeCab.Tagger使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类MeCab
的用法示例。
在下文中一共展示了MeCab.Tagger方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: __init__
# 需要导入模块: import MeCab [as 别名]
# 或者: from MeCab import Tagger [as 别名]
def __init__(self, do_lower_case=False, never_split=None, normalize_text=True):
"""Constructs a MecabTokenizer.
Args:
**do_lower_case**: (`optional`) boolean (default True)
Whether to lower case the input.
**never_split**: (`optional`) list of str
Kept for backward compatibility purposes.
Now implemented directly at the base class level (see :func:`PreTrainedTokenizer.tokenize`)
List of token not to split.
**normalize_text**: (`optional`) boolean (default True)
Whether to apply unicode normalization to text before tokenization.
"""
self.do_lower_case = do_lower_case
self.never_split = never_split if never_split is not None else []
self.normalize_text = normalize_text
import MeCab
self.mecab = MeCab.Tagger()
示例2: __init__
# 需要导入模块: import MeCab [as 别名]
# 或者: from MeCab import Tagger [as 别名]
def __init__(self, do_lower_case=False, never_split=None,
mecab_dict_path=None, preserve_spaces=False):
"""Constructs a MecabBasicTokenizer.
Args:
**do_lower_case**: (`optional`) boolean (default True)
Whether to lower case the input.
**mecab_dict_path**: (`optional`) string
Path to a directory of a MeCab dictionary.
**preserve_spaces**: (`optional`) boolean (default True)
Whether to preserve whitespaces in the output tokens.
"""
if never_split is None:
never_split = []
self.do_lower_case = do_lower_case
self.never_split = never_split
import MeCab
if mecab_dict_path is not None:
self.mecab = MeCab.Tagger('-d {}'.format(mecab_dict_path))
else:
self.mecab = MeCab.Tagger()
self.preserve_spaces = preserve_spaces
示例3: response
# 需要导入模块: import MeCab [as 别名]
# 或者: from MeCab import Tagger [as 别名]
def response(self):
if random.randint(0, 200) > self.PROBABLY:
return None
mecab = MeCab.Tagger().parse(self.text.encode('utf-8'))
node = mecab.split("\n")
noword = []
for l in node:
if l == 'EOS' or l == '':
break
word, wordclass = l.split("\t")
wordclass = wordclass.split(",")
if wordclass[0] == "名詞":
noword.append(word)
random.shuffle(noword)
if len(noword) > 0:
return (noword.pop()).decode('utf-8')
return None
示例4: _set_tokenizer
# 需要导入模块: import MeCab [as 别名]
# 或者: from MeCab import Tagger [as 别名]
def _set_tokenizer(self):
try:
import MeCab
class Tokenizer():
def __init__(self):
self.tagger = MeCab.Tagger("-Ochasen")
def tokenize(self, text):
self.tagger.parse("")
node = self.tagger.parseToNode(text)
tokens = []
while node:
if node.surface:
tokens.append(node)
node = node.next
return tokens
self.tokenizer = Tokenizer()
except Exception as ex:
from janome.tokenizer import Tokenizer
self.tokenizer = Tokenizer()
示例5: __init__
# 需要导入模块: import MeCab [as 别名]
# 或者: from MeCab import Tagger [as 别名]
def __init__(self, dic_path):
try:
import MeCab
except ImportError:
raise ImportError("could not import `MeCab`; make sure that "
"`mecab-python` is installed by running "
"`install_mceab_ko.sh` in the repository. ")
self._dic_path = dic_path
self._tagger = MeCab.Tagger("-d {}".format(
dic_path
))
示例6: __init__
# 需要导入模块: import MeCab [as 别名]
# 或者: from MeCab import Tagger [as 别名]
def __init__(self):
self.tagger = MeCab.Tagger("-Owakati")
# make sure the dictionary is IPA
# sacreBLEU is only compatible with 0.996.5 for now
# Please see: https://github.com/mjpost/sacrebleu/issues/94
d = self.tagger.dictionary_info()
assert d.size == 392126, \
"Please make sure to use IPA dictionary for MeCab"
assert d.next is None
示例7: __init__
# 需要导入模块: import MeCab [as 别名]
# 或者: from MeCab import Tagger [as 别名]
def __init__(self,
lang: str = 'en',
lower_case: bool = True,
romanize: Optional[bool] = None,
descape: bool = False):
assert lower_case, 'lower case is needed by all the models'
if lang in ('cmn', 'wuu', 'yue'):
lang = 'zh'
if lang == 'jpn':
lang = 'ja'
if lang == 'zh' and jieba is None:
raise ModuleNotFoundError(
'''No module named 'jieba'. Install laserembeddings with 'zh' extra to fix that: "pip install laserembeddings[zh]"'''
)
if lang == 'ja' and MeCab is None:
raise ModuleNotFoundError(
'''No module named 'MeCab'. Install laserembeddings with 'ja' extra to fix that: "pip install laserembeddings[ja]"'''
)
self.lang = lang
self.lower_case = lower_case
self.romanize = romanize if romanize is not None else lang == 'el'
self.descape = descape
self.normalizer = MosesPunctNormalizer(lang=lang)
self.tokenizer = MosesTokenizer(lang=lang)
self.mecab_tokenizer = MeCab.Tagger(
"-O wakati -b 50000") if lang == 'ja' else None
示例8: __init__
# 需要导入模块: import MeCab [as 别名]
# 或者: from MeCab import Tagger [as 别名]
def __init__(self):
try:
import MeCab
self.tagger = MeCab.Tagger('-Ochasen')
except ImportError:
logging.error(
('mecab-python3 is not installed. Install the module by running '
'`$ pip install mecab-python3`. If MeCab is not installed in your '
'system yet, run `$ make install-mecab` instead.'))
sys.exit(1)
示例9: analyze_morph
# 需要导入模块: import MeCab [as 别名]
# 或者: from MeCab import Tagger [as 别名]
def analyze_morph(sent):
surfaces = []
features = []
t = MeCab.Tagger()
t.parse('')
m = t.parseToNode(sent)
while m:
if m.feature.startswith('BOS/EOS'):
m = m.next
continue
surfaces.append(m.surface)
features.append(m.feature)
m = m.next
return surfaces, features
示例10: tokenize
# 需要导入模块: import MeCab [as 别名]
# 或者: from MeCab import Tagger [as 别名]
def tokenize(self, sentence_str):
'''
Tokenize str.
Args:
sentence_str: tokenized string.
Returns:
[token, token, token, ...]
'''
mt = MeCab.Tagger("-Owakati")
wordlist = mt.parse(sentence_str)
token_list = wordlist.rstrip(" \n").split(" ")
return token_list
示例11: __init__
# 需要导入模块: import MeCab [as 别名]
# 或者: from MeCab import Tagger [as 别名]
def __init__(self, user_dic_path='', sys_dic_path=''):
option = ''
if user_dic_path:
option += ' -u {0}'.format(user_dic_path)
if sys_dic_path:
option += ' -d {0}'.format(sys_dic_path)
self._t = MeCab.Tagger(option)
示例12: __init__
# 需要导入模块: import MeCab [as 别名]
# 或者: from MeCab import Tagger [as 别名]
def __init__(self, mecab_dict_path=None):
if mecab_dict_path is not None:
self.mecab = MeCab.Tagger('-d {}'.format(mecab_dict_path))
else:
self.mecab = MeCab.Tagger()
示例13: data_preprocess_filtering
# 需要导入模块: import MeCab [as 别名]
# 或者: from MeCab import Tagger [as 别名]
def data_preprocess_filtering(self, line, iter_i):
fields = line.strip().split("\t")
valid, emojis = self.validated_tweet(fields)
#japanese stored as the unicode-escape
mecab = MeCab.Tagger('-Owakati')
try:
if JAPAN:
re_url = re.compile(ur'(?:https?://|www\.)(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')
re_at = re.compile(ur'@[a-zA-Z0-9_]+')
start = text.find("https")
if start != -1:
text = text[:at_start].encode('utf-8')+mecab.parse(text[at_start:start].encode('utf-8'))+text[start:].encode('utf-8')
else:
text = mecab.parse(text.encode('utf-8'))
# print("wori", at_start, text)
text = text.decode('utf-8')
else:
text = fields[2].decode('unicode-escape') \
.replace(u'\\n', u'') \
.replace(u'\\r', u'') \
.replace(u'&', u'&') if valid else ''
except:
text = fields[2].replace(u'\\n', u'') \
.replace(u'\\r', u'') \
.replace(u'&', u'&') if valid else ''
return valid, text, {'emojis': emojis}
示例14: __init__
# 需要导入模块: import MeCab [as 别名]
# 或者: from MeCab import Tagger [as 别名]
def __init__(self, mecab_option='', do_lower_case=False, preserved_pattern=None):
super(MeCabTokenizer, self).__init__(do_lower_case, preserved_pattern)
import MeCab
self.mecab_option = mecab_option
self.mecab = MeCab.Tagger(self.mecab_option)
示例15: __init__
# 需要导入模块: import MeCab [as 别名]
# 或者: from MeCab import Tagger [as 别名]
def __init__(self, mecab_args="mecabrc"):
"""Initialize tokenizer.
Args:
mecab_args: Argument of mecab.
i.e. '-Ochasen', '-d /usr/local/lib/mecab/dic/mecab-ipadic-neologd'
"""
self.tokenizer = MeCab.Tagger(mecab_args)