当前位置: 首页>>代码示例>>Python>>正文


Python MeCab.Tagger方法代码示例

本文整理汇总了Python中MeCab.Tagger方法的典型用法代码示例。如果您正苦于以下问题:Python MeCab.Tagger方法的具体用法?Python MeCab.Tagger怎么用?Python MeCab.Tagger使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在MeCab的用法示例。


在下文中一共展示了MeCab.Tagger方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: __init__

# 需要导入模块: import MeCab [as 别名]
# 或者: from MeCab import Tagger [as 别名]
def __init__(self, do_lower_case=False, never_split=None, normalize_text=True):
        """Constructs a MecabTokenizer.

        Args:
            **do_lower_case**: (`optional`) boolean (default True)
                Whether to lower case the input.
            **never_split**: (`optional`) list of str
                Kept for backward compatibility purposes.
                Now implemented directly at the base class level (see :func:`PreTrainedTokenizer.tokenize`)
                List of token not to split.
            **normalize_text**: (`optional`) boolean (default True)
                Whether to apply unicode normalization to text before tokenization.
        """
        self.do_lower_case = do_lower_case
        self.never_split = never_split if never_split is not None else []
        self.normalize_text = normalize_text

        import MeCab

        self.mecab = MeCab.Tagger() 
开发者ID:bhoov,项目名称:exbert,代码行数:22,代码来源:tokenization_bert_japanese.py

示例2: __init__

# 需要导入模块: import MeCab [as 别名]
# 或者: from MeCab import Tagger [as 别名]
def __init__(self, do_lower_case=False, never_split=None,
                 mecab_dict_path=None, preserve_spaces=False):
        """Constructs a MecabBasicTokenizer.

        Args:
            **do_lower_case**: (`optional`) boolean (default True)
                Whether to lower case the input.
            **mecab_dict_path**: (`optional`) string
                Path to a directory of a MeCab dictionary.
            **preserve_spaces**: (`optional`) boolean (default True)
                Whether to preserve whitespaces in the output tokens.
        """
        if never_split is None:
            never_split = []

        self.do_lower_case = do_lower_case
        self.never_split = never_split

        import MeCab
        if mecab_dict_path is not None:
            self.mecab = MeCab.Tagger('-d {}'.format(mecab_dict_path))
        else:
            self.mecab = MeCab.Tagger()

        self.preserve_spaces = preserve_spaces 
开发者ID:cl-tohoku,项目名称:bert-japanese,代码行数:27,代码来源:tokenization.py

示例3: response

# 需要导入模块: import MeCab [as 别名]
# 或者: from MeCab import Tagger [as 别名]
def response(self):
        if random.randint(0, 200) > self.PROBABLY:
            return None
        mecab = MeCab.Tagger().parse(self.text.encode('utf-8'))
        node = mecab.split("\n")
        noword = []
        for l in node:
            if l == 'EOS' or l == '':
                break
            word, wordclass = l.split("\t")
            wordclass = wordclass.split(",")
            if wordclass[0] == "名詞":
                noword.append(word)
        random.shuffle(noword)
        if len(noword) > 0:
            return (noword.pop()).decode('utf-8')
        return None 
开发者ID:tinbotu,项目名称:sub,代码行数:19,代码来源:sun.py

示例4: _set_tokenizer

# 需要导入模块: import MeCab [as 别名]
# 或者: from MeCab import Tagger [as 别名]
def _set_tokenizer(self):
        try:
            import MeCab

            class Tokenizer():

                def __init__(self):
                    self.tagger = MeCab.Tagger("-Ochasen")

                def tokenize(self, text):
                    self.tagger.parse("")
                    node = self.tagger.parseToNode(text)
                    tokens = []
                    while node:
                        if node.surface:
                            tokens.append(node)
                        node = node.next
                    return tokens

            self.tokenizer = Tokenizer()

        except Exception as ex:
            from janome.tokenizer import Tokenizer
            self.tokenizer = Tokenizer() 
开发者ID:chakki-works,项目名称:sumeval,代码行数:26,代码来源:lang_ja.py

示例5: __init__

# 需要导入模块: import MeCab [as 别名]
# 或者: from MeCab import Tagger [as 别名]
def __init__(self, dic_path):
        try:
            import MeCab
        except ImportError:
            raise ImportError("could not import `MeCab`; make sure that "
                              "`mecab-python` is installed by running "
                              "`install_mceab_ko.sh` in the repository. ")
        self._dic_path = dic_path
        self._tagger = MeCab.Tagger("-d {}".format(
            dic_path
        )) 
开发者ID:kaniblu,项目名称:hangul-utils,代码行数:13,代码来源:preprocess.py

示例6: __init__

# 需要导入模块: import MeCab [as 别名]
# 或者: from MeCab import Tagger [as 别名]
def __init__(self):
        self.tagger = MeCab.Tagger("-Owakati")

        # make sure the dictionary is IPA
        # sacreBLEU is only compatible with 0.996.5 for now
        # Please see: https://github.com/mjpost/sacrebleu/issues/94
        d = self.tagger.dictionary_info()
        assert d.size == 392126, \
            "Please make sure to use IPA dictionary for MeCab"
        assert d.next is None 
开发者ID:mjpost,项目名称:sacrebleu,代码行数:12,代码来源:tokenizer_ja_mecab.py

示例7: __init__

# 需要导入模块: import MeCab [as 别名]
# 或者: from MeCab import Tagger [as 别名]
def __init__(self,
                 lang: str = 'en',
                 lower_case: bool = True,
                 romanize: Optional[bool] = None,
                 descape: bool = False):
        assert lower_case, 'lower case is needed by all the models'

        if lang in ('cmn', 'wuu', 'yue'):
            lang = 'zh'
        if lang == 'jpn':
            lang = 'ja'

        if lang == 'zh' and jieba is None:
            raise ModuleNotFoundError(
                '''No module named 'jieba'. Install laserembeddings with 'zh' extra to fix that: "pip install laserembeddings[zh]"'''
            )
        if lang == 'ja' and MeCab is None:
            raise ModuleNotFoundError(
                '''No module named 'MeCab'. Install laserembeddings with 'ja' extra to fix that: "pip install laserembeddings[ja]"'''
            )

        self.lang = lang
        self.lower_case = lower_case
        self.romanize = romanize if romanize is not None else lang == 'el'
        self.descape = descape

        self.normalizer = MosesPunctNormalizer(lang=lang)
        self.tokenizer = MosesTokenizer(lang=lang)
        self.mecab_tokenizer = MeCab.Tagger(
            "-O wakati -b 50000") if lang == 'ja' else None 
开发者ID:yannvgn,项目名称:laserembeddings,代码行数:32,代码来源:preprocessing.py

示例8: __init__

# 需要导入模块: import MeCab [as 别名]
# 或者: from MeCab import Tagger [as 别名]
def __init__(self):
    try:
      import MeCab
      self.tagger = MeCab.Tagger('-Ochasen')
    except ImportError:
      logging.error(
          ('mecab-python3 is not installed. Install the module by running '
           '`$ pip install mecab-python3`. If MeCab is not installed in your '
           'system yet, run `$ make install-mecab` instead.'))
      sys.exit(1) 
开发者ID:google,项目名称:budou,代码行数:12,代码来源:mecabsegmenter.py

示例9: analyze_morph

# 需要导入模块: import MeCab [as 别名]
# 或者: from MeCab import Tagger [as 别名]
def analyze_morph(sent):
    surfaces = []
    features = []
    t = MeCab.Tagger()
    t.parse('')
    m = t.parseToNode(sent)
    while m:
        if m.feature.startswith('BOS/EOS'):
            m = m.next
            continue
        surfaces.append(m.surface)
        features.append(m.feature)
        m = m.next
    return surfaces, features 
开发者ID:Hironsan,项目名称:HotPepperGourmetDialogue,代码行数:16,代码来源:analyzer.py

示例10: tokenize

# 需要导入模块: import MeCab [as 别名]
# 或者: from MeCab import Tagger [as 别名]
def tokenize(self, sentence_str):
        '''
        Tokenize str.
        
        Args:
            sentence_str:   tokenized string.
        
        Returns:
            [token, token, token, ...]
        '''
        mt = MeCab.Tagger("-Owakati")
        wordlist = mt.parse(sentence_str)
        token_list = wordlist.rstrip(" \n").split(" ")
        return token_list 
开发者ID:chimera0,项目名称:accel-brain-code,代码行数:16,代码来源:mecab_tokenizer.py

示例11: __init__

# 需要导入模块: import MeCab [as 别名]
# 或者: from MeCab import Tagger [as 别名]
def __init__(self, user_dic_path='', sys_dic_path=''):
        option = ''
        if user_dic_path:
            option += ' -u {0}'.format(user_dic_path)
        if sys_dic_path:
            option += ' -d {0}'.format(sys_dic_path)
        self._t = MeCab.Tagger(option) 
开发者ID:Hironsan,项目名称:natural-language-preprocessings,代码行数:9,代码来源:tokenizer.py

示例12: __init__

# 需要导入模块: import MeCab [as 别名]
# 或者: from MeCab import Tagger [as 别名]
def __init__(self, mecab_dict_path=None):
        if mecab_dict_path is not None:
            self.mecab = MeCab.Tagger('-d {}'.format(mecab_dict_path))
        else:
            self.mecab = MeCab.Tagger() 
开发者ID:cl-tohoku,项目名称:bert-japanese,代码行数:7,代码来源:make_corpus.py

示例13: data_preprocess_filtering

# 需要导入模块: import MeCab [as 别名]
# 或者: from MeCab import Tagger [as 别名]
def data_preprocess_filtering(self, line, iter_i):
        fields = line.strip().split("\t")
        valid, emojis = self.validated_tweet(fields)
        #japanese stored as the unicode-escape
        mecab = MeCab.Tagger('-Owakati')
        try:
            if JAPAN:
                re_url = re.compile(ur'(?:https?://|www\.)(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')
                re_at = re.compile(ur'@[a-zA-Z0-9_]+')
                start = text.find("https")
                if start != -1:
                    text = text[:at_start].encode('utf-8')+mecab.parse(text[at_start:start].encode('utf-8'))+text[start:].encode('utf-8')
                else:
                    text = mecab.parse(text.encode('utf-8'))
                # print("wori", at_start, text)
                text = text.decode('utf-8')
            else:
                text = fields[2].decode('unicode-escape') \
                        .replace(u'\\n', u'') \
                        .replace(u'\\r', u'') \
                        .replace(u'&amp', u'&') if valid else ''
        except:
            text = fields[2].replace(u'\\n', u'') \
                        .replace(u'\\r', u'') \
                        .replace(u'&amp', u'&') if valid else ''
        return valid, text, {'emojis': emojis} 
开发者ID:sIncerass,项目名称:ELSA,代码行数:28,代码来源:word_generator.py

示例14: __init__

# 需要导入模块: import MeCab [as 别名]
# 或者: from MeCab import Tagger [as 别名]
def __init__(self, mecab_option='', do_lower_case=False, preserved_pattern=None):
        super(MeCabTokenizer, self).__init__(do_lower_case, preserved_pattern)
        import MeCab
        self.mecab_option = mecab_option
        self.mecab = MeCab.Tagger(self.mecab_option) 
开发者ID:singletongue,项目名称:WikiEntVec,代码行数:7,代码来源:tokenization.py

示例15: __init__

# 需要导入模块: import MeCab [as 别名]
# 或者: from MeCab import Tagger [as 别名]
def __init__(self, mecab_args="mecabrc"):
        """Initialize tokenizer.

        Args:
          mecab_args: Argument of mecab.
            i.e. '-Ochasen', '-d /usr/local/lib/mecab/dic/mecab-ipadic-neologd'

        """
        self.tokenizer = MeCab.Tagger(mecab_args) 
开发者ID:ymym3412,项目名称:position-rank,代码行数:11,代码来源:tokenizer.py


注:本文中的MeCab.Tagger方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。