當前位置: 首頁>>代碼示例>>Python>>正文


Python MeCab.Tagger方法代碼示例

本文整理匯總了Python中MeCab.Tagger方法的典型用法代碼示例。如果您正苦於以下問題:Python MeCab.Tagger方法的具體用法?Python MeCab.Tagger怎麽用?Python MeCab.Tagger使用的例子?那麽, 這裏精選的方法代碼示例或許可以為您提供幫助。您也可以進一步了解該方法所在MeCab的用法示例。


在下文中一共展示了MeCab.Tagger方法的15個代碼示例,這些例子默認根據受歡迎程度排序。您可以為喜歡或者感覺有用的代碼點讚,您的評價將有助於係統推薦出更棒的Python代碼示例。

示例1: __init__

# 需要導入模塊: import MeCab [as 別名]
# 或者: from MeCab import Tagger [as 別名]
def __init__(self, do_lower_case=False, never_split=None, normalize_text=True):
        """Constructs a MecabTokenizer.

        Args:
            **do_lower_case**: (`optional`) boolean (default True)
                Whether to lower case the input.
            **never_split**: (`optional`) list of str
                Kept for backward compatibility purposes.
                Now implemented directly at the base class level (see :func:`PreTrainedTokenizer.tokenize`)
                List of token not to split.
            **normalize_text**: (`optional`) boolean (default True)
                Whether to apply unicode normalization to text before tokenization.
        """
        self.do_lower_case = do_lower_case
        self.never_split = never_split if never_split is not None else []
        self.normalize_text = normalize_text

        import MeCab

        self.mecab = MeCab.Tagger() 
開發者ID:bhoov,項目名稱:exbert,代碼行數:22,代碼來源:tokenization_bert_japanese.py

示例2: __init__

# 需要導入模塊: import MeCab [as 別名]
# 或者: from MeCab import Tagger [as 別名]
def __init__(self, do_lower_case=False, never_split=None,
                 mecab_dict_path=None, preserve_spaces=False):
        """Constructs a MecabBasicTokenizer.

        Args:
            **do_lower_case**: (`optional`) boolean (default True)
                Whether to lower case the input.
            **mecab_dict_path**: (`optional`) string
                Path to a directory of a MeCab dictionary.
            **preserve_spaces**: (`optional`) boolean (default True)
                Whether to preserve whitespaces in the output tokens.
        """
        if never_split is None:
            never_split = []

        self.do_lower_case = do_lower_case
        self.never_split = never_split

        import MeCab
        if mecab_dict_path is not None:
            self.mecab = MeCab.Tagger('-d {}'.format(mecab_dict_path))
        else:
            self.mecab = MeCab.Tagger()

        self.preserve_spaces = preserve_spaces 
開發者ID:cl-tohoku,項目名稱:bert-japanese,代碼行數:27,代碼來源:tokenization.py

示例3: response

# 需要導入模塊: import MeCab [as 別名]
# 或者: from MeCab import Tagger [as 別名]
def response(self):
        if random.randint(0, 200) > self.PROBABLY:
            return None
        mecab = MeCab.Tagger().parse(self.text.encode('utf-8'))
        node = mecab.split("\n")
        noword = []
        for l in node:
            if l == 'EOS' or l == '':
                break
            word, wordclass = l.split("\t")
            wordclass = wordclass.split(",")
            if wordclass[0] == "名詞":
                noword.append(word)
        random.shuffle(noword)
        if len(noword) > 0:
            return (noword.pop()).decode('utf-8')
        return None 
開發者ID:tinbotu,項目名稱:sub,代碼行數:19,代碼來源:sun.py

示例4: _set_tokenizer

# 需要導入模塊: import MeCab [as 別名]
# 或者: from MeCab import Tagger [as 別名]
def _set_tokenizer(self):
        try:
            import MeCab

            class Tokenizer():

                def __init__(self):
                    self.tagger = MeCab.Tagger("-Ochasen")

                def tokenize(self, text):
                    self.tagger.parse("")
                    node = self.tagger.parseToNode(text)
                    tokens = []
                    while node:
                        if node.surface:
                            tokens.append(node)
                        node = node.next
                    return tokens

            self.tokenizer = Tokenizer()

        except Exception as ex:
            from janome.tokenizer import Tokenizer
            self.tokenizer = Tokenizer() 
開發者ID:chakki-works,項目名稱:sumeval,代碼行數:26,代碼來源:lang_ja.py

示例5: __init__

# 需要導入模塊: import MeCab [as 別名]
# 或者: from MeCab import Tagger [as 別名]
def __init__(self, dic_path):
        try:
            import MeCab
        except ImportError:
            raise ImportError("could not import `MeCab`; make sure that "
                              "`mecab-python` is installed by running "
                              "`install_mceab_ko.sh` in the repository. ")
        self._dic_path = dic_path
        self._tagger = MeCab.Tagger("-d {}".format(
            dic_path
        )) 
開發者ID:kaniblu,項目名稱:hangul-utils,代碼行數:13,代碼來源:preprocess.py

示例6: __init__

# 需要導入模塊: import MeCab [as 別名]
# 或者: from MeCab import Tagger [as 別名]
def __init__(self):
        self.tagger = MeCab.Tagger("-Owakati")

        # make sure the dictionary is IPA
        # sacreBLEU is only compatible with 0.996.5 for now
        # Please see: https://github.com/mjpost/sacrebleu/issues/94
        d = self.tagger.dictionary_info()
        assert d.size == 392126, \
            "Please make sure to use IPA dictionary for MeCab"
        assert d.next is None 
開發者ID:mjpost,項目名稱:sacrebleu,代碼行數:12,代碼來源:tokenizer_ja_mecab.py

示例7: __init__

# 需要導入模塊: import MeCab [as 別名]
# 或者: from MeCab import Tagger [as 別名]
def __init__(self,
                 lang: str = 'en',
                 lower_case: bool = True,
                 romanize: Optional[bool] = None,
                 descape: bool = False):
        assert lower_case, 'lower case is needed by all the models'

        if lang in ('cmn', 'wuu', 'yue'):
            lang = 'zh'
        if lang == 'jpn':
            lang = 'ja'

        if lang == 'zh' and jieba is None:
            raise ModuleNotFoundError(
                '''No module named 'jieba'. Install laserembeddings with 'zh' extra to fix that: "pip install laserembeddings[zh]"'''
            )
        if lang == 'ja' and MeCab is None:
            raise ModuleNotFoundError(
                '''No module named 'MeCab'. Install laserembeddings with 'ja' extra to fix that: "pip install laserembeddings[ja]"'''
            )

        self.lang = lang
        self.lower_case = lower_case
        self.romanize = romanize if romanize is not None else lang == 'el'
        self.descape = descape

        self.normalizer = MosesPunctNormalizer(lang=lang)
        self.tokenizer = MosesTokenizer(lang=lang)
        self.mecab_tokenizer = MeCab.Tagger(
            "-O wakati -b 50000") if lang == 'ja' else None 
開發者ID:yannvgn,項目名稱:laserembeddings,代碼行數:32,代碼來源:preprocessing.py

示例8: __init__

# 需要導入模塊: import MeCab [as 別名]
# 或者: from MeCab import Tagger [as 別名]
def __init__(self):
    try:
      import MeCab
      self.tagger = MeCab.Tagger('-Ochasen')
    except ImportError:
      logging.error(
          ('mecab-python3 is not installed. Install the module by running '
           '`$ pip install mecab-python3`. If MeCab is not installed in your '
           'system yet, run `$ make install-mecab` instead.'))
      sys.exit(1) 
開發者ID:google,項目名稱:budou,代碼行數:12,代碼來源:mecabsegmenter.py

示例9: analyze_morph

# 需要導入模塊: import MeCab [as 別名]
# 或者: from MeCab import Tagger [as 別名]
def analyze_morph(sent):
    surfaces = []
    features = []
    t = MeCab.Tagger()
    t.parse('')
    m = t.parseToNode(sent)
    while m:
        if m.feature.startswith('BOS/EOS'):
            m = m.next
            continue
        surfaces.append(m.surface)
        features.append(m.feature)
        m = m.next
    return surfaces, features 
開發者ID:Hironsan,項目名稱:HotPepperGourmetDialogue,代碼行數:16,代碼來源:analyzer.py

示例10: tokenize

# 需要導入模塊: import MeCab [as 別名]
# 或者: from MeCab import Tagger [as 別名]
def tokenize(self, sentence_str):
        '''
        Tokenize str.
        
        Args:
            sentence_str:   tokenized string.
        
        Returns:
            [token, token, token, ...]
        '''
        mt = MeCab.Tagger("-Owakati")
        wordlist = mt.parse(sentence_str)
        token_list = wordlist.rstrip(" \n").split(" ")
        return token_list 
開發者ID:chimera0,項目名稱:accel-brain-code,代碼行數:16,代碼來源:mecab_tokenizer.py

示例11: __init__

# 需要導入模塊: import MeCab [as 別名]
# 或者: from MeCab import Tagger [as 別名]
def __init__(self, user_dic_path='', sys_dic_path=''):
        option = ''
        if user_dic_path:
            option += ' -u {0}'.format(user_dic_path)
        if sys_dic_path:
            option += ' -d {0}'.format(sys_dic_path)
        self._t = MeCab.Tagger(option) 
開發者ID:Hironsan,項目名稱:natural-language-preprocessings,代碼行數:9,代碼來源:tokenizer.py

示例12: __init__

# 需要導入模塊: import MeCab [as 別名]
# 或者: from MeCab import Tagger [as 別名]
def __init__(self, mecab_dict_path=None):
        if mecab_dict_path is not None:
            self.mecab = MeCab.Tagger('-d {}'.format(mecab_dict_path))
        else:
            self.mecab = MeCab.Tagger() 
開發者ID:cl-tohoku,項目名稱:bert-japanese,代碼行數:7,代碼來源:make_corpus.py

示例13: data_preprocess_filtering

# 需要導入模塊: import MeCab [as 別名]
# 或者: from MeCab import Tagger [as 別名]
def data_preprocess_filtering(self, line, iter_i):
        fields = line.strip().split("\t")
        valid, emojis = self.validated_tweet(fields)
        #japanese stored as the unicode-escape
        mecab = MeCab.Tagger('-Owakati')
        try:
            if JAPAN:
                re_url = re.compile(ur'(?:https?://|www\.)(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')
                re_at = re.compile(ur'@[a-zA-Z0-9_]+')
                start = text.find("https")
                if start != -1:
                    text = text[:at_start].encode('utf-8')+mecab.parse(text[at_start:start].encode('utf-8'))+text[start:].encode('utf-8')
                else:
                    text = mecab.parse(text.encode('utf-8'))
                # print("wori", at_start, text)
                text = text.decode('utf-8')
            else:
                text = fields[2].decode('unicode-escape') \
                        .replace(u'\\n', u'') \
                        .replace(u'\\r', u'') \
                        .replace(u'&amp', u'&') if valid else ''
        except:
            text = fields[2].replace(u'\\n', u'') \
                        .replace(u'\\r', u'') \
                        .replace(u'&amp', u'&') if valid else ''
        return valid, text, {'emojis': emojis} 
開發者ID:sIncerass,項目名稱:ELSA,代碼行數:28,代碼來源:word_generator.py

示例14: __init__

# 需要導入模塊: import MeCab [as 別名]
# 或者: from MeCab import Tagger [as 別名]
def __init__(self, mecab_option='', do_lower_case=False, preserved_pattern=None):
        super(MeCabTokenizer, self).__init__(do_lower_case, preserved_pattern)
        import MeCab
        self.mecab_option = mecab_option
        self.mecab = MeCab.Tagger(self.mecab_option) 
開發者ID:singletongue,項目名稱:WikiEntVec,代碼行數:7,代碼來源:tokenization.py

示例15: __init__

# 需要導入模塊: import MeCab [as 別名]
# 或者: from MeCab import Tagger [as 別名]
def __init__(self, mecab_args="mecabrc"):
        """Initialize tokenizer.

        Args:
          mecab_args: Argument of mecab.
            i.e. '-Ochasen', '-d /usr/local/lib/mecab/dic/mecab-ipadic-neologd'

        """
        self.tokenizer = MeCab.Tagger(mecab_args) 
開發者ID:ymym3412,項目名稱:position-rank,代碼行數:11,代碼來源:tokenizer.py


注:本文中的MeCab.Tagger方法示例由純淨天空整理自Github/MSDocs等開源代碼及文檔管理平台,相關代碼片段篩選自各路編程大神貢獻的開源項目,源碼版權歸原作者所有,傳播和使用請參考對應項目的License;未經允許,請勿轉載。