当前位置: 首页>>代码示例>>Python>>正文


Python tag.Mecab方法代码示例

本文整理汇总了Python中konlpy.tag.Mecab方法的典型用法代码示例。如果您正苦于以下问题:Python tag.Mecab方法的具体用法?Python tag.Mecab怎么用?Python tag.Mecab使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在konlpy.tag的用法示例。


在下文中一共展示了tag.Mecab方法的8个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: get_tokenizer

# 需要导入模块: from konlpy import tag [as 别名]
# 或者: from konlpy.tag import Mecab [as 别名]
def get_tokenizer(tokenizer_name):
    if tokenizer_name == "komoran":
        tokenizer = Komoran()
    elif tokenizer_name == "okt":
        tokenizer = Okt()
    elif tokenizer_name == "mecab":
        tokenizer = Mecab()
    elif tokenizer_name == "hannanum":
        tokenizer = Hannanum()
    elif tokenizer_name == "kkma":
        tokenizer = Kkma()
    elif tokenizer_name == "khaiii":
        tokenizer = KhaiiiApi()
    else:
        tokenizer = Mecab()
    return tokenizer 
开发者ID:ratsgo,项目名称:embedding,代码行数:18,代码来源:supervised_nlputils.py

示例2: morphTag

# 需要导入模块: from konlpy import tag [as 别名]
# 或者: from konlpy.tag import Mecab [as 别名]
def morphTag(in_fname, out_fname):
    mec = Mecab()
    corpus = readfileUTF8(in_fname)
    concat_sent = []
    for n in range(0, len(corpus)):
        tagged = mec.pos(corpus[n])
        concat = ''
        for m in range(0, len(tagged)):
            if m < len(tagged):
                concat = concat + tagged[m][0] + '/' + tagged[m][1] + ' '
            elif m == len(tagged):  # When reached the final item
                concat = concat + tagged[m][0] + '/' + tagged[m][1]

        concat_sent.append(concat)
    writefile(concat_sent, out_fname)

    return concat_sent 
开发者ID:scarletcho,项目名称:KoLM,代码行数:19,代码来源:tag.py

示例3: __init__

# 需要导入模块: from konlpy import tag [as 别名]
# 或者: from konlpy.tag import Mecab [as 别名]
def __init__(self):
        try:
            from konlpy.tag import Mecab
        except ImportError:
            raise ImportError(
                'Mecab is not installed. '
                'You can install Mecab with "sh scripts/install_mecab.sh" '
                'You can refer to the installation guide in https://github.com/lyeoni/prenlp/blob/master/scripts/install_mecab.sh or https://bitbucket.org/eunjeon/mecab-ko-dic/src')
        self.tokenizer = Mecab() 
开发者ID:lyeoni,项目名称:prenlp,代码行数:11,代码来源:tokenizer.py

示例4: get_mecab

# 需要导入模块: from konlpy import tag [as 别名]
# 或者: from konlpy.tag import Mecab [as 别名]
def get_mecab(self, dict_path):
        try:
            if dict_path:
                return Mecab(dict_path) # for annotation
            else:
                return Mecab()
        except Exception as e:
            raise Exception(
                'If you want to install mecab, The command is.. bash <(curl -s https://raw.githubusercontent.com/konlpy/konlpy/master/scripts/mecab.sh)'
            ) 
开发者ID:Kyubyong,项目名称:g2pK,代码行数:12,代码来源:g2pk.py

示例5: _mecab_ko

# 需要导入模块: from konlpy import tag [as 别名]
# 或者: from konlpy.tag import Mecab [as 别名]
def _mecab_ko(self, text):
        if self.word_tokenizer is None:
            from konlpy.tag import Mecab

            self.word_tokenizer = Mecab()

        return self.word_tokenizer.morphs(text) 
开发者ID:naver,项目名称:claf,代码行数:9,代码来源:word.py

示例6: main

# 需要导入模块: from konlpy import tag [as 别名]
# 或者: from konlpy.tag import Mecab [as 别名]
def main(config):
    print(config)
    
    list_of_tokens = []
    if config.is_tokenized:
        # Read tokens
        with open(config.corpus, 'r', encoding='utf8') as reader:
            for li, line in enumerate(reader):
                list_of_tokens += line.strip().split()
    else:
        # Select tokenizer
        if config.tokenizer=='mecab':
            from konlpy.tag import Mecab
            tokenizer = Tokenizer(tokenization_fn=Mecab().morphs)

        # Tokenization & read tokens
        with open(config.corpus, 'r', encoding='utf8') as reader:
            for li, line in enumerate(reader):
                list_of_tokens += tokenizer.tokenize(line.strip())

    # Build vocabulary                
    vocab = Vocab(list_of_tokens=list_of_tokens,
                  unk_token=config.unk_token,
                  pad_token=config.pad_token,
                  bos_token=config.bos_token,
                  eos_token=config.eos_token,
                  min_freq=config.min_freq,
                  lower=config.lower)
    vocab.build()
    print('Vocabulary size: ', len(vocab))

    # Save vocabulary
    with open(config.vocab, 'wb') as writer:
        pickle.dump(vocab, writer)
    print('Vocabulary saved to', config.vocab) 
开发者ID:lyeoni,项目名称:pretraining-for-language-understanding,代码行数:37,代码来源:build_vocab.py

示例7: __init__

# 需要导入模块: from konlpy import tag [as 别名]
# 或者: from konlpy.tag import Mecab [as 别名]
def __init__(self, tagger, useful_tags, delimiters, min_token_length, stopwords, **kwargs):
        if tagger == 'twitter':
            self.tagger = taggers.Twitter()
            self.tagger_options = {
                'norm': bool(kwargs.get('norm', True)),
                'stem': bool(kwargs.get('stem', True)),
            }
        elif tagger == 'komoran':
            self.tagger = taggers.Komoran()
            self.tagger_options = {
                'flatten': bool(kwargs.get('flatten', True)),
            }
        elif tagger == 'hannanum':
            self.tagger = taggers.Hannanum()
            self.tagger_options = {
                'ntags': int(kwargs.get('ntags', 9)),
                'flatten': bool(kwargs.get('flatten', True)),
            }
        elif tagger == 'kkma':
            self.tagger = taggers.Kkma()
            self.tagger_options = {
                'flatten': bool(kwargs.get('flatten', True)),
            }
        elif tagger == 'mecab':
            self.tagger = taggers.Mecab()
            self.tagger_options = {
                'flatten': bool(kwargs.get('flatten', True)),
            }
        else:
            raise LexRankError("available taggers are: twitter, komoran, hannanum, kkma, mecab")
        self.useful_tags = useful_tags
        self.delimiters = delimiters
        self.stopwords = stopwords
        self.min_token_length = min_token_length
        self.splitter = self.splitterer()
        self.pos = lambda text: self.tagger.pos(text, **self.tagger_options) 
开发者ID:theeluwin,项目名称:lexrankr,代码行数:38,代码来源:lexrankr.py

示例8: load_tokenizer

# 需要导入模块: from konlpy import tag [as 别名]
# 或者: from konlpy.tag import Mecab [as 别名]
def load_tokenizer(lang):
    if lang == "ko":
        from konlpy.tag import Mecab
        tokenizer = Mecab()
    elif lang == "ja":
        import Mykytea
        opt = "-model jp-0.4.7-1.mod"
        tokenizer = Mykytea.Mykytea(opt)
    elif lang == "zh_cn":
        import Mykytea
        opt = "-model ctb-0.4.0-1.mod"
        tokenizer = Mykytea.Mykytea(opt)
    elif lang == "zh_tw":
        import jieba
        tokenizer = jieba
    elif lang == "vi":
        from pyvi import ViTokenizer
        tokenizer = ViTokenizer
    elif lang == "th":
        from pythainlp.tokenize import word_tokenize
        tokenizer = word_tokenize
    elif lang == "ar":
        import pyarabic.araby as araby
        tokenizer = araby
    # elif lang=="en":
    #     from nltk import word_tokenize
    #     tokenizer = word_tokenize
    else:
        from nltk.tokenize import ToktokTokenizer
        tokenizer = ToktokTokenizer()

    return tokenizer 
开发者ID:kakaobrain,项目名称:word2word,代码行数:34,代码来源:tokenization.py


注:本文中的konlpy.tag.Mecab方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。