当前位置: 首页>>代码示例>>Python>>正文


Python tokenizer.Tokenizer类代码示例

本文整理汇总了Python中janome.tokenizer.Tokenizer的典型用法代码示例。如果您正苦于以下问题:Python Tokenizer类的具体用法?Python Tokenizer怎么用?Python Tokenizer使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。


在下文中一共展示了Tokenizer类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: tokenize

def tokenize(text):
    t = Tokenizer()
    # テキストの先頭にあるヘッダとフッタを削除
    text = re.split(r'\-{5,}',text)[2]
    text = re.split(r'底本:', text)[0]
    text = text.strip()
    # ルビを削除
    text = text.replace('|', '')
    text = re.sub(r'《.+?》', '', text)
    # テキスト内の脚注を削除
    text = re.sub(r'[#.+?]', '', text)
    # 一行ずつ処理
    lines = text.split("\r\n")
    results = []
    for line in lines:
        res = []
        tokens = t.tokenize(line)
        for tok in tokens:
            bf = tok.base_form # 基本系
            if bf == "*": bf = tok.surface
            ps = tok.part_of_speech # 品詞情報
            hinsi = ps.split(',')[0]
            if hinsi in ['名詞', '動詞', '形容詞', '記号']:
                res.append(bf)
        l = " ".join(res)
        results.append(l)
    return results
开发者ID:kaneshirok,项目名称:ml-plactice,代码行数:27,代码来源:word2vec-split.py

示例2: separatewords

def separatewords(text):
    separatedWord=[]
    t=Tokenizer()
    tokens=t.tokenize(unicode(text, "utf-8"))
    
    for token in tokens:
        posList=token.part_of_speech.split(",")

        pos1=posList[0]
        if isinstance(pos1, unicode):
          pos1=pos1.encode("utf-8")

        pos2=posList[1]
        if isinstance(pos2, unicode):
          pos2=pos2.encode("utf-8")

        ruby=token.reading
        if isinstance(ruby, unicode):
          ruby=ruby.encode("utf-8")

        if pos1=="名詞":
            if pos2!="接尾" and pos2!="代名詞" and pos2!="非自立" and pos2!="数" and pos2!="形容動詞語幹":
                if ruby!="*":
                    separatedWord.append(token.surface.lower())
                    print token.surface.lower()
                elif pos2!="サ変接続" and len(token.surface)>3:
                    # 英単語に関しては4文字以上の単語を扱う
                    separatedWord.append(token.surface.lower())
                    print token.surface.lower()

    return separatedWord
开发者ID:camberbridge,项目名称:camberbridge.github.io,代码行数:31,代码来源:newsfeatures.py

示例3: chunk_with_kanji

def chunk_with_kanji(istr):
    t = Tokenizer()
    tokens = t.tokenize(istr)

    # give each element flags (jiritsu or fuzoku)
    flags = [judge_jifu(x.part_of_speech) for x in tokens]
    
    surface = [x.surface for x in tokens]

    # split to chunks, delimited by KUGIRI flag
    # very ugly. should be rewritten using tree structure etc.
    cflags = insert_chunkflg(flags)
    rstr = u""
    i = 0
    for j, f in enumerate(flags):
        if i >= len(cflags): break
        if cflags[i] == KUGIRI:
            if f == KUTOU: 
                rstr += surface[j]
                i += 1
            else:
                rstr += u" "
                rstr += surface[j]
                i += 2
        else:
            rstr += surface[j]
            i += 1

    # don't know why this is necessary
    if flags != [] and j == 0 and len(surface) != 1: 
        while j  < len(surface):
            rstr += surface[j]    
            j += 1

    return rstr
开发者ID:yawara,项目名称:pwakati,代码行数:35,代码来源:decompose.py

示例4: text_to_array_ja

def text_to_array_ja(textdata, wordtypes):
    textdata = filter(textdata)
    t = Tokenizer()
    tokens = t.tokenize(textdata)
    words = sorted([token.surface
                    for token in tokens
                    if token.part_of_speech.split(',')[0] in wordtypes])
    return words
开发者ID:mr-justdoit,项目名称:harvest2ch,代码行数:8,代码来源:textmining.py

示例5: _tokenize

def _tokenize(text):
    from collections import namedtuple
    Token = namedtuple("Token", ["t", "surface", "pos"])

    t = Tokenizer()
    tokens = t.tokenize(text)
    for t in tokens:
        nt = Token(t, t.surface, t.part_of_speech.split(","))
        yield nt
开发者ID:icoxfog417,项目名称:kanaria,代码行数:9,代码来源:interpreter.py

示例6: output_ja_text

def output_ja_text(data, wordtypes):
    textdata = filter(data)
    t = Tokenizer()
    tokens = t.tokenize(textdata)
    words = sorted([token.surface
                    for token in tokens
                    if token.part_of_speech.split(',')[0] in wordtypes])
    dictionary = count_words(words)
    return pyaml.dump(dictionary, sys.stdout, vspacing=[0, 1])
开发者ID:mr-justdoit,项目名称:tweetanalyzer,代码行数:9,代码来源:textmining.py

示例7: makekeywords

def makekeywords(text):
    from janome.tokenizer import Tokenizer
    t = Tokenizer()
    tokens = t.tokenize(text)
    keywords = []
    for token in tokens:
        if token.part_of_speech.find("名詞") >= 0 and token.part_of_speech.find("数") == -1 and token.part_of_speech.find("非自立") == -1 and token.part_of_speech.find("接尾") == -1:
            keywords.append(token.surface)
    return keywords
开发者ID:TMats,项目名称:newsclassification,代码行数:9,代码来源:makekeywords.py

示例8: split

 def split(self, text):
     result = []
     t = Tokenizer()
     malist = t.tokenize(text)
     for w in malist:
         sf = w.surface   # 区切られた単語そのまま 
         bf = w.base_form # 単語の基本形
         if bf == '' or bf == "*": bf = sf
         result.append(bf)
     return result
开发者ID:kaneshirok,项目名称:ml-plactice,代码行数:10,代码来源:bayes.py

示例9: test_func

def test_func():
    t = Tokenizer()
    temp = ""
    for token in t.tokenize(u'この腫瘍は間葉系組織から生ずると考えられ、ビメンチンを発現する。'):
        if (not re.search('^(助詞|助動詞|記号)', token.part_of_speech)):
            temp = temp + token.surface
        else:
            temp = temp + token.surface
            print(temp)
            temp = ""
开发者ID:okkn,项目名称:Omsoba,代码行数:10,代码来源:main.py

示例10: run

    def run(self, force=None):
        print('start')
        # 全サイト取得と重複排除
        sites = {}
        for site in Site.get_all():
            sites[site.url] = site

        # リストに対してignoreとkeywordマッチを排除
        sure = []
        for key in sites:
            site = sites[key]
            response = requests.get(site.subjects_url)
            assert (response.status_code == 200), response.text

            # parse
            data = list(response.text.split('\n'))
            for line in data:
                try:
                    _ = Subject(site, line)
                    sure.append(_)
                except:
                    pass

        print(sure)

        # リスト出力
        t = Tokenizer()
        r = defaultdict(int)
        r2 = defaultdict(list)
        r3 = defaultdict(int)
        for _sure in sure:
            try:
                for token in t.tokenize(_sure.title):
                    if not token_filter(token):
                        r[token.surface] += 1
                        r2[token.surface] += [_sure]
                        r3[token] += 0
            except:
                pass

        # sort
        sure = sorted(sure, key=lambda x: x.title)

        for _sure in sure:
            try:
                point = 0
                for token in t.tokenize(_sure.title):
                    if not token_filter(token):
                        point += r[token.surface]
                if not filter_title(point, _sure):
                    print(_sure.title, _sure.count_res)

            except:
                pass
开发者ID:subc,项目名称:flask_template,代码行数:54,代码来源:pickup.py

示例11: understand_move

    def understand_move(self, text):
        generator = Tokenizer()
        tokens = []

        for t in generator.tokenize(text):
            tokens.append(t)

        direction = self._understand_direction(tokens)
        distance = self._understand_distance(tokens)

        return direction, distance
开发者ID:icoxfog417,项目名称:TravelPenguinBot,代码行数:11,代码来源:penguin_nlu.py

示例12: get_morphs

def get_morphs(string):
    t = Tokenizer()
    dicts=[]
    for token in t.tokenize(unicode(string, 'utf-8')):
        dic = {}
        token_list = str(token).replace("	", ",").split(",")
        dic["surface"] = token_list[0]
        dic["base"] = token_list[7]
        dic["pos"] = token_list[1]
        dic["pos1"] = token_list[2]

        dicts.append(dic)

    return dicts
开发者ID:mizukmb,项目名称:NLP100Knock,代码行数:14,代码来源:mymodule.py

示例13: main

def main():
    """
    >>> main()
    すもも	名詞,一般,*,*,*,*,すもも,スモモ,スモモ
    も	助詞,係助詞,*,*,*,*,も,モ,モ
    もも	名詞,一般,*,*,*,*,もも,モモ,モモ
    も	助詞,係助詞,*,*,*,*,も,モ,モ
    もも	名詞,一般,*,*,*,*,もも,モモ,モモ
    の	助詞,連体化,*,*,*,*,の,ノ,ノ
    うち	名詞,非自立,副詞可能,*,*,*,うち,ウチ,ウチ
    """
    t = Tokenizer()
    for token in t.tokenize(u'すもももももももものうち'):
        print(token)
开发者ID:t2y,项目名称:learnnlp,代码行数:14,代码来源:janome_sample1.py

示例14: MainTranslator

class MainTranslator(object):
	def __init__ (self):
		self.janome= Tokenizer()
		
	def get_gobi(self, n):
		f = n.part_of_speech.split(',')	
		if n.surface in ['だ','です','た','だろ','ある']:
			if f[0] == '助動詞': 
				return 'ハゲ'
		
		if n.surface in ['無い','ない','ぬ']:
			if f[0] == '助動詞':
				return 'ぬハゲ' 
			if f[0] == '形容詞':
				return 'なしハゲ'
			
	
	def Translator(self, text):
		tokens = self.janome.tokenize(text)
		text = ''
		for n in tokens:
			f = n.part_of_speech.split(',')
			if n.surface in converter:
				text += converter[n.surface]
			elif len(f) > 3:
				gobi = self.get_gobi(n)
				if gobi is not None:
					text += gobi
				else:
					text += n.surface
			else:
				text += n.surface
		
		return text
开发者ID:PetitPocky,项目名称:TextEditor-for-Pythonista,代码行数:34,代码来源:Hage.py

示例15: JapaneseTokenizer

class JapaneseTokenizer(object):
    def __init__(self, cls, nlp=None):
        self.vocab = nlp.vocab if nlp is not None else cls.create_vocab(nlp)
        try:
            from janome.tokenizer import Tokenizer
        except ImportError:
            raise ImportError("The Japanese tokenizer requires the Janome "
                              "library: https://github.com/mocobeta/janome")
        self.tokenizer = Tokenizer()

    def __call__(self, text):
        words = [x.surface for x in self.tokenizer.tokenize(text)]
        return Doc(self.vocab, words=words, spaces=[False]*len(words))

    # add dummy methods for to_bytes, from_bytes, to_disk and from_disk to
    # allow serialization (see #1557)
    def to_bytes(self, **exclude):
        return b''

    def from_bytes(self, bytes_data, **exclude):
        return self

    def to_disk(self, path, **exclude):
        return None

    def from_disk(self, path, **exclude):
        return self
开发者ID:AvinashGupta,项目名称:spaCy,代码行数:27,代码来源:__init__.py


注:本文中的janome.tokenizer.Tokenizer类示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。