當前位置: 首頁>>代碼示例>>Python>>正文


Python Tokenizer.tokenize方法代碼示例

本文整理匯總了Python中janome.tokenizer.Tokenizer.tokenize方法的典型用法代碼示例。如果您正苦於以下問題:Python Tokenizer.tokenize方法的具體用法?Python Tokenizer.tokenize怎麽用?Python Tokenizer.tokenize使用的例子?那麽, 這裏精選的方法代碼示例或許可以為您提供幫助。您也可以進一步了解該方法所在janome.tokenizer.Tokenizer的用法示例。


在下文中一共展示了Tokenizer.tokenize方法的15個代碼示例,這些例子默認根據受歡迎程度排序。您可以為喜歡或者感覺有用的代碼點讚,您的評價將有助於係統推薦出更棒的Python代碼示例。

示例1: run

# 需要導入模塊: from janome.tokenizer import Tokenizer [as 別名]
# 或者: from janome.tokenizer.Tokenizer import tokenize [as 別名]
    def run(self, force=None):
        print('start')
        # 全サイト取得と重複排除
        sites = {}
        for site in Site.get_all():
            sites[site.url] = site

        # リストに対してignoreとkeywordマッチを排除
        sure = []
        for key in sites:
            site = sites[key]
            response = requests.get(site.subjects_url)
            assert (response.status_code == 200), response.text

            # parse
            data = list(response.text.split('\n'))
            for line in data:
                try:
                    _ = Subject(site, line)
                    sure.append(_)
                except:
                    pass

        print(sure)

        # リスト出力
        t = Tokenizer()
        r = defaultdict(int)
        r2 = defaultdict(list)
        r3 = defaultdict(int)
        for _sure in sure:
            try:
                for token in t.tokenize(_sure.title):
                    if not token_filter(token):
                        r[token.surface] += 1
                        r2[token.surface] += [_sure]
                        r3[token] += 0
            except:
                pass

        # sort
        sure = sorted(sure, key=lambda x: x.title)

        for _sure in sure:
            try:
                point = 0
                for token in t.tokenize(_sure.title):
                    if not token_filter(token):
                        point += r[token.surface]
                if not filter_title(point, _sure):
                    print(_sure.title, _sure.count_res)

            except:
                pass
開發者ID:subc,項目名稱:flask_template,代碼行數:56,代碼來源:pickup.py

示例2: MainTranslator

# 需要導入模塊: from janome.tokenizer import Tokenizer [as 別名]
# 或者: from janome.tokenizer.Tokenizer import tokenize [as 別名]
class MainTranslator(object):
	def __init__ (self):
		self.janome= Tokenizer()
		
	def get_gobi(self, n):
		f = n.part_of_speech.split(',')	
		if n.surface in ['だ','です','た','だろ','ある']:
			if f[0] == '助動詞': 
				return 'ハゲ'
		
		if n.surface in ['無い','ない','ぬ']:
			if f[0] == '助動詞':
				return 'ぬハゲ' 
			if f[0] == '形容詞':
				return 'なしハゲ'
			
	
	def Translator(self, text):
		tokens = self.janome.tokenize(text)
		text = ''
		for n in tokens:
			f = n.part_of_speech.split(',')
			if n.surface in converter:
				text += converter[n.surface]
			elif len(f) > 3:
				gobi = self.get_gobi(n)
				if gobi is not None:
					text += gobi
				else:
					text += n.surface
			else:
				text += n.surface
		
		return text
開發者ID:PetitPocky,項目名稱:TextEditor-for-Pythonista,代碼行數:36,代碼來源:Hage.py

示例3: separatewords

# 需要導入模塊: from janome.tokenizer import Tokenizer [as 別名]
# 或者: from janome.tokenizer.Tokenizer import tokenize [as 別名]
def separatewords(text):
    separatedWord=[]
    t=Tokenizer()
    tokens=t.tokenize(unicode(text, "utf-8"))
    
    for token in tokens:
        posList=token.part_of_speech.split(",")

        pos1=posList[0]
        if isinstance(pos1, unicode):
          pos1=pos1.encode("utf-8")

        pos2=posList[1]
        if isinstance(pos2, unicode):
          pos2=pos2.encode("utf-8")

        ruby=token.reading
        if isinstance(ruby, unicode):
          ruby=ruby.encode("utf-8")

        if pos1=="名詞":
            if pos2!="接尾" and pos2!="代名詞" and pos2!="非自立" and pos2!="數" and pos2!="形容動詞語幹":
                if ruby!="*":
                    separatedWord.append(token.surface.lower())
                    print token.surface.lower()
                elif pos2!="サ変接続" and len(token.surface)>3:
                    # 英単語に関しては4文字以上の単語を扱う
                    separatedWord.append(token.surface.lower())
                    print token.surface.lower()

    return separatedWord
開發者ID:camberbridge,項目名稱:camberbridge.github.io,代碼行數:33,代碼來源:newsfeatures.py

示例4: janome_tokenizer

# 需要導入模塊: from janome.tokenizer import Tokenizer [as 別名]
# 或者: from janome.tokenizer.Tokenizer import tokenize [as 別名]
def janome_tokenizer(sentence):
    t = Janome_Tokenizer()
    sentence = sentence.decode("utf-8")
    try:
        tokens = t.tokenize(sentence)
    except:
        try:
            tokens = t.tokenize(sentence.replace(u"\xa0", u"、"))
        except:
            try:
                tokens = t.tokenize(sentence.replace(u"\xa0", u""))
            except:
                print ("Tokenization error at sentence: "+sentence.encode("utf-8"))
                return  [sentence]

    return [dic.surface.encode("utf-8") for dic in tokens]
開發者ID:ttamada,項目名稱:MachineTranslationWithVisualContexts,代碼行數:18,代碼來源:data_utils.py

示例5: JapaneseTokenizer

# 需要導入模塊: from janome.tokenizer import Tokenizer [as 別名]
# 或者: from janome.tokenizer.Tokenizer import tokenize [as 別名]
class JapaneseTokenizer(object):
    def __init__(self, cls, nlp=None):
        self.vocab = nlp.vocab if nlp is not None else cls.create_vocab(nlp)
        try:
            from janome.tokenizer import Tokenizer
        except ImportError:
            raise ImportError("The Japanese tokenizer requires the Janome "
                              "library: https://github.com/mocobeta/janome")
        self.tokenizer = Tokenizer()

    def __call__(self, text):
        words = [x.surface for x in self.tokenizer.tokenize(text)]
        return Doc(self.vocab, words=words, spaces=[False]*len(words))

    # add dummy methods for to_bytes, from_bytes, to_disk and from_disk to
    # allow serialization (see #1557)
    def to_bytes(self, **exclude):
        return b''

    def from_bytes(self, bytes_data, **exclude):
        return self

    def to_disk(self, path, **exclude):
        return None

    def from_disk(self, path, **exclude):
        return self
開發者ID:AvinashGupta,項目名稱:spaCy,代碼行數:29,代碼來源:__init__.py

示例6: chunk_with_kanji

# 需要導入模塊: from janome.tokenizer import Tokenizer [as 別名]
# 或者: from janome.tokenizer.Tokenizer import tokenize [as 別名]
def chunk_with_kanji(istr):
    t = Tokenizer()
    tokens = t.tokenize(istr)

    # give each element flags (jiritsu or fuzoku)
    flags = [judge_jifu(x.part_of_speech) for x in tokens]
    
    surface = [x.surface for x in tokens]

    # split to chunks, delimited by KUGIRI flag
    # very ugly. should be rewritten using tree structure etc.
    cflags = insert_chunkflg(flags)
    rstr = u""
    i = 0
    for j, f in enumerate(flags):
        if i >= len(cflags): break
        if cflags[i] == KUGIRI:
            if f == KUTOU: 
                rstr += surface[j]
                i += 1
            else:
                rstr += u" "
                rstr += surface[j]
                i += 2
        else:
            rstr += surface[j]
            i += 1

    # don't know why this is necessary
    if flags != [] and j == 0 and len(surface) != 1: 
        while j  < len(surface):
            rstr += surface[j]    
            j += 1

    return rstr
開發者ID:yawara,項目名稱:pwakati,代碼行數:37,代碼來源:decompose.py

示例7: tokenize

# 需要導入模塊: from janome.tokenizer import Tokenizer [as 別名]
# 或者: from janome.tokenizer.Tokenizer import tokenize [as 別名]
def tokenize(text):
    t = Tokenizer()
    # テキストの先頭にあるヘッダとフッタを削除
    text = re.split(r'\-{5,}',text)[2]
    text = re.split(r'底本:', text)[0]
    text = text.strip()
    # ルビを削除
    text = text.replace('|', '')
    text = re.sub(r'《.+?》', '', text)
    # テキスト內の腳注を削除
    text = re.sub(r'[#.+?]', '', text)
    # 一行ずつ処理
    lines = text.split("\r\n")
    results = []
    for line in lines:
        res = []
        tokens = t.tokenize(line)
        for tok in tokens:
            bf = tok.base_form # 基本係
            if bf == "*": bf = tok.surface
            ps = tok.part_of_speech # 品詞情報
            hinsi = ps.split(',')[0]
            if hinsi in ['名詞', '動詞', '形容詞', '記號']:
                res.append(bf)
        l = " ".join(res)
        results.append(l)
    return results
開發者ID:kaneshirok,項目名稱:ml-plactice,代碼行數:29,代碼來源:word2vec-split.py

示例8: text_to_array_ja

# 需要導入模塊: from janome.tokenizer import Tokenizer [as 別名]
# 或者: from janome.tokenizer.Tokenizer import tokenize [as 別名]
def text_to_array_ja(textdata, wordtypes):
    textdata = filter(textdata)
    t = Tokenizer()
    tokens = t.tokenize(textdata)
    words = sorted([token.surface
                    for token in tokens
                    if token.part_of_speech.split(',')[0] in wordtypes])
    return words
開發者ID:mr-justdoit,項目名稱:harvest2ch,代碼行數:10,代碼來源:textmining.py

示例9: makekeywords

# 需要導入模塊: from janome.tokenizer import Tokenizer [as 別名]
# 或者: from janome.tokenizer.Tokenizer import tokenize [as 別名]
def makekeywords(text):
    from janome.tokenizer import Tokenizer
    t = Tokenizer()
    tokens = t.tokenize(text)
    keywords = []
    for token in tokens:
        if token.part_of_speech.find("名詞") >= 0 and token.part_of_speech.find("數") == -1 and token.part_of_speech.find("非自立") == -1 and token.part_of_speech.find("接尾") == -1:
            keywords.append(token.surface)
    return keywords
開發者ID:TMats,項目名稱:newsclassification,代碼行數:11,代碼來源:makekeywords.py

示例10: output_ja_text

# 需要導入模塊: from janome.tokenizer import Tokenizer [as 別名]
# 或者: from janome.tokenizer.Tokenizer import tokenize [as 別名]
def output_ja_text(data, wordtypes):
    textdata = filter(data)
    t = Tokenizer()
    tokens = t.tokenize(textdata)
    words = sorted([token.surface
                    for token in tokens
                    if token.part_of_speech.split(',')[0] in wordtypes])
    dictionary = count_words(words)
    return pyaml.dump(dictionary, sys.stdout, vspacing=[0, 1])
開發者ID:mr-justdoit,項目名稱:tweetanalyzer,代碼行數:11,代碼來源:textmining.py

示例11: _tokenize

# 需要導入模塊: from janome.tokenizer import Tokenizer [as 別名]
# 或者: from janome.tokenizer.Tokenizer import tokenize [as 別名]
def _tokenize(text):
    from collections import namedtuple
    Token = namedtuple("Token", ["t", "surface", "pos"])

    t = Tokenizer()
    tokens = t.tokenize(text)
    for t in tokens:
        nt = Token(t, t.surface, t.part_of_speech.split(","))
        yield nt
開發者ID:icoxfog417,項目名稱:kanaria,代碼行數:11,代碼來源:interpreter.py

示例12: test_func

# 需要導入模塊: from janome.tokenizer import Tokenizer [as 別名]
# 或者: from janome.tokenizer.Tokenizer import tokenize [as 別名]
def test_func():
    t = Tokenizer()
    temp = ""
    for token in t.tokenize(u'この腫瘍は間葉係組織から生ずると考えられ、ビメンチンを発現する。'):
        if (not re.search('^(助詞|助動詞|記號)', token.part_of_speech)):
            temp = temp + token.surface
        else:
            temp = temp + token.surface
            print(temp)
            temp = ""
開發者ID:okkn,項目名稱:Omsoba,代碼行數:12,代碼來源:main.py

示例13: split

# 需要導入模塊: from janome.tokenizer import Tokenizer [as 別名]
# 或者: from janome.tokenizer.Tokenizer import tokenize [as 別名]
 def split(self, text):
     result = []
     t = Tokenizer()
     malist = t.tokenize(text)
     for w in malist:
         sf = w.surface   # 區切られた単語そのまま 
         bf = w.base_form # 単語の基本形
         if bf == '' or bf == "*": bf = sf
         result.append(bf)
     return result
開發者ID:kaneshirok,項目名稱:ml-plactice,代碼行數:12,代碼來源:bayes.py

示例14: understand_move

# 需要導入模塊: from janome.tokenizer import Tokenizer [as 別名]
# 或者: from janome.tokenizer.Tokenizer import tokenize [as 別名]
    def understand_move(self, text):
        generator = Tokenizer()
        tokens = []

        for t in generator.tokenize(text):
            tokens.append(t)

        direction = self._understand_direction(tokens)
        distance = self._understand_distance(tokens)

        return direction, distance
開發者ID:icoxfog417,項目名稱:TravelPenguinBot,代碼行數:13,代碼來源:penguin_nlu.py

示例15: JapaneseTokenizer

# 需要導入模塊: from janome.tokenizer import Tokenizer [as 別名]
# 或者: from janome.tokenizer.Tokenizer import tokenize [as 別名]
class JapaneseTokenizer(object):
    def __init__(self, cls, nlp=None):
        self.vocab = nlp.vocab if nlp is not None else cls.create_vocab(nlp)
        try:
            from janome.tokenizer import Tokenizer
        except ImportError:
            raise ImportError("The Japanese tokenizer requires the Janome library: "
                              "https://github.com/mocobeta/janome")
        self.tokenizer = Tokenizer()

    def __call__(self, text):
        words = [x.surface for x in self.tokenizer.tokenize(text)]
        return Doc(self.vocab, words=words, spaces=[False]*len(words))
開發者ID:kunbud1989,項目名稱:spaCy,代碼行數:15,代碼來源:__init__.py


注:本文中的janome.tokenizer.Tokenizer.tokenize方法示例由純淨天空整理自Github/MSDocs等開源代碼及文檔管理平台,相關代碼片段篩選自各路編程大神貢獻的開源項目,源碼版權歸原作者所有,傳播和使用請參考對應項目的License;未經允許,請勿轉載。