Python tokenizer.tokenize方法代碼示例

本文整理匯總了Python中tokenizer.tokenize方法的典型用法代碼示例。如果您正苦於以下問題：Python tokenizer.tokenize方法的具體用法？Python tokenizer.tokenize怎麽用？Python tokenizer.tokenize使用的例子？那麽, 這裏精選的方法代碼示例或許可以為您提供幫助。您也可以進一步了解該方法所在類tokenizer的用法示例。

在下文中一共展示了tokenizer.tokenize方法的8個代碼示例，這些例子默認根據受歡迎程度排序。您可以為喜歡或者感覺有用的代碼點讚，您的評價將有助於係統推薦出更棒的Python代碼示例。

示例1: calculate_batchsize_maxlen

# 需要導入模塊: import tokenizer [as 別名]
# 或者: from tokenizer import tokenize [as 別名]
def calculate_batchsize_maxlen(texts):
    """ Calculates the maximum length in the provided texts and a suitable
        batch size. Rounds up maxlen to the nearest multiple of ten.

    # Arguments:
        texts: List of inputs.

    # Returns:
        Batch size,
        max length
    """
    def roundup(x):
        return int(math.ceil(x / 10.0)) * 10

    # Calculate max length of sequences considered
    # Adjust batch_size accordingly to prevent GPU overflow
    lengths = [len(tokenize(t)) for t in texts]
    maxlen = roundup(np.percentile(lengths, 80.0))
    batch_size = 250 if maxlen <= 100 else 50
    return batch_size, maxlen

開發者ID:bfelbo，項目名稱:DeepMoji，代碼行數:22，代碼來源:finetuning.py

示例2: numericTagger

# 需要導入模塊: import tokenizer [as 別名]
# 或者: from tokenizer import tokenize [as 別名]
def numericTagger(instr):
    lst = type([1, 2, 3])
    tup = type(("Hello", "Hi"))
    string = type("Hello")
    num_match = re.compile(r'([०१२३४५६७८९]+[\.\,]*)+[०१२३४५६७८९]+|([-+]*\d+[\.\,]*)+\d+|([०१२३४५६७८९]+|\d+)')
    if type(instr) == lst:
        for index, item in enumerate(instr):
            if type(item) == tup:
                if num_match.search(str(item[0])):
                    instr[index] = (instr[index][1], 'num')
            else:
                if num_match.search(str(item)):
                    instr[index] = (instr[index], 'num')
    else: 
        if type(instr) == string:
            instr = tok.tokenize(instr)
            numericTagger(instr)
        else:
            print("not supported")

    return instr

開發者ID:SangitaNLP，項目名稱:sangita，代碼行數:23，代碼來源:genderclassifer.py

示例3: get_words

# 需要導入模塊: import tokenizer [as 別名]
# 或者: from tokenizer import tokenize [as 別名]
def get_words(self, sentence):
        """ Tokenizes a sentence into individual words.
            Converts Unicode punctuation into ASCII if that option is set.
            Ignores sentences with Unicode if that option is set.
            Returns an empty list of words if the sentence has Unicode and
            that is not allowed.
        """

        if not isinstance(sentence, unicode):
            raise ValueError("All sentences should be Unicode-encoded!")
        sentence = sentence.strip().lower()

        if self.break_replacement:
            sentence = convert_linebreaks(sentence)

        if self.remove_variation_selectors:
            sentence = remove_variation_selectors(sentence)

        # Split into words using simple whitespace splitting and convert
        # Unicode. This is done to prevent word splitting issues with
        # twokenize and Unicode
        words = sentence.split()
        converted_words = []
        for w in words:
            accept_sentence, c_w = self.convert_unicode_word(w)
            # Unicode word detected and not allowed
            if not accept_sentence:
                return []
            else:
                converted_words.append(c_w)
        sentence = ' '.join(converted_words)

        words = tokenize(sentence)
        words = [process_word(w) for w in words]
        return words

開發者ID:bfelbo，項目名稱:DeepMoji，代碼行數:37，代碼來源:word_generator.py

示例4: process_questions

# 需要導入模塊: import tokenizer [as 別名]
# 或者: from tokenizer import tokenize [as 別名]
def process_questions(questions, return_score_modifiers = False):

    # Make a list
    if not isinstance(questions, list):
        questions = [questions]

    # Clean and tokenize
    prepared_questions = []
    for question in questions:
        question = question.strip()
        prepared_questions.append(apply_bpe(tokenize(question)) if question else '##emptyquestion##')

    # Run inference
    answers_list = inference_helper(prepared_questions)

    # Process answers
    prepared_answers_list = []
    for index, answers in enumerate(answers_list):
        answers = detokenize(answers)
        answers = replace_in_answers(answers)
        answers = normalize_new_lines(answers)
        answers_score = score_answers(questions[index], answers)
        best_index, best_score = get_best_score(answers_score['score'])

        if prepared_questions[index] == '##emptyquestion##':
            prepared_answers_list.append(None)
        elif return_score_modifiers:
            prepared_answers_list.append({'answers': answers, 'scores': answers_score['score'], 'best_index': best_index, 'best_score': best_score, 'score_modifiers': answers_score['score_modifiers']})
        else:
            prepared_answers_list.append({'answers': answers, 'scores': answers_score['score'], 'best_index': best_index, 'best_score': best_score})

    return prepared_answers_list

# interactive mode

開發者ID:daniel-kukiela，項目名稱:nmt-chatbot，代碼行數:36，代碼來源:inference.py

示例5: defaultTagger

# 需要導入模塊: import tokenizer [as 別名]
# 或者: from tokenizer import tokenize [as 別名]
def defaultTagger(instr):
    lst = type([1, 2, 3])
    tup = type(("Hello", "Hi"))
    string = type("Hello")
    if type(instr) == lst:
        for index, item in enumerate(instr):
            if type(item) != tup:
                instr[index] = (instr[index], 'any')
    else: 
        if type(instr) == string:
            instr = tok.tokenize(instr)
            defaultTagger(instr)
        else:
            print("not supported")
    return instr

開發者ID:SangitaNLP，項目名稱:sangita，代碼行數:17，代碼來源:genderclassifer.py

示例6: lookupTagger

# 需要導入模塊: import tokenizer [as 別名]
# 或者: from tokenizer import tokenize [as 別名]
def lookupTagger(instr):
    lst = type([1, 2, 3])
    tup = type(("Hello", "Hi"))
    string = type("Hello")
    gndrlst = gndr.drawlist()
    words = []
    genders = []
    for item in gndrlst:
        words.append(item.split("\t")[0])
        if(len(item.split("\t"))>2):
            genders.append("any")
        else:
            genders.append(item.split("\t")[1])
        
    tokens = set(words)
    
    if(type(instr) == lst):
        for index,item in enumerate(instr):
            if(type(item) == tup):
                if item in tokens:
                    tag = genders[words.index(item)]
                    instr[index] = (instr[index][1],tag)
            else:
                if(type(item) != tup):
                    if item in tokens:
                        tag = genders[words.index(item)]
                        instr[index] = (instr[index], tag)
                
    else: 
        if(type(instr) == string):
            instr = tok.tokenize(instr)
            lookupTagger(instr)
            
        else:
            print("not supported")

    return(instr)

開發者ID:SangitaNLP，項目名稱:sangita，代碼行數:39，代碼來源:genderclassifer.py

示例7: parse_paragraph

# 需要導入模塊: import tokenizer [as 別名]
# 或者: from tokenizer import tokenize [as 別名]
def parse_paragraph(parag, mim_tags, fast_p):
    """ Parse a single paragraph in free text form and compare to MIM POS tags """

    tokens = tokenize(parag)
    tlist = list(tokens)
    result = parse_tokens(tlist, mim_tags, fast_p)
    print("{0}\n--> {1} sentences, {2} parsed".format(parag, result["num_sent"], result["num_parsed_sent"]))

開發者ID:mideind，項目名稱:Greynir，代碼行數:9，代碼來源:mim.py

示例8: get_tokens_iterator

# 需要導入模塊: import tokenizer [as 別名]
# 或者: from tokenizer import tokenize [as 別名]
def get_tokens_iterator(tagger, iter_docs):
    tokenize = partial(tokenizer.tokenize, tagger=tagger)

    def iter_tokens():
        for doc in iter_docs():
            yield tokenize(doc)

    return iter_tokens

開發者ID:shiroyagicorp，項目名稱:japanese-word2vec-model-builder，代碼行數:10，代碼來源:main.py

注：本文中的tokenizer.tokenize方法示例由純淨天空整理自Github/MSDocs等開源代碼及文檔管理平台，相關代碼片段篩選自各路編程大神貢獻的開源項目，源碼版權歸原作者所有，傳播和使用請參考對應項目的License；未經允許，請勿轉載。