当前位置: 首页>>代码示例>>Python>>正文


Python text.Tokenizer方法代码示例

本文整理汇总了Python中keras.preprocessing.text.Tokenizer方法的典型用法代码示例。如果您正苦于以下问题:Python text.Tokenizer方法的具体用法?Python text.Tokenizer怎么用?Python text.Tokenizer使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在keras.preprocessing.text的用法示例。


在下文中一共展示了text.Tokenizer方法的14个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: doc_vec_feature

# 需要导入模块: from keras.preprocessing import text [as 别名]
# 或者: from keras.preprocessing.text import Tokenizer [as 别名]
def doc_vec_feature(self, data_set, max_sentences=16):
        from keras.preprocessing.text import Tokenizer, text_to_word_sequence
        tokenizer = Tokenizer()
        tokenizer.fit_on_texts(data_set)
        data_feature = np.zeros((len(data_set), max_sentences, self.max_len), dtype='int32')
        sentence_symbols = "".join(self.sentence_symbol)
        split = "[" + sentence_symbols + "]"
        for i, sentence in enumerate(data_set):
            short_sents = re.split(split, sentence)
            for j, sent in enumerate(short_sents):
                if j < max_sentences and sent.strip():
                    words = text_to_word_sequence(sent)
                    k = 0
                    for w in words:
                        if k < self.max_len:
                            if w in tokenizer.word_index:
                                data_feature[i, j, k] = tokenizer.word_index[w]
                            k += 1
        word_index = tokenizer.word_index
        logger.info('Number of Unique Tokens: %d' % len(word_index))
        print('Shape of Data Tensor:', data_feature.shape)
        return data_feature 
开发者ID:shibing624,项目名称:text-classifier,代码行数:24,代码来源:feature.py

示例2: _handle_rare_words

# 需要导入模块: from keras.preprocessing import text [as 别名]
# 或者: from keras.preprocessing.text import Tokenizer [as 别名]
def _handle_rare_words(self, captions):
        if self._rare_words_handling == 'nothing':
            return captions
        elif self._rare_words_handling == 'discard':
            tokenizer = Tokenizer()
            tokenizer.fit_on_texts(captions)
            new_captions = []
            for caption in captions:
                words = text_to_word_sequence(caption)
                new_words = [w for w in words
                             if tokenizer.word_counts.get(w, 0) >=
                             self._words_min_occur]
                new_captions.append(' '.join(new_words))
            return new_captions

        raise NotImplementedError('rare_words_handling={} is not implemented '
                                  'yet!'.format(self._rare_words_handling)) 
开发者ID:danieljl,项目名称:keras-image-captioning,代码行数:19,代码来源:preprocessors.py

示例3: main

# 需要导入模块: from keras.preprocessing import text [as 别名]
# 或者: from keras.preprocessing.text import Tokenizer [as 别名]
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--raw_data_path', default='../data/train.json', type=str, required=False, help='原始训练语料')
    parser.add_argument('--vocab_file', default='vocab_processed.txt', type=str, required=False, help='生成vocab链接')
    parser.add_argument('--vocab_size', default=50000, type=int, required=False, help='词表大小')
    args = parser.parse_args()

    lac = thulac.thulac(seg_only=True)
    tokenizer = Tokenizer(num_words=args.vocab_size)
    print('args:\n' + args.__repr__())
    print('This script is extremely slow especially for large corpus. Take a break.')

    f = open(args.raw_data_path, 'r')
    lines = json.load(f)
    for i, line in enumerate(tqdm(lines)):
        lines[i] = lac.cut(line, text=True)

    tokenizer.fit_on_texts(lines)
    vocab = list(tokenizer.index_word.values())
    pre = ['[SEP]', '[CLS]', '[MASK]', '[PAD]', '[UNK]']
    vocab = pre + vocab
    with open(args.vocab_file, 'w') as f:
        for word in vocab[:args.vocab_size + 5]:
            f.write(word + '\n') 
开发者ID:Morizeyao,项目名称:GPT2-Chinese,代码行数:26,代码来源:make_vocab.py

示例4: create_embedding_matrix

# 需要导入模块: from keras.preprocessing import text [as 别名]
# 或者: from keras.preprocessing.text import Tokenizer [as 别名]
def create_embedding_matrix(tokenizer, word_vectors, embedding_dim):
    """
    Create embedding matrix containing word indexes and respective vectors from word vectors
    Args:
        tokenizer (keras.preprocessing.text.Tokenizer): keras tokenizer object containing word indexes
        word_vectors (dict): dict containing word and their respective vectors
        embedding_dim (int): dimention of word vector

    Returns:

    """
    nb_words = len(tokenizer.word_index) + 1
    word_index = tokenizer.word_index
    embedding_matrix = np.zeros((nb_words, embedding_dim))
    print("Embedding matrix shape: %s" % str(embedding_matrix.shape))
    for word, i in word_index.items():
        try:
            embedding_vector = word_vectors[word]
            if embedding_vector is not None:
                embedding_matrix[i] = embedding_vector
        except KeyError:
            print("vector not found for word - %s" % word)
    print('Null word embeddings: %d' % np.sum(np.sum(embedding_matrix, axis=1) == 0))
    return embedding_matrix 
开发者ID:amansrivastava17,项目名称:lstm-siamese-text-similarity,代码行数:26,代码来源:inputHandler.py

示例5: word_embed_meta_data

# 需要导入模块: from keras.preprocessing import text [as 别名]
# 或者: from keras.preprocessing.text import Tokenizer [as 别名]
def word_embed_meta_data(documents, embedding_dim):
    """
    Load tokenizer object for given vocabs list
    Args:
        documents (list): list of document
        embedding_dim (int): embedding dimension
    Returns:
        tokenizer (keras.preprocessing.text.Tokenizer): keras tokenizer object
        embedding_matrix (dict): dict with word_index and vector mapping
    """
    documents = [x.lower().split() for x in documents]
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(documents)
    word_vector = train_word2vec(documents, embedding_dim)
    embedding_matrix = create_embedding_matrix(tokenizer, word_vector, embedding_dim)
    del word_vector
    gc.collect()
    return tokenizer, embedding_matrix 
开发者ID:amansrivastava17,项目名称:lstm-siamese-text-similarity,代码行数:20,代码来源:inputHandler.py

示例6: create_test_data

# 需要导入模块: from keras.preprocessing import text [as 别名]
# 或者: from keras.preprocessing.text import Tokenizer [as 别名]
def create_test_data(tokenizer, test_sentences_pair, max_sequence_length):
    """
    Create training and validation dataset
    Args:
        tokenizer (keras.preprocessing.text.Tokenizer): keras tokenizer object
        test_sentences_pair (list): list of tuple of sentences pairs
        max_sequence_length (int): max sequence length of sentences to apply padding

    Returns:
        test_data_1 (list): list of input features for training set from sentences1
        test_data_2 (list): list of input features for training set from sentences2
    """
    test_sentences1 = [x[0].lower() for x in test_sentences_pair]
    test_sentences2 = [x[1].lower() for x in test_sentences_pair]

    test_sequences_1 = tokenizer.texts_to_sequences(test_sentences1)
    test_sequences_2 = tokenizer.texts_to_sequences(test_sentences2)
    leaks_test = [[len(set(x1)), len(set(x2)), len(set(x1).intersection(x2))]
                  for x1, x2 in zip(test_sequences_1, test_sequences_2)]

    leaks_test = np.array(leaks_test)
    test_data_1 = pad_sequences(test_sequences_1, maxlen=max_sequence_length)
    test_data_2 = pad_sequences(test_sequences_2, maxlen=max_sequence_length)

    return test_data_1, test_data_2, leaks_test 
开发者ID:amansrivastava17,项目名称:lstm-siamese-text-similarity,代码行数:27,代码来源:inputHandler.py

示例7: test_tokenizer

# 需要导入模块: from keras.preprocessing import text [as 别名]
# 或者: from keras.preprocessing.text import Tokenizer [as 别名]
def test_tokenizer():
    texts = ['The cat sat on the mat.',
             'The dog sat on the log.',
             'Dogs and cats living together.']
    tokenizer = Tokenizer(num_words=10)
    tokenizer.fit_on_texts(texts)

    sequences = []
    for seq in tokenizer.texts_to_sequences_generator(texts):
        sequences.append(seq)
    assert np.max(np.max(sequences)) < 10
    assert np.min(np.min(sequences)) == 1

    tokenizer.fit_on_sequences(sequences)

    for mode in ['binary', 'count', 'tfidf', 'freq']:
        matrix = tokenizer.texts_to_matrix(texts, mode) 
开发者ID:hello-sea,项目名称:DeepLearning_Wavelet-LSTM,代码行数:19,代码来源:text_test.py

示例8: test_sequential_fit

# 需要导入模块: from keras.preprocessing import text [as 别名]
# 或者: from keras.preprocessing.text import Tokenizer [as 别名]
def test_sequential_fit():
    texts = ['The cat sat on the mat.',
             'The dog sat on the log.',
             'Dogs and cats living together.']
    word_sequences = [
        ['The', 'cat', 'is', 'sitting'],
        ['The', 'dog', 'is', 'standing']
    ]

    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(texts)
    tokenizer.fit_on_texts(word_sequences)

    assert tokenizer.document_count == 5

    tokenizer.texts_to_matrix(texts)
    tokenizer.texts_to_matrix(word_sequences) 
开发者ID:hello-sea,项目名称:DeepLearning_Wavelet-LSTM,代码行数:19,代码来源:text_test.py

示例9: test_tokenizer_oov_flag

# 需要导入模块: from keras.preprocessing import text [as 别名]
# 或者: from keras.preprocessing.text import Tokenizer [as 别名]
def test_tokenizer_oov_flag():
    """
    Test of Out of Vocabulary (OOV) flag in Tokenizer
    """
    x_train = ['This text has only known words']
    x_test = ['This text has some unknown words']  # 2 OOVs: some, unknown

    # Default, without OOV flag
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(x_train)
    x_test_seq = tokenizer.texts_to_sequences(x_test)
    assert len(x_test_seq[0]) == 4  # discards 2 OOVs

    # With OOV feature
    tokenizer = Tokenizer(oov_token='<unk>')
    tokenizer.fit_on_texts(x_train)
    x_test_seq = tokenizer.texts_to_sequences(x_test)
    assert len(x_test_seq[0]) == 6  # OOVs marked in place 
开发者ID:hello-sea,项目名称:DeepLearning_Wavelet-LSTM,代码行数:20,代码来源:text_test.py

示例10: train_wordtoVect

# 需要导入模块: from keras.preprocessing import text [as 别名]
# 或者: from keras.preprocessing.text import Tokenizer [as 别名]
def train_wordtoVect(train_inputTexts):
    """
    训练词向量函数
    """
    texts=[]
    for doc in train_inputTexts:
        seg_doc = jieba.lcut(doc.replace('\n', ''))
        d =" ".join(seg_doc)
        texts.append(d)
    tokenizer = text.Tokenizer()                            # 分词MAX_NB_WORDS
    tokenizer.fit_on_texts(texts)
    text_sequences = tokenizer.texts_to_sequences(texts)    # 受num_words影响
    word_index = tokenizer.word_index                       # 词_索引
    data = sequence.pad_sequences(text_sequences, maxlen=MAX_SEQUENCE_LENGTH)

    return word_index, data 
开发者ID:jarvisqi,项目名称:deep_learning,代码行数:18,代码来源:textAnalysis.py

示例11: _transform_request

# 需要导入模块: from keras.preprocessing import text [as 别名]
# 或者: from keras.preprocessing.text import Tokenizer [as 别名]
def _transform_request(request):
    request_str = request.decode('utf-8')

    # tokenize the csv request and create json
    X = pandas.read_csv(io.StringIO(request_str), engine='python', quotechar='|', header=None).values[:,0]
    for index, item in enumerate(X):
        reqJson = json.loads(item, object_pairs_hook=OrderedDict)
        del reqJson['http']['timestamp']
        del reqJson['http']['headers']
        del reqJson['http']['source']
        del reqJson['http']['route']
        del reqJson['http']['responsePayload']
        X[index] = json.dumps(reqJson, separators=(',', ':'))

    tokenizer = Tokenizer(filters='\t\n', char_level=True)
    tokenizer.fit_on_texts(X)
    # this used to be [log_entry]
    seq = tokenizer.texts_to_sequences([request_str])
    max_log_length = 1024
    log_entry_processed = sequence.pad_sequences(seq, maxlen=max_log_length)

    return log_entry_processed 
开发者ID:PipelineAI,项目名称:models,代码行数:24,代码来源:pipeline_invoke_python.py

示例12: load_retures_keras

# 需要导入模块: from keras.preprocessing import text [as 别名]
# 或者: from keras.preprocessing.text import Tokenizer [as 别名]
def load_retures_keras():
    from keras.preprocessing.text import Tokenizer
    from keras.datasets import reuters
    max_words = 1000

    print('Loading data...')
    (x, y), (_, _) = reuters.load_data(num_words=max_words, test_split=0.)
    print(len(x), 'train sequences')

    num_classes = np.max(y) + 1
    print(num_classes, 'classes')

    print('Vectorizing sequence data...')
    tokenizer = Tokenizer(num_words=max_words)
    x = tokenizer.sequences_to_matrix(x, mode='binary')
    print('x_train shape:', x.shape)

    return x.astype(float), y 
开发者ID:XifengGuo,项目名称:DEC-keras,代码行数:20,代码来源:datasets.py

示例13: load_imdb

# 需要导入模块: from keras.preprocessing import text [as 别名]
# 或者: from keras.preprocessing.text import Tokenizer [as 别名]
def load_imdb():
    from keras.preprocessing.text import Tokenizer
    from keras.datasets import imdb
    max_words = 1000

    print('Loading data...')
    (x1, y1), (x2, y2) = imdb.load_data(num_words=max_words)
    x = np.concatenate((x1, x2))
    y = np.concatenate((y1, y2))
    print(len(x), 'train sequences')

    num_classes = np.max(y) + 1
    print(num_classes, 'classes')

    print('Vectorizing sequence data...')
    tokenizer = Tokenizer(num_words=max_words)
    x = tokenizer.sequences_to_matrix(x, mode='binary')
    print('x_train shape:', x.shape)

    return x.astype(float), y 
开发者ID:XifengGuo,项目名称:DEC-keras,代码行数:22,代码来源:datasets.py

示例14: create_tokenizer

# 需要导入模块: from keras.preprocessing import text [as 别名]
# 或者: from keras.preprocessing.text import Tokenizer [as 别名]
def create_tokenizer(captions):
	lines = to_lines(captions)
	tokenizer = Tokenizer()
	tokenizer.fit_on_texts(lines)
	return tokenizer

# Calculate the length of the captions with the most words 
开发者ID:dabasajay,项目名称:Image-Caption-Generator,代码行数:9,代码来源:load_data.py


注:本文中的keras.preprocessing.text.Tokenizer方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。