本文整理汇总了Python中sklearn.preprocessing.label.LabelBinarizer.transform方法的典型用法代码示例。如果您正苦于以下问题:Python LabelBinarizer.transform方法的具体用法?Python LabelBinarizer.transform怎么用?Python LabelBinarizer.transform使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类sklearn.preprocessing.label.LabelBinarizer
的用法示例。
在下文中一共展示了LabelBinarizer.transform方法的5个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: test_label_binarizer_unseen_labels
# 需要导入模块: from sklearn.preprocessing.label import LabelBinarizer [as 别名]
# 或者: from sklearn.preprocessing.label.LabelBinarizer import transform [as 别名]
def test_label_binarizer_unseen_labels():
lb = LabelBinarizer()
expected = np.array([[1, 0, 0], [0, 1, 0], [0, 0, 1]])
got = lb.fit_transform(["b", "d", "e"])
assert_array_equal(expected, got)
expected = np.array([[0, 0, 0], [1, 0, 0], [0, 0, 0], [0, 1, 0], [0, 0, 1], [0, 0, 0]])
got = lb.transform(["a", "b", "c", "d", "e", "f"])
assert_array_equal(expected, got)
示例2: dbpedia_convgemb
# 需要导入模块: from sklearn.preprocessing.label import LabelBinarizer [as 别名]
# 或者: from sklearn.preprocessing.label.LabelBinarizer import transform [as 别名]
def dbpedia_convgemb(sample=None, n_procs=None):
if not n_procs:
n_procs = cpu_count()
df = get_dbpedia_data(size=sample)
if sample:
test_size = int(round(np.sum(5000 * df.category.value_counts().values / 45000)))
else:
test_size = 5000 * 14
split = StratifiedShuffleSplit(df.category, test_size=test_size)
train_split, test_split = next(iter(split))
train_df = df.iloc[train_split]
test_df = df.iloc[test_split]
train_docs = DataframeSentences(train_df, cols=['title', 'abstract'], flatten=True)
vocab = Dictionary(train_docs)
vocab.filter_extremes(keep_n=5000)
bin = LabelBinarizer()
x_train = np.array(pad_sentences([[vocab.token2id[tok] + 1 for tok in s if tok in vocab.token2id]
for s in train_docs],
max_length=100, padding_word=0))
y_train = bin.fit_transform(train_df.category.values)
test_docs = DataframeSentences(test_df, cols=['title', 'abstract'], flatten=True)
x_test = np.array(pad_sentences([[vocab.token2id[tok] + 1 for tok in s if tok in vocab.token2id]
for s in test_docs],
max_length=100, padding_word=0))
y_test = bin.transform(test_df.category.values)
emb_weights = load_w2v_weights(vocab)
model = Sequential()
model.add(Embedding(5001, 300, input_length=100, dropout=.2, weights=[emb_weights], trainable=False))
model.add(Convolution1D(nb_filter=50, filter_length=3, border_mode='valid',
activation='relu', subsample_length=1))
model.add(MaxPooling1D(pool_length=model.output_shape[1]))
model.add(Flatten())
model.add(Dense(100, activation='relu'))
model.add(Dropout(.2))
model.add(Dense(14, activation='sigmoid'))
model.compile(loss='binary_crossentropy',
optimizer='adam',
metrics=['accuracy'])
model.fit(x_train, y_train)
print(accuracy_score(np.argwhere(y_test)[:,1], model.predict_classes(x_test)))
示例3: test_label_binarizer_unseen_labels
# 需要导入模块: from sklearn.preprocessing.label import LabelBinarizer [as 别名]
# 或者: from sklearn.preprocessing.label.LabelBinarizer import transform [as 别名]
def test_label_binarizer_unseen_labels():
lb = LabelBinarizer()
expected = np.array([[1, 0, 0],
[0, 1, 0],
[0, 0, 1]])
got = lb.fit_transform(['b', 'd', 'e'])
assert_array_equal(expected, got)
expected = np.array([[0, 0, 0],
[1, 0, 0],
[0, 0, 0],
[0, 1, 0],
[0, 0, 1],
[0, 0, 0]])
got = lb.transform(['a', 'b', 'c', 'd', 'e', 'f'])
assert_array_equal(expected, got)
示例4: dbpedia_smallcharconv
# 需要导入模块: from sklearn.preprocessing.label import LabelBinarizer [as 别名]
# 或者: from sklearn.preprocessing.label.LabelBinarizer import transform [as 别名]
def dbpedia_smallcharconv(sample=None, n_procs=None):
if not n_procs:
n_procs = cpu_count()
df = get_dbpedia_data(size=sample)
if sample:
test_size = int(round(np.sum(5000 * df.category.value_counts().values / 45000)))
else:
test_size = 5000 * 14
logging.info('creating train test split ...')
split = StratifiedShuffleSplit(df.category, test_size=test_size)
train_split, test_split = next(iter(split))
train_df = df.iloc[train_split]
test_df = df.iloc[test_split]
logging.info('preprocessing, padding and binarizing data ...')
train_docs = [[CHAR_MAP.index(c) if c in CHAR_MAP else len(CHAR_MAP) for c in text] for text
in train_df[['title', 'abstract']].apply(lambda cols: u'\n'.join(cols), axis=1).values]
bin = LabelBinarizer()
x_train = np.array(pad_sentences(train_docs, max_length=1014, padding_word=CHAR_MAP.index(' ')))
y_train = bin.fit_transform(train_df.category.values)
test_docs = [[CHAR_MAP.index(c) if c in CHAR_MAP else len(CHAR_MAP) for c in text] for text
in test_df[['title', 'abstract']].apply(lambda cols: u'\n'.join(cols), axis=1).values]
x_test = np.array(pad_sentences(test_docs, max_length=1014, padding_word=0))
y_test = bin.transform(test_df.category.values)
logging.info('building model ...')
model = Sequential()
model.add(Embedding(len(CHAR_MAP) + 1, len(CHAR_MAP) + 1, input_length=1014,
weights=[char_embedding()], trainable=False))
model.add(Convolution1D(nb_filter=256, filter_length=7, border_mode='valid',
activation='relu'))
model.add(MaxPooling1D(pool_length=3))
model.add(Convolution1D(nb_filter=256, filter_length=7, border_mode='valid',
activation='relu', subsample_length=1))
model.add(MaxPooling1D(pool_length=3))
model.add(Convolution1D(nb_filter=256, filter_length=3, border_mode='valid',
activation='relu', subsample_length=1))
model.add(Convolution1D(nb_filter=256, filter_length=3, border_mode='valid',
activation='relu', subsample_length=1))
model.add(Convolution1D(nb_filter=256, filter_length=3, border_mode='valid',
activation='relu', subsample_length=1))
model.add(Convolution1D(nb_filter=256, filter_length=3, border_mode='valid',
activation='relu', subsample_length=1))
model.add(MaxPooling1D(pool_length=3))
model.add(Flatten())
model.add(Dense(1024, activation='relu'))
model.add(Dropout(.5))
model.add(Dense(1024, activation='relu'))
model.add(Dropout(.5))
model.add(Dense(14, activation='sigmoid'))
model.compile(loss='binary_crossentropy',
optimizer='adam',
metrics=['categorical_accuracy'])
print(model.summary())
model.fit(x_train, y_train, batch_size=64, nb_epoch=5, validation_data=[x_test, y_test])
print(accuracy_score(np.argwhere(y_test)[:,1], model.predict_classes(x_test)))
示例5: dbpedia_smallwordconv
# 需要导入模块: from sklearn.preprocessing.label import LabelBinarizer [as 别名]
# 或者: from sklearn.preprocessing.label.LabelBinarizer import transform [as 别名]
def dbpedia_smallwordconv(sample=None, n_procs=None):
if not n_procs:
n_procs = cpu_count()
df = get_dbpedia_data(size=sample)
if sample:
test_size = int(round(np.sum(5000 * df.category.value_counts().values / 45000)))
else:
test_size = 5000 * 14
logging.info('creating train test split ...')
split = StratifiedShuffleSplit(df.category, test_size=test_size)
train_split, test_split = next(iter(split))
train_df = df.iloc[train_split]
test_df = df.iloc[test_split]
logging.info('preprocessing, padding and binarizing data ...')
train_docs = DataframeSentences(train_df, cols=['title', 'abstract'], flatten=True)
vocab = Dictionary(train_docs)
vocab.filter_extremes(keep_n=5000)
bin = LabelBinarizer()
x_train = np.array(pad_sentences([[vocab.token2id[tok] + 1 for tok in s if tok in vocab.token2id]
for s in train_docs],
max_length=100, padding_word=0))
y_train = bin.fit_transform(train_df.category.values)
test_docs = DataframeSentences(test_df, cols=['title', 'abstract'], flatten=True)
x_test = np.array(pad_sentences([[vocab.token2id[tok] + 1 for tok in s if tok in vocab.token2id]
for s in test_docs],
max_length=100, padding_word=0))
y_test = bin.transform(test_df.category.values)
logging.info('building model ...')
model = Sequential()
model.add(Embedding(5001, 300, input_length=100))
model.add(Convolution1D(nb_filter=300, filter_length=7, border_mode='valid',
activation='relu', subsample_length=1))
model.add(MaxPooling1D(pool_length=3, stride=1))
model.add(Convolution1D(nb_filter=300, filter_length=7, border_mode='valid',
activation='relu', subsample_length=1))
model.add(MaxPooling1D(pool_length=3, stride=1))
model.add(Convolution1D(nb_filter=300, filter_length=3, border_mode='valid',
activation='relu', subsample_length=1))
model.add(Convolution1D(nb_filter=300, filter_length=3, border_mode='valid',
activation='relu', subsample_length=1))
model.add(Convolution1D(nb_filter=300, filter_length=3, border_mode='valid',
activation='relu', subsample_length=1))
model.add(Convolution1D(nb_filter=300, filter_length=3, border_mode='valid',
activation='relu', subsample_length=1))
model.add(MaxPooling1D(pool_length=3, stride=1))
model.add(Flatten())
model.add(Dense(1024, activation='relu'))
model.add(Dropout(.5))
model.add(Dense(1024, activation='relu'))
model.add(Dropout(.5))
model.add(Dense(14, activation='sigmoid'))
model.compile(loss='binary_crossentropy',
optimizer='adam',
metrics=['categorical_accuracy'])
model.fit(x_train, y_train, batch_size=32, nb_epoch=5, validation_data=[x_test, y_test])
print(accuracy_score(np.argwhere(y_test)[:,1], model.predict_classes(x_test)))