本文整理汇总了Python中sklearn.preprocessing.MultiLabelBinarizer方法的典型用法代码示例。如果您正苦于以下问题:Python preprocessing.MultiLabelBinarizer方法的具体用法?Python preprocessing.MultiLabelBinarizer怎么用?Python preprocessing.MultiLabelBinarizer使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类sklearn.preprocessing
的用法示例。
在下文中一共展示了preprocessing.MultiLabelBinarizer方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: __init__
# 需要导入模块: from sklearn import preprocessing [as 别名]
# 或者: from sklearn.preprocessing import MultiLabelBinarizer [as 别名]
def __init__(self, model_module, weights_path, evaluation_strategy="s2"):
"""
Test metadata format
---------------------
filename : string
class_ids: string of ints with space as a delimiter
"""
test_dataset = pd.read_csv(IRMAS_TESTING_META_PATH, names=["filename", "class_ids"])
self.X = list(test_dataset.filename)
targets = [[int(category) for category in target.split()] for target in test_dataset.class_ids]
self.ml_binarizer = MultiLabelBinarizer().fit(targets)
self.y_true = self.ml_binarizer.transform(targets)
self.y_pred = np.zeros(shape=self.y_true.shape)
self.y_pred_raw = np.zeros(shape=self.y_true.shape)
self.y_pred_raw_average = np.zeros(shape=self.y_true.shape)
self.model_module = model_module
self.weights_path = weights_path
self.feature_filenames = os.listdir(os.path.join(IRMAS_TEST_FEATURE_BASEPATH, model_module.BASE_NAME))
self.dataset_mean = np.load(os.path.join(MODEL_MEANS_BASEPATH, "{}_mean.npy".format(model_module.BASE_NAME)))
self.evaluation_strategy = evaluation_strategy
self.thresholds_s1 = [0.10, 0.12, 0.14, 0.16, 0.18, 0.20, 0.22, 0.24]
self.thresholds_s2 = [0.25, 0.30, 0.35, 0.40, 0.45, 0.50, 0.55, 0.60]
示例2: cat_onehot_encoder_m
# 需要导入模块: from sklearn import preprocessing [as 别名]
# 或者: from sklearn.preprocessing import MultiLabelBinarizer [as 别名]
def cat_onehot_encoder_m(df,y,col,selection=True):
## ZJN: test raise memory error
# raise MemoryError
mlbs = MultiLabelBinarizer(sparse_output=True).fit(df.values)
from scipy.sparse import csr_matrix
features_tmp = mlbs.transform(df.values)
features_tmp = csr_matrix(features_tmp,dtype=float).tocsr()
models = None
auc_score = None
if selection is True:
auc_score, models = train_lightgbm_for_feature_selection(features_tmp, y)
print(col, "auc", auc_score)
#new_feature = pd.DataFrame(features_tmp,columns=["mul_feature_"+col])
new_feature = features_tmp
from scipy.sparse import hstack
return new_feature,mlbs,models,auc_score
示例3: setUp
# 需要导入模块: from sklearn import preprocessing [as 别名]
# 或者: from sklearn.preprocessing import MultiLabelBinarizer [as 别名]
def setUp(self):
FILENAME = "../data/images/overfeat_raw.txt"
data = prepare.data_from_csv(FILENAME, sep='\\t')
TARGET = 'Labels'
self.SENS = ['Race']
self.EXPL = []
labeled_data = [ast.literal_eval(s) for s in data[TARGET]]
for l in labeled_data:
assert len(l) == 5
label_encoder = preprocessing.MultiLabelBinarizer()
labeled_data = label_encoder.fit_transform(labeled_data)
labels = label_encoder.classes_
df_labels = pd.DataFrame(labeled_data, columns=labels)
self.data = DataSource(pd.concat([data.drop(TARGET, axis=1), df_labels],
axis=1))
self.TARGET = labels.tolist()
示例4: evaluate
# 需要导入模块: from sklearn import preprocessing [as 别名]
# 或者: from sklearn.preprocessing import MultiLabelBinarizer [as 别名]
def evaluate(self, preds):
acc = eval_func.sequential_accuracy(
[self.label_dict[srcid] for srcid in preds.keys()],
[preds[srcid] for srcid in preds.keys()])
pred = [preds[srcid] for srcid in preds.keys()]
true = [self.label_dict[srcid] for srcid in preds.keys()]
mlb = MultiLabelBinarizer()
mlb.fit(pred + true)
encoded_true = mlb.transform(true)
encoded_pred = mlb.transform(pred)
macro_f1 = f1_score(encoded_true, encoded_pred, average='macro')
f1 = f1_score(encoded_true, encoded_pred, average='weighted')
res = {
'accuracy': acc,
'f1': f1,
'macro_f1': macro_f1
}
return res
示例5: feature_vectorizer
# 需要导入模块: from sklearn import preprocessing [as 别名]
# 或者: from sklearn.preprocessing import MultiLabelBinarizer [as 别名]
def feature_vectorizer(X_train, X_test, y_train, y_test):
"""prepare X data with tfidf and y with multi label binarizer"""
vectorizer = TfidfVectorizer(
analyzer="word", min_df=0.0,
max_df=1.0, strip_accents=None,
encoding="utf-8", preprocessor=None,
token_pattern=r"(?u)\S\S+", max_features=1000,
)
# fit only training data
vectorizer.fit(X_train)
_save_data(vectorizer, "/workdir/models/X_vectorizer.pk")
X_train_features = vectorizer.transform(X_train)
X_test_features = vectorizer.transform(X_test)
# use multiLabelBinarizer to create one-hot encoding of labels for y data
mlb = MultiLabelBinarizer()
# fit only training data
mlb.fit(y_train)
_save_data(mlb, "/workdir/models/label_binarizer.pk")
y_train_features = mlb.transform(y_train)
y_test_features = mlb.transform(y_test)
return X_train_features, X_test_features, y_train_features, y_test_features
示例6: _build_label_dict
# 需要导入模块: from sklearn import preprocessing [as 别名]
# 或者: from sklearn.preprocessing import MultiLabelBinarizer [as 别名]
def _build_label_dict(self,
labels: List[str]):
from sklearn.preprocessing import MultiLabelBinarizer
if self.multi_label:
label_set = set()
for i in labels:
label_set = label_set.union(list(i))
else:
label_set = set(labels)
self.label2idx = {}
for idx, label in enumerate(sorted(label_set)):
self.label2idx[label] = len(self.label2idx)
self.idx2label = dict([(value, key) for key, value in self.label2idx.items()])
self.dataset_info['label_count'] = len(self.label2idx)
self.multi_label_binarizer = MultiLabelBinarizer(classes=list(self.label2idx.keys()))
示例7: text_similarity
# 需要导入模块: from sklearn import preprocessing [as 别名]
# 或者: from sklearn.preprocessing import MultiLabelBinarizer [as 别名]
def text_similarity(df, col):
"""
Convert strings to their unicode representation and then apply one hot encoding, creating one feature for each unique character in the column.
This can be useful when similarity between strings is significant.
"""
unique = pd.DataFrame(df[col].unique(), columns=[col])
encoded = pd.DataFrame(unique.loc[:,col].apply(lambda s: [ord(a) for a in s]), index=unique.index)
mlb = preprocessing.MultiLabelBinarizer()
encoded = pd.DataFrame(mlb.fit_transform(encoded[col]),columns=mlb.classes_, index=encoded.index).add_prefix(col+"_")
unique = unique.join(encoded)
return unique.set_index(col)
示例8: prepVect
# 需要导入模块: from sklearn import preprocessing [as 别名]
# 或者: from sklearn.preprocessing import MultiLabelBinarizer [as 别名]
def prepVect(min_df=2, max_features=50000, n_captions=5, n_sbu=None,
multilabel=False):
print "prepping the Word Tokenizer..."
_0, _1, trY, _3 = coco(mode='full', n_captions=n_captions)
if n_sbu:
_4, sbuY, _5 = sbuXYFilenames(n_sbu)
trY.extend(sbuY)
vect = Tokenizer(min_df=min_df, max_features=max_features)
captions = sampleCaptions(trY, n_captions)
vect.fit(captions)
if multilabel:
mlb = MultiLabelBinarizer()
mlb.fit(vect.transform(captions))
return vect, mlb
# if not multilabel:
return vect
示例9: test_objectmapper
# 需要导入模块: from sklearn import preprocessing [as 别名]
# 或者: from sklearn.preprocessing import MultiLabelBinarizer [as 别名]
def test_objectmapper(self):
df = pdml.ModelFrame([])
self.assertIs(df.preprocessing.Binarizer, pp.Binarizer)
self.assertIs(df.preprocessing.FunctionTransformer,
pp.FunctionTransformer)
self.assertIs(df.preprocessing.Imputer, pp.Imputer)
self.assertIs(df.preprocessing.KernelCenterer, pp.KernelCenterer)
self.assertIs(df.preprocessing.LabelBinarizer, pp.LabelBinarizer)
self.assertIs(df.preprocessing.LabelEncoder, pp.LabelEncoder)
self.assertIs(df.preprocessing.MultiLabelBinarizer, pp.MultiLabelBinarizer)
self.assertIs(df.preprocessing.MaxAbsScaler, pp.MaxAbsScaler)
self.assertIs(df.preprocessing.MinMaxScaler, pp.MinMaxScaler)
self.assertIs(df.preprocessing.Normalizer, pp.Normalizer)
self.assertIs(df.preprocessing.OneHotEncoder, pp.OneHotEncoder)
self.assertIs(df.preprocessing.PolynomialFeatures, pp.PolynomialFeatures)
self.assertIs(df.preprocessing.RobustScaler, pp.RobustScaler)
self.assertIs(df.preprocessing.StandardScaler, pp.StandardScaler)
示例10: NodeClassification
# 需要导入模块: from sklearn import preprocessing [as 别名]
# 或者: from sklearn.preprocessing import MultiLabelBinarizer [as 别名]
def NodeClassification(embedding_look_up, node_list, labels, testing_ratio, seed):
X_train, y_train, X_test, y_test = split_train_test_classify(embedding_look_up, node_list, labels,
testing_ratio=testing_ratio,seed=seed)
binarizer = MultiLabelBinarizer(sparse_output=True)
y_all = np.append(y_train, y_test)
binarizer.fit(y_all)
y_train = binarizer.transform(y_train).todense()
y_test = binarizer.transform(y_test).todense()
model = OneVsRestClassifier(LogisticRegression(random_state=seed, solver='lbfgs'))
model.fit(X_train, y_train)
y_pred_prob = model.predict_proba(X_test)
## small trick : we assume that we know how many label to predict
y_pred = get_y_pred(y_test, y_pred_prob)
accuracy = accuracy_score(y_test, y_pred)
micro_f1 = f1_score(y_test, y_pred, average="micro")
macro_f1 = f1_score(y_test, y_pred, average="macro")
print('#' * 9 + ' Node Classification Performance ' + '#' * 9)
print(f'Accuracy: {accuracy:.3f}, Micro-F1: {micro_f1:.3f}, Macro-F1: {macro_f1:.3f}')
print('#' * 50)
return accuracy, micro_f1, macro_f1
示例11: build_input_label_data
# 需要导入模块: from sklearn import preprocessing [as 别名]
# 或者: from sklearn.preprocessing import MultiLabelBinarizer [as 别名]
def build_input_label_data(labels, class_order):
from sklearn.preprocessing import MultiLabelBinarizer
from itertools import chain
bml = MultiLabelBinarizer(classes=class_order, sparse_output=True)
indexes = sp.find(bml.fit_transform(labels))
y = []
for i in range(len(labels)):
y.append([])
for i,j in zip(indexes[0], indexes[1]):
y[i].append(j)
return y
# padding operation
# =========================================================
示例12: __init__
# 需要导入模块: from sklearn import preprocessing [as 别名]
# 或者: from sklearn.preprocessing import MultiLabelBinarizer [as 别名]
def __init__(self, inputs, labels, test_indices=None, **kwargs):
"""Encapsulates all pieces of data to run an experiment. This is basically a bag of items that makes it
easy to serialize and deserialize everything as a unit.
Args:
inputs: The raw model inputs. This can be set to None if you dont want
to serialize this value when you save the dataset.
labels: The raw output labels.
test_indices: The optional test indices to use. Ideally, this should be generated one time and reused
across experiments to make results comparable. `generate_test_indices` can be used generate first
time indices.
**kwargs: Additional key value items to store.
"""
self.X = np.array(inputs)
self.y = np.array(labels)
for key, value in kwargs.items():
setattr(self, key, value)
self._test_indices = None
self._train_indices = None
self.test_indices = test_indices
self.is_multi_label = isinstance(labels[0], (set, list, tuple))
self.label_encoder = MultiLabelBinarizer() if self.is_multi_label else LabelBinarizer()
self.y = self.label_encoder.fit_transform(self.y).flatten()
示例13: __init__
# 需要导入模块: from sklearn import preprocessing [as 别名]
# 或者: from sklearn.preprocessing import MultiLabelBinarizer [as 别名]
def __init__(self, vectors, clf):
self.embeddings = vectors
self.clf = TopKRanker(clf)
self.binarizer = MultiLabelBinarizer(sparse_output=True)
示例14: setUp
# 需要导入模块: from sklearn import preprocessing [as 别名]
# 或者: from sklearn.preprocessing import MultiLabelBinarizer [as 别名]
def setUp(self):
dataset_filepath = os.path.join(
os.path.dirname(os.path.realpath(__file__)),
'datasets/yeast_train.svm')
X, y = load_svmlight_file(dataset_filepath, multilabel=True)
self.X = X.todense().tolist()
self.y = MultiLabelBinarizer().fit_transform(y).tolist()
self.quota = 10
示例15: main
# 需要导入模块: from sklearn import preprocessing [as 别名]
# 或者: from sklearn.preprocessing import MultiLabelBinarizer [as 别名]
def main(argv=sys.argv):
if len(argv) != 1:
usage(argv)
FILENAME = "../../../data/recommender/recommendations.txt"
OUTPUT_DIR = "."
data = prepare.data_from_csv(FILENAME, sep='\\t',
to_drop=['RMSE', 'Avg Movie Age',
'Avg Recommended Rating',
'Avg Seen Rating', 'Occupation'])
TARGET = 'Types'
SENS = ['Gender']
EXPL = []
labeled_data = [ast.literal_eval(s) for s in data[TARGET]]
for labels in labeled_data:
assert len(labels) == 5
label_encoder = preprocessing.MultiLabelBinarizer()
labeled_data = label_encoder.fit_transform(labeled_data)
labels = label_encoder.classes_
df_labels = pd.DataFrame(labeled_data, columns=labels)
data = pd.concat([data.drop(TARGET, axis=1), df_labels], axis=1)
TARGET = labels.tolist()
data_source = DataSource(data)
# Instantiate the experiment
inv = Discovery(data_source, SENS, TARGET, EXPL, topk=10, random_state=0)
# Train the classifier
train([inv])
# Evaluate on the testing set
test([inv])
# Create the report
report([inv], "discovery", OUTPUT_DIR)