本文整理匯總了Python中sklearn.preprocessing.MultiLabelBinarizer方法的典型用法代碼示例。如果您正苦於以下問題:Python preprocessing.MultiLabelBinarizer方法的具體用法?Python preprocessing.MultiLabelBinarizer怎麽用?Python preprocessing.MultiLabelBinarizer使用的例子?那麽, 這裏精選的方法代碼示例或許可以為您提供幫助。您也可以進一步了解該方法所在類sklearn.preprocessing
的用法示例。
在下文中一共展示了preprocessing.MultiLabelBinarizer方法的15個代碼示例,這些例子默認根據受歡迎程度排序。您可以為喜歡或者感覺有用的代碼點讚,您的評價將有助於係統推薦出更棒的Python代碼示例。
示例1: __init__
# 需要導入模塊: from sklearn import preprocessing [as 別名]
# 或者: from sklearn.preprocessing import MultiLabelBinarizer [as 別名]
def __init__(self, model_module, weights_path, evaluation_strategy="s2"):
"""
Test metadata format
---------------------
filename : string
class_ids: string of ints with space as a delimiter
"""
test_dataset = pd.read_csv(IRMAS_TESTING_META_PATH, names=["filename", "class_ids"])
self.X = list(test_dataset.filename)
targets = [[int(category) for category in target.split()] for target in test_dataset.class_ids]
self.ml_binarizer = MultiLabelBinarizer().fit(targets)
self.y_true = self.ml_binarizer.transform(targets)
self.y_pred = np.zeros(shape=self.y_true.shape)
self.y_pred_raw = np.zeros(shape=self.y_true.shape)
self.y_pred_raw_average = np.zeros(shape=self.y_true.shape)
self.model_module = model_module
self.weights_path = weights_path
self.feature_filenames = os.listdir(os.path.join(IRMAS_TEST_FEATURE_BASEPATH, model_module.BASE_NAME))
self.dataset_mean = np.load(os.path.join(MODEL_MEANS_BASEPATH, "{}_mean.npy".format(model_module.BASE_NAME)))
self.evaluation_strategy = evaluation_strategy
self.thresholds_s1 = [0.10, 0.12, 0.14, 0.16, 0.18, 0.20, 0.22, 0.24]
self.thresholds_s2 = [0.25, 0.30, 0.35, 0.40, 0.45, 0.50, 0.55, 0.60]
示例2: cat_onehot_encoder_m
# 需要導入模塊: from sklearn import preprocessing [as 別名]
# 或者: from sklearn.preprocessing import MultiLabelBinarizer [as 別名]
def cat_onehot_encoder_m(df,y,col,selection=True):
## ZJN: test raise memory error
# raise MemoryError
mlbs = MultiLabelBinarizer(sparse_output=True).fit(df.values)
from scipy.sparse import csr_matrix
features_tmp = mlbs.transform(df.values)
features_tmp = csr_matrix(features_tmp,dtype=float).tocsr()
models = None
auc_score = None
if selection is True:
auc_score, models = train_lightgbm_for_feature_selection(features_tmp, y)
print(col, "auc", auc_score)
#new_feature = pd.DataFrame(features_tmp,columns=["mul_feature_"+col])
new_feature = features_tmp
from scipy.sparse import hstack
return new_feature,mlbs,models,auc_score
示例3: setUp
# 需要導入模塊: from sklearn import preprocessing [as 別名]
# 或者: from sklearn.preprocessing import MultiLabelBinarizer [as 別名]
def setUp(self):
FILENAME = "../data/images/overfeat_raw.txt"
data = prepare.data_from_csv(FILENAME, sep='\\t')
TARGET = 'Labels'
self.SENS = ['Race']
self.EXPL = []
labeled_data = [ast.literal_eval(s) for s in data[TARGET]]
for l in labeled_data:
assert len(l) == 5
label_encoder = preprocessing.MultiLabelBinarizer()
labeled_data = label_encoder.fit_transform(labeled_data)
labels = label_encoder.classes_
df_labels = pd.DataFrame(labeled_data, columns=labels)
self.data = DataSource(pd.concat([data.drop(TARGET, axis=1), df_labels],
axis=1))
self.TARGET = labels.tolist()
示例4: evaluate
# 需要導入模塊: from sklearn import preprocessing [as 別名]
# 或者: from sklearn.preprocessing import MultiLabelBinarizer [as 別名]
def evaluate(self, preds):
acc = eval_func.sequential_accuracy(
[self.label_dict[srcid] for srcid in preds.keys()],
[preds[srcid] for srcid in preds.keys()])
pred = [preds[srcid] for srcid in preds.keys()]
true = [self.label_dict[srcid] for srcid in preds.keys()]
mlb = MultiLabelBinarizer()
mlb.fit(pred + true)
encoded_true = mlb.transform(true)
encoded_pred = mlb.transform(pred)
macro_f1 = f1_score(encoded_true, encoded_pred, average='macro')
f1 = f1_score(encoded_true, encoded_pred, average='weighted')
res = {
'accuracy': acc,
'f1': f1,
'macro_f1': macro_f1
}
return res
示例5: feature_vectorizer
# 需要導入模塊: from sklearn import preprocessing [as 別名]
# 或者: from sklearn.preprocessing import MultiLabelBinarizer [as 別名]
def feature_vectorizer(X_train, X_test, y_train, y_test):
"""prepare X data with tfidf and y with multi label binarizer"""
vectorizer = TfidfVectorizer(
analyzer="word", min_df=0.0,
max_df=1.0, strip_accents=None,
encoding="utf-8", preprocessor=None,
token_pattern=r"(?u)\S\S+", max_features=1000,
)
# fit only training data
vectorizer.fit(X_train)
_save_data(vectorizer, "/workdir/models/X_vectorizer.pk")
X_train_features = vectorizer.transform(X_train)
X_test_features = vectorizer.transform(X_test)
# use multiLabelBinarizer to create one-hot encoding of labels for y data
mlb = MultiLabelBinarizer()
# fit only training data
mlb.fit(y_train)
_save_data(mlb, "/workdir/models/label_binarizer.pk")
y_train_features = mlb.transform(y_train)
y_test_features = mlb.transform(y_test)
return X_train_features, X_test_features, y_train_features, y_test_features
示例6: _build_label_dict
# 需要導入模塊: from sklearn import preprocessing [as 別名]
# 或者: from sklearn.preprocessing import MultiLabelBinarizer [as 別名]
def _build_label_dict(self,
labels: List[str]):
from sklearn.preprocessing import MultiLabelBinarizer
if self.multi_label:
label_set = set()
for i in labels:
label_set = label_set.union(list(i))
else:
label_set = set(labels)
self.label2idx = {}
for idx, label in enumerate(sorted(label_set)):
self.label2idx[label] = len(self.label2idx)
self.idx2label = dict([(value, key) for key, value in self.label2idx.items()])
self.dataset_info['label_count'] = len(self.label2idx)
self.multi_label_binarizer = MultiLabelBinarizer(classes=list(self.label2idx.keys()))
示例7: text_similarity
# 需要導入模塊: from sklearn import preprocessing [as 別名]
# 或者: from sklearn.preprocessing import MultiLabelBinarizer [as 別名]
def text_similarity(df, col):
"""
Convert strings to their unicode representation and then apply one hot encoding, creating one feature for each unique character in the column.
This can be useful when similarity between strings is significant.
"""
unique = pd.DataFrame(df[col].unique(), columns=[col])
encoded = pd.DataFrame(unique.loc[:,col].apply(lambda s: [ord(a) for a in s]), index=unique.index)
mlb = preprocessing.MultiLabelBinarizer()
encoded = pd.DataFrame(mlb.fit_transform(encoded[col]),columns=mlb.classes_, index=encoded.index).add_prefix(col+"_")
unique = unique.join(encoded)
return unique.set_index(col)
示例8: prepVect
# 需要導入模塊: from sklearn import preprocessing [as 別名]
# 或者: from sklearn.preprocessing import MultiLabelBinarizer [as 別名]
def prepVect(min_df=2, max_features=50000, n_captions=5, n_sbu=None,
multilabel=False):
print "prepping the Word Tokenizer..."
_0, _1, trY, _3 = coco(mode='full', n_captions=n_captions)
if n_sbu:
_4, sbuY, _5 = sbuXYFilenames(n_sbu)
trY.extend(sbuY)
vect = Tokenizer(min_df=min_df, max_features=max_features)
captions = sampleCaptions(trY, n_captions)
vect.fit(captions)
if multilabel:
mlb = MultiLabelBinarizer()
mlb.fit(vect.transform(captions))
return vect, mlb
# if not multilabel:
return vect
示例9: test_objectmapper
# 需要導入模塊: from sklearn import preprocessing [as 別名]
# 或者: from sklearn.preprocessing import MultiLabelBinarizer [as 別名]
def test_objectmapper(self):
df = pdml.ModelFrame([])
self.assertIs(df.preprocessing.Binarizer, pp.Binarizer)
self.assertIs(df.preprocessing.FunctionTransformer,
pp.FunctionTransformer)
self.assertIs(df.preprocessing.Imputer, pp.Imputer)
self.assertIs(df.preprocessing.KernelCenterer, pp.KernelCenterer)
self.assertIs(df.preprocessing.LabelBinarizer, pp.LabelBinarizer)
self.assertIs(df.preprocessing.LabelEncoder, pp.LabelEncoder)
self.assertIs(df.preprocessing.MultiLabelBinarizer, pp.MultiLabelBinarizer)
self.assertIs(df.preprocessing.MaxAbsScaler, pp.MaxAbsScaler)
self.assertIs(df.preprocessing.MinMaxScaler, pp.MinMaxScaler)
self.assertIs(df.preprocessing.Normalizer, pp.Normalizer)
self.assertIs(df.preprocessing.OneHotEncoder, pp.OneHotEncoder)
self.assertIs(df.preprocessing.PolynomialFeatures, pp.PolynomialFeatures)
self.assertIs(df.preprocessing.RobustScaler, pp.RobustScaler)
self.assertIs(df.preprocessing.StandardScaler, pp.StandardScaler)
示例10: NodeClassification
# 需要導入模塊: from sklearn import preprocessing [as 別名]
# 或者: from sklearn.preprocessing import MultiLabelBinarizer [as 別名]
def NodeClassification(embedding_look_up, node_list, labels, testing_ratio, seed):
X_train, y_train, X_test, y_test = split_train_test_classify(embedding_look_up, node_list, labels,
testing_ratio=testing_ratio,seed=seed)
binarizer = MultiLabelBinarizer(sparse_output=True)
y_all = np.append(y_train, y_test)
binarizer.fit(y_all)
y_train = binarizer.transform(y_train).todense()
y_test = binarizer.transform(y_test).todense()
model = OneVsRestClassifier(LogisticRegression(random_state=seed, solver='lbfgs'))
model.fit(X_train, y_train)
y_pred_prob = model.predict_proba(X_test)
## small trick : we assume that we know how many label to predict
y_pred = get_y_pred(y_test, y_pred_prob)
accuracy = accuracy_score(y_test, y_pred)
micro_f1 = f1_score(y_test, y_pred, average="micro")
macro_f1 = f1_score(y_test, y_pred, average="macro")
print('#' * 9 + ' Node Classification Performance ' + '#' * 9)
print(f'Accuracy: {accuracy:.3f}, Micro-F1: {micro_f1:.3f}, Macro-F1: {macro_f1:.3f}')
print('#' * 50)
return accuracy, micro_f1, macro_f1
示例11: build_input_label_data
# 需要導入模塊: from sklearn import preprocessing [as 別名]
# 或者: from sklearn.preprocessing import MultiLabelBinarizer [as 別名]
def build_input_label_data(labels, class_order):
from sklearn.preprocessing import MultiLabelBinarizer
from itertools import chain
bml = MultiLabelBinarizer(classes=class_order, sparse_output=True)
indexes = sp.find(bml.fit_transform(labels))
y = []
for i in range(len(labels)):
y.append([])
for i,j in zip(indexes[0], indexes[1]):
y[i].append(j)
return y
# padding operation
# =========================================================
示例12: __init__
# 需要導入模塊: from sklearn import preprocessing [as 別名]
# 或者: from sklearn.preprocessing import MultiLabelBinarizer [as 別名]
def __init__(self, inputs, labels, test_indices=None, **kwargs):
"""Encapsulates all pieces of data to run an experiment. This is basically a bag of items that makes it
easy to serialize and deserialize everything as a unit.
Args:
inputs: The raw model inputs. This can be set to None if you dont want
to serialize this value when you save the dataset.
labels: The raw output labels.
test_indices: The optional test indices to use. Ideally, this should be generated one time and reused
across experiments to make results comparable. `generate_test_indices` can be used generate first
time indices.
**kwargs: Additional key value items to store.
"""
self.X = np.array(inputs)
self.y = np.array(labels)
for key, value in kwargs.items():
setattr(self, key, value)
self._test_indices = None
self._train_indices = None
self.test_indices = test_indices
self.is_multi_label = isinstance(labels[0], (set, list, tuple))
self.label_encoder = MultiLabelBinarizer() if self.is_multi_label else LabelBinarizer()
self.y = self.label_encoder.fit_transform(self.y).flatten()
示例13: __init__
# 需要導入模塊: from sklearn import preprocessing [as 別名]
# 或者: from sklearn.preprocessing import MultiLabelBinarizer [as 別名]
def __init__(self, vectors, clf):
self.embeddings = vectors
self.clf = TopKRanker(clf)
self.binarizer = MultiLabelBinarizer(sparse_output=True)
示例14: setUp
# 需要導入模塊: from sklearn import preprocessing [as 別名]
# 或者: from sklearn.preprocessing import MultiLabelBinarizer [as 別名]
def setUp(self):
dataset_filepath = os.path.join(
os.path.dirname(os.path.realpath(__file__)),
'datasets/yeast_train.svm')
X, y = load_svmlight_file(dataset_filepath, multilabel=True)
self.X = X.todense().tolist()
self.y = MultiLabelBinarizer().fit_transform(y).tolist()
self.quota = 10
示例15: main
# 需要導入模塊: from sklearn import preprocessing [as 別名]
# 或者: from sklearn.preprocessing import MultiLabelBinarizer [as 別名]
def main(argv=sys.argv):
if len(argv) != 1:
usage(argv)
FILENAME = "../../../data/recommender/recommendations.txt"
OUTPUT_DIR = "."
data = prepare.data_from_csv(FILENAME, sep='\\t',
to_drop=['RMSE', 'Avg Movie Age',
'Avg Recommended Rating',
'Avg Seen Rating', 'Occupation'])
TARGET = 'Types'
SENS = ['Gender']
EXPL = []
labeled_data = [ast.literal_eval(s) for s in data[TARGET]]
for labels in labeled_data:
assert len(labels) == 5
label_encoder = preprocessing.MultiLabelBinarizer()
labeled_data = label_encoder.fit_transform(labeled_data)
labels = label_encoder.classes_
df_labels = pd.DataFrame(labeled_data, columns=labels)
data = pd.concat([data.drop(TARGET, axis=1), df_labels], axis=1)
TARGET = labels.tolist()
data_source = DataSource(data)
# Instantiate the experiment
inv = Discovery(data_source, SENS, TARGET, EXPL, topk=10, random_state=0)
# Train the classifier
train([inv])
# Evaluate on the testing set
test([inv])
# Create the report
report([inv], "discovery", OUTPUT_DIR)