本文整理汇总了Python中sklearn.preprocessing.MultiLabelBinarizer.fit方法的典型用法代码示例。如果您正苦于以下问题:Python MultiLabelBinarizer.fit方法的具体用法?Python MultiLabelBinarizer.fit怎么用?Python MultiLabelBinarizer.fit使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类sklearn.preprocessing.MultiLabelBinarizer
的用法示例。
在下文中一共展示了MultiLabelBinarizer.fit方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: __init__
# 需要导入模块: from sklearn.preprocessing import MultiLabelBinarizer [as 别名]
# 或者: from sklearn.preprocessing.MultiLabelBinarizer import fit [as 别名]
def __init__(self, inter_filePath = "inter/technology_companies_of_the_united_states/"):
# [[cat,cat...]...]
self.m = Word2Vec.load_word2vec_format("vectors/technology_companies_of_the_united_states/cat_train_neg5size400min_count5", binary=True)
self.dim = 400
(correct_categories_train, context_categories_train) = self.load_category_page(inter_filePath + "category_page.txt")
(correct_categories_test, context_categories_test) = self.load_category_page(inter_filePath + "category_page_test.txt")
## ---- By mean ---
Xvectors = np.array(self.predict_vector_by_mean(context_categories_train))
Xvectors_test = np.array(self.predict_vector_by_mean(context_categories_test))
## ---- By mean --- *
## ---- By SVM ---
corpus_train = [" ".join(i) for i in context_categories_train]
corpus_test = [" ".join(i) for i in context_categories_test]
cv = CountVectorizer(min_df = 1)
X = cv.fit_transform(corpus_train)
##TFIDF
transformer = TfidfTransformer()
X_tfidf = transformer.fit_transform(X)
#Labels
mlb = MultiLabelBinarizer()
mlb.fit(correct_categories_train + correct_categories_test)
Y = mlb.transform(correct_categories_train) ###Transform to multilabel indicator
#predict test labels
X_test = cv.transform(corpus_test)
Y_test = mlb.transform(correct_categories_test)
#Y_predict_ovr = self.ovrSVM(X, Y, X_test)
Y_predict_ovr = self.ovrSVM(Xvectors, Y, Xvectors_test)
#Y_predict_ovo = self.ovoSVM(X, Y, X_test)
print "---One versus rest---"
print "Macro F-1:", f1_score(Y_test, Y_predict_ovr, average='macro')
print "Micro F-1:", f1_score(Y_test, Y_predict_ovr, average='micro')
示例2: fit_images
# 需要导入模块: from sklearn.preprocessing import MultiLabelBinarizer [as 别名]
# 或者: from sklearn.preprocessing.MultiLabelBinarizer import fit [as 别名]
def fit_images():
client = pymongo.MongoClient('localhost', 27017)
db = client['image_annotation']
responses = db['mapped_responses'].find()
no_labels = db['labels_binary'].find()
numbers = []
for i in no_labels:
numbers.append(set([int(i["number"])]))
train_data = []
labels = []
i=0
mlb = MultiLabelBinarizer()
mlb.fit(numbers)
for index, instance in enumerate(responses):
t_data = instance['hist']['0']
indexes[index] = instance['image_no']
train_data.append(t_data)
label = instance['binary_results']
new_labels = []
for key, value in enumerate(label):
value1 = int(value)
new_labels.append(set([value1]))
new_labels = mlb.transform(new_labels)
labels.append(label)
classifier = KNeighborsClassifier(n_neighbors = 5, weights='uniform')
classifier.fit(train_data, labels)
build_dir = getBuildDir()
pickle.dump(classifier, open(join(build_dir, 'model.data'),'w'),protocol=1)
client.close()
示例3: test_multilabel_classification_report
# 需要导入模块: from sklearn.preprocessing import MultiLabelBinarizer [as 别名]
# 或者: from sklearn.preprocessing.MultiLabelBinarizer import fit [as 别名]
def test_multilabel_classification_report():
n_classes = 4
n_samples = 50
make_ml = make_multilabel_classification
_, y_true_ll = make_ml(n_features=1, n_classes=n_classes, random_state=0,
n_samples=n_samples)
_, y_pred_ll = make_ml(n_features=1, n_classes=n_classes, random_state=1,
n_samples=n_samples)
expected_report = """\
precision recall f1-score support
0 0.50 0.67 0.57 24
1 0.51 0.74 0.61 27
2 0.29 0.08 0.12 26
3 0.52 0.56 0.54 27
avg / total 0.45 0.51 0.46 104
"""
lb = MultiLabelBinarizer()
lb.fit([range(4)])
y_true_bi = lb.transform(y_true_ll)
y_pred_bi = lb.transform(y_pred_ll)
for y_true, y_pred in [(y_true_ll, y_pred_ll), (y_true_bi, y_pred_bi)]:
report = classification_report(y_true, y_pred)
assert_equal(report, expected_report)
示例4: TimeSeriesLabelTransformer
# 需要导入模块: from sklearn.preprocessing import MultiLabelBinarizer [as 别名]
# 或者: from sklearn.preprocessing.MultiLabelBinarizer import fit [as 别名]
class TimeSeriesLabelTransformer(BaseTaskTransformer):
def __init__(self, namespace, name, labels=None):
'''Initialize a time-series label transformer
Parameters
----------
jam : jams.JAMS
The JAMS object container
n_samples : int > 0
The number of samples in the audio frame
label_encoder : sklearn.preprocessing.MultiLabelBinarizer
The (pre-constructed) label encoder
'''
super(TimeSeriesLabelTransformer, self).__init__(namespace, 0)
self.encoder = MultiLabelBinarizer()
self.encoder.fit([labels])
self._classes = set(self.encoder.classes_)
self.name = name
def transform(self, jam):
ann = self.find_annotation(jam)
intervals = np.asarray([[0.0, jam.file_metadata.duration]])
values = [None]
mask = False
if ann:
ann_int, ann_val = ann.data.to_interval_values()
intervals = np.vstack([intervals, ann_int])
values.extend(ann_val)
mask = True
# Suppress all intervals not in the encoder
tags = []
for v in values:
if v in self._classes:
tags.extend(self.encoder.transform([[v]]))
else:
tags.extend(self.encoder.transform([[]]))
tags = np.asarray(tags)
target = self.encode_intervals(jam.file_metadata.duration,
intervals,
tags)
return {'output_{:s}'.format(self.name): target,
'mask_{:s}'.format(self.name): mask}
示例5: test_multilabelbinarizer_vs_sklearn
# 需要导入模块: from sklearn.preprocessing import MultiLabelBinarizer [as 别名]
# 或者: from sklearn.preprocessing.MultiLabelBinarizer import fit [as 别名]
def test_multilabelbinarizer_vs_sklearn():
# Compare msmbuilder.preprocessing.MultiLabelBinarizer
# with sklearn.preprocessing.MultiLabelBinarizer
multilabelbinarizerr = MultiLabelBinarizerR()
multilabelbinarizerr.fit(np.concatenate(trajs))
multilabelbinarizer = MultiLabelBinarizer()
multilabelbinarizer.fit(trajs)
y_ref1 = multilabelbinarizerr.transform(trajs[0])
y1 = multilabelbinarizer.transform(trajs)[0]
np.testing.assert_array_almost_equal(y_ref1, y1)
示例6: load_data
# 需要导入模块: from sklearn.preprocessing import MultiLabelBinarizer [as 别名]
# 或者: from sklearn.preprocessing.MultiLabelBinarizer import fit [as 别名]
def load_data():
labels=pd.read_csv("train.csv")
bismatch=pd.read_csv("train_photo_to_biz_ids.csv")
labels=bismatch.merge(labels,how='left',on='business_id')
labels=labels[pd.isnull(labels['labels'])==False]
labels['labels']=labels['labels'].map(lambda x:[int(i) for i in x.split(" ")])
training_=os.listdir("train_photos/train244")
train_ids=pd.DataFrame({"photo_id":[int(i.split(".")[0]) for i in training_]})
train_ids=train_ids.merge(labels,on='photo_id',how='inner')
# val_ids=val_ids.merge(labels,on='photo_id',how='inner')
mlb=MultiLabelBinarizer()
mlb.fit(train_ids['labels'].tolist())
# X_train=np.array([imread('train_photos/train244/'+str(f_)+".jpg") for f_ in train_ids['photo_id'].tolist()]).astype(np.float32)
# X_test=np.array([imread('train_photos/val244/'+str(f_)+".jpg") for f_ in val_ids['photo_id'].tolist()]).astype(np.float32)
return train_ids,mlb
示例7: ACMClassificator
# 需要导入模块: from sklearn.preprocessing import MultiLabelBinarizer [as 别名]
# 或者: from sklearn.preprocessing.MultiLabelBinarizer import fit [as 别名]
class ACMClassificator(BaseACMClassificator):
def __init__(self):
self.vectorizer = CountVectorizer(min_df=0.05, max_df=0.45, tokenizer=tokenize)
self.mlb = MultiLabelBinarizer()
self.classificator = OneVsRestClassifier(ExtraTreeClassifier(criterion="gini",
max_depth=None,
min_samples_split=2,
min_samples_leaf=1,
min_weight_fraction_leaf=0.,
max_features="auto",
max_leaf_nodes=None,
class_weight=None),
n_jobs=-1
)
def _prepare_problems(self, problems):
return self.vectorizer.transform([p.statement for p in problems])
def fit(self, problems, tags):
nltk.download('punkt', quiet=True)
self.vectorizer.fit([p.statement for p in problems])
mat = self._prepare_problems(problems)
self.mlb = self.mlb.fit(tags)
self.classificator.fit(mat.toarray(), self.mlb.transform(tags))
def predict(self, problems):
mat = self._prepare_problems(problems)
predicted = self.classificator.predict(mat.toarray())
return self.mlb.inverse_transform(predicted)
示例8: prepVect
# 需要导入模块: from sklearn.preprocessing import MultiLabelBinarizer [as 别名]
# 或者: from sklearn.preprocessing.MultiLabelBinarizer import fit [as 别名]
def prepVect(min_df=2, max_features=50000, n_captions=5, n_sbu=None,
multilabel=False):
print "prepping the Word Tokenizer..."
_0, _1, trY, _3 = coco(mode='full', n_captions=n_captions)
if n_sbu:
_4, sbuY, _5 = sbuXYFilenames(n_sbu)
trY.extend(sbuY)
vect = Tokenizer(min_df=min_df, max_features=max_features)
captions = sampleCaptions(trY, n_captions)
vect.fit(captions)
if multilabel:
mlb = MultiLabelBinarizer()
mlb.fit(vect.transform(captions))
return vect, mlb
# if not multilabel:
return vect
示例9: run_classifierAccuracy
# 需要导入模块: from sklearn.preprocessing import MultiLabelBinarizer [as 别名]
# 或者: from sklearn.preprocessing.MultiLabelBinarizer import fit [as 别名]
def run_classifierAccuracy(trainSentences, trainLabels, testSentences, testLabels):
all_labels = ["Drought", "Earthquake", "Flood", "Epidemic", "Hurricane", \
"Rebellion", "Terrorism", "Tornado", "Tsunami", "displaced_people_and_evacuations", \
"donation_needs_or_offers_or_volunteering_services", "infrastructure_and_utilities_damage", \
"injured_or_dead_people", "missing_trapped_or_found_people"]
disaster_labels = ["Drought", "Earthquake", "Flood", "Hurricane", \
"Tornado", "Tsunami", "displaced_people_and_evacuations", \
"donation_needs_or_offers_or_volunteering_services", "infrastructure_and_utilities_damage", \
"injured_or_dead_people", "missing_trapped_or_found_people"]
health_labels = ["Epidemic", "displaced_people_and_evacuations", \
"donation_needs_or_offers_or_volunteering_services", \
"injured_or_dead_people"]
conflict_labels = ["Rebellion", "Terrorism", "displaced_people_and_evacuations", \
"infrastructure_and_utilities_damage", \
"injured_or_dead_people", "missing_trapped_or_found_people"]
import numpy as np
curr_labels = all_labels
trainLabels = [list(set(l).intersection(curr_labels)) for l in trainLabels]
testLabels = [list(set(l).intersection(curr_labels))for l in testLabels]
from sklearn.preprocessing import MultiLabelBinarizer
mlb = MultiLabelBinarizer(classes=curr_labels)
train_label_matrix = mlb.fit(trainLabels)
print("Labels : ", mlb.classes_)
train_label_matrix = mlb.transform(trainLabels)
test_label_matrix = mlb.transform(testLabels)
print("Shape of label matrix : ", test_label_matrix.shape)
train_matrix, tfidf = tf_idf_fit_transform(trainSentences)
test_matrix = tfidf.transform(testSentences)
print("Shape of sentence matrix : ", test_matrix.shape)
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
# estimator = LinearSVC()
estimator = RandomForestClassifier(n_estimators=50, max_depth=None, min_samples_split=2, random_state=0, n_jobs = -1)
classifier = OneVsRestClassifier(estimator, n_jobs=-1)
classifier.fit(train_matrix, train_label_matrix)
predictions = classifier.predict(test_matrix)
from sklearn.metrics import f1_score, precision_score, recall_score
print("Micro-Precision", precision_score(test_label_matrix, predictions, average='micro'))
print("Micro-Recall", recall_score(test_label_matrix, predictions, average='micro'))
print("Micro-F1", f1_score(test_label_matrix, predictions, average='micro'))
print("Macro-Precision", precision_score(test_label_matrix, predictions, average='macro'))
print("Macro-Recall", recall_score(test_label_matrix, predictions, average='macro'))
print("Macro-F1", f1_score(test_label_matrix, predictions, average='macro'))
print("Macro-Precision", precision_score(test_label_matrix, predictions, average=None))
print("Macro-Recall", recall_score(test_label_matrix, predictions, average=None))
print("Macro-F1", f1_score(test_label_matrix, predictions, average=None))
示例10: GlobalLabelTransformer
# 需要导入模块: from sklearn.preprocessing import MultiLabelBinarizer [as 别名]
# 或者: from sklearn.preprocessing.MultiLabelBinarizer import fit [as 别名]
class GlobalLabelTransformer(BaseTaskTransformer):
def __init__(self, namespace, name, labels=None):
'''Initialize a global label transformer
Parameters
----------
jam : jams.JAMS
The JAMS object container
'''
super(GlobalLabelTransformer, self).__init__(namespace, 0)
self.encoder = MultiLabelBinarizer()
self.encoder.fit([labels])
self._classes = set(self.encoder.classes_)
self.name = name
def transform(self, jam):
ann = self.find_annotation(jam)
intervals = np.asarray([[0, 1]])
values = [None]
mask = False
if ann:
values = list(ann.data.value)
intervals = np.tile(intervals, [len(values), 1])
mask = True
# Suppress all intervals not in the encoder
tags = [v for v in values if v in self._classes]
if len(tags):
target = self.encoder.transform([tags]).max(axis=0)
else:
target = np.zeros(len(self._classes), dtype=np.int)
return {'output_{:s}'.format(self.name): target,
'mask_{:s}'.format(self.name): mask}
示例11: ACMClassificator
# 需要导入模块: from sklearn.preprocessing import MultiLabelBinarizer [as 别名]
# 或者: from sklearn.preprocessing.MultiLabelBinarizer import fit [as 别名]
class ACMClassificator(BaseACMClassificator):
def __init__(self):
self.vectorizer = CountVectorizer(min_df=0.05, max_df=0.45, tokenizer=tokenize)
self.mlb = MultiLabelBinarizer()
self.classificator = OneVsRestClassifier(SVC(), n_jobs=-1)
def _prepare_problems(self, problems):
return self.vectorizer.transform([p.statement for p in problems])
def fit(self, problems, tags):
nltk.download('punkt', quiet=True)
self.vectorizer.fit([p.statement for p in problems])
mat = self._prepare_problems(problems)
self.mlb = self.mlb.fit(tags)
self.classificator.fit(mat.toarray(), self.mlb.transform(tags))
def predict(self, problems):
mat = self._prepare_problems(problems)
predicted = self.classificator.predict(mat.toarray())
return self.mlb.inverse_transform(predicted)
示例12: open
# 需要导入模块: from sklearn.preprocessing import MultiLabelBinarizer [as 别名]
# 或者: from sklearn.preprocessing.MultiLabelBinarizer import fit [as 别名]
# unique_tags = []
# with open("../logs/tags.txt") as top_tag_list:
# for line in top_tag_list:
# line = line.split('\n')[0]
# if cnt[line] > 0:
# unique_tags.append(line)
# for key in data:
# for tag in data[key]:
# if tag not in unique_tags:
# data[key].remove(tag)
tags = data.values()
mlb = MultiLabelBinarizer()
mlb.fit(tags)
print("Saving trained LabelBinarizer to disk")
joblib.dump(mlb, '../dump/pkl/' + str(mlb)[:5] + '.pkl')
print("")
# Split corpus into training and test sets
questions_train, questions_test, tags_train, tags_test = train_test_split(questions, tags, test_size=0.2, random_state = random.randint(1, 100))
print("Extracting features from the training data using the vectorizer")
t0 = time()
X_train = vectorizer.transform(questions_train)
duration = time() - t0
print("done in %fs" % (duration))
print("n_samples: %d, n_features: %d" % X_train.shape)
print("")
示例13: return
# 需要导入模块: from sklearn.preprocessing import MultiLabelBinarizer [as 别名]
# 或者: from sklearn.preprocessing.MultiLabelBinarizer import fit [as 别名]
a_=[loadbusimage(im_) for im_ in x]
return(np.array(a_))
labels=pd.read_csv("train.csv")
labels=labels[pd.isnull(labels['labels'])==False]
bismatch=pd.read_csv("train_photo_to_biz_ids.csv")
photo_labels=bismatch.merge(labels,how='left',on='business_id')
photo_labels=photo_labels[pd.isnull(photo_labels['labels'])==False]
photo_labels['labels']=photo_labels['labels'].map(lambda x:[int(i) for i in x.split(" ")])
np.random.seed(42)
labels['assignment']=np.random.randint(0,10,size=(labels.shape[0],1))
photo_labels=photo_labels.merge(labels[['business_id','assignment']],on='business_id')
train=photo_labels[photo_labels['assignment']<=7].reset_index(drop=True)
test=photo_labels[photo_labels['assignment']>7].reset_index(drop=True)
mlb=MultiLabelBinarizer()
mlb.fit(train['labels'].tolist()+test['labels'].tolist())
#INSERT NORMALIZATION TRAINING HERE
n_images=10
graph = Graph()
nfilters=32
for i in xrange(0,n_images):
graph.add_input(name="input"+str(i),input_shape=(3,size,size))
graph.add_shared_node(Convolution2D(nfilters, 3, 3, border_mode='same',activation='relu'),name='conv1',inputs=["input"+str(i) for i in xrange(0,10)])
graph.add_shared_node(BatchNormalization(),name='batch1',inputs=['conv1'])
graph.add_shared_node(Convolution2D(nfilters,3,3,activation=LeakyReLU()), name='conv2', inputs=['batch1'])
graph.add_shared_node(BatchNormalization(),name='batch2',inputs=['conv2'])
graph.add_shared_node(Convolution2D(nfilters,3,3,activation=LeakyReLU()), name='conv3', inputs=['batch2'])
graph.add_shared_node(BatchNormalization(),name='batch3',inputs=['conv3'])
graph.add_shared_node(Convolution2D(nfilters,3,3,activation=LeakyReLU()), name='conv4', inputs=['batch3'])
graph.add_shared_node(BatchNormalization(),name='batch4',inputs=['conv4'])
graph.add_shared_node(Convolution2D(nfilters,3,3,activation=LeakyReLU()), name='conv5', inputs=['batch4'])
示例14: del
# 需要导入模块: from sklearn.preprocessing import MultiLabelBinarizer [as 别名]
# 或者: from sklearn.preprocessing.MultiLabelBinarizer import fit [as 别名]
chunks = []
for chunk in reader:
chunk.dropna(inplace=True)
chunks.append(chunk)
test = pd.concat(chunks)
del(chunks)
# Split the tags by spaces
train_labels = train['Tags'].map(lambda x: x.split())
test_labels = test['Tags'].map(lambda x: x.split())
# The label binarizer takes all the tags and turns them into a big sparse matrix
mlb = MultiLabelBinarizer()
mlb.fit(pd.concat([train_labels, test_labels]))
labels = mlb.transform(train_labels)
# Turn the tokens into a sparse matrix
vect = CountVectorizer(
# Get text from html
preprocessor = preprocess,
# Turn the text into tokens
tokenizer = tokenize,
# Generate ngrams
ngram_range = (1, 2),
# Remove extremely common tokens
max_df = 0.5,
# Remove extremely uncommon tokens
min_df = 0.001
)
示例15: load_train
# 需要导入模块: from sklearn.preprocessing import MultiLabelBinarizer [as 别名]
# 或者: from sklearn.preprocessing.MultiLabelBinarizer import fit [as 别名]
# val_ids=val_ids.merge(labels,on='photo_id',how='inner')
mlb=LabelEncoder()
mlb.fit(train_ids['business_id'].tolist())
# X_train=np.array([imread('train_photos/train244/'+str(f_)+".jpg") for f_ in train_ids['photo_id'].tolist()]).astype(np.float32)
# X_test=np.array([imread('train_photos/val244/'+str(f_)+".jpg") for f_ in val_ids['photo_id'].tolist()]).astype(np.float32)
return train_ids,mlb
def load_train(train_list):
return(np.array([imread('train_photos/train244/'+str(f_)+".jpg") for f_ in train_list]).astype(np.float32)/255.0)
train_ids,mlb=load_data()
labels=pd.read_csv("train.csv")
labels=labels[pd.isnull(labels['labels'])==False].reset_index(drop=True)
labels['assignment']=np.random.uniform(size=(labels.shape[0],1))
MLB=MultiLabelBinarizer()
train_ids=train_ids.merge(labels[['business_id','assignment']],on='business_id',how='left')
MLB.fit(train_ids['labels'].tolist())
labels['labels']=labels['labels'].map(lambda x:[int(i) for i in x.split(" ")])
BETA=MLB.transform(labels.sort('business_id')['labels'])
val_ids=train_ids[train_ids['assignment']>=.9].reset_index(drop=True)
val_Y=MLB.transform(val_ids['labels'])
train_ids=train_ids[train_ids['assignment']<.9].reset_index(drop=True)
Y_test=mlb.transform(val_ids['business_id'].tolist())
print Y_test.shape
np.random.seed(42)
#train_ids=train_ids.sort('business_id').reset_index(drop=True)
train_ids.reindex(np.random.permutation(train_ids.index))
val_ids.reindex(np.random.permutation(val_ids.index))
validate=np.array([imread('train_photos/train244/'+str(f_)+".jpg") for f_ in val_ids['photo_id'].tolist()[0:10000]]).astype(np.float32)/255.0
datagen = ImageDataGenerator(
featurewise_center=True,