本文整理汇总了Python中sklearn.preprocessing.MultiLabelBinarizer.fit_transform方法的典型用法代码示例。如果您正苦于以下问题:Python MultiLabelBinarizer.fit_transform方法的具体用法?Python MultiLabelBinarizer.fit_transform怎么用?Python MultiLabelBinarizer.fit_transform使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类sklearn.preprocessing.MultiLabelBinarizer
的用法示例。
在下文中一共展示了MultiLabelBinarizer.fit_transform方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: evaluate_solution
# 需要导入模块: from sklearn.preprocessing import MultiLabelBinarizer [as 别名]
# 或者: from sklearn.preprocessing.MultiLabelBinarizer import fit_transform [as 别名]
def evaluate_solution(users, urecovered, observed_index, xs=None, E=None,
hidden_edges=None):
"""Evaluate the quality of the recovered user profile"""
mse = mean_squared_error(users[observed_index, :],
urecovered[observed_index, :])
if hidden_edges is None or len(hidden_edges) < 1:
return mse, None
labeler = MultiLabelBinarizer(classes=np.arange(xs.shape[1]))
gold = labeler.fit_transform([E[e] for e in sorted(hidden_edges)])
# gold = np.array([E[e] for e in sorted(hidden_edges)])
eh = sorted(hidden_edges)
heads, tails = zip(*eh)
Cr = np.dot(urecovered, xs.T)
Dr = np.abs(Cr[heads, :] - Cr[tails, :])
# TODO prediction here could be better: instead of predict the k best
# directions all the time, look at revealed edge to compute threshold of
# similarity (i.e replace 0.05)
best_dirs = np.argsort(Dr, 1).astype(int)[:, :2]
pred = []
for all_dir, suggestion in zip(Dr, best_dirs):
my_pred = [suggestion[0]]
if all_dir[suggestion[1]] < 0.05:
my_pred.append(suggestion[1])
pred.append(my_pred)
pred = labeler.fit_transform(pred)
return mse, f1_score(gold, pred, average='samples')
示例2: read_all_data
# 需要导入模块: from sklearn.preprocessing import MultiLabelBinarizer [as 别名]
# 或者: from sklearn.preprocessing.MultiLabelBinarizer import fit_transform [as 别名]
def read_all_data(p):
img_src = "images/"
df = pd.read_pickle("frame_no_stem.pkl")
images = __read_all_images(img_src)
print("Finished reading images")
x_images = []
x_desc = []
y_category = []
all_categories = set()
for asin in df.index.values:
if asin in images:
data = images[asin]
x_images.append(data)
item = df.loc[asin]
x_desc.append(item.description)
cate = item.categories
y_category.append(cate)
for c in cate:
all_categories.add(c)
print("Finished reading dataframe")
mlb = MultiLabelBinarizer()
y_total = mlb.fit_transform(y_category)
x_images = np.array(x_images)
x_desc = np.array(x_desc)
return x_images,x_desc, y_total
示例3: load_data
# 需要导入模块: from sklearn.preprocessing import MultiLabelBinarizer [as 别名]
# 或者: from sklearn.preprocessing.MultiLabelBinarizer import fit_transform [as 别名]
def load_data(config={}):
"""
Load the Reuters dataset.
Returns
-------
data : dict
with keys 'x_train', 'x_test', 'y_train', 'y_test', 'labels'
"""
stop_words = stopwords.words("english")
vectorizer = TfidfVectorizer(stop_words=stop_words)
mlb = MultiLabelBinarizer()
documents = reuters.fileids()
test = [d for d in documents if d.startswith('test/')]
train = [d for d in documents if d.startswith('training/')]
docs = {}
docs['train'] = [reuters.raw(doc_id) for doc_id in train]
docs['test'] = [reuters.raw(doc_id) for doc_id in test]
xs = {'train': [], 'test': []}
xs['train'] = vectorizer.fit_transform(docs['train']).toarray()
xs['test'] = vectorizer.transform(docs['test']).toarray()
ys = {'train': [], 'test': []}
ys['train'] = mlb.fit_transform([reuters.categories(doc_id)
for doc_id in train])
ys['test'] = mlb.transform([reuters.categories(doc_id)
for doc_id in test])
data = {'x_train': xs['train'], 'y_train': ys['train'],
'x_test': xs['test'], 'y_test': ys['test'],
'labels': globals()["labels"]}
return data
示例4: get_training_data
# 需要导入模块: from sklearn.preprocessing import MultiLabelBinarizer [as 别名]
# 或者: from sklearn.preprocessing.MultiLabelBinarizer import fit_transform [as 别名]
def get_training_data(window_size_ms, train_time_sec=30):
#loop until empty input is detected
X = []
y = []
print "Training time for each key is {} seconds".format(train_time_sec)
i = 0
while True:
s = raw_input('Press <enter> to begin training key {} or q-<enter> to quit'.format(i))
if s: break
j = 0
while j < train_time_sec:
j += (window_size_ms / float(1000))
freq_spect = read_spectral_data_for_time(window_size_ms)
X.append(freq_spect)
y.append([i])
#increment key counter
i += 1
mb = MultiLabelBinarizer()
y = mb.fit_transform(y)
X = np.asarray(X)
y = np.asarray(y)
return X, y
示例5: run_classifier
# 需要导入模块: from sklearn.preprocessing import MultiLabelBinarizer [as 别名]
# 或者: from sklearn.preprocessing.MultiLabelBinarizer import fit_transform [as 别名]
def run_classifier(sentences, labels, test_doc_list, output_file_path_list):
import numpy as np
train_matrix, tfidf = tf_idf_fit_transform(sentences)
from sklearn.preprocessing import MultiLabelBinarizer
mlb = MultiLabelBinarizer()
label_matrix = mlb.fit_transform(labels)
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import LinearSVC
estimator = LinearSVC()
classifier = OneVsRestClassifier(estimator, n_jobs=-1)
classifier.fit(train_matrix, label_matrix)
for test_doc, output_file_path in zip(test_doc_list, output_file_path_list):
test_sentences = doc2sentences([test_doc])
sentence_matrix = tfidf.transform(test_sentences)
print("Shape of sentence matrix : ", sentence_matrix.shape)
predictions = classifier.predict(sentence_matrix)
from lxml import etree
document = etree.Element('doc')
doc_tree = etree.ElementTree(document)
for i in range(len(test_sentences)):
curr_pred = [mlb.classes_[x] for x in range(predictions.shape[1]) if predictions[i][x]==1]
etree.SubElement(document, "Sent", classes=", ".join(curr_pred)).text = test_sentences[i]
doc_tree.write(output_file_path)
示例6: generateTrainFeatures
# 需要导入模块: from sklearn.preprocessing import MultiLabelBinarizer [as 别名]
# 或者: from sklearn.preprocessing.MultiLabelBinarizer import fit_transform [as 别名]
def generateTrainFeatures(L):
"""
This function generates the training data features and its target labels.
Input: L : The number of training data
Output: trainX -> a (L * 2000) numpy matrix representing the 2000 features for each of the
L training samples
trainY -> (L * 185) numpy matrix representing the target class of the training samples
Logic:
The input text is read, preprocessed to remove stop words, and is appended to a list.
Similarly, each of the target class values are read into a list.
Sklearn package TFIDF vectorizer is used for generating TFIDF matrix for the 2000 frequent
words.
The multi-label classification algorithms require a target Y variable of the form,
(nsamples * nclasses), multilabel binarizer is used for converting the list of classes
to a matrix form.
"""
global classOrder
X = []
Y = []
# read the input
for i in range(L):
categories = raw_input()
target = [int(y) for y in categories.split(" ")]
del target[0]
meaningfulWords = readInput()
Y.append(target)
X.append(meaningfulWords)
# construct TF-IDF matrix representing the features
trainX = vectorizer.fit_transform(X).toarray()
# convert the target label list to a suitable matrix form
mlb = MultiLabelBinarizer()
trainY = mlb.fit_transform(Y)
# for representing the order of the classes
classOrder = mlb.classes_
return (trainX, trainY)
示例7: main
# 需要导入模块: from sklearn.preprocessing import MultiLabelBinarizer [as 别名]
# 或者: from sklearn.preprocessing.MultiLabelBinarizer import fit_transform [as 别名]
def main():
#Explore the data for how many class labels
reviewsDict = {}
with open("/Users/huzefa/Workspace/College-Fall-2015/Search/Dataset/Task2/reviewUsefulDict.pickle") as f:
reviewsDict = pickle.load(f)
print "Reviews Dictionary loaded .. "
'''
usefulCountDict = {}
for key, value in reviewsDict.iteritems():
if value not in usefulCountDict:
usefulCountDict[value] = 1
else:
usefulCountDict[value] = usefulCountDict[value]+1
pprint(usefulCountDict)
'''
corpus, target = DictToList(reviewsDict)
vectorizer = TfidfVectorizer(stop_words="english", max_df=0.5, sublinear_tf=True)
XAll = vectorizer.fit_transform(corpus)
mlb = MultiLabelBinarizer()
yAll = mlb.fit_transform(target)
with open("/Users/huzefa/Workspace/College-Fall-2015/Search/Dataset/Task2/Onlyreviews.fv", 'w') as f:
pickle.dump(XAll, f)
with open("/Users/huzefa/Workspace/College-Fall-2015/Search/Dataset/Task2/Onlyreviews.target2", 'w') as f:
pickle.dump(yAll, f)
with open("/Users/huzefa/Workspace/College-Fall-2015/Search/Dataset/Task2/Onlyreviews.mlb", 'w') as f:
pickle.dump(mlb, f)
print "Dumped featrue vectors .... "
示例8: __init__
# 需要导入模块: from sklearn.preprocessing import MultiLabelBinarizer [as 别名]
# 或者: from sklearn.preprocessing.MultiLabelBinarizer import fit_transform [as 别名]
class VectorizedData:
""" Simple container that holds the input dataset
in a sklearn-friendly form, with X, y numpy vectors.
TODO: we ignore # of matches for each fbpath """
def __init__(self, data, Xdict=None, Ydict=None):
fdict = [q_to_fdict(q) for q in data]
lset = [q_to_lset(q) for q in data]
if Xdict is None:
self.Xdict = DictVectorizer()
self.X = self.Xdict.fit_transform(fdict)
else:
self.Xdict = Xdict
self.X = self.Xdict.transform(fdict)
if Ydict is None:
self.Ydict = MultiLabelBinarizer()
self.Y = self.Ydict.fit_transform(lset)
else:
self.Ydict = Ydict
# Filter out data with unknown labels, MultiLabelBinarizer() cannot
# handle this
known_lset = [set([label for label in ls if label in self.Ydict.classes_]) for ls in lset]
lset_n = sum([len(ls) for ls in lset])
known_lset_n = sum([len(ls) for ls in known_lset])
if known_lset_n < lset_n:
print('dropped %d out of %d labels (not in training set)' % (lset_n - known_lset_n, lset_n), file=sys.stderr)
self.Y = self.Ydict.transform(known_lset)
def cfier_score(self, cfier, scorer):
""" Measure cfier performance on this dataset.
scorer -> lambda cfier, X: cfier.predict_proba(X)
(or decision_function when probabilities not predicted) """
skl_score = cfier.score(self.X.toarray(), self.Y)
# XXX: Matched paths might/could be weighted by their nMatches too...
# Measure prediction performance
Ypred = cfier.predict(self.X.toarray())
n_q = float(np.size(self.Y, axis=0))
# number of questions where all correct paths have been recalled
recall_all = np.sum(np.sum(self.Y, axis=1) == np.sum(Ypred * self.Y, axis=1)) / n_q
# number of questions where at least one correct path has been recalled
recall_any = np.sum((np.sum(self.Y, axis=1) != 0) == (np.sum(Ypred * self.Y, axis=1) != 0)) / n_q
# number of *PATHS* (not q.) that were correct
precision = np.sum((Ypred + self.Y) == 2) / float(np.sum(Ypred))
# Measure scoring performance
Yscores = scorer(cfier, self.X.toarray())
# MRR of first correct path
mrr = mrr_by_score(self.Y, Yscores)
# number of questions where at least one correct path has been recalled in top N paths
# TODO
return {'sklScore': skl_score, 'qRecallAll': recall_all, 'qRecallAny': recall_any, 'pPrec': precision, 'qScoreMRR': mrr}
示例9: perform_train_test_split
# 需要导入模块: from sklearn.preprocessing import MultiLabelBinarizer [as 别名]
# 或者: from sklearn.preprocessing.MultiLabelBinarizer import fit_transform [as 别名]
def perform_train_test_split(db_name=ds.DEFAULT_DB_NAME,
train_size=ds.DEFAULT_TRAININGSET_SIZE):
"""
Get all document_ids of given database and split's it according to given
train_size.
The tricky part is that we n
:param db_name: Name of database to split documents (default DEFAULT_DB_NAME)
:param train_size: Size in percentage [0,1] of the training set.
:return splitted_dataset - List of lists
[[DEFAULT_DATASET_LIST_INDEX_TRAINING],
[DEFAULT_DATASET_LIST_INDEX_TEST]]
"""
database = db.couch_database(db_name)
all_docs = database.getAllDocumentsFromDatabase()
doc_ids_list = []
all_tag_list = []
i = 0
for row in all_docs.rows:
document = row.doc
#append the document id to doc_ids_list
doc_ids_list.append(document[cp.COUCHDB_DOCUMENT_FIELD_ID])
tag_list = []
#if document has tags than split and add them
if pp.STACKEXCHANGE_TAGS_COLUM in document.keys():
document_tags = document[pp.STACKEXCHANGE_TAGS_COLUM]
tags_list = document_tags.split(sep=dtm_provider.TAG_SPLIT_separator)
for tag in tags_list:
#remove the closing tag (last item)
tag_list.append(tag[:-1])
#append the list of document tags to all_tag_list
all_tag_list.append(tag_list)
i += 1
if i > 10000:
break
mlb = MultiLabelBinarizer()
tags_encoded = mlb.fit_transform(all_tag_list)
print(len(doc_ids_list))
splitted_dataset = cross_validation.train_test_split(doc_ids_list,tags_encoded,
train_size=0.8, random_state=42,
stratify=tags_encoded)
示例10: createDataMatrix
# 需要导入模块: from sklearn.preprocessing import MultiLabelBinarizer [as 别名]
# 或者: from sklearn.preprocessing.MultiLabelBinarizer import fit_transform [as 别名]
def createDataMatrix(ngram_features, character_gram_features,tweetText, pos, pos_features, different_pos_tags, pos_text, voca_clusters, categories):
tokenizer_case_preserve = Tokenizer(preserve_case=True)
tokenizer = Tokenizer(preserve_case=False)
handmade_features, cll, cll2 = [], [], []
for tweet in tweetText:
feat = []
feat.append(exclamations(tweet))
feat.append(questions(tweet))
feat.append(questions_and_exclamation(tweet))
feat.append(emoticon_negative(tweet))
feat.append(emoticon_positive(tweet))
words = tokenizer_case_preserve.tokenize(tweet) #preserving casing
feat.append(allCaps(words))
feat.append(elongated(words))
feat.append(questions_and_exclamation(words[-1]))
handmade_features.append(np.array(feat))
words = tokenizer.tokenize(tweet)
words = [word.strip("_NEG") for word in words]
cll.append(getClusters(voca_clusters, words))
#cll2.append(getClusters(voca_handmade, words))
bl = csr_matrix(bing_lius(tweetText, pos, different_pos_tags, pos_text))
nrc_emo = csr_matrix(nrc_emotion(tweetText, pos, different_pos_tags, pos_text ))
mpqa_feat = csr_matrix(mpqa(tweetText,pos, different_pos_tags, pos_text))
handmade_features = np.array(handmade_features)
mlb = MultiLabelBinarizer(sparse_output=True, classes = list(set(voca_clusters.values())))
cluster_memberships_binarized = csr_matrix(mlb.fit_transform(cll))
#mlb = MultiLabelBinarizer(sparse_output=True, classes = list(set(voca_handmade.values())))
#cluster_memberships_binarized_2 = csr_matrix(mlb.fit_transform(cll2))
hasht = csr_matrix(sent140aff(tweetText, pos, different_pos_tags, pos_text, '../lexicons/HashtagSentimentAffLexNegLex/HS-AFFLEX-NEGLEX-unigrams.txt'))
# sent140aff_data = csr_matrix(sent140aff(tweetText, pos, different_pos_tags, pos_text, '../../lexicons/Sentiment140AffLexNegLex/S140-AFFLEX-NEGLEX-unigrams.txt'))
hasht_bigrams=csr_matrix(sent140aff_bigrams(tweetText, pos, different_pos_tags, pos_text, '../lexicons/HashtagSentimentAffLexNegLex/HS-AFFLEX-NEGLEX-bigrams.txt'))
# sent140affBigrams=csr_matrix(sent140aff_bigrams(tweetText, pos, different_pos_tags, pos_text, '../../lexicons/Sentiment140AffLexNegLex/S140-AFFLEX-NEGLEX-bigrams.txt'))
sentQ = csr_matrix(get_sentiwordnet(pos_text, pos))
pos_features = csr_matrix(pos_features)
handmade_features = csr_matrix(handmade_features)
# ffeatures = scipy.sparse.hstack((ngram_features, character_gram_features, cluster_memberships_binarized, handmade_features, pos_features,
# sent140affBigrams, hasht_bigrams, hasht, sent140aff_data, bl, mpqa_feat, nrc_emo), dtype=float)
# ffeatures = scipy.sparse.hstack((ngram_features, character_gram_features, cluster_memberships_binarized, handmade_features, pos_features, sent140affBigrams, hasht_bigrams, hasht, sent140aff_data, bl, mpqa_feat, nrc_emo), dtype=float)
ffeatures = scipy.sparse.hstack((ngram_features, character_gram_features, sentQ, handmade_features, pos_features, cluster_memberships_binarized, bl, mpqa_feat, nrc_emo, hasht, hasht_bigrams ), dtype=float)
# print ngram_features.shape, character_gram_features.shape, cluster_memberships_binarized.shape, handmade_features.shape, pos_features.shape,
# sent140affBigrams.shape, hasht_bigrams, hasht.shape, sent140aff_data.shape, bl.shape, mpqa_feat.shape, nrc_emo.shape
y=[]
for i in categories:
if i=='positive':
y.append(1)
elif i == 'negative':
y.append(-1)
elif i == 'UNKNOWN':
y.append(0)
else:
print i
ffeatures = normalize(ffeatures)
# ffeatures, y = shuffle(ffeatures,y)
return ffeatures, y
示例11: xval
# 需要导入模块: from sklearn.preprocessing import MultiLabelBinarizer [as 别名]
# 或者: from sklearn.preprocessing.MultiLabelBinarizer import fit_transform [as 别名]
def xval(clf, x, y, train_index, test_index):
x_train, x_test = x[train_index], x[test_index]
y_train, y_test = y[train_index], y[test_index]
clf.fit(x_train, y_train)
mlb = MultiLabelBinarizer()
y_pred = clf.predict_proba(x_test)
mse = mean_squared_error(mlb.fit_transform(label_binarize(y_test, clf.classes_)), y_pred)
acc = accuracy_score(y_test, y_pred.argmax(axis=1))
evals = clf.get_num_evals()
return mse, acc, evals
示例12: test_BRKnna_no_labels_take_closest
# 需要导入模块: from sklearn.preprocessing import MultiLabelBinarizer [as 别名]
# 或者: from sklearn.preprocessing.MultiLabelBinarizer import fit_transform [as 别名]
def test_BRKnna_no_labels_take_closest(self):
data = csr.csr_matrix([[0, 1], [1, 1], [1, 1.1], [0, 1]])
train_ids = [['lid0', 'lid1'], ['lid2', 'lid3'], ['lid2', 'lid3'], ['lid0', 'lid5']]
mlb = MultiLabelBinarizer(sparse_output=True)
y = mlb.fit_transform(train_ids)
knn = BRKNeighborsClassifier(n_neighbors=2, threshold=0.6, mode='a')
knn.fit(data, y)
pred = knn.predict(csr.csr_matrix([[0, 1]])).todense()
print(pred)
np.testing.assert_array_equal([[1, 0, 0, 0, 0]], pred)
示例13: test_BRKnnb_predict_two_samples
# 需要导入模块: from sklearn.preprocessing import MultiLabelBinarizer [as 别名]
# 或者: from sklearn.preprocessing.MultiLabelBinarizer import fit_transform [as 别名]
def test_BRKnnb_predict_two_samples(self):
data = csr.csr_matrix([[0, 1], [1, 1.1], [1, 1], [0.5, 1]])
train_ids = [['lid0', 'lid1'], ['lid0', 'lid1'], ['lid4', 'lid5'], ['lid4', 'lid5']]
mlb = MultiLabelBinarizer(sparse_output=True)
y = mlb.fit_transform(train_ids)
knn = BRKNeighborsClassifier(mode='b', n_neighbors=3)
knn.fit(data, y)
pred = knn.predict(csr.csr_matrix([[0, 1], [2, 2]])).todense()
np.testing.assert_array_equal([[1, 1, 0, 0], [0, 0, 1, 1]], pred)
示例14: test_BRKnna_predict_dense
# 需要导入模块: from sklearn.preprocessing import MultiLabelBinarizer [as 别名]
# 或者: from sklearn.preprocessing.MultiLabelBinarizer import fit_transform [as 别名]
def test_BRKnna_predict_dense(self):
data = csr.csr_matrix([[0, 1], [1, 1], [1, 1.1], [0.5, 1]])
train_ids = [['lid0', 'lid1'], ['lid2', 'lid3'], ['lid4', 'lid3'], ['lid4', 'lid5']]
mlb = MultiLabelBinarizer()
y = mlb.fit_transform(train_ids)
knn = BRKNeighborsClassifier(threshold=0.5, n_neighbors=3, mode='a')
knn.fit(data, y)
pred = knn.predict(csr.csr_matrix([[1.1, 1.1]])).todense()
np.testing.assert_array_equal([[0, 0, 0, 1, 1, 0]], pred)
示例15: main
# 需要导入模块: from sklearn.preprocessing import MultiLabelBinarizer [as 别名]
# 或者: from sklearn.preprocessing.MultiLabelBinarizer import fit_transform [as 别名]
def main():
#sets = select_by_trait(10,2,tags=["Comedy","Human","Sad","Dark"])
sets = select_sets_by_tag(20,4,tag_names)
#sets = random_select_sets(30,6)
train_tags = fetch_tags(sets["train"])
train_texts = id_to_filename(sets["train"])#txt_to_list(sets["train"])
#vectorize
count_vect = CountVectorizer(stop_words='english', encoding="utf-16", input="filename")
X_train_counts = count_vect.fit_transform(train_texts)
#tf-idf transformation
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
#process tags
mlb = MultiLabelBinarizer()
processed_train_tags = mlb.fit_transform(train_tags)
#rint(processed_train_tags)
#classifier
#clf = OneVsRestClassifier(MultinomialNB())
clf = OneVsRestClassifier(LinearSVC())
clf.fit(X_train_tfidf,processed_train_tags)
print("classes:{}".format(clf.classes_))
#process test set
test_texts = id_to_filename(sets["test"])#txt_to_list(sets["test"])
X_test_counts = count_vect.transform(test_texts)
#print("X_test_counts inverse transformed: {}".format(count_vect.inverse_transform(X_test_counts)))
X_test_tfidf = tfidf_transformer.transform(X_test_counts)
predicted_tags = clf.predict(X_test_tfidf)
predicted_tags_readable = mlb.inverse_transform(predicted_tags)
test_tags_actual = fetch_tags(sets["test"])
predicted_probs = clf.decision_function(X_test_tfidf)
#predicted_probs = clf.get_params(X_test_tfidf)
class_list = mlb.classes_
report = metrics.classification_report(mlb.transform(test_tags_actual),predicted_tags,target_names=class_list)
print(report)
#retrieve top 30% for each class
top_percentage = 30
threshold_index = int( len(sets["test"]) *(top_percentage/100.0) )
threshold_vals_dic = {}
threshold_vals = []
num_classes = len(class_list)
for i in range(num_classes):
z = [ predicted_probs[j,i] for j in range(len(sets["test"]))]
z.sort(reverse=True)
threshold_vals_dic[class_list[i]]= z[threshold_index]
threshold_vals.append(z[threshold_index])
print(threshold_vals_dic)
print_predictions(sets["test"],predicted_tags_readable,class_list, class_probablities=predicted_probs,threshold_vals=threshold_vals)