当前位置: 首页>>代码示例>>Python>>正文


Python MultiLabelBinarizer.fit_transform方法代码示例

本文整理汇总了Python中sklearn.preprocessing.MultiLabelBinarizer.fit_transform方法的典型用法代码示例。如果您正苦于以下问题:Python MultiLabelBinarizer.fit_transform方法的具体用法?Python MultiLabelBinarizer.fit_transform怎么用?Python MultiLabelBinarizer.fit_transform使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在sklearn.preprocessing.MultiLabelBinarizer的用法示例。


在下文中一共展示了MultiLabelBinarizer.fit_transform方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: evaluate_solution

# 需要导入模块: from sklearn.preprocessing import MultiLabelBinarizer [as 别名]
# 或者: from sklearn.preprocessing.MultiLabelBinarizer import fit_transform [as 别名]
def evaluate_solution(users, urecovered, observed_index, xs=None, E=None,
                      hidden_edges=None):
    """Evaluate the quality of the recovered user profile"""
    mse = mean_squared_error(users[observed_index, :],
                             urecovered[observed_index, :])
    if hidden_edges is None or len(hidden_edges) < 1:
        return mse, None
    labeler = MultiLabelBinarizer(classes=np.arange(xs.shape[1]))
    gold = labeler.fit_transform([E[e] for e in sorted(hidden_edges)])
    # gold = np.array([E[e] for e in sorted(hidden_edges)])
    eh = sorted(hidden_edges)
    heads, tails = zip(*eh)
    Cr = np.dot(urecovered, xs.T)
    Dr = np.abs(Cr[heads, :] - Cr[tails, :])
    # TODO prediction here could be better: instead of predict the k best
    # directions all the time, look at revealed edge to compute threshold of
    # similarity (i.e replace 0.05)
    best_dirs = np.argsort(Dr, 1).astype(int)[:, :2]
    pred = []
    for all_dir, suggestion in zip(Dr, best_dirs):
        my_pred = [suggestion[0]]
        if all_dir[suggestion[1]] < 0.05:
            my_pred.append(suggestion[1])
        pred.append(my_pred)
    pred = labeler.fit_transform(pred)
    return mse, f1_score(gold, pred, average='samples')
开发者ID:daureg,项目名称:magnet,代码行数:28,代码来源:synth.py

示例2: read_all_data

# 需要导入模块: from sklearn.preprocessing import MultiLabelBinarizer [as 别名]
# 或者: from sklearn.preprocessing.MultiLabelBinarizer import fit_transform [as 别名]
def read_all_data(p):
    img_src = "images/"

    df = pd.read_pickle("frame_no_stem.pkl")
    images = __read_all_images(img_src) 
    print("Finished reading images")

    x_images = []
    x_desc = []
    y_category = []
    all_categories = set()

    for asin in df.index.values:
        if asin in images:
            data = images[asin]
            x_images.append(data)

            item = df.loc[asin]
            x_desc.append(item.description)
            cate = item.categories
            y_category.append(cate)
            for c in cate:
                all_categories.add(c)

    print("Finished reading dataframe")
    mlb = MultiLabelBinarizer()
    y_total = mlb.fit_transform(y_category)
    x_images = np.array(x_images)
    x_desc = np.array(x_desc)

    
    return x_images,x_desc, y_total
开发者ID:jeffwiroj,项目名称:ml_proj,代码行数:34,代码来源:image_classifier.py

示例3: load_data

# 需要导入模块: from sklearn.preprocessing import MultiLabelBinarizer [as 别名]
# 或者: from sklearn.preprocessing.MultiLabelBinarizer import fit_transform [as 别名]
def load_data(config={}):
    """
    Load the Reuters dataset.

    Returns
    -------
    data : dict
        with keys 'x_train', 'x_test', 'y_train', 'y_test', 'labels'
    """
    stop_words = stopwords.words("english")
    vectorizer = TfidfVectorizer(stop_words=stop_words)
    mlb = MultiLabelBinarizer()

    documents = reuters.fileids()
    test = [d for d in documents if d.startswith('test/')]
    train = [d for d in documents if d.startswith('training/')]

    docs = {}
    docs['train'] = [reuters.raw(doc_id) for doc_id in train]
    docs['test'] = [reuters.raw(doc_id) for doc_id in test]
    xs = {'train': [], 'test': []}
    xs['train'] = vectorizer.fit_transform(docs['train']).toarray()
    xs['test'] = vectorizer.transform(docs['test']).toarray()
    ys = {'train': [], 'test': []}
    ys['train'] = mlb.fit_transform([reuters.categories(doc_id)
                                     for doc_id in train])
    ys['test'] = mlb.transform([reuters.categories(doc_id)
                                for doc_id in test])
    data = {'x_train': xs['train'], 'y_train': ys['train'],
            'x_test': xs['test'], 'y_test': ys['test'],
            'labels': globals()["labels"]}
    return data
开发者ID:MartinThoma,项目名称:algorithms,代码行数:34,代码来源:reuters.py

示例4: get_training_data

# 需要导入模块: from sklearn.preprocessing import MultiLabelBinarizer [as 别名]
# 或者: from sklearn.preprocessing.MultiLabelBinarizer import fit_transform [as 别名]
def get_training_data(window_size_ms, train_time_sec=30):
	#loop until empty input is detected
	X = []
	y = []

	print "Training time for each key is {} seconds".format(train_time_sec)
	i = 0
	while True:
		s = raw_input('Press <enter> to begin training key {} or q-<enter> to quit'.format(i))
		if s: break

		j = 0
		while j < train_time_sec:
			j += (window_size_ms / float(1000))
			freq_spect = read_spectral_data_for_time(window_size_ms)
			X.append(freq_spect)
			y.append([i])

		#increment key counter
		i += 1

	mb = MultiLabelBinarizer()
	y = mb.fit_transform(y)

	X = np.asarray(X)
	y = np.asarray(y)
	return X, y
开发者ID:johncava,项目名称:HackAZ_2016,代码行数:29,代码来源:serialize_training_data.py

示例5: run_classifier

# 需要导入模块: from sklearn.preprocessing import MultiLabelBinarizer [as 别名]
# 或者: from sklearn.preprocessing.MultiLabelBinarizer import fit_transform [as 别名]
def run_classifier(sentences, labels, test_doc_list, output_file_path_list):
	import numpy as np

	train_matrix, tfidf = tf_idf_fit_transform(sentences)

	from sklearn.preprocessing import MultiLabelBinarizer
	mlb = MultiLabelBinarizer()
	label_matrix = mlb.fit_transform(labels)

	from sklearn.multiclass import OneVsRestClassifier
	from sklearn.svm import LinearSVC
	estimator = LinearSVC()
	classifier = OneVsRestClassifier(estimator, n_jobs=-1)
	classifier.fit(train_matrix, label_matrix)

	for test_doc, output_file_path in zip(test_doc_list, output_file_path_list):
		test_sentences = doc2sentences([test_doc])
		sentence_matrix = tfidf.transform(test_sentences)
		print("Shape of sentence matrix : ", sentence_matrix.shape)
		predictions = classifier.predict(sentence_matrix)

		from lxml import etree
		document = etree.Element('doc')
		doc_tree = etree.ElementTree(document)
		for i in range(len(test_sentences)):
			curr_pred = [mlb.classes_[x] for x in range(predictions.shape[1]) if predictions[i][x]==1]
			etree.SubElement(document, "Sent", classes=", ".join(curr_pred)).text = test_sentences[i]
		doc_tree.write(output_file_path)
开发者ID:sarath1,项目名称:EventExtraction,代码行数:30,代码来源:sentence_classifier.py

示例6: generateTrainFeatures

# 需要导入模块: from sklearn.preprocessing import MultiLabelBinarizer [as 别名]
# 或者: from sklearn.preprocessing.MultiLabelBinarizer import fit_transform [as 别名]
def generateTrainFeatures(L):
    """
    This function generates the training data features and its target labels.
    Input: L : The number of training data
    Output: trainX -> a (L * 2000) numpy matrix representing the 2000 features for each of the
                        L training samples
            trainY -> (L * 185) numpy matrix representing the target class of the training samples
    Logic:
    The input text is read, preprocessed to remove stop words, and is appended to a list.
    Similarly, each of the target class values are read into a list.
    Sklearn package TFIDF vectorizer is used for generating TFIDF matrix for the 2000 frequent
    words. 
    The multi-label classification algorithms require a target Y variable of the form,
    (nsamples * nclasses), multilabel binarizer is used for converting the list of classes
    to a matrix form.
    """
    global classOrder
    X = []
    Y = []
    # read the input
    for i in range(L):
        categories = raw_input()
        target = [int(y) for y in categories.split(" ")]
        del target[0]
        meaningfulWords = readInput()
        Y.append(target)
        X.append(meaningfulWords)
    # construct TF-IDF matrix representing the features
    trainX = vectorizer.fit_transform(X).toarray()
    # convert the target label list to a suitable matrix form
    mlb = MultiLabelBinarizer()
    trainY = mlb.fit_transform(Y)
    # for representing the order of the classes
    classOrder = mlb.classes_
    return (trainX, trainY)
开发者ID:hpam1,项目名称:Machine-Learning,代码行数:37,代码来源:labeler.py

示例7: main

# 需要导入模块: from sklearn.preprocessing import MultiLabelBinarizer [as 别名]
# 或者: from sklearn.preprocessing.MultiLabelBinarizer import fit_transform [as 别名]
def main():
    #Explore the data for how many class labels
    reviewsDict = {}
    with open("/Users/huzefa/Workspace/College-Fall-2015/Search/Dataset/Task2/reviewUsefulDict.pickle") as f:
        reviewsDict = pickle.load(f)
    print "Reviews Dictionary loaded .. "
    '''
    usefulCountDict = {}
    for key, value in reviewsDict.iteritems():
        if value not in usefulCountDict:
            usefulCountDict[value] = 1
        else:
            usefulCountDict[value] = usefulCountDict[value]+1
    pprint(usefulCountDict)
    '''
    corpus, target = DictToList(reviewsDict)
    
    vectorizer = TfidfVectorizer(stop_words="english", max_df=0.5, sublinear_tf=True)
    XAll = vectorizer.fit_transform(corpus)
    mlb = MultiLabelBinarizer()
    yAll = mlb.fit_transform(target)
    
    with open("/Users/huzefa/Workspace/College-Fall-2015/Search/Dataset/Task2/Onlyreviews.fv", 'w') as f:
        pickle.dump(XAll, f)
    with open("/Users/huzefa/Workspace/College-Fall-2015/Search/Dataset/Task2/Onlyreviews.target2", 'w') as f:
        pickle.dump(yAll, f)
    with open("/Users/huzefa/Workspace/College-Fall-2015/Search/Dataset/Task2/Onlyreviews.mlb", 'w') as f:
        pickle.dump(mlb, f)
    
    print "Dumped featrue vectors .... "
开发者ID:yangyang861115,项目名称:Yelp-Project,代码行数:32,代码来源:createFeatureVectorsBinary.py

示例8: __init__

# 需要导入模块: from sklearn.preprocessing import MultiLabelBinarizer [as 别名]
# 或者: from sklearn.preprocessing.MultiLabelBinarizer import fit_transform [as 别名]
class VectorizedData:
    """ Simple container that holds the input dataset
    in a sklearn-friendly form, with X, y numpy vectors.

    TODO: we ignore # of matches for each fbpath """
    def __init__(self, data, Xdict=None, Ydict=None):
        fdict = [q_to_fdict(q) for q in data]
        lset = [q_to_lset(q) for q in data]

        if Xdict is None:
            self.Xdict = DictVectorizer()
            self.X = self.Xdict.fit_transform(fdict)
        else:
            self.Xdict = Xdict
            self.X = self.Xdict.transform(fdict)

        if Ydict is None:
            self.Ydict = MultiLabelBinarizer()
            self.Y = self.Ydict.fit_transform(lset)
        else:
            self.Ydict = Ydict

            # Filter out data with unknown labels, MultiLabelBinarizer() cannot
            # handle this
            known_lset = [set([label for label in ls if label in self.Ydict.classes_]) for ls in lset]
            lset_n = sum([len(ls) for ls in lset])
            known_lset_n = sum([len(ls) for ls in known_lset])
            if known_lset_n < lset_n:
                print('dropped %d out of %d labels (not in training set)' % (lset_n - known_lset_n, lset_n), file=sys.stderr)

            self.Y = self.Ydict.transform(known_lset)

    def cfier_score(self, cfier, scorer):
        """ Measure cfier performance on this dataset.

        scorer -> lambda cfier, X: cfier.predict_proba(X)
        (or decision_function when probabilities not predicted) """
        skl_score = cfier.score(self.X.toarray(), self.Y)

        # XXX: Matched paths might/could be weighted by their nMatches too...

        # Measure prediction performance
        Ypred = cfier.predict(self.X.toarray())
        n_q = float(np.size(self.Y, axis=0))
        # number of questions where all correct paths have been recalled
        recall_all = np.sum(np.sum(self.Y, axis=1) == np.sum(Ypred * self.Y, axis=1)) / n_q
        # number of questions where at least one correct path has been recalled
        recall_any = np.sum((np.sum(self.Y, axis=1) != 0) == (np.sum(Ypred * self.Y, axis=1) != 0)) / n_q
        # number of *PATHS* (not q.) that were correct
        precision = np.sum((Ypred + self.Y) == 2) / float(np.sum(Ypred))

        # Measure scoring performance
        Yscores = scorer(cfier, self.X.toarray())
        # MRR of first correct path
        mrr = mrr_by_score(self.Y, Yscores)
        # number of questions where at least one correct path has been recalled in top N paths
        # TODO

        return {'sklScore': skl_score, 'qRecallAll': recall_all, 'qRecallAny': recall_any, 'pPrec': precision, 'qScoreMRR': mrr}
开发者ID:AmitShah,项目名称:yodaqa,代码行数:61,代码来源:fbpathtrain.py

示例9: perform_train_test_split

# 需要导入模块: from sklearn.preprocessing import MultiLabelBinarizer [as 别名]
# 或者: from sklearn.preprocessing.MultiLabelBinarizer import fit_transform [as 别名]
def perform_train_test_split(db_name=ds.DEFAULT_DB_NAME,
                                        train_size=ds.DEFAULT_TRAININGSET_SIZE):
    
    """
    Get all document_ids of given database and split's it according to given
    train_size.
    The tricky part is that we n
    
    :param db_name: Name of database to split documents (default DEFAULT_DB_NAME)
    :param train_size: Size in percentage [0,1] of the training set.
    :return splitted_dataset - List of lists 
                    [[DEFAULT_DATASET_LIST_INDEX_TRAINING], 
                    [DEFAULT_DATASET_LIST_INDEX_TEST]]
    """
    
    database = db.couch_database(db_name)
    all_docs = database.getAllDocumentsFromDatabase()
    
    doc_ids_list = []
    all_tag_list = []
    
    i = 0
    
    for row in all_docs.rows:
        
        document = row.doc
        #append the document id to doc_ids_list
        doc_ids_list.append(document[cp.COUCHDB_DOCUMENT_FIELD_ID])
        
        tag_list = []
        
        #if document has tags than split and add them
        if pp.STACKEXCHANGE_TAGS_COLUM in document.keys():
            
            document_tags = document[pp.STACKEXCHANGE_TAGS_COLUM]
            
            tags_list = document_tags.split(sep=dtm_provider.TAG_SPLIT_separator)
            
            for tag in tags_list:
                
                #remove the closing tag (last item)
                tag_list.append(tag[:-1])
        #append the list of document tags to all_tag_list        
        all_tag_list.append(tag_list)
        
        i += 1
        
        if i > 10000:
            break
    
    mlb = MultiLabelBinarizer()
    tags_encoded = mlb.fit_transform(all_tag_list)

    
    print(len(doc_ids_list))
    
    splitted_dataset = cross_validation.train_test_split(doc_ids_list,tags_encoded,
                                               train_size=0.8, random_state=42, 
                                               stratify=tags_encoded)
开发者ID:davcem,项目名称:stackexchange_text_classification,代码行数:61,代码来源:classifier_inspect_splits.py

示例10: createDataMatrix

# 需要导入模块: from sklearn.preprocessing import MultiLabelBinarizer [as 别名]
# 或者: from sklearn.preprocessing.MultiLabelBinarizer import fit_transform [as 别名]
def createDataMatrix(ngram_features, character_gram_features,tweetText, pos, pos_features, different_pos_tags, pos_text, voca_clusters, categories):
    tokenizer_case_preserve = Tokenizer(preserve_case=True)
    tokenizer = Tokenizer(preserve_case=False)
    handmade_features, cll, cll2 = [], [], []
    for tweet in tweetText:
        feat = []
        feat.append(exclamations(tweet))
        feat.append(questions(tweet))
        feat.append(questions_and_exclamation(tweet))
        feat.append(emoticon_negative(tweet))
        feat.append(emoticon_positive(tweet))
        words = tokenizer_case_preserve.tokenize(tweet) #preserving casing
        feat.append(allCaps(words))
        feat.append(elongated(words))
        feat.append(questions_and_exclamation(words[-1]))
        handmade_features.append(np.array(feat))
        words = tokenizer.tokenize(tweet)
        words = [word.strip("_NEG") for word in words]
        cll.append(getClusters(voca_clusters, words))
        #cll2.append(getClusters(voca_handmade, words))


    bl = csr_matrix(bing_lius(tweetText, pos, different_pos_tags, pos_text))
    nrc_emo = csr_matrix(nrc_emotion(tweetText, pos, different_pos_tags, pos_text ))
    mpqa_feat = csr_matrix(mpqa(tweetText,pos, different_pos_tags, pos_text))
    handmade_features = np.array(handmade_features)
    mlb = MultiLabelBinarizer(sparse_output=True, classes = list(set(voca_clusters.values())))
    cluster_memberships_binarized = csr_matrix(mlb.fit_transform(cll))
    #mlb = MultiLabelBinarizer(sparse_output=True, classes = list(set(voca_handmade.values())))
    #cluster_memberships_binarized_2 = csr_matrix(mlb.fit_transform(cll2))
    
    hasht = csr_matrix(sent140aff(tweetText, pos, different_pos_tags, pos_text, '../lexicons/HashtagSentimentAffLexNegLex/HS-AFFLEX-NEGLEX-unigrams.txt'))
#    sent140aff_data = csr_matrix(sent140aff(tweetText, pos, different_pos_tags, pos_text, '../../lexicons/Sentiment140AffLexNegLex/S140-AFFLEX-NEGLEX-unigrams.txt'))
    hasht_bigrams=csr_matrix(sent140aff_bigrams(tweetText, pos, different_pos_tags, pos_text, '../lexicons/HashtagSentimentAffLexNegLex/HS-AFFLEX-NEGLEX-bigrams.txt'))
#    sent140affBigrams=csr_matrix(sent140aff_bigrams(tweetText, pos, different_pos_tags, pos_text, '../../lexicons/Sentiment140AffLexNegLex/S140-AFFLEX-NEGLEX-bigrams.txt'))
    sentQ = csr_matrix(get_sentiwordnet(pos_text, pos))
    pos_features = csr_matrix(pos_features)
    handmade_features = csr_matrix(handmade_features)
    # ffeatures = scipy.sparse.hstack((ngram_features, character_gram_features, cluster_memberships_binarized, handmade_features, pos_features, 
#                             sent140affBigrams, hasht_bigrams, hasht, sent140aff_data, bl, mpqa_feat, nrc_emo), dtype=float)
#    ffeatures = scipy.sparse.hstack((ngram_features, character_gram_features, cluster_memberships_binarized, handmade_features, pos_features, sent140affBigrams, hasht_bigrams, hasht, sent140aff_data, bl, mpqa_feat, nrc_emo), dtype=float)
    ffeatures = scipy.sparse.hstack((ngram_features, character_gram_features, sentQ, handmade_features, pos_features, cluster_memberships_binarized, bl, mpqa_feat, nrc_emo, hasht, hasht_bigrams ), dtype=float)

#     print ngram_features.shape, character_gram_features.shape, cluster_memberships_binarized.shape, handmade_features.shape, pos_features.shape, 
#     sent140affBigrams.shape, hasht_bigrams, hasht.shape, sent140aff_data.shape, bl.shape, mpqa_feat.shape, nrc_emo.shape
    y=[]
    for i in categories:
        if i=='positive':
            y.append(1)
        elif i == 'negative':
            y.append(-1)
        elif i == 'UNKNOWN':
            y.append(0)
        else:
            print i
    ffeatures = normalize(ffeatures)
#     ffeatures, y = shuffle(ffeatures,y)
    return ffeatures, y
开发者ID:balikasg,项目名称:SemEval2016-Twitter_Sentiment_Evaluation,代码行数:60,代码来源:my_utils.py

示例11: xval

# 需要导入模块: from sklearn.preprocessing import MultiLabelBinarizer [as 别名]
# 或者: from sklearn.preprocessing.MultiLabelBinarizer import fit_transform [as 别名]
def xval(clf, x, y, train_index, test_index):
    x_train, x_test = x[train_index], x[test_index]
    y_train, y_test = y[train_index], y[test_index]
    clf.fit(x_train, y_train)
    mlb = MultiLabelBinarizer()
    y_pred = clf.predict_proba(x_test)
    mse = mean_squared_error(mlb.fit_transform(label_binarize(y_test, clf.classes_)), y_pred)
    acc = accuracy_score(y_test, y_pred.argmax(axis=1))
    evals = clf.get_num_evals()
    return mse, acc, evals
开发者ID:shehzadqureshi,项目名称:NeuralNetDynamicOSI,代码行数:12,代码来源:test_basic_4bit_cv.py

示例12: test_BRKnna_no_labels_take_closest

# 需要导入模块: from sklearn.preprocessing import MultiLabelBinarizer [as 别名]
# 或者: from sklearn.preprocessing.MultiLabelBinarizer import fit_transform [as 别名]
    def test_BRKnna_no_labels_take_closest(self):
        data = csr.csr_matrix([[0, 1], [1, 1], [1, 1.1], [0, 1]])
        train_ids = [['lid0', 'lid1'], ['lid2', 'lid3'], ['lid2', 'lid3'], ['lid0', 'lid5']]
        mlb = MultiLabelBinarizer(sparse_output=True)
        y = mlb.fit_transform(train_ids)
        knn = BRKNeighborsClassifier(n_neighbors=2, threshold=0.6, mode='a')
        knn.fit(data, y)

        pred = knn.predict(csr.csr_matrix([[0, 1]])).todense()
        print(pred)
        np.testing.assert_array_equal([[1, 0, 0, 0, 0]], pred)
开发者ID:quadflor,项目名称:Quadflor,代码行数:13,代码来源:test_BRKNN.py

示例13: test_BRKnnb_predict_two_samples

# 需要导入模块: from sklearn.preprocessing import MultiLabelBinarizer [as 别名]
# 或者: from sklearn.preprocessing.MultiLabelBinarizer import fit_transform [as 别名]
    def test_BRKnnb_predict_two_samples(self):
        data = csr.csr_matrix([[0, 1], [1, 1.1], [1, 1], [0.5, 1]])
        train_ids = [['lid0', 'lid1'], ['lid0', 'lid1'], ['lid4', 'lid5'], ['lid4', 'lid5']]
        mlb = MultiLabelBinarizer(sparse_output=True)
        y = mlb.fit_transform(train_ids)

        knn = BRKNeighborsClassifier(mode='b', n_neighbors=3)
        knn.fit(data, y)

        pred = knn.predict(csr.csr_matrix([[0, 1], [2, 2]])).todense()
        np.testing.assert_array_equal([[1, 1, 0, 0], [0, 0, 1, 1]], pred)
开发者ID:quadflor,项目名称:Quadflor,代码行数:13,代码来源:test_BRKNN.py

示例14: test_BRKnna_predict_dense

# 需要导入模块: from sklearn.preprocessing import MultiLabelBinarizer [as 别名]
# 或者: from sklearn.preprocessing.MultiLabelBinarizer import fit_transform [as 别名]
    def test_BRKnna_predict_dense(self):
        data = csr.csr_matrix([[0, 1], [1, 1], [1, 1.1], [0.5, 1]])
        train_ids = [['lid0', 'lid1'], ['lid2', 'lid3'], ['lid4', 'lid3'], ['lid4', 'lid5']]
        mlb = MultiLabelBinarizer()
        y = mlb.fit_transform(train_ids)

        knn = BRKNeighborsClassifier(threshold=0.5, n_neighbors=3, mode='a')
        knn.fit(data, y)

        pred = knn.predict(csr.csr_matrix([[1.1, 1.1]])).todense()
        np.testing.assert_array_equal([[0, 0, 0, 1, 1, 0]], pred)
开发者ID:quadflor,项目名称:Quadflor,代码行数:13,代码来源:test_BRKNN.py

示例15: main

# 需要导入模块: from sklearn.preprocessing import MultiLabelBinarizer [as 别名]
# 或者: from sklearn.preprocessing.MultiLabelBinarizer import fit_transform [as 别名]
def main():
    #sets = select_by_trait(10,2,tags=["Comedy","Human","Sad","Dark"])
    sets = select_sets_by_tag(20,4,tag_names)
    #sets = random_select_sets(30,6)
    train_tags = fetch_tags(sets["train"])
    train_texts = id_to_filename(sets["train"])#txt_to_list(sets["train"])
    #vectorize
    count_vect = CountVectorizer(stop_words='english', encoding="utf-16", input="filename")
    X_train_counts = count_vect.fit_transform(train_texts)

    #tf-idf transformation
    tfidf_transformer = TfidfTransformer()
    X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)

    #process tags
    mlb = MultiLabelBinarizer()
    processed_train_tags = mlb.fit_transform(train_tags)
    #rint(processed_train_tags)
    #classifier
    #clf = OneVsRestClassifier(MultinomialNB())
    clf = OneVsRestClassifier(LinearSVC())
    clf.fit(X_train_tfidf,processed_train_tags)
    print("classes:{}".format(clf.classes_))
    #process test set

    test_texts = id_to_filename(sets["test"])#txt_to_list(sets["test"])
    X_test_counts = count_vect.transform(test_texts)
    #print("X_test_counts inverse transformed: {}".format(count_vect.inverse_transform(X_test_counts)))
    X_test_tfidf = tfidf_transformer.transform(X_test_counts)

    predicted_tags = clf.predict(X_test_tfidf)
    predicted_tags_readable = mlb.inverse_transform(predicted_tags)
    test_tags_actual = fetch_tags(sets["test"])
    predicted_probs = clf.decision_function(X_test_tfidf)
    #predicted_probs = clf.get_params(X_test_tfidf)
    class_list = mlb.classes_
    report = metrics.classification_report(mlb.transform(test_tags_actual),predicted_tags,target_names=class_list)
    print(report)
    #retrieve top 30% for each class
    top_percentage = 30
    threshold_index = int( len(sets["test"]) *(top_percentage/100.0) )
    threshold_vals_dic = {}
    threshold_vals = []
    num_classes = len(class_list)
    for i in range(num_classes):
        z = [ predicted_probs[j,i] for j in range(len(sets["test"]))]
        z.sort(reverse=True)
        threshold_vals_dic[class_list[i]]= z[threshold_index]
        threshold_vals.append(z[threshold_index])
    print(threshold_vals_dic)


    print_predictions(sets["test"],predicted_tags_readable,class_list, class_probablities=predicted_probs,threshold_vals=threshold_vals)
开发者ID:samkam,项目名称:Senior-Project,代码行数:55,代码来源:classify.py


注:本文中的sklearn.preprocessing.MultiLabelBinarizer.fit_transform方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。