本文整理汇总了Python中sklearn.pipeline.Pipeline.fit方法的典型用法代码示例。如果您正苦于以下问题:Python Pipeline.fit方法的具体用法?Python Pipeline.fit怎么用?Python Pipeline.fit使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类sklearn.pipeline.Pipeline
的用法示例。
在下文中一共展示了Pipeline.fit方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: calcCSPLDA
# 需要导入模块: from sklearn.pipeline import Pipeline [as 别名]
# 或者: from sklearn.pipeline.Pipeline import fit [as 别名]
def calcCSPLDA(epochs_train, labels_train, nb):
"""Creates the CSP+LDA pipeline and applies it to training data.
(just really a function to call the MNE and SKlearn processing functs)
Parameters
----------
epochs_train : epochs in mne data format
labels_train : labels of epochs in mne format
nb: number of CSP components, must be even. (6 implies the 3 top-most and bottom eigenvectors)
Returns
-------
clf : the fitted model for the CSP+LDA approach
csp.filters_ : CSP weight vector, shape (nchannels, nchannels)
svc.coef_ : LDA weight vector, shape (1, nb)
Examples
--------
>>> data_path = "/PATH/TO/FILE/somematrix.txt"
>>> matrix_data = loadAsMatrix(data_path)
"""
svc = LDA()
csp = CSP(n_components=4, reg=None, log=True, cov_est='epoch')
clf = Pipeline([('CSP', csp), ('SVC', svc)])
epochs_data = epochs_train.get_data()
clf.fit(epochs_data, labels_train)
return clf, csp.filters_, svc.coef_
示例2: KFOLDTEST
# 需要导入模块: from sklearn.pipeline import Pipeline [as 别名]
# 或者: from sklearn.pipeline.Pipeline import fit [as 别名]
def KFOLDTEST(self, text, sent):
k_fold = KFold(n=len(text), n_folds=6)
pipeline = Pipeline(
[
("vectorizer", CountVectorizer(ngram_range=(1, 2), tokenizer=self.tokenize_data)),
("tfidf", TfidfTransformer(norm="l2", smooth_idf=False, use_idf=False)),
("classifier", OneVsOneClassifier(LinearSVC())),
]
)
scores = []
for train_indices, test_indices in k_fold:
# print('Train: %s | test: %s' % (train_indices, test_indices))
train_text = text[train_indices]
train_y = sent[train_indices]
test_text = text[test_indices]
test_y = sent[test_indices]
pipeline.fit(train_text, train_y)
score = pipeline.score(test_text, test_y)
scores.append(score)
score = sum(scores) / len(scores)
print ("scores ", scores, " Score ", score)
return score
示例3: MachineLearning
# 需要导入模块: from sklearn.pipeline import Pipeline [as 别名]
# 或者: from sklearn.pipeline.Pipeline import fit [as 别名]
class MachineLearning(object):
def __init__(self):
# Initialize classifier and vectorizer
self.clf = Pipeline([('tfidf', TfidfVectorizer(min_df=1, ngram_range=(1, 2))),
('clf', MultinomialNB(alpha=.01)),
])
def init_training(self):
self.x_train = []
self.y_train = []
def add_training_data(self, data, label):
self.x_train.append(data)
self.y_train.append(label)
# Train classifier
# Can also use grid search to optimize accuracy, like
'''
parameters = {'tfidf__ngram_range': [(1, 1), (1, 2)],
'clf__alpha': (.01, .001),
}
gs_clf = GridSearchCV(clf, parameters, n_jobs=-1)
'''
def train(self):
self.clf.fit(self.x_train, self.y_train)
# Predict result
# We can roughly estimate the accuracy using cross validation, like
'''
result = clf.predict(test_dc + test_marvel)
baseline = [0 for x in range(len(test_dc))] + [1 for x in range(len(test_marvel))]
print np.sum(result == baseline) / float(len(result))
'''
def predict(self, data):
return self.clf.predict([data])[0]
示例4: test
# 需要导入模块: from sklearn.pipeline import Pipeline [as 别名]
# 或者: from sklearn.pipeline.Pipeline import fit [as 别名]
def test():
target_label = [u'weather', u'audio',u'pic',u'calculate',u'music', u'poem']
training_text_raw = []
training_label = []
with open ('./training_source.csv','r') as f:
for line in f.readlines():
line = line.strip().split('\t')
if len(line) > 1 and line[1] in target_label:
training_text_raw.append(unicode(line[0],"utf-8"))
training_label.append(line[1])
print training_label
training_text = []
for text in training_text_raw:
seg_text = seg(text)
training_text.append(seg_text)
text_clf = Pipeline([('vect', CountVectorizer()),
('tfidf', TfidfTransformer(use_idf=False)),
('clf', MultinomialNB()),
])
scores = cross_validation.cross_val_score(text_clf, training_text, training_label, cv=8)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))
text_clf.fit(training_text, training_label)
while True:
k_text = raw_input("\nPlease input:")
if k_text == "exit":
break
print text_clf.predict([seg(unicode(k_text,'utf-8'))])
示例5: Regressor
# 需要导入模块: from sklearn.pipeline import Pipeline [as 别名]
# 或者: from sklearn.pipeline.Pipeline import fit [as 别名]
class Regressor(BaseEstimator):
def __init__(self):
self.clf = Pipeline([
("RF", RandomForestRegressor(n_estimators=200, max_depth=15,
n_jobs=N_JOBS))])
self.scaler = StandardScaler()
self.agglo = FeatureAgglomeration(n_clusters=500)
def fit(self, X, y):
y = y.ravel()
n_samples, n_lags, n_lats, n_lons = X.shape
self.scaler.fit(X[:, -1].reshape(n_samples, -1))
X = X.reshape(n_lags * n_samples, -1)
connectivity = grid_to_graph(n_lats, n_lons)
self.agglo.connectivity = connectivity
X = self.scaler.transform(X)
X = self.agglo.fit_transform(X)
X = X.reshape(n_samples, -1)
self.clf.fit(X, y)
def predict(self, X):
n_samples, n_lags, n_lats, n_lons = X.shape
X = X.reshape(n_lags * n_samples, -1)
X = self.scaler.transform(X)
X = self.agglo.transform(X)
X = X.reshape(n_samples, -1)
return self.clf.predict(X)
示例6: clasificador
# 需要导入模块: from sklearn.pipeline import Pipeline [as 别名]
# 或者: from sklearn.pipeline.Pipeline import fit [as 别名]
def clasificador(self,X_train, y_train, X_test, target_names, y_test,all_labels):
lb = preprocessing.MultiLabelBinarizer()
Y = lb.fit_transform(y_train)
classifier = Pipeline([
('vectorizer',CountVectorizer(strip_accents='unicode')),
('tfidf',TfidfTransformer()),
('to_dense', DenseTransformer()),
('clf',OneVsRestClassifier(GaussianNB()))])
classifier.fit(X_train,Y)
predicted = classifier.predict(X_test)
etiquetas = lb.inverse_transform(predicted)
for i in range(0,len(etiquetas)):
etiquetas[i]=list(etiquetas[i])
valoresMacro = self.macro(etiquetas,y_test)
valoresMicro = self.micro(etiquetas, y_test)
示例7: svcDictVector
# 需要导入模块: from sklearn.pipeline import Pipeline [as 别名]
# 或者: from sklearn.pipeline.Pipeline import fit [as 别名]
def svcDictVector():
recipeData = getRecipeData()
labels = [recipe['cuisine'] for recipe in recipeData]
ingredientsFixtures = [sorted(set(e['ingredients'])) for e in recipeData]
for i, w in enumerate(ingredientsFixtures):
ingredientsFixtures[i] = dict(zip(w, [1] * len(w)))
pipeline = Pipeline([
('dict', DictVectorizer()),
('variance', VarianceThreshold()),
('tfidf', TfidfTransformer()),
('bayes', svm.LinearSVC()),
])
pipeline.fit(ingredientsFixtures, labels)
print pipeline
testRecipes = getTestData()
testIngredientsFixtures = [sorted(set(e['ingredients'])) for e in testRecipes]
for i, w in enumerate(testIngredientsFixtures):
testIngredientsFixtures[i] = dict(zip(w, [1] * len(w)))
predictions = pipeline.predict(testIngredientsFixtures)
outputPercentCorrect(predictions)
copyAndOutput(predictions, testRecipes)
示例8: useTFIDF
# 需要导入模块: from sklearn.pipeline import Pipeline [as 别名]
# 或者: from sklearn.pipeline.Pipeline import fit [as 别名]
def useTFIDF():
print "TFIDF"
trainData = pd.read_csv("data/multinomialTrain.csv", header=0)
# dat = trainData[["rating", 'numDet', 'innerPunctuation','avgWordLength',
# 'numPresVerb', "numFirstPerson",'numPropNoun', "numOtherNoun", "numWords", "numAdj",
# "numPastVerb", "numConj", "exclamationPoints"]]
dat = trainData
knn = KNeighborsClassifier(n_neighbors=21, weights='distance')
scaler = preprocessing.StandardScaler()
tfidf = TfidfTransformer()
tfidf_scaled_knn = Pipeline([('tfidf', tfidf), ('knn', knn)])
kf = KFold(len(trainData), n_folds=3, shuffle=True)
for train, test in kf:
trainX, trainy = transform_sklearn_dictionary(transform_csv(dat.iloc[train], target_col="rating",
ignore_cols=["01v234", "2v34", "words","words_nostopwords",
"review", 'numDet', 'innerPunctuation','avgWordLength','numPresVerb', "numFirstPerson",'numPropNoun', "numOtherNoun", "numWords", "numAdj",
"numPastVerb", "numConj", "exclamationPoints"]))
testX, testy = transform_sklearn_dictionary(transform_csv(dat.iloc[test], target_col="rating",
ignore_cols=["01v234", "2v34", "words","words_nostopwords",
"review", 'numDet', 'innerPunctuation','avgWordLength','numPresVerb', "numFirstPerson",'numPropNoun', "numOtherNoun", "numWords", "numAdj",
"numPastVerb", "numConj", "exclamationPoints"]))
tfidf_scaled_knn.fit(trainX, trainy)
print tfidf_scaled_knn.score(testX, testy)
示例9: main
# 需要导入模块: from sklearn.pipeline import Pipeline [as 别名]
# 或者: from sklearn.pipeline.Pipeline import fit [as 别名]
def main():
data = import_files(filenames)
sentences = defaultdict(lambda: [])
# invert the dictionary
for cat in data:
if cat == 'yn':
continue
for sentence in data[cat]:
sentences[sentence].append(cat)
X_list = []
y_data = []
for s in sentences:
X_list.append(s)
y_data.append(sentences[s])
X_data = np.array(X_list)
# X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.01, random_state=802701)\
X_train = X_data
y_train = y_data
classifier = Pipeline([
('vectorizer', TfidfVectorizer()),
('clf', OneVsRestClassifier(LinearSVC()))])
classifier.fit(X_train, y_train)
save_classifier(classifier, outfile)
示例10: Classifier
# 需要导入模块: from sklearn.pipeline import Pipeline [as 别名]
# 或者: from sklearn.pipeline.Pipeline import fit [as 别名]
class Classifier(BaseEstimator):
def __init__(self, rf_max_depth=10, rf_n_estimators=50, n_estimators=50, n_jobs=1):
self.rf_max_depth = rf_max_depth
self.rf_n_estimators = rf_n_estimators
self.n_estimators = n_estimators
self.n_jobs = n_jobs
def fit(self, X, y):
self.clf = Pipeline([
('rf', AdaBoostClassifier(
base_estimator=RandomForestClassifier(
max_depth=self.rf_max_depth, n_estimators=self.rf_n_estimators,
n_jobs=self.n_jobs),
n_estimators=self.n_estimators)
)
])
self.clf.fit(X, y)
return self
def predict(self, X):
return self.clf.predict(X)
def predict_proba(self, X):
return self.clf.predict_proba(X)
示例11: allFeatureClassify
# 需要导入模块: from sklearn.pipeline import Pipeline [as 别名]
# 或者: from sklearn.pipeline.Pipeline import fit [as 别名]
def allFeatureClassify(cosine=False):
print "AllFeatureClassifier"
if cosine:
print "Cosine"
trainData = pd.read_csv("data/multinomialTrain.csv", header=0)
# dat = trainData[["rating", 'numDet', 'innerPunctuation','avgWordLength',
# 'numPresVerb', "numFirstPerson",'numPropNoun', "numOtherNoun", "numWords", "numAdj",
# "numPastVerb", "numConj", "exclamationPoints"]]
dat = trainData
if cosine:
knn = KNeighborsClassifier(n_neighbors=21, metric=pairwise.cosine_similarity)
else:
knn = KNeighborsClassifier(n_neighbors=21)
scaler = preprocessing.StandardScaler()
scaled_knn = Pipeline([('scaler', scaler), ('knn', knn)])
kf = KFold(len(trainData), n_folds=3, shuffle=True)
for train, test in kf:
trainX, trainy = transform_sklearn_dictionary(transform_csv(dat.iloc[train], target_col="rating",
ignore_cols=["01v234", "2v34", "words",
"words_nostopwords", "review"]))
testX, testy = transform_sklearn_dictionary(transform_csv(dat.iloc[test], target_col="rating",
ignore_cols=["01v234", "2v34", "words",
"words_nostopwords", "review"]))
scaled_knn.fit(trainX, trainy)
print scaled_knn.score(testX, testy)
示例12: cross_validation
# 需要导入模块: from sklearn.pipeline import Pipeline [as 别名]
# 或者: from sklearn.pipeline.Pipeline import fit [as 别名]
def cross_validation(self, X, Y, n_folds=10):
""" n-fold cross validation to get the best classifier. """
kf = KFold(len(X), n_folds=n_folds)
best_accuracy = -1
training_accuracy = 0
for train, cv in kf:
classifier = Pipeline([('vect', CountVectorizer()),
('tfidf', TfidfTransformer()),
('svm', LinearSVC(C=1))])
# forms the training and test set
X_train = []
X_train.extend(X[0:cv[0]])
X_train.extend(X[cv[-1]:])
Y_train = []
Y_train.extend(Y[0:cv[0]])
Y_train.extend(Y[cv[-1]:])
X_cv = X[cv[0]:cv[-1]+1]
Y_cv = Y[cv[0]:cv[-1]+1]
classifier.fit(X_train, Y_train)
accuracy = self.__accuracy(classifier, X_cv, Y_cv)
if accuracy > best_accuracy:
best_classifier = classifier
best_accuracy = accuracy
training_accuracy = self.__accuracy(
classifier, X_train, Y_train)
return best_classifier, training_accuracy, best_accuracy
示例13: run
# 需要导入模块: from sklearn.pipeline import Pipeline [as 别名]
# 或者: from sklearn.pipeline.Pipeline import fit [as 别名]
def run(training, validation, k, config=None):
isError, OneError, nDocs = 0, 0, 0
margins, AP = [], []
class_index = Index()
traindocs, train_X, train_y = zip(*load_data(training, class_index))
testdocs, test_X, test_y = zip(*load_data(validation, class_index))
n_iter = np.ceil(10**6 / len(traindocs))
clf = SGDClassifier(alpha=.000001, loss='log', n_iter=50, penalty='elasticnet')
#clf = MultinomialNB(alpha=0.000001)
classifier = Pipeline([
('vectorizer', CountVectorizer(min_df=1, max_df=1.0, analyzer=lambda t: t)),
('tfidf', TfidfTransformer(norm='l2')),
('clf', OneVsRestClassifier(clf, n_jobs=-1))])
classifier.fit(train_X, train_y)
predictions = classifier.predict_proba(test_X)
for j, prediction in enumerate(predictions):
nDocs += 1
refs = np.zeros(len(prediction))
refs[list(test_y[j])] = 1
preds = sorted(range(len(prediction)), key=lambda i: prediction[i], reverse=True)
refs = set(test_y[j])
ap = average_precision(preds, refs)
AP.append(ap)
isError += is_error(ap)
OneError += one_error(preds, refs)
margins.append(margin(preds, refs))
return isError, OneError, nDocs, margins, AP
示例14: pipeline_test
# 需要导入模块: from sklearn.pipeline import Pipeline [as 别名]
# 或者: from sklearn.pipeline.Pipeline import fit [as 别名]
def pipeline_test(params, data_path, dataset):
data_train = os.path.expanduser(os.path.join(data_path, dataset, 'train.arff'))
X_train, y_train = load_arff_data(data_train)
data_test = os.path.expanduser(os.path.join(data_path, dataset, 'test.arff'))
X_test, y_test = load_arff_data(data_test)
dpr = get_data_preprocessor_rescaling(params)
params = get_data_preprocessor_balancing(params, y_train)
fp = get_feature_preprocessor(params)
clf = get_classifier(params)
steps = []
if dpr is not None:
steps.append(('data_preprocessor_rescaling', dpr))
if fp is not None:
steps.append(('feature_preprocessor', fp))
steps.append(('classifier', clf))
ppl = Pipeline(steps)
ppl.fit(X_train, y_train)
y_pred = ppl.predict(X_test)
score = accuracy_score(y_test, y_pred)
result = 100.0 - 100.0 * score
return result
示例15: Model10
# 需要导入模块: from sklearn.pipeline import Pipeline [as 别名]
# 或者: from sklearn.pipeline.Pipeline import fit [as 别名]
class Model10(Model):
def __init__(self):
pass
def fit(self, Xmask, y):
pr = prepare.Prepare_0(model=10, preproc=1, min_df=1, use_svd=False, tfidf=2,
stemmer=0)
(X_all_df,_,BP,params) = pr.load_transform(update=False)
names = list(X_all_df.columns)
X_all = np.asarray(X_all_df)
self.X_all, self.names = X_all, names
clf0 = GaussianNB()
clf1 = MultinomialNB(alpha=0.8)
clf2 = BernoulliNB(alpha=1, binarize=0.01)
clf = clf1
self.rd = Pipeline([
("trans", Transformer(names=self.names, X_all=X_all, BP=BP)),
#("scaler",StandardScaler(with_mean=False)),
("est", clf)
])
self.rd.fit(Xmask,np.asarray(y))
return self
def predict_proba(self, Xmask):
return self.rd.predict_proba(Xmask)
def predict(self, Xmask):
return self.rd.predict(Xmask)
def starter(self):
print "Model10 starter"
self.fit(np.arange(100),np.arange(100))