本文整理汇总了Python中sklearn.pipeline.Pipeline.fit_transform方法的典型用法代码示例。如果您正苦于以下问题:Python Pipeline.fit_transform方法的具体用法?Python Pipeline.fit_transform怎么用?Python Pipeline.fit_transform使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类sklearn.pipeline.Pipeline
的用法示例。
在下文中一共展示了Pipeline.fit_transform方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: partial_dependence
# 需要导入模块: from sklearn.pipeline import Pipeline [as 别名]
# 或者: from sklearn.pipeline.Pipeline import fit_transform [as 别名]
def partial_dependence(df, y):
'''
INPUT: X = features
y = target variable binary, imbalanced classes
OUPUT: X = features oversampled to have balanced target classes
y = target variable oversample to have balanced classes
Discovers the minority class and then oversamples until eah class makes up
50% of your data.
'''
X_train, X_test, y_train, y_test = oversample_train_test(df, y)
# X_train, X_test, y_train, y_test = train_test_split(df, y, random_state=42)
feature_engineering = Pipeline([
('lists', ListSplitter()),
('race', RaceDummies()),
('crime_sentence', CrimeAndSentence()),
('feat_eng', FeatureEngineer()),
('columns', ColumnFilter(prejudice=False))
])
X = feature_engineering.fit_transform(X_train.copy(), y_train)
X_test = feature_engineering.fit_transform(X_test.copy(), y_test)
gbc = GradientBoostingClassifier(n_estimators=850, learning_rate=.75)
gbc.fit(X.copy(), y_train)
most_imp = np.argsort(gbc.feature_importances_)[-6:]
names = list(X_test.columns)
feats = list(most_imp)
fig, axs = plot_partial_dependence(gbc, X_test, feats, feature_names=names,
n_jobs=3, grid_resolution=50)
示例2: make_features
# 需要导入模块: from sklearn.pipeline import Pipeline [as 别名]
# 或者: from sklearn.pipeline.Pipeline import fit_transform [as 别名]
def make_features(self):
features = Pipeline([
('count', self.build_vectorizer()),
('tfidf', TfidfTransformer())
])
doc_vecs = features.fit_transform(self.docs)
rp_vecs = features.fit_transform(self.rps)
return (doc_vecs, rp_vecs)
示例3: create_store_transforms
# 需要导入模块: from sklearn.pipeline import Pipeline [as 别名]
# 或者: from sklearn.pipeline.Pipeline import fit_transform [as 别名]
def create_store_transforms(rl):
trnsfrm = Pipeline([
('vbk', ValueByKey('wrd_list')),
('tfidf_tmp', TfidfVectorizer(lowercase=True, min_df = 2, stop_words='english')),
])
with open('transforms/just_txt.pickle', 'wb') as handle: pickle.dump(trnsfrm.fit_transform(rl), handle)
trnsfrm = Pipeline([
('vbk', ValueByKey('wrd_list')),
('tfidf_tmp', TfidfVectorizer(lowercase=True, min_df = 2, stop_words='english', tokenizer=brad_tokenizer_test)),
])
with open('transforms/just_txt_chunks.pickle', 'wb') as handle: pickle.dump(trnsfrm.fit_transform(rl), handle)
trnsfrm = Pipeline([
('union', FeatureUnion(
transformer_list=[
('cuisinetype', Pipeline([
('vbk', ValueByKey('type_2')),
('labels', preprocessing.LabelBinarizer()),
])),
# ('price_lev', Pipeline([
# ('vbk', ValueByKey('price_level')),
# ('labels2', preprocessing.LabelBinarizer()),
# ])),
#
# ('rating_lev', Pipeline([
# ('vbk', ValueByKey('rating_level')),
# ('labels3', preprocessing.LabelBinarizer()),
# ])),
('nlp', Pipeline([
('vbk', ValueByKey('wrd_list')),
('tfidf_tmp', TfidfVectorizer(lowercase=True, min_df = 2, stop_words='english'))
]))
]
))
])
with open('transforms/txt_cat.pickle', 'wb') as handle: pickle.dump(trnsfrm.fit_transform(rl), handle)
trnsfrm = Pipeline([
('union', FeatureUnion(
transformer_list=[
('cuisinetype', Pipeline([
('vbk', ValueByKey('type_2')),
('labels', preprocessing.LabelBinarizer()),
])),
('nlp', Pipeline([
('vbk', ValueByKey('wrd_list')),
('tfidf_tmp', TfidfVectorizer(lowercase=True, min_df = 2, stop_words='english', tokenizer=brad_tokenizer_test))
]))
]
))
])
with open('transforms/txt_cat_chunks.pickle', 'wb') as handle: pickle.dump(trnsfrm.fit_transform(rl), handle)
示例4: makePlots
# 需要导入模块: from sklearn.pipeline import Pipeline [as 别名]
# 或者: from sklearn.pipeline.Pipeline import fit_transform [as 别名]
def makePlots(Z):
imp = Imputer()
scal = StandardScaler()
vart = VarianceThreshold()
pipe = Pipeline([("imputer", imp), ("var theshold", vart), ("scaler", scal) ])
# Require Z
X1 = pipe.fit_transform(Z)
pca = PCA(n_components=2)
x2d = pca.fit_transform(X1.T)
labels = {}
centers = []
for n in [2, 3, 5, 10]:
agglo = FeatureAgglomeration(n_clusters=n).fit(X1)
labels['ag%d'%n] = agglo.labels_
plot(x2d, agglo.labels_, "Feature Agglomeration")
km = KMeans(n_clusters=n).fit(X1.T)
labels['km%d'%n] = km.labels_
plot(x2d, km.labels_, "K-Means")
centers = km.cluaster_centers_
dbs = DBSCAN(eps = 100 ,min_samples=10).fit(X1.T)
labels['DBSCAN'] = dbs.labels_
plot(x2d, dbs.labels_, "DBSCAN")
return labels, centers
示例5: transformCorpus
# 需要导入模块: from sklearn.pipeline import Pipeline [as 别名]
# 或者: from sklearn.pipeline.Pipeline import fit_transform [as 别名]
def transformCorpus(tdocuments, tentities):
X1 = None
#treat the tasks as documents and calculate the tfIdf vector
'''hasher = HashingVectorizer(stop_words='english', non_negative=True,
norm=None,
binary=False)
vectorizer = Pipeline((
('hasher', hasher),
('tf_idf', TfidfTransformer())
))
'''
'''lsa = TruncatedSVD(1000)
X = lsa.fit_transform( vectorizer.fit_transform(tdocuments) )
X1 = Normalizer(copy=False).fit_transform(X)
'''
#X1 = vectorizer.fit_transform(tdocuments)
#print("n_samples: %d, n_features: %d" % X1.shape)
#print()
vec = Pipeline((('dictText', DictVectorizer()),
('tfIdf', TfidfTransformer())))
X2 = vec.fit_transform(tentities)
lsa = TruncatedSVD(1000)
X = lsa.fit_transform(X2)
X1 = Normalizer(copy=False).fit_transform(X)
#X2 = Normalizer(copy=False).fit_transform(X)
print('n_samples: %d, n_features: %d' % X.shape)
print()
return X1, X2
示例6: test_l2density_basic
# 需要导入模块: from sklearn.pipeline import Pipeline [as 别名]
# 或者: from sklearn.pipeline.Pipeline import fit_transform [as 别名]
def test_l2density_basic():
dim = 3
bags = [np.random.randn(np.random.randint(30, 100), dim)
for _ in xrange(50)]
pipe = Pipeline([
('scale', BagMinMaxScaler([0, 1])),
('density', L2DensityTransformer(15)),
])
l2ed = pipe.fit_transform(bags)
assert np.all(np.isfinite(l2ed))
# ||x - y||^2 = <x, x> - 2 <x, y> + <y, y>
K = l2ed.dot(l2ed.T)
row_norms_sq = np.diagonal(K)
l2_dist_sq = row_norms_sq[:, None] - 2 * K + row_norms_sq[None, :]
assert np.min(row_norms_sq) > 0
assert np.min(l2_dist_sq) >= 0
assert_raises(ValueError, lambda: L2DensityTransformer(10, basis='foo'))
t = L2DensityTransformer(10)
assert_raises(AttributeError, lambda: t.transform(bags))
t.fit(dim)
t.transform(BagMinMaxScaler([0, 1]).fit_transform(bags))
assert_raises(ValueError, lambda: t.transform([b[:, :2] for b in bags]))
assert_raises(ValueError, lambda: t.transform(bags))
t.basis = 'haha snuck my way in'
assert_raises(ValueError, lambda: t.transform(bags))
示例7: main
# 需要导入模块: from sklearn.pipeline import Pipeline [as 别名]
# 或者: from sklearn.pipeline.Pipeline import fit_transform [as 别名]
def main(opt):
with codecs.open(opt.vocab, encoding='utf-8') as f:
vocab = load_vocab(f)
id2word = build_id2word(vocab)
_, docs_train, _ = load_all_data(opt.train_jsons)
lda = Pipeline([
('bow', BagOfWords(vocab=vocab)),
('lda', Lda(id2word=id2word, num_topics=opt.num_topics))])
lda_vec_train = lda.fit_transform(docs_train)
sent_set = set()
tmp_path = opt.lda_vec_path + '.tmp'
with codecs.open(tmp_path, encoding='utf-8', mode='w') as f:
dump_lda_vec(docs_train, lda_vec_train, sent_set, f)
if opt.test_jsons:
_, docs_test, _ = load_all_data(opt.test_jsons)
lda_vec_test = lda.transform(docs_test)
with codecs.open(tmp_path, encoding='utf-8', mode='a') as f:
dump_lda_vec(docs_test, lda_vec_test, sent_set, f)
with codecs.open(tmp_path, encoding='utf-8') as fin, \
codecs.open(opt.lda_vec_path, encoding='utf-8', mode='w') as fout:
fout.write('{} {}\n'.format(len(sent_set), opt.num_topics))
for line in fin:
fout.write(line)
os.remove(tmp_path)
示例8: test_set_pipeline_step_none
# 需要导入模块: from sklearn.pipeline import Pipeline [as 别名]
# 或者: from sklearn.pipeline.Pipeline import fit_transform [as 别名]
def test_set_pipeline_step_none():
# Test setting Pipeline steps to None
X = np.array([[1]])
y = np.array([1])
mult2 = Mult(mult=2)
mult3 = Mult(mult=3)
mult5 = Mult(mult=5)
def make():
return Pipeline([("m2", mult2), ("m3", mult3), ("last", mult5)])
pipeline = make()
exp = 2 * 3 * 5
assert_array_equal([[exp]], pipeline.fit_transform(X, y))
assert_array_equal([exp], pipeline.fit(X).predict(X))
assert_array_equal(X, pipeline.inverse_transform([[exp]]))
pipeline.set_params(m3=None)
exp = 2 * 5
assert_array_equal([[exp]], pipeline.fit_transform(X, y))
assert_array_equal([exp], pipeline.fit(X).predict(X))
assert_array_equal(X, pipeline.inverse_transform([[exp]]))
assert_dict_equal(
pipeline.get_params(deep=True),
{"steps": pipeline.steps, "m2": mult2, "m3": None, "last": mult5, "m2__mult": 2, "last__mult": 5},
)
pipeline.set_params(m2=None)
exp = 5
assert_array_equal([[exp]], pipeline.fit_transform(X, y))
assert_array_equal([exp], pipeline.fit(X).predict(X))
assert_array_equal(X, pipeline.inverse_transform([[exp]]))
# for other methods, ensure no AttributeErrors on None:
other_methods = ["predict_proba", "predict_log_proba", "decision_function", "transform", "score"]
for method in other_methods:
getattr(pipeline, method)(X)
pipeline.set_params(m2=mult2)
exp = 2 * 5
assert_array_equal([[exp]], pipeline.fit_transform(X, y))
assert_array_equal([exp], pipeline.fit(X).predict(X))
assert_array_equal(X, pipeline.inverse_transform([[exp]]))
pipeline = make()
pipeline.set_params(last=None)
# mult2 and mult3 are active
exp = 6
assert_array_equal([[exp]], pipeline.fit(X, y).transform(X))
assert_array_equal([[exp]], pipeline.fit_transform(X, y))
assert_array_equal(X, pipeline.inverse_transform([[exp]]))
assert_raise_message(AttributeError, "'NoneType' object has no attribute 'predict'", getattr, pipeline, "predict")
# Check None step at construction time
exp = 2 * 5
pipeline = Pipeline([("m2", mult2), ("m3", None), ("last", mult5)])
assert_array_equal([[exp]], pipeline.fit_transform(X, y))
assert_array_equal([exp], pipeline.fit(X).predict(X))
assert_array_equal(X, pipeline.inverse_transform([[exp]]))
示例9: XY8
# 需要导入模块: from sklearn.pipeline import Pipeline [as 别名]
# 或者: from sklearn.pipeline.Pipeline import fit_transform [as 别名]
def XY8():
X, y, X_test, X_test_index = load_xy()
#### DON'T CHANGE BEFORE
dummy_cols = ['DepartmentDescription']
keep_cols = ['Weekday', 'Returns']
mul_col = 'ScanCount'
dfta = ft.DataFrameToArray()
add_returns = ft.NGAddReturns()
grouper = ft.GDummyKeepAndMultiplierTransform(dummy_cols, mul_col,
keep_cols)
transform_steps = [("imputer", ft.NGNAImputer()),
("add_returns", add_returns), ('grouper', grouper)]
### DON'T CHANGE AFTER
transform_steps.append((("dfta", dfta)))
transform_pipe = Pipeline(steps=transform_steps)
return {
"X": transform_pipe.fit_transform(X),
"y": y,
"X_test": transform_pipe.transform(X_test),
"X_test_index": X_test_index
}
示例10: XY7
# 需要导入模块: from sklearn.pipeline import Pipeline [as 别名]
# 或者: from sklearn.pipeline.Pipeline import fit_transform [as 别名]
def XY7():
X, y, X_test, X_test_index = load_xy()
#### DON'T CHANGE BEFORE
dummy_cols = ['DepartmentDescription']
keep_cols = ['Weekday']
mul_col = 'ScanCount'
dfta = ft.DataFrameToArray()
grouper = ft.GDummyKeepAndMultiplierTransform(dummy_cols, mul_col,
keep_cols)
transform_steps = [("imputer", ft.NGNAImputer())] + \
list(ft.wrapStep(('grouper', grouper)))
### DON'T CHANGE AFTER
transform_steps.append((("dfta", dfta)))
transform_pipe = Pipeline(steps=transform_steps)
kh.start_pipeline()
kh.record_metric("validation", "start", "NA", "transform_pipeline",
str(transform_pipe), "NA")
return {
"X": transform_pipe.fit_transform(X),
"y": y,
"X_test": transform_pipe.transform(X_test),
"X_test_index": X_test_index
}
示例11: XY9
# 需要导入模块: from sklearn.pipeline import Pipeline [as 别名]
# 或者: from sklearn.pipeline.Pipeline import fit_transform [as 别名]
def XY9():
X, y, X_test, X_test_index = load_xy()
#### DON'T CHANGE BEFORE
dummy_cols = ['FinelineNumber']
keep_cols = ['Weekday', 'Returns']
mul_col = None
dfta = ft.DataFrameToArray()
add_returns = ft.NGAddReturns()
print("starting grouping")
grouper = ft.GDummyKeepAndMultiplierTransform(dummy_cols, mul_col,
keep_cols)
print("done grouping")
transform_steps = [("imputer", ft.NGNAImputer()),
("add_returns", add_returns), ('grouper', grouper)]
### DON'T CHANGE AFTER
transform_steps.append((("dfta", dfta)))
transform_pipe = Pipeline(steps=transform_steps)
print("done with pipeline, now calculating")
return {
"X": transform_pipe.fit_transform(X),
"y": y,
"X_test": transform_pipe.transform(X_test),
"X_test_index": X_test_index
}
示例12: XY1
# 需要导入模块: from sklearn.pipeline import Pipeline [as 别名]
# 或者: from sklearn.pipeline.Pipeline import fit_transform [as 别名]
def XY1():
X, y, X_test, X_test_index = load_xy()
####### VARIABLES
dummy_cols = ['Weekday', 'DepartmentDescription']
keep_cols = ['ScanCount', 'Returns']
funcs = [np.sum, np.count_nonzero]
dfta = ft.DataFrameToArray()
add_returns = ft.NGAddReturns()
gdd = ft.GDummyAndKeepTransform(dummy_cols, keep_cols,
funcs) # Doesn't work!
transform_steps = [("imputer", ft.NGNAImputer())] + \
list(ft.wrapStep(("add_returns", add_returns))) + \
list(ft.wrapStep(('grouper', gdd))) + \
[("dfta", dfta)]
transform_pipe = Pipeline(steps=transform_steps)
kh.start_pipeline()
kh.record_metric("validation", "start", "NA", "transform_pipeline",
str(transform_pipe), "NA")
return {
"X": transform_pipe.fit_transform(X),
"y": y,
"X_test": transform_pipe.transform(X_test),
"X_test_index": X_test_index
}
示例13: load_data_template
# 需要导入模块: from sklearn.pipeline import Pipeline [as 别名]
# 或者: from sklearn.pipeline.Pipeline import fit_transform [as 别名]
def load_data_template(argv):
# Train set
data = np.load("data/train.npz")
y_train = data["y_train"]
X_train = data["X_train"]
fu = FeatureUnion([
#('spec', FlattenTransformer(scale=1.0)),
('st1', StatsTransformer(axis=1)),
#('st0', StatsTransformer(axis=0))
])
tf = Pipeline(steps=[('specg', SpectrogramTransformer(NFFT=256, clip=500,
noverlap=0.5,
dtype=np.float32,
log=False, flatten=False)),
('tm', TemplateMatcher(raw=True)),
#('flatten', FlattenTransformer()),
('fu', fu),
])
X_train = tf.fit_transform(X_train, y_train)
# Test set
data = np.load("data/test.npz")
y_test = None
X_test = data['X_test']
X_test = tf.transform(X_test)
return X_train, X_test, y_train, y_test
示例14: MultinomialNB
# 需要导入模块: from sklearn.pipeline import Pipeline [as 别名]
# 或者: from sklearn.pipeline.Pipeline import fit_transform [as 别名]
class MultinomialNB(Step):
def __init__(self, percentile_threshold, bins):
assert bins > 0
bin_size = 1 / bins
self.bins = np.arange(bin_size, 1, bin_size)
self.lower = percentile_threshold
self.upper = 100 - percentile_threshold
scaler = MinMaxScaler()
discretizer = FunctionTransformer(Discretizer(self.bins))
self.pipeline = Pipeline(
[('scaler', scaler), ('discretizer', discretizer)])
def fit(self, vectors):
self.lower_clip = np.percentile(vectors, self.lower, axis=0)
self.upper_clip = np.percentile(vectors, self.upper, axis=0)
vectors = np.clip(vectors, self.lower_clip, self.upper_clip)
vectors = self.pipeline.fit_transform(vectors)
n_docs = vectors.shape[0]
self.distribution = np.array(
[np.bincount(v, minlength=len(self.bins)) / n_docs
for v in vectors.T])
def transform(self, vectors):
assert self.distribution is not None
vectors = np.clip(vectors, self.lower_clip, self.upper_clip)
probabilities = []
n_dim = vectors.shape[1]
vectors = self.pipeline.transform(vectors)
for bins in vectors:
pr = np.product(self.distribution[np.arange(n_dim), bins])
probabilities.append(pr)
return -np.log(np.maximum(1e-10, np.array(probabilities)))
示例15: test_sklearn_pipeline
# 需要导入模块: from sklearn.pipeline import Pipeline [as 别名]
# 或者: from sklearn.pipeline.Pipeline import fit_transform [as 别名]
def test_sklearn_pipeline(self):
df = pd.DataFrame.from_dict([{"a":"something","b":1},{"a":"something2"}])
t = bt.Exclude_features_transform(excluded=["b"])
transformers = [("exclude_transform",t)]
p = Pipeline(transformers)
df2 = p.fit_transform(df)
self.assertEquals(len(df2.columns),1)