当前位置: 首页>>代码示例>>Python>>正文


Python Pipeline.fit_transform方法代码示例

本文整理汇总了Python中sklearn.pipeline.Pipeline.fit_transform方法的典型用法代码示例。如果您正苦于以下问题:Python Pipeline.fit_transform方法的具体用法?Python Pipeline.fit_transform怎么用?Python Pipeline.fit_transform使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在sklearn.pipeline.Pipeline的用法示例。


在下文中一共展示了Pipeline.fit_transform方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: partial_dependence

# 需要导入模块: from sklearn.pipeline import Pipeline [as 别名]
# 或者: from sklearn.pipeline.Pipeline import fit_transform [as 别名]
def partial_dependence(df, y):
    '''
    INPUT: X = features
           y = target variable binary, imbalanced classes
    OUPUT: X = features oversampled to have balanced target classes
           y = target variable oversample to have balanced classes

    Discovers the minority class and then oversamples until eah class makes up
    50% of your data.
    '''
    X_train, X_test, y_train, y_test = oversample_train_test(df, y)
    # X_train, X_test, y_train, y_test = train_test_split(df, y, random_state=42)

    feature_engineering = Pipeline([
        ('lists', ListSplitter()),
        ('race', RaceDummies()),
        ('crime_sentence', CrimeAndSentence()),
        ('feat_eng', FeatureEngineer()),
        ('columns', ColumnFilter(prejudice=False))
    ])

    X = feature_engineering.fit_transform(X_train.copy(), y_train)
    X_test = feature_engineering.fit_transform(X_test.copy(), y_test)

    gbc = GradientBoostingClassifier(n_estimators=850, learning_rate=.75)
    gbc.fit(X.copy(), y_train)
    most_imp = np.argsort(gbc.feature_importances_)[-6:]

    names = list(X_test.columns)
    feats = list(most_imp)
    fig, axs = plot_partial_dependence(gbc, X_test, feats, feature_names=names,
                                       n_jobs=3, grid_resolution=50)
开发者ID:dannyprikaz,项目名称:megans_law_project,代码行数:34,代码来源:functions.py

示例2: make_features

# 需要导入模块: from sklearn.pipeline import Pipeline [as 别名]
# 或者: from sklearn.pipeline.Pipeline import fit_transform [as 别名]
    def make_features(self):
        features = Pipeline([
            ('count', self.build_vectorizer()),
            ('tfidf', TfidfTransformer())
        ])

        doc_vecs = features.fit_transform(self.docs)
        rp_vecs = features.fit_transform(self.rps)

        return (doc_vecs, rp_vecs)
开发者ID:ben-chin,项目名称:AuCTOR,代码行数:12,代码来源:labeller.py

示例3: create_store_transforms

# 需要导入模块: from sklearn.pipeline import Pipeline [as 别名]
# 或者: from sklearn.pipeline.Pipeline import fit_transform [as 别名]
def create_store_transforms(rl):
    trnsfrm = Pipeline([
        ('vbk', ValueByKey('wrd_list')),
        ('tfidf_tmp', TfidfVectorizer(lowercase=True, min_df = 2, stop_words='english')),
    ])
    with open('transforms/just_txt.pickle', 'wb') as handle: pickle.dump(trnsfrm.fit_transform(rl), handle)

    trnsfrm = Pipeline([
        ('vbk', ValueByKey('wrd_list')),
        ('tfidf_tmp', TfidfVectorizer(lowercase=True, min_df = 2, stop_words='english', tokenizer=brad_tokenizer_test)),
    ])
    with open('transforms/just_txt_chunks.pickle', 'wb') as handle: pickle.dump(trnsfrm.fit_transform(rl), handle)

    trnsfrm = Pipeline([
        ('union', FeatureUnion(
            transformer_list=[
                ('cuisinetype', Pipeline([
                    ('vbk', ValueByKey('type_2')),
                    ('labels', preprocessing.LabelBinarizer()),
                ])),
                # ('price_lev', Pipeline([
                #     ('vbk', ValueByKey('price_level')),
                #     ('labels2', preprocessing.LabelBinarizer()),
                # ])),
                #
                # ('rating_lev', Pipeline([
                #     ('vbk', ValueByKey('rating_level')),
                #     ('labels3', preprocessing.LabelBinarizer()),
                # ])),
                ('nlp', Pipeline([
                    ('vbk', ValueByKey('wrd_list')),
                    ('tfidf_tmp', TfidfVectorizer(lowercase=True, min_df = 2, stop_words='english'))
                ]))
            ]
        ))
    ])
    with open('transforms/txt_cat.pickle', 'wb') as handle: pickle.dump(trnsfrm.fit_transform(rl), handle)

    trnsfrm = Pipeline([
        ('union', FeatureUnion(
            transformer_list=[
                ('cuisinetype', Pipeline([
                    ('vbk', ValueByKey('type_2')),
                    ('labels', preprocessing.LabelBinarizer()),
                ])),
                ('nlp', Pipeline([
                    ('vbk', ValueByKey('wrd_list')),
                    ('tfidf_tmp', TfidfVectorizer(lowercase=True, min_df = 2, stop_words='english', tokenizer=brad_tokenizer_test))
                ]))
            ]
        ))
    ])
    with open('transforms/txt_cat_chunks.pickle', 'wb') as handle: pickle.dump(trnsfrm.fit_transform(rl), handle)
开发者ID:lesleymaraina,项目名称:Restaurants,代码行数:55,代码来源:roperations.py

示例4: makePlots

# 需要导入模块: from sklearn.pipeline import Pipeline [as 别名]
# 或者: from sklearn.pipeline.Pipeline import fit_transform [as 别名]
def makePlots(Z):
	imp = Imputer()
	scal = StandardScaler()
	vart = VarianceThreshold()
	
	pipe = Pipeline([("imputer", imp), ("var theshold", vart), ("scaler", scal) ])
	
	# Require Z
	X1 = pipe.fit_transform(Z)
	pca = PCA(n_components=2)
	x2d = pca.fit_transform(X1.T)
	
	labels = {}
	centers = []
	
	for n in [2, 3, 5, 10]:
		agglo = FeatureAgglomeration(n_clusters=n).fit(X1)
		labels['ag%d'%n] = agglo.labels_
		plot(x2d, agglo.labels_, "Feature Agglomeration")
		
		km = KMeans(n_clusters=n).fit(X1.T)
		labels['km%d'%n] = km.labels_
		plot(x2d, km.labels_, "K-Means")
		centers = km.cluaster_centers_
	
	dbs = DBSCAN(eps = 100 ,min_samples=10).fit(X1.T)
	labels['DBSCAN'] = dbs.labels_
	plot(x2d, dbs.labels_, "DBSCAN")
		
	return labels, centers
开发者ID:orichardson,项目名称:mcm2016,代码行数:32,代码来源:cluster2.py

示例5: transformCorpus

# 需要导入模块: from sklearn.pipeline import Pipeline [as 别名]
# 或者: from sklearn.pipeline.Pipeline import fit_transform [as 别名]
def transformCorpus(tdocuments, tentities):
  X1 = None
  #treat the tasks as documents and calculate the tfIdf vector
  '''hasher = HashingVectorizer(stop_words='english', non_negative=True,

                                                                 norm=None,
                                                                 binary=False)
        vectorizer = Pipeline((
                ('hasher', hasher),
                ('tf_idf', TfidfTransformer())
        ))

        '''
  '''lsa = TruncatedSVD(1000)
	X = lsa.fit_transform( vectorizer.fit_transform(tdocuments) )
	X1 = Normalizer(copy=False).fit_transform(X)
	'''
  #X1 = vectorizer.fit_transform(tdocuments)
  #print("n_samples: %d, n_features: %d" % X1.shape)
  #print()
  vec = Pipeline((('dictText', DictVectorizer()),
                  ('tfIdf', TfidfTransformer())))
  X2 = vec.fit_transform(tentities)
  lsa = TruncatedSVD(1000)
  X = lsa.fit_transform(X2)
  X1 = Normalizer(copy=False).fit_transform(X)
  #X2 = Normalizer(copy=False).fit_transform(X)
  print('n_samples: %d, n_features: %d' % X.shape)
  print()

  return X1, X2
开发者ID:vmanisha,项目名称:QueryExpansion,代码行数:33,代码来源:clusterTasks.py

示例6: test_l2density_basic

# 需要导入模块: from sklearn.pipeline import Pipeline [as 别名]
# 或者: from sklearn.pipeline.Pipeline import fit_transform [as 别名]
def test_l2density_basic():
    dim = 3
    bags = [np.random.randn(np.random.randint(30, 100), dim)
            for _ in xrange(50)]
    pipe = Pipeline([
        ('scale', BagMinMaxScaler([0, 1])),
        ('density', L2DensityTransformer(15)),
    ])
    l2ed = pipe.fit_transform(bags)

    assert np.all(np.isfinite(l2ed))
    # ||x - y||^2 = <x, x> - 2 <x, y> + <y, y>
    K = l2ed.dot(l2ed.T)
    row_norms_sq = np.diagonal(K)
    l2_dist_sq = row_norms_sq[:, None] - 2 * K + row_norms_sq[None, :]
    assert np.min(row_norms_sq) > 0
    assert np.min(l2_dist_sq) >= 0

    assert_raises(ValueError, lambda: L2DensityTransformer(10, basis='foo'))

    t = L2DensityTransformer(10)
    assert_raises(AttributeError, lambda: t.transform(bags))
    t.fit(dim)
    t.transform(BagMinMaxScaler([0, 1]).fit_transform(bags))
    assert_raises(ValueError, lambda: t.transform([b[:, :2] for b in bags]))
    assert_raises(ValueError, lambda: t.transform(bags))
    t.basis = 'haha snuck my way in'
    assert_raises(ValueError, lambda: t.transform(bags))
开发者ID:cimor,项目名称:skl-groups,代码行数:30,代码来源:test_summaries.py

示例7: main

# 需要导入模块: from sklearn.pipeline import Pipeline [as 别名]
# 或者: from sklearn.pipeline.Pipeline import fit_transform [as 别名]
def main(opt):
    with codecs.open(opt.vocab, encoding='utf-8') as f:
        vocab = load_vocab(f)
    id2word = build_id2word(vocab)
    _, docs_train, _ = load_all_data(opt.train_jsons)
    lda = Pipeline([
        ('bow', BagOfWords(vocab=vocab)),
        ('lda', Lda(id2word=id2word, num_topics=opt.num_topics))])
    lda_vec_train = lda.fit_transform(docs_train)

    sent_set = set()
    tmp_path = opt.lda_vec_path + '.tmp'
    with codecs.open(tmp_path, encoding='utf-8', mode='w') as f:
        dump_lda_vec(docs_train, lda_vec_train, sent_set, f)

    if opt.test_jsons:
        _, docs_test, _ = load_all_data(opt.test_jsons)
        lda_vec_test = lda.transform(docs_test)
        with codecs.open(tmp_path, encoding='utf-8', mode='a') as f:
            dump_lda_vec(docs_test, lda_vec_test, sent_set, f)

    with codecs.open(tmp_path, encoding='utf-8') as fin, \
            codecs.open(opt.lda_vec_path, encoding='utf-8', mode='w') as fout:
        fout.write('{} {}\n'.format(len(sent_set), opt.num_topics))
        for line in fin:
            fout.write(line)

    os.remove(tmp_path)
开发者ID:binghaobhw,项目名称:topic-segmentation,代码行数:30,代码来源:lda_vec.py

示例8: test_set_pipeline_step_none

# 需要导入模块: from sklearn.pipeline import Pipeline [as 别名]
# 或者: from sklearn.pipeline.Pipeline import fit_transform [as 别名]
def test_set_pipeline_step_none():
    # Test setting Pipeline steps to None
    X = np.array([[1]])
    y = np.array([1])
    mult2 = Mult(mult=2)
    mult3 = Mult(mult=3)
    mult5 = Mult(mult=5)

    def make():
        return Pipeline([("m2", mult2), ("m3", mult3), ("last", mult5)])

    pipeline = make()

    exp = 2 * 3 * 5
    assert_array_equal([[exp]], pipeline.fit_transform(X, y))
    assert_array_equal([exp], pipeline.fit(X).predict(X))
    assert_array_equal(X, pipeline.inverse_transform([[exp]]))

    pipeline.set_params(m3=None)
    exp = 2 * 5
    assert_array_equal([[exp]], pipeline.fit_transform(X, y))
    assert_array_equal([exp], pipeline.fit(X).predict(X))
    assert_array_equal(X, pipeline.inverse_transform([[exp]]))
    assert_dict_equal(
        pipeline.get_params(deep=True),
        {"steps": pipeline.steps, "m2": mult2, "m3": None, "last": mult5, "m2__mult": 2, "last__mult": 5},
    )

    pipeline.set_params(m2=None)
    exp = 5
    assert_array_equal([[exp]], pipeline.fit_transform(X, y))
    assert_array_equal([exp], pipeline.fit(X).predict(X))
    assert_array_equal(X, pipeline.inverse_transform([[exp]]))

    # for other methods, ensure no AttributeErrors on None:
    other_methods = ["predict_proba", "predict_log_proba", "decision_function", "transform", "score"]
    for method in other_methods:
        getattr(pipeline, method)(X)

    pipeline.set_params(m2=mult2)
    exp = 2 * 5
    assert_array_equal([[exp]], pipeline.fit_transform(X, y))
    assert_array_equal([exp], pipeline.fit(X).predict(X))
    assert_array_equal(X, pipeline.inverse_transform([[exp]]))

    pipeline = make()
    pipeline.set_params(last=None)
    # mult2 and mult3 are active
    exp = 6
    assert_array_equal([[exp]], pipeline.fit(X, y).transform(X))
    assert_array_equal([[exp]], pipeline.fit_transform(X, y))
    assert_array_equal(X, pipeline.inverse_transform([[exp]]))
    assert_raise_message(AttributeError, "'NoneType' object has no attribute 'predict'", getattr, pipeline, "predict")

    # Check None step at construction time
    exp = 2 * 5
    pipeline = Pipeline([("m2", mult2), ("m3", None), ("last", mult5)])
    assert_array_equal([[exp]], pipeline.fit_transform(X, y))
    assert_array_equal([exp], pipeline.fit(X).predict(X))
    assert_array_equal(X, pipeline.inverse_transform([[exp]]))
开发者ID:cheral,项目名称:scikit-learn,代码行数:62,代码来源:test_pipeline.py

示例9: XY8

# 需要导入模块: from sklearn.pipeline import Pipeline [as 别名]
# 或者: from sklearn.pipeline.Pipeline import fit_transform [as 别名]
def XY8():
    X, y, X_test, X_test_index = load_xy()

    #### DON'T CHANGE BEFORE
    dummy_cols = ['DepartmentDescription']
    keep_cols = ['Weekday', 'Returns']
    mul_col = 'ScanCount'
    dfta = ft.DataFrameToArray()
    add_returns = ft.NGAddReturns()

    grouper = ft.GDummyKeepAndMultiplierTransform(dummy_cols, mul_col,
                                                  keep_cols)

    transform_steps = [("imputer", ft.NGNAImputer()),
                       ("add_returns", add_returns), ('grouper', grouper)]

    ### DON'T CHANGE AFTER
    transform_steps.append((("dfta", dfta)))
    transform_pipe = Pipeline(steps=transform_steps)

    return {
        "X": transform_pipe.fit_transform(X),
        "y": y,
        "X_test": transform_pipe.transform(X_test),
        "X_test_index": X_test_index
    }
开发者ID:anabranch,项目名称:mlproject,代码行数:28,代码来源:loader.py

示例10: XY7

# 需要导入模块: from sklearn.pipeline import Pipeline [as 别名]
# 或者: from sklearn.pipeline.Pipeline import fit_transform [as 别名]
def XY7():
    X, y, X_test, X_test_index = load_xy()

    #### DON'T CHANGE BEFORE
    dummy_cols = ['DepartmentDescription']
    keep_cols = ['Weekday']
    mul_col = 'ScanCount'
    dfta = ft.DataFrameToArray()

    grouper = ft.GDummyKeepAndMultiplierTransform(dummy_cols, mul_col,
                                                  keep_cols)

    transform_steps = [("imputer", ft.NGNAImputer())] + \
                      list(ft.wrapStep(('grouper', grouper)))

    ### DON'T CHANGE AFTER
    transform_steps.append((("dfta", dfta)))
    transform_pipe = Pipeline(steps=transform_steps)

    kh.start_pipeline()
    kh.record_metric("validation", "start", "NA", "transform_pipeline",
                     str(transform_pipe), "NA")

    return {
        "X": transform_pipe.fit_transform(X),
        "y": y,
        "X_test": transform_pipe.transform(X_test),
        "X_test_index": X_test_index
    }
开发者ID:anabranch,项目名称:mlproject,代码行数:31,代码来源:loader.py

示例11: XY9

# 需要导入模块: from sklearn.pipeline import Pipeline [as 别名]
# 或者: from sklearn.pipeline.Pipeline import fit_transform [as 别名]
def XY9():
    X, y, X_test, X_test_index = load_xy()

    #### DON'T CHANGE BEFORE
    dummy_cols = ['FinelineNumber']
    keep_cols = ['Weekday', 'Returns']
    mul_col = None
    dfta = ft.DataFrameToArray()
    add_returns = ft.NGAddReturns()

    print("starting grouping")
    grouper = ft.GDummyKeepAndMultiplierTransform(dummy_cols, mul_col,
                                                  keep_cols)
    print("done grouping")
    transform_steps = [("imputer", ft.NGNAImputer()),
                       ("add_returns", add_returns), ('grouper', grouper)]

    ### DON'T CHANGE AFTER
    transform_steps.append((("dfta", dfta)))
    transform_pipe = Pipeline(steps=transform_steps)
    print("done with pipeline, now calculating")
    return {
        "X": transform_pipe.fit_transform(X),
        "y": y,
        "X_test": transform_pipe.transform(X_test),
        "X_test_index": X_test_index
    }
开发者ID:anabranch,项目名称:mlproject,代码行数:29,代码来源:loader.py

示例12: XY1

# 需要导入模块: from sklearn.pipeline import Pipeline [as 别名]
# 或者: from sklearn.pipeline.Pipeline import fit_transform [as 别名]
def XY1():
    X, y, X_test, X_test_index = load_xy()

    ####### VARIABLES
    dummy_cols = ['Weekday', 'DepartmentDescription']
    keep_cols = ['ScanCount', 'Returns']
    funcs = [np.sum, np.count_nonzero]

    dfta = ft.DataFrameToArray()
    add_returns = ft.NGAddReturns()
    gdd = ft.GDummyAndKeepTransform(dummy_cols, keep_cols,
                                    funcs)  # Doesn't work!

    transform_steps = [("imputer", ft.NGNAImputer())] + \
                      list(ft.wrapStep(("add_returns", add_returns))) + \
                      list(ft.wrapStep(('grouper', gdd))) + \
                      [("dfta", dfta)]
    transform_pipe = Pipeline(steps=transform_steps)

    kh.start_pipeline()
    kh.record_metric("validation", "start", "NA", "transform_pipeline",
                     str(transform_pipe), "NA")

    return {
        "X": transform_pipe.fit_transform(X),
        "y": y,
        "X_test": transform_pipe.transform(X_test),
        "X_test_index": X_test_index
    }
开发者ID:anabranch,项目名称:mlproject,代码行数:31,代码来源:loader.py

示例13: load_data_template

# 需要导入模块: from sklearn.pipeline import Pipeline [as 别名]
# 或者: from sklearn.pipeline.Pipeline import fit_transform [as 别名]
def load_data_template(argv):
    # Train set
    data = np.load("data/train.npz")
    y_train = data["y_train"]
    X_train = data["X_train"]

    fu = FeatureUnion([
        #('spec', FlattenTransformer(scale=1.0)),
        ('st1', StatsTransformer(axis=1)),
        #('st0', StatsTransformer(axis=0))
    ])

    tf = Pipeline(steps=[('specg', SpectrogramTransformer(NFFT=256, clip=500,
                                                          noverlap=0.5,
                                                          dtype=np.float32,
                                                          log=False, flatten=False)),
                         ('tm', TemplateMatcher(raw=True)),
                         #('flatten', FlattenTransformer()),
                         ('fu', fu),
                     ])

    X_train = tf.fit_transform(X_train, y_train)


    # Test set
    data = np.load("data/test.npz")
    y_test = None
    X_test = data['X_test']
    X_test = tf.transform(X_test)

    return X_train, X_test, y_train, y_test
开发者ID:Sandy4321,项目名称:kaggle-marinexplore,代码行数:33,代码来源:stacking.py

示例14: MultinomialNB

# 需要导入模块: from sklearn.pipeline import Pipeline [as 别名]
# 或者: from sklearn.pipeline.Pipeline import fit_transform [as 别名]
class MultinomialNB(Step):

    def __init__(self, percentile_threshold, bins):
        assert bins > 0
        bin_size = 1 / bins
        self.bins = np.arange(bin_size, 1, bin_size)
        self.lower = percentile_threshold
        self.upper = 100 - percentile_threshold
        scaler = MinMaxScaler()
        discretizer = FunctionTransformer(Discretizer(self.bins))
        self.pipeline = Pipeline(
            [('scaler', scaler), ('discretizer', discretizer)])

    def fit(self, vectors):
        self.lower_clip = np.percentile(vectors, self.lower, axis=0)
        self.upper_clip = np.percentile(vectors, self.upper, axis=0)
        vectors = np.clip(vectors, self.lower_clip, self.upper_clip)
        vectors = self.pipeline.fit_transform(vectors)
        n_docs = vectors.shape[0]
        self.distribution = np.array(
            [np.bincount(v, minlength=len(self.bins)) / n_docs
             for v in vectors.T])

    def transform(self, vectors):
        assert self.distribution is not None
        vectors = np.clip(vectors, self.lower_clip, self.upper_clip)
        probabilities = []
        n_dim = vectors.shape[1]
        vectors = self.pipeline.transform(vectors)
        for bins in vectors:
            pr = np.product(self.distribution[np.arange(n_dim), bins])
            probabilities.append(pr)
        return -np.log(np.maximum(1e-10, np.array(probabilities)))
开发者ID:danijar,项目名称:semantic,代码行数:35,代码来源:multinomial.py

示例15: test_sklearn_pipeline

# 需要导入模块: from sklearn.pipeline import Pipeline [as 别名]
# 或者: from sklearn.pipeline.Pipeline import fit_transform [as 别名]
 def test_sklearn_pipeline(self):
     df = pd.DataFrame.from_dict([{"a":"something","b":1},{"a":"something2"}])
     t = bt.Exclude_features_transform(excluded=["b"])
     transformers = [("exclude_transform",t)]
     p = Pipeline(transformers)
     df2 = p.fit_transform(df)
     self.assertEquals(len(df2.columns),1)
开发者ID:pk359,项目名称:seldon-server,代码行数:9,代码来源:test_basic_transforms.py


注:本文中的sklearn.pipeline.Pipeline.fit_transform方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。