当前位置: 首页>>代码示例>>Python>>正文


Python FeatureUnion.get_feature_names方法代码示例

本文整理汇总了Python中sklearn.pipeline.FeatureUnion.get_feature_names方法的典型用法代码示例。如果您正苦于以下问题:Python FeatureUnion.get_feature_names方法的具体用法?Python FeatureUnion.get_feature_names怎么用?Python FeatureUnion.get_feature_names使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在sklearn.pipeline.FeatureUnion的用法示例。


在下文中一共展示了FeatureUnion.get_feature_names方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: test_set_feature_union_steps

# 需要导入模块: from sklearn.pipeline import FeatureUnion [as 别名]
# 或者: from sklearn.pipeline.FeatureUnion import get_feature_names [as 别名]
def test_set_feature_union_steps():
    mult2 = Mult(2)
    mult2.get_feature_names = lambda: ["x2"]
    mult3 = Mult(3)
    mult3.get_feature_names = lambda: ["x3"]
    mult5 = Mult(5)
    mult5.get_feature_names = lambda: ["x5"]

    ft = FeatureUnion([("m2", mult2), ("m3", mult3)])
    assert_array_equal([[2, 3]], ft.transform(np.asarray([[1]])))
    assert_equal(["m2__x2", "m3__x3"], ft.get_feature_names())

    # Directly setting attr
    ft.transformer_list = [("m5", mult5)]
    assert_array_equal([[5]], ft.transform(np.asarray([[1]])))
    assert_equal(["m5__x5"], ft.get_feature_names())

    # Using set_params
    ft.set_params(transformer_list=[("mock", mult3)])
    assert_array_equal([[3]], ft.transform(np.asarray([[1]])))
    assert_equal(["mock__x3"], ft.get_feature_names())

    # Using set_params to replace single step
    ft.set_params(mock=mult5)
    assert_array_equal([[5]], ft.transform(np.asarray([[1]])))
    assert_equal(["mock__x5"], ft.get_feature_names())
开发者ID:cheral,项目名称:scikit-learn,代码行数:28,代码来源:test_pipeline.py

示例2: test_set_feature_union_step_none

# 需要导入模块: from sklearn.pipeline import FeatureUnion [as 别名]
# 或者: from sklearn.pipeline.FeatureUnion import get_feature_names [as 别名]
def test_set_feature_union_step_none():
    mult2 = Mult(2)
    mult2.get_feature_names = lambda: ['x2']
    mult3 = Mult(3)
    mult3.get_feature_names = lambda: ['x3']
    X = np.asarray([[1]])

    ft = FeatureUnion([('m2', mult2), ('m3', mult3)])
    assert_array_equal([[2, 3]], ft.fit(X).transform(X))
    assert_array_equal([[2, 3]], ft.fit_transform(X))
    assert_equal(['m2__x2', 'm3__x3'], ft.get_feature_names())

    ft.set_params(m2=None)
    assert_array_equal([[3]], ft.fit(X).transform(X))
    assert_array_equal([[3]], ft.fit_transform(X))
    assert_equal(['m3__x3'], ft.get_feature_names())

    ft.set_params(m3=None)
    assert_array_equal([[]], ft.fit(X).transform(X))
    assert_array_equal([[]], ft.fit_transform(X))
    assert_equal([], ft.get_feature_names())

    # check we can change back
    ft.set_params(m3=mult3)
    assert_array_equal([[3]], ft.fit(X).transform(X))
开发者ID:dsquareindia,项目名称:scikit-learn,代码行数:27,代码来源:test_pipeline.py

示例3: train_model

# 需要导入模块: from sklearn.pipeline import FeatureUnion [as 别名]
# 或者: from sklearn.pipeline.FeatureUnion import get_feature_names [as 别名]
def train_model(trainset):
	word_vector = TfidfVectorizer(analyzer="word", ngram_range=(2,2), binary = False, max_features= 2000,min_df=1,decode_error="ignore")
#	print word_vector	
	print "works fine"
	char_vector = TfidfVectorizer(ngram_range=(2,3), analyzer="char", binary = False, min_df = 1, max_features = 2000,decode_error= "ignore")
	vectorizer =FeatureUnion([ ("chars", char_vector),("words", word_vector) ])
	corpus = []
	classes = []

	for item in trainset:
		corpus.append(item['text'])
		classes.append(item['label'])

	print "Training instances : ", 0.8*len(classes)
	print "Testing instances : ", 0.2*len(classes) 
	
	matrix = vectorizer.fit_transform(corpus)
	print "feature count : ", len(vectorizer.get_feature_names())
	print "training model"
	X = matrix.toarray()
	y = numpy.asarray(classes)
	model =LinearSVC()
	X_train, X_test, y_train, y_test= train_test_split(X,y,train_size=0.8,test_size=.2,random_state=0)
	y_pred = OneVsRestClassifier(model).fit(X_train, y_train).predict(X_test)
	#y_prob = OneVsRestClassifier(model).fit(X_train, y_train).decision_function(X_test)
	#print y_prob
	#con_matrix = []
	#for row in range(len(y_prob)):
	#	temp = [y_pred[row]]	
	#	for prob in y_prob[row]:
	#		temp.append(prob)
	#	con_matrix.append(temp)
	#for row in con_matrix:
	#	output.write(str(row)+"\n")
	#print y_pred		
	#print y_test
	
	res1=[i for i, j in enumerate(y_pred) if j == 'anonEdited']
	res2=[i for i, j in enumerate(y_test) if j == 'anonEdited']
	reset=[]
	for r in res1:
		if y_test[r] != "anonEdited":
			reset.append(y_test[r])
	for r in res2:
		if y_pred[r] != "anonEdited":
			reset.append(y_pred[r])
	
	
	output=open(sys.argv[2],"w")
	for suspect in reset:
		output.write(str(suspect)+"\n")	
	cm = confusion_matrix(y_test, y_pred)
	print(cm)
	pl.matshow(cm)
	pl.title('Confusion matrix')
	pl.colorbar()
	pl.ylabel('True label')
	pl.xlabel('Predicted label')
	pl.show()
	print accuracy_score(y_pred,y_test)
开发者ID:srini21,项目名称:Amazon-deceptive-reviews,代码行数:62,代码来源:anontesting.py

示例4: test_feature_union_feature_names

# 需要导入模块: from sklearn.pipeline import FeatureUnion [as 别名]
# 或者: from sklearn.pipeline.FeatureUnion import get_feature_names [as 别名]
def test_feature_union_feature_names():
    word_vect = CountVectorizer(analyzer="word")
    char_vect = CountVectorizer(analyzer="char_wb", ngram_range=(3, 3))
    ft = FeatureUnion([("chars", char_vect), ("words", word_vect)])
    ft.fit(JUNK_FOOD_DOCS)
    feature_names = ft.get_feature_names()
    for feat in feature_names:
        assert_true("chars__" in feat or "words__" in feat)
    assert_equal(len(feature_names), 35)
开发者ID:Givonaldo,项目名称:scikit-learn,代码行数:11,代码来源:test_pipeline.py

示例5: test_feature_union_feature_names

# 需要导入模块: from sklearn.pipeline import FeatureUnion [as 别名]
# 或者: from sklearn.pipeline.FeatureUnion import get_feature_names [as 别名]
def test_feature_union_feature_names():
    word_vect = CountVectorizer(analyzer="word")
    char_vect = CountVectorizer(analyzer="char_wb", ngram_range=(3, 3))
    ft = FeatureUnion([("chars", char_vect), ("words", word_vect)])
    ft.fit(JUNK_FOOD_DOCS)
    feature_names = ft.get_feature_names()
    for feat in feature_names:
        assert_true("chars__" in feat or "words__" in feat)
    assert_equal(len(feature_names), 35)

    ft = FeatureUnion([("tr1", Transf())]).fit([[1]])
    assert_raise_message(
        AttributeError, 'Transformer tr1 (type Transf) does not provide '
        'get_feature_names', ft.get_feature_names)
开发者ID:dsquareindia,项目名称:scikit-learn,代码行数:16,代码来源:test_pipeline.py

示例6: train_model

# 需要导入模块: from sklearn.pipeline import FeatureUnion [as 别名]
# 或者: from sklearn.pipeline.FeatureUnion import get_feature_names [as 别名]
def train_model(trainset):

  # create 2 blocks of features, word and character ngrams, size of 2 (using TF-IDF method)
  # we can also append here multiple other features in general

  word_vector = TfidfVectorizer( analyzer="word" , ngram_range=(2,2), binary = False, max_features= 2000 )
  char_vector = TfidfVectorizer(ngram_range=(2, 3), analyzer="char", binary=False, min_df=0 , max_features=2000 )

  # our vectors are the feature union of word/char ngrams
  vectorizer = FeatureUnion([  ("chars", char_vector),("words", word_vector)  ] )

  corpus, classes = [], []
    

  for item in trainset:    
    corpus.append( item['text'] )
    classes.append( item['label'] )

  print "num of training instances: ", len(classes)    
  print "num of training classes: ", len(set(classes))

  #fit the model of tfidf vectors for the coprus
  matrix = vectorizer.fit_transform(corpus)
 
  print "num of features: " , len(vectorizer.get_feature_names())
  print "training model"
  X = matrix.toarray()
  y = np.asarray(classes)

  print X[0]

  # Here are results of several different models for Law corpus:

  # model  = SVC(kernel='sigmoid') # ->                       0.38
  # model  = KNeighborsClassifier(algorithm = 'kd_tree') # -> 0.41
  # model = AdaBoostClassifier() #->                            0.46
  # model  = RandomForestClassifier() # ->                    0.52
  # model  = LogisticRegression() # ->                        0.65 
  model  = LinearSVC( loss='l1', dual=True) # ->              0.70
  # Results of several different models for Enron corpus:
  # model  = LinearSVC( loss='l1', dual=True) # ->              0.6

  scores = cross_validation.cross_val_score(  estimator = model,
    X = matrix.toarray(), 
        y= np.asarray(classes), cv=10  )

  print "10-fold cross-validation results:", "mean score = ", scores.mean(), "std=", scores.std(), ", num folds =", len(scores)
开发者ID:0-1-0,项目名称:authorship,代码行数:49,代码来源:main.py

示例7: test_same_result

# 需要导入模块: from sklearn.pipeline import FeatureUnion [as 别名]
# 或者: from sklearn.pipeline.FeatureUnion import get_feature_names [as 别名]
    def test_same_result(self):
        X, Z = self.make_text_rdd(2)

        loc_char = CountVectorizer(analyzer="char_wb", ngram_range=(3, 3))
        dist_char = SparkCountVectorizer(analyzer="char_wb", ngram_range=(3, 3))

        loc_word = CountVectorizer(analyzer="word")
        dist_word = SparkCountVectorizer(analyzer="word")

        loc_union = FeatureUnion([
            ("chars", loc_char),
            ("words", loc_word)
        ])
        dist_union = SparkFeatureUnion([
            ("chars", dist_char),
            ("words", dist_word)
        ])
        # test same feature names
        loc_union.fit(X)
        dist_union.fit(Z)
        assert_equal(
            loc_union.get_feature_names(),
            dist_union.get_feature_names()
        )
        # test same results
        X_transformed = loc_union.transform(X)
        Z_transformed = sp.vstack(dist_union.transform(Z).collect())
        assert_array_equal(X_transformed.toarray(), Z_transformed.toarray())
        # test same results with fit_transform
        X_transformed = loc_union.fit_transform(X)
        Z_transformed = sp.vstack(dist_union.fit_transform(Z).collect())
        assert_array_equal(X_transformed.toarray(), Z_transformed.toarray())
        # test same results in parallel
        loc_union_par = FeatureUnion([
            ("chars", loc_char),
            ("words", loc_word)
        ], n_jobs=2)
        dist_union_par = SparkFeatureUnion([
            ("chars", dist_char),
            ("words", dist_word)
        ], n_jobs=2)

        loc_union_par.fit(X)
        dist_union_par.fit(Z)
        X_transformed = loc_union_par.transform(X)
        Z_transformed = sp.vstack(dist_union_par.transform(Z).collect())
        assert_array_equal(X_transformed.toarray(), Z_transformed.toarray())
开发者ID:KartikPadmanabhan,项目名称:sparkit-learn,代码行数:49,代码来源:test_pipeline.py

示例8: test_feature_stacker_feature_names

# 需要导入模块: from sklearn.pipeline import FeatureUnion [as 别名]
# 或者: from sklearn.pipeline.FeatureUnion import get_feature_names [as 别名]
def test_feature_stacker_feature_names():
    JUNK_FOOD_DOCS = (
        "the pizza pizza beer copyright",
        "the pizza burger beer copyright",
        "the the pizza beer beer copyright",
        "the burger beer beer copyright",
        "the coke burger coke copyright",
        "the coke burger burger",
    )
    word_vect = CountVectorizer(analyzer="word")
    char_vect = CountVectorizer(analyzer="char_wb", ngram_range=(3, 3))
    ft = FeatureUnion([("chars", char_vect), ("words", word_vect)])
    ft.fit(JUNK_FOOD_DOCS)
    feature_names = ft.get_feature_names()
    for feat in feature_names:
        assert_true("chars__" in feat or "words__" in feat)
    assert_equal(len(feature_names), 35)
开发者ID:PepGardiola,项目名称:scikit-learn,代码行数:19,代码来源:test_pipeline.py

示例9: main

# 需要导入模块: from sklearn.pipeline import FeatureUnion [as 别名]
# 或者: from sklearn.pipeline.FeatureUnion import get_feature_names [as 别名]
def main():
	qtrain = read_set()
# 	X_train = gen_features(qtrain)
	Y_train = get_ans(qtrain)
	qtest = read_set()
# 	X_test = gen_features(qtest)
# 	(X_train, X_test), featkeys = dictVec(X_train, X_test)
	
#  	tfidf_word = TfidfVectorizer(preprocessor=lambda x: x['question_text'].lower(), ngram_range=(1, 3), analyzer="word", binary=False, min_df=3)
 	tfidf_word = TfidfVectorizer(preprocessor=exa, ngram_range=(1, 3), analyzer="word", binary=False, min_df=0.05)
#  	feat_select = SelectPercentile(score_func=f_regression_, percentile=0.15)
 	feat_select = SelectKBest(score_func=f_regression_, k=QN_PARAMS[QUESTION]['features_select'])
 	cf = CustomFeat()
 	feat = FeatureUnion([('word_counts', tfidf_word), ('custom', cf)])
# 	feat = FeatureUnion([('custom', cf)])
# 	feat = FeatureUnion([('word_counts', tfidf_word)])
 # 	est = ESTIMATOR(**params[SETTINGS['EST']])
  	w_model = Pipeline([('funion', feat), ('feat_select', feat_select)]) #, ('est', est)]
#   	w_X_train = tfidf_word.fit_transform(qtrain)
#   	w_X_test = tfidf_word.transform(qtest)
#   	print_err(w_X_train[0])
#  	X_train = w_X_train
#  	X_test = w_X_test
#  	featkeys = tfidf_word.get_feature_names()
# 	feat_select
# 	f_regression_(X_train[:,0],Y_train)
#	print_err('fitting')
#	w_model.fit(qtrain, Y_train)
# 	print_err(feat_select.get_support(indices=True))
	X_train = w_model.fit_transform(qtrain, Y_train).toarray()
	X_test = w_model.transform(qtest).toarray()
   	featkeys = np.asarray(feat.get_feature_names())[feat_select.get_support(indices=True)]
#	featkeys = []
# 	Y_test = classify(w_model, qtest)
# 	print_err(est.coef_.nonzero())

 	clf = get_clf(X_train, Y_train, feat_indices=featkeys, clf_used=SETTINGS['EST'], grid_search=SETTINGS['GRIDSEARCH'])
 	Y_test = classify(clf, X_test)
	for qn, pans in zip(qtest, Y_test):
		print json.dumps({
			'question_key': qn['question_key'].encode('ascii'),
			'__ans__': pans
		})
开发者ID:wonglkd,项目名称:QuoraMLCodeSprint13,代码行数:45,代码来源:solution.py

示例10: dump_train

# 需要导入模块: from sklearn.pipeline import FeatureUnion [as 别名]
# 或者: from sklearn.pipeline.FeatureUnion import get_feature_names [as 别名]
def dump_train():
    _, _, _, train_gray_data, test_gray_data, _, labels = i_p.load_data()

    train_df = f.make_data_df(train_gray_data, labels)
    test_df = f.make_test_df(test_gray_data)

    train_df = train_df.reset_index()
    test_df = test_df.reset_index()

    train_df.columns = ["pngname", "input", "label"]
    test_df.columns = ["pngname", "input"]

    fu = FeatureUnion(transformer_list=f.feature_transformer_rule)
    feature_name_list = [s.split("__")[1] for s in fu.get_feature_names()]
    feature_name_list.append("target")
    train_X = fu.fit_transform(train_df)
    train_y = np.concatenate(train_df["label"].apply(lambda x: x.flatten()))
    train_X, train_y = cl.downsampling_data(train_X, train_y, 0.2)
    train_dump = pd.DataFrame(np.c_[train_X, train_y], columns=feature_name_list)
    dump_path = os.path.abspath(os.path.dirname(__file__)) +\
        "/../tmp/train_dump"
    train_dump.to_csv(dump_path + "/train_dump.csv", index=False)
开发者ID:haisland0909,项目名称:Denoising-Dirty-Documents,代码行数:24,代码来源:dump_train_feature.py

示例11: SpecialWordCounter

# 需要导入模块: from sklearn.pipeline import FeatureUnion [as 别名]
# 或者: from sklearn.pipeline.FeatureUnion import get_feature_names [as 别名]
	print ''
	word_count = SpecialWordCounter()
	word_count.fit(t)
	print word_count.get_feature_names()
	print word_count.transform(t)

	combined_features = FeatureUnion([
		('stats', TextStats())
		, ('special_word_stats', SpecialWordCounter())
		])

	# Use combined features to transform dataset:
	X_features = combined_features.fit(t).transform(t)
	print '\nfeature union'
	print 'X:', X_features
	print 'names:', combined_features.get_feature_names()
	print 

	pipeline = Pipeline([
	    # Use FeatureUnion to combine the features from subject and body
	    ('union', FeatureUnion(
	        transformer_list=[
	        ('scaled_text_stats', Pipeline([
                ('stats', TextStats())
               , ('scaling',  StandardScaler())
            ])
            )
	        , ('special_word_stats', SpecialWordCounter())
	        ]
	        )
	    )
开发者ID:joyce-duan,项目名称:ml_helper,代码行数:33,代码来源:text_features.py

示例12: Orchestrator

# 需要导入模块: from sklearn.pipeline import FeatureUnion [as 别名]
# 或者: from sklearn.pipeline.FeatureUnion import get_feature_names [as 别名]

#.........这里部分代码省略.........

        # TODO No hardcodear
        columns_names = self.headings
        columns_is_text = [False, False, True, False]
        columns_is_class = [False, False, False, True]

        train_y = []

        steps = []
        # steps.append(('numeric_feats', MyPipeline([
        #     ('selector', SelectNumerics(columns_is_text, columns_names, columns_is_class)),
        #     ('dict', DictVectorizer()),
        # ])))
        for column_i, column_is_text in enumerate(columns_is_text):
            if columns_is_class[column_i]:
                train_y = map(lambda x: float(x[column_i]), self.preprocessed_rows)
                train_y = np.array(list(train_y))
            else:
                if column_is_text:
                    steps.append(
                        (
                            columns_names[column_i],
                            MyPipeline(
                                [
                                    ("selector", SelectText(column_i=column_i)),
                                    ("count_vector", CountVectorizer(**kwargs)),
                                ]
                            ),
                        )
                    )

        self.feature_union = FeatureUnion(steps)
        self.featured_rows = self.feature_union.fit_transform(self.preprocessed_rows, train_y)
        self.featured_headings = deepcopy(self.feature_union.get_feature_names())
        self.train_y = train_y

        variance_too_high = False
        if variance_threshold is not None:
            thresholder = VarianceThreshold(threshold=variance_threshold)
            try:
                self.featured_rows = thresholder.fit_transform(self.featured_rows)
                self.featured_support = thresholder.get_support()
                self.featured_selected_headings = [
                    self.featured_headings[i] for i, v in enumerate(self.featured_support) if v
                ]
                self.main_pfcsamr_app.variance_warn_message = ""
            except ValueError:
                traceback.print_exc()
                self.featured_rows = np.empty_like(self.featured_rows)
                self.featured_support = []
                self.featured_selected_headings = []
                self.main_pfcsamr_app.variance_warn_message = "threshold too high!!!"
                variance_too_high = True
        else:
            self.main_pfcsamr_app.variance_warn_message = ""
            self.featured_support = [True] * self.featured_rows.shape[1]
            self.featured_selected_headings = deepcopy(self.featured_headings)

        if not variance_too_high:
            self.main_pfcsamr_app.learn_tab_enabled = True
            self.main_pfcsamr_app.current_model = MyTableModel(self.featured_selected_headings, self.featured_rows)
            self.main_pfcsamr_app.status_text = "Feature extraction done. Shape of useful features: %s. Removed %d." % (
                str(self.featured_rows.shape),
                len(self.featured_headings) - len(self.featured_selected_headings),
            )
开发者ID:terrex,项目名称:SentimentAnalysis,代码行数:69,代码来源:orchestrator.py

示例13: test_same_result_withdictrdd

# 需要导入模块: from sklearn.pipeline import FeatureUnion [as 别名]
# 或者: from sklearn.pipeline.FeatureUnion import get_feature_names [as 别名]
    def test_same_result_withdictrdd(self):
        X, X_rdd = self.make_text_rdd(2)
        Y_rdd = ArrayRDD(self.sc.parallelize([None] * len(X), 4), bsize=2)
        Z = DictRDD([X_rdd, Y_rdd], columns=("X", "y"), bsize=2)

        loc_char = CountVectorizer(analyzer="char_wb", ngram_range=(3, 3))
        dist_char = SparkCountVectorizer(analyzer="char_wb", ngram_range=(3, 3))

        loc_word = CountVectorizer(analyzer="word")
        loc_word_2 = CountVectorizer(analyzer="word")
        dist_word = SparkCountVectorizer(analyzer="word")
        dist_word_2 = SparkCountVectorizer(analyzer="word")

        loc_union = FeatureUnion([
            ("chars", loc_char),
            ("words", loc_word),
            ("words2", loc_word_2)
        ])
        dist_union = SparkFeatureUnion([
            ("chars", dist_char),
            ("words", dist_word),
            ("words2", dist_word_2)
        ])
        # test same feature names
        loc_union.fit(X)
        dist_union.fit(Z)
        converted_union = dist_union.to_scikit()

        assert_equal(
            loc_union.get_feature_names(),
            dist_union.get_feature_names(),
            converted_union.get_feature_names(),
        )

        # test same results
        Z_transformed = sp.vstack(dist_union.transform(Z)[:, 'X'].collect())
        assert_array_equal(loc_union.transform(X).toarray(), Z_transformed.toarray())
        assert_array_equal(loc_union.transform(X).toarray(),
                           converted_union.transform(X).toarray())
        # test same results with fit_transform
        X_transformed = loc_union.fit_transform(X)
        X_converted_transformed = converted_union.fit_transform(X)
        Z_transformed = sp.vstack(dist_union.fit_transform(Z)[:, 'X'].collect())

        assert_array_equal(X_transformed.toarray(), Z_transformed.toarray())
        assert_array_equal(X_transformed.toarray(),
                           X_converted_transformed.toarray())
        # test same results in parallel
        loc_union_par = FeatureUnion([
            ("chars", loc_char),
            ("words", loc_word)
        ], n_jobs=2)
        dist_union_par = SparkFeatureUnion([
            ("chars", dist_char),
            ("words", dist_word)
        ], n_jobs=2)

        loc_union_par.fit(X)
        dist_union_par.fit(Z)
        converted_union = dist_union_par.to_scikit()
        X_transformed = loc_union_par.transform(X)
        Z_transformed = sp.vstack(dist_union_par.transform(Z)[:, 'X'].collect())
        assert_array_equal(X_transformed.toarray(), Z_transformed.toarray())
        assert_array_equal(X_transformed.toarray(),
                           converted_union.transform(X).toarray())
开发者ID:lensacom,项目名称:sparkit-learn,代码行数:67,代码来源:test_pipeline.py

示例14: CountVectorizer

# 需要导入模块: from sklearn.pipeline import FeatureUnion [as 别名]
# 或者: from sklearn.pipeline.FeatureUnion import get_feature_names [as 别名]
        preprocessor=get_col('description'))),
    ('title', CountVectorizer(
        ngram_range=(1, 2),
        stop_words=russian_stop,
        # max_features=7000,
        preprocessor=get_col('title')))
])

start_vect = time.time()

# Fit my vectorizer on the entire dataset instead of the training rows
# Score improved by .0001
vectorizer.fit(df.to_dict('records'))

ready_df = vectorizer.transform(df.to_dict('records'))
tfvocab = vectorizer.get_feature_names()
print("Vectorization Runtime: %0.2f Minutes" % ((time.time() - start_vect) / 60))

# Drop Text Cols
textfeats = ["description", "title"]
df.drop(textfeats, axis=1, inplace=True)

from sklearn.metrics import mean_squared_error
from math import sqrt

ridge_params = {'alpha': 30.0, 'fit_intercept': True, 'normalize': False, 'copy_X': True,
                'max_iter': None, 'tol': 0.001, 'solver': 'auto', 'random_state': SEED}

# Ridge oof method from Faron's kernel
# I was using this to analyze my vectorization, but figured it would be interesting to add the results back into the dataset
# It doesn't really add much to the score, but it does help lightgbm converge faster
开发者ID:mengli,项目名称:PcmAudioRecorder,代码行数:33,代码来源:avito.py

示例15: FeatureExtractor

# 需要导入模块: from sklearn.pipeline import FeatureUnion [as 别名]
# 或者: from sklearn.pipeline.FeatureUnion import get_feature_names [as 别名]

#.........这里部分代码省略.........
        else:
            self._vectorizer = pickle.loads(base64.b64decode(vectorizer))
            self._transform_only = True

        if labelencoder is None:
            self._labelencoder = preprocessing.LabelEncoder()
        else:
            self._labelencoder = pickle.loads(base64.b64decode(labelencoder))
            self._transform_only = True

    def export(self):
        return {
            "settings": self.settings.export(),
            "vectorizer": base64.b64encode(pickle.dumps(self._vectorizer)).decode("ascii"),
            "labelencoder": base64.b64encode(pickle.dumps(self._labelencoder)).decode("ascii"),
        }

    @property
    def settings(self):
        return self._settings

    @property
    def vectorizer(self):
        return self._vectorizer

    @property
    def dataframe(self):
        return self._dataframe

    @property
    def settings(self):
        return self._settings

    @property
    def labelencoder(self):
        return self._labelencoder

    @property
    def strings(self):
        """Get feature strings.
        
        Returns
        -------
        list[unicode]
            Dataframe columns concatenated to a single string.
        """
        if "strings" not in self._cache:
            self._cache["strings"] = [
                " ".join(row) for row in self._dataframe[self._settings.features].fillna("").values
            ]
        return self._cache["strings"]

    @property
    def X(self):
        """Returns
        -------
        scipy.sparse
            Sparse matrix containing textual features for classification.
        """
        if "X" not in self._cache:
            X = self.strings
            if self._transform_only:
                self._cache["X"] = self._vectorizer.transform(X)
            else:
                self._cache["X"] = self._vectorizer.fit_transform(X)
        return self._cache["X"]

    @property
    def y(self):
        """Returns
        -------
        numpy.array
            Labels encoded as integer values. Use get_labels() for mapping them back to strings.
        """
        if "y" not in self._cache:
            y = list(self._dataframe[self._settings.label].fillna(""))
            if self._transform_only:
                self._cache["y"] = np.array(self._labelencoder.transform(y))
            else:
                self._cache["y"] = np.array(self._labelencoder.fit_transform(y))
        return self._cache["y"]

    @property
    def feature_names(self):
        """Returns
        -------
        list[unicode]
            Meaningful feature names for vectorized feature matrix columns.
        """
        return self._vectorizer.get_feature_names()

    @property
    def labels(self):
        """Returns
        -------
        list[unicode]
            Labels for for encoded labels (y).
        """
        self.y
        return [l for l in self._labelencoder.classes_]
开发者ID:stacc-ee,项目名称:textclassifier,代码行数:104,代码来源:featureextractor.py


注:本文中的sklearn.pipeline.FeatureUnion.get_feature_names方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。