本文整理汇总了Python中sklearn.feature_selection.chi2方法的典型用法代码示例。如果您正苦于以下问题:Python feature_selection.chi2方法的具体用法?Python feature_selection.chi2怎么用?Python feature_selection.chi2使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类sklearn.feature_selection
的用法示例。
在下文中一共展示了feature_selection.chi2方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: chi_square
# 需要导入模块: from sklearn import feature_selection [as 别名]
# 或者: from sklearn.feature_selection import chi2 [as 别名]
def chi_square(X, y):
"""
This function implements the chi-square feature selection (existing method for classification in scikit-learn)
Input
-----
X: {numpy array}, shape (n_samples, n_features)
input data
y: {numpy array},shape (n_samples,)
input class labels
Output
------
F: {numpy array}, shape (n_features,)
chi-square score for each feature
"""
F, pval = chi2(X, y)
return F
示例2: featuresFromFeatureSelection
# 需要导入模块: from sklearn import feature_selection [as 别名]
# 或者: from sklearn.feature_selection import chi2 [as 别名]
def featuresFromFeatureSelection(X,Y,columnNames):
for f in columnNames:
print(f)
X_new_withfitTransform = SelectKBest(chi2, k=34).fit(X, Y)
colors = getColorNames()
counter = 0
scores = X_new_withfitTransform.scores_
scores_scaled = np.divide(scores, 1000)
for score in scores_scaled:
#if(score > 10):
#print('Feature {:>34}'.format(columnNames[counter]))
print('{:>34} '.format( score))
'''Plot a graph'''
plt.bar(counter, score,color=colors[counter])
counter +=1
plt.ylabel('Scores(1k)')
plt.title('Scores calculated by Chi-Square Test')
plt.legend(columnNames, bbox_to_anchor=(0., 0.8, 1., .102), loc=3,ncol=5, mode="expand", borderaxespad=0.)
plt.show()
#print(feature_selection.chi2(X,Y))
示例3: get_top_k
# 需要导入模块: from sklearn import feature_selection [as 别名]
# 或者: from sklearn.feature_selection import chi2 [as 别名]
def get_top_k(self):
columns=list(self.data.columns.values)
columns.remove(self.target)
# remove intercept from top_k
if(self.objective):
top_k_vars=SelectKBest(f_regression, k=self.top_k)
top_k_vars.fit_transform(self.data[columns], self.data[self.target])
else:
columns.remove('intercept')
try:
top_k_vars=SelectKBest(chi2, k=self.top_k)
top_k_vars.fit_transform(self.data[columns], self.data[self.target])
except:
top_k_vars=SelectKBest(f_classif, k=self.top_k)
top_k_vars.fit_transform(self.data[columns], self.data[self.target])
return [columns[i] for i in top_k_vars.get_support(indices=True)]
示例4: find_best_feature_selections
# 需要导入模块: from sklearn import feature_selection [as 别名]
# 或者: from sklearn.feature_selection import chi2 [as 别名]
def find_best_feature_selections(X,y):
#select the best features usin different technique
X_new = SelectKBest(chi2, k=80).fit_transform(X,y)
X_new1 = SelectPercentile(chi2, percentile=20).fit_transform(X,y)
X_new2 = SelectKBest(f_classif, k=80).fit_transform(X,y) #this one has the best performance
X_new22 = SelectPercentile(f_classif, percentile=20).fit_transform(X,y)
X_new3 = SelectKBest(f_classif, k=70).fit_transform(X,y)
X_new4 = SelectKBest(f_classif, k=60).fit_transform(X,y)
print (X_new.shape)
#selection_parameters_for_classfier(X_new,y)
#print (y.shape)
train_and_test(X_new,y)
train_and_test(X_new1,y)
train_and_test(X_new2,y)
train_and_test(X_new22,y)
train_and_test(X_new3,y)
train_and_test(X_new4,y)
#X,y = _dataset_sample()
################################PARAMETER Selected################################
#TODO some problem happens when using the parameter max_leaf_nodes in Dtree and RandomForest
示例5: feature_select
# 需要导入模块: from sklearn import feature_selection [as 别名]
# 或者: from sklearn.feature_selection import chi2 [as 别名]
def feature_select(corpus, labels, k=1000):
"""
select top k features through chi-square test
"""
bin_cv = CountVectorizer(binary=True)
le = LabelEncoder()
X = bin_cv.fit_transform(corpus)
y = le.fit_transform(labels).reshape(-1, 1)
k = min(X.shape[1], k)
skb = SelectKBest(chi2, k=k)
skb.fit(X, y)
feature_ids = skb.get_support(indices=True)
feature_names = bin_cv.get_feature_names()
vocab = {}
for new_fid, old_fid in enumerate(feature_ids):
feature_name = feature_names[old_fid]
vocab[feature_name] = new_fid
# we only care about the final extracted feature vocabulary
return vocab
示例6: univariate_feature_selection
# 需要导入模块: from sklearn import feature_selection [as 别名]
# 或者: from sklearn.feature_selection import chi2 [as 别名]
def univariate_feature_selection(mode,predictors,target):
if mode == 'f_regression':
fselect = SelectPercentile(f_regression, 100)
if mode == 'f_classif':
fselect = SelectPercentile(f_classif, 100)
if mode == 'chi2':
fselect = SelectPercentile(chi2, 100)
fselect.fit_transform(predictors, target)
return fselect.pvalues_
示例7: test_feature_selection
# 需要导入模块: from sklearn import feature_selection [as 别名]
# 或者: from sklearn.feature_selection import chi2 [as 别名]
def test_feature_selection():
# make two feature dicts with two useful features and a bunch of useless
# ones, in terms of chi2
d1 = dict([("useless%d" % i, 10) for i in range(20)],
useful1=1, useful2=20)
d2 = dict([("useless%d" % i, 10) for i in range(20)],
useful1=20, useful2=1)
for indices in (True, False):
v = DictVectorizer().fit([d1, d2])
X = v.transform([d1, d2])
sel = SelectKBest(chi2, k=2).fit(X, [0, 1])
v.restrict(sel.get_support(indices=indices), indices=indices)
assert_equal(v.get_feature_names(), ["useful1", "useful2"])
示例8: __select_features
# 需要导入模块: from sklearn import feature_selection [as 别名]
# 或者: from sklearn.feature_selection import chi2 [as 别名]
def __select_features(data_set):
dataset = [clean_en_text(data) for data in data_set[0]]
tf_idf_model = TfidfVectorizer(ngram_range=(1, 1),
binary=True,
sublinear_tf=True)
tf_vectors = tf_idf_model.fit_transform(dataset)
# 选出前1/5的词用来做特征
k = int(tf_vectors.shape[1] / 6)
chi_model = SelectKBest(chi2, k=k)
chi_features = chi_model.fit_transform(tf_vectors, data_set[1])
print('tf-idf:\t\t' + str(tf_vectors.shape[1]))
print('chi:\t\t' + str(chi_features.shape[1]))
return chi_features, tf_idf_model, chi_model
示例9: _derive_entity_label_matrix
# 需要导入模块: from sklearn import feature_selection [as 别名]
# 或者: from sklearn.feature_selection import chi2 [as 别名]
def _derive_entity_label_matrix(train_full_results, entities):
"""
Derive entity feature matrix for chi2 anaylsis using entity annotations from message api
:param train_full_results: pandas data frame outputed by inference
:param entities: list of entities that is defined in the workspace
:return entity_feature_matrix: numpy matrix of examples with entities x number of entities
:return labels: numpy array: number of labels correspond to number of examples
:return entity_average_confidence_dict: dict entity --> average confidence for entity
"""
entity_feature_matrix = list()
labels = list()
entity_conf_dict = dict()
entity_count_dict = dict()
entity_average_confidence_dict = dict()
for i in range(len(train_full_results)):
current_result = train_full_results.iloc[i]
if current_result["entities"]:
# create empty feature vector
current_feature = [0] * len(entities)
for entity_reference in current_result["entities"]:
e_ref = entity_reference["entity"]
e_conf = entity_reference["confidence"]
entity_idx = entities.index(e_ref)
current_feature[entity_idx] += 1
entity_conf_dict[e_ref] = entity_conf_dict.get(e_ref, 0) + e_conf
entity_count_dict[e_ref] = entity_count_dict.get(e_ref, 0) + 1
entity_feature_matrix.append(current_feature)
labels.append(current_result["correct_intent"])
entity_feature_matrix = np.array(entity_feature_matrix)
labels = np.array(labels)
for key in entity_conf_dict:
entity_average_confidence_dict[key] = (
entity_conf_dict[key] / entity_count_dict[key]
)
return entity_feature_matrix, labels, entity_average_confidence_dict
示例10: entity_label_correlation_analysis
# 需要导入模块: from sklearn import feature_selection [as 别名]
# 或者: from sklearn.feature_selection import chi2 [as 别名]
def entity_label_correlation_analysis(train_full_results, entities_list, p_value=0.05):
"""
Apply chi2 analysis on entities of the training set
:param train_full_results: pandas data frame output by inference
:param entities_list: the list of entities that is defined in the workspace
:param p_value: threshold for chi2 analysis
:return entity_label_df: pandas df with col 1 being intents and col 2 entities
"""
(
entity_feature_matrix,
labels,
entity_average_confidence_dict,
) = _derive_entity_label_matrix(train_full_results, entities_list)
entities_list = np.array(entities_list)
unique_labels = list(set(labels))
final_labels = list()
final_entities = list()
for label in unique_labels:
chi2_statistics, pval = chi2(entity_feature_matrix, labels == label)
temp_entities_list = entities_list[pval < p_value]
chi2_statistics = chi2_statistics[pval < p_value]
ordered_entities = temp_entities_list[np.argsort(chi2_statistics)]
if len(ordered_entities) == 0:
continue
final_labels.append(label)
final_entities.append(", ".join(ordered_entities[-N:]))
entity_label_df = pd.DataFrame(
{"Intent": final_labels, "Correlated Entities": final_entities}
)
return entity_label_df
示例11: _preprocess_chi2
# 需要导入模块: from sklearn import feature_selection [as 别名]
# 或者: from sklearn.feature_selection import chi2 [as 别名]
def _preprocess_chi2(workspace_pd):
"""
Preprocess dataframe for chi2 analysis
:param workspace_pd: Preprocess dataframe for chi2
:return labels: intents processed
:return count_vectorizer: vectorizer instance
:return features: features from transform
"""
stopword_list = skills_util.STOP_WORDS
workspace_pd["utterance_punc_stripped"] = workspace_pd["utterance"].apply(
strip_punctuations
)
count_vectorizer = CountVectorizer(
min_df=1,
encoding="utf-8",
ngram_range=(1, 2),
stop_words=stopword_list,
tokenizer=word_tokenize,
token_pattern="(?u)\b\w+\b",
)
features = count_vectorizer.fit_transform(
workspace_pd["utterance_punc_stripped"]
).toarray()
labels = workspace_pd["intent"]
return labels, count_vectorizer, features
示例12: _compute_chi2_top_feature
# 需要导入模块: from sklearn import feature_selection [as 别名]
# 或者: from sklearn.feature_selection import chi2 [as 别名]
def _compute_chi2_top_feature(
features, labels, vectorizer, cls, significance_level=0.05
):
"""
Perform chi2 analysis, punctuation filtering and deduplication
:param features: count vectorizer features
:param labels: intents processed
:param vectorizer: count vectorizer instances
:param cls: classes for chi square
:param significance_level: specify an alpha
:return deduplicated_unigram:
:return deduplicated_bigram:
"""
features_chi2, pval = chi2(features, labels == cls)
feature_names = np.array(vectorizer.get_feature_names())
features_chi2 = features_chi2[pval < significance_level]
feature_names = feature_names[pval < significance_level]
indices = np.argsort(features_chi2)
feature_names = feature_names[indices]
unigrams = [v.strip() for v in feature_names if len(v.strip().split()) == 1]
deduplicated_unigram = list()
for unigram in unigrams:
if unigram not in deduplicated_unigram:
deduplicated_unigram.append(unigram)
bigrams = [v.strip() for v in feature_names if len(v.strip().split()) == 2]
deduplicated_bigram = list()
for bigram in bigrams:
if bigram not in deduplicated_bigram:
deduplicated_bigram.append(bigram)
return deduplicated_unigram, deduplicated_bigram
示例13: select_best_feature
# 需要导入模块: from sklearn import feature_selection [as 别名]
# 或者: from sklearn.feature_selection import chi2 [as 别名]
def select_best_feature(self, data_set, data_lbl):
ch2 = SelectKBest(chi2, k=10000)
return ch2.fit_transform(data_set, data_lbl), ch2
示例14: feature_ranking
# 需要导入模块: from sklearn import feature_selection [as 别名]
# 或者: from sklearn.feature_selection import chi2 [as 别名]
def feature_ranking(F):
"""
Rank features in descending order according to chi2-score, the higher the chi2-score, the more important the feature is
"""
idx = np.argsort(F)
return idx[::-1]
示例15: _fit_transform_tfidf_vectorizer
# 需要导入模块: from sklearn import feature_selection [as 别名]
# 或者: from sklearn.feature_selection import chi2 [as 别名]
def _fit_transform_tfidf_vectorizer(self, x, y, dataset):
from sklearn.feature_selection import chi2
self.tfidf_vectorizer = TfidfVectorizer(
config=self.config.tfidf_vectorizer_config,
builtin_entity_parser=self.builtin_entity_parser,
custom_entity_parser=self.custom_entity_parser,
resources=self.resources,
random_state=self.random_state,
)
x_tfidf = self.tfidf_vectorizer.fit_transform(x, dataset)
if not self.tfidf_vectorizer.vocabulary:
raise _EmptyDatasetUtterancesError(
"Dataset is empty or with empty utterances")
_, tfidf_pval = chi2(x_tfidf, y)
best_tfidf_features = set(i for i, v in enumerate(tfidf_pval)
if v < self.config.pvalue_threshold)
if not best_tfidf_features:
best_tfidf_features = set(
idx for idx, val in enumerate(tfidf_pval) if
val == tfidf_pval.min())
best_ngrams = [ng for ng, i in
iteritems(self.tfidf_vectorizer.vocabulary)
if i in best_tfidf_features]
self.tfidf_vectorizer.limit_vocabulary(best_ngrams)
# We can't return x_tfidf[:best_tfidf_features] because of the
# normalization in the transform of the tfidf_vectorizer
# this would lead to inconsistent result between: fit_transform(x, y)
# and fit(x, y).transform(x)
return self.tfidf_vectorizer.transform(x)