Python feature_selection.chi2方法代码示例

本文整理汇总了Python中sklearn.feature_selection.chi2方法的典型用法代码示例。如果您正苦于以下问题：Python feature_selection.chi2方法的具体用法？Python feature_selection.chi2怎么用？Python feature_selection.chi2使用的例子？那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类sklearn.feature_selection的用法示例。

在下文中一共展示了feature_selection.chi2方法的15个代码示例，这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞，您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: chi_square

# 需要导入模块: from sklearn import feature_selection [as 别名]
# 或者: from sklearn.feature_selection import chi2 [as 别名]
def chi_square(X, y):
    """
    This function implements the chi-square feature selection (existing method for classification in scikit-learn)

    Input
    -----
    X: {numpy array}, shape (n_samples, n_features)
        input data
    y: {numpy array},shape (n_samples,)
        input class labels

    Output
    ------
    F: {numpy array}, shape (n_features,)
        chi-square score for each feature
    """
    F, pval = chi2(X, y)
    return F

开发者ID:jundongl，项目名称:scikit-feature，代码行数:20，代码来源:chi_square.py

示例2: featuresFromFeatureSelection

# 需要导入模块: from sklearn import feature_selection [as 别名]
# 或者: from sklearn.feature_selection import chi2 [as 别名]
def featuresFromFeatureSelection(X,Y,columnNames):
    
    for f in columnNames:
        print(f)
    X_new_withfitTransform = SelectKBest(chi2, k=34).fit(X, Y)
    colors = getColorNames()
    counter  = 0
    
    scores = X_new_withfitTransform.scores_
    scores_scaled = np.divide(scores, 1000) 
        
    for score in scores_scaled:
        #if(score > 10):
        #print('Feature {:>34}'.format(columnNames[counter]))
        print('{:>34}  '.format( score))
        '''Plot a graph'''    
        plt.bar(counter, score,color=colors[counter])
        counter +=1 

    plt.ylabel('Scores(1k)')
    plt.title('Scores calculated by Chi-Square Test')
    plt.legend(columnNames, bbox_to_anchor=(0., 0.8, 1., .102), loc=3,ncol=5, mode="expand", borderaxespad=0.)
    plt.show()
    
    #print(feature_selection.chi2(X,Y))

开发者ID:md-k-sarker，项目名称:Predicting-Health-Insurance-Cost，代码行数:27，代码来源:DataAnalysis.py

示例3: get_top_k

# 需要导入模块: from sklearn import feature_selection [as 别名]
# 或者: from sklearn.feature_selection import chi2 [as 别名]
def get_top_k(self):
		columns=list(self.data.columns.values)
		columns.remove(self.target)
		# remove intercept from top_k
		if(self.objective):
			top_k_vars=SelectKBest(f_regression, k=self.top_k)
			top_k_vars.fit_transform(self.data[columns], self.data[self.target])
		else:
			columns.remove('intercept')
			try:
				top_k_vars=SelectKBest(chi2, k=self.top_k)
				top_k_vars.fit_transform(self.data[columns], self.data[self.target])
			except:
				top_k_vars=SelectKBest(f_classif, k=self.top_k)
				top_k_vars.fit_transform(self.data[columns], self.data[self.target])
		return [columns[i] for i in top_k_vars.get_support(indices=True)]

开发者ID:dominance-analysis，项目名称:dominance-analysis，代码行数:18，代码来源:dominance.py

示例4: find_best_feature_selections

# 需要导入模块: from sklearn import feature_selection [as 别名]
# 或者: from sklearn.feature_selection import chi2 [as 别名]
def find_best_feature_selections(X,y):

    #select the best features usin different technique
    X_new = SelectKBest(chi2, k=80).fit_transform(X,y)
    X_new1 = SelectPercentile(chi2, percentile=20).fit_transform(X,y)

    X_new2 = SelectKBest(f_classif, k=80).fit_transform(X,y) #this one has the best performance
    X_new22 = SelectPercentile(f_classif, percentile=20).fit_transform(X,y)

    X_new3 = SelectKBest(f_classif, k=70).fit_transform(X,y)
    X_new4 = SelectKBest(f_classif, k=60).fit_transform(X,y)

    print (X_new.shape)
    #selection_parameters_for_classfier(X_new,y)
    #print (y.shape)
    train_and_test(X_new,y)
    train_and_test(X_new1,y)
    train_and_test(X_new2,y)
    train_and_test(X_new22,y)
    train_and_test(X_new3,y)
    train_and_test(X_new4,y)
    #X,y = _dataset_sample()

################################PARAMETER  Selected################################
#TODO some problem happens when using the parameter max_leaf_nodes in Dtree and RandomForest

开发者ID:ririhedou，项目名称:dr_droid，代码行数:27，代码来源:GetMLPara.py

示例5: feature_select

# 需要导入模块: from sklearn import feature_selection [as 别名]
# 或者: from sklearn.feature_selection import chi2 [as 别名]
def feature_select(corpus, labels, k=1000):
    """
    select top k features through chi-square test
    """
    bin_cv = CountVectorizer(binary=True)
    le = LabelEncoder()
    X = bin_cv.fit_transform(corpus)
    y = le.fit_transform(labels).reshape(-1, 1)

    k = min(X.shape[1], k)
    skb = SelectKBest(chi2, k=k)
    skb.fit(X, y)

    feature_ids = skb.get_support(indices=True)
    feature_names = bin_cv.get_feature_names()
    vocab = {}

    for new_fid, old_fid in enumerate(feature_ids):
        feature_name = feature_names[old_fid]
        vocab[feature_name] = new_fid

    # we only care about the final extracted feature vocabulary
    return vocab

开发者ID:FelixHo，项目名称:Text-Classification-Benchmark，代码行数:25，代码来源:tester.py

示例6: univariate_feature_selection

# 需要导入模块: from sklearn import feature_selection [as 别名]
# 或者: from sklearn.feature_selection import chi2 [as 别名]
def univariate_feature_selection(mode,predictors,target):
    
    if mode == 'f_regression':
        fselect = SelectPercentile(f_regression, 100)
        
    if mode == 'f_classif':
        fselect = SelectPercentile(f_classif, 100)
        
    if mode == 'chi2':
        fselect = SelectPercentile(chi2, 100)
        
    fselect.fit_transform(predictors, target)
    
    return fselect.pvalues_

开发者ID:dataiku，项目名称:dataiku-contrib，代码行数:16，代码来源:feature_selection.py

示例7: test_feature_selection

# 需要导入模块: from sklearn import feature_selection [as 别名]
# 或者: from sklearn.feature_selection import chi2 [as 别名]
def test_feature_selection():
    # make two feature dicts with two useful features and a bunch of useless
    # ones, in terms of chi2
    d1 = dict([("useless%d" % i, 10) for i in range(20)],
              useful1=1, useful2=20)
    d2 = dict([("useless%d" % i, 10) for i in range(20)],
              useful1=20, useful2=1)

    for indices in (True, False):
        v = DictVectorizer().fit([d1, d2])
        X = v.transform([d1, d2])
        sel = SelectKBest(chi2, k=2).fit(X, [0, 1])

        v.restrict(sel.get_support(indices=indices), indices=indices)
        assert_equal(v.get_feature_names(), ["useful1", "useful2"])

开发者ID:PacktPublishing，项目名称:Mastering-Elasticsearch-7.0，代码行数:17，代码来源:test_dict_vectorizer.py

示例8: __select_features

# 需要导入模块: from sklearn import feature_selection [as 别名]
# 或者: from sklearn.feature_selection import chi2 [as 别名]
def __select_features(data_set):
        dataset = [clean_en_text(data) for data in data_set[0]]
        tf_idf_model = TfidfVectorizer(ngram_range=(1, 1),
                                       binary=True, 
                                       sublinear_tf=True)
        tf_vectors = tf_idf_model.fit_transform(dataset)

        # 选出前1/5的词用来做特征
        k = int(tf_vectors.shape[1] / 6)
        chi_model = SelectKBest(chi2, k=k)
        chi_features = chi_model.fit_transform(tf_vectors, data_set[1])
        print('tf-idf:\t\t' + str(tf_vectors.shape[1]))
        print('chi:\t\t' + str(chi_features.shape[1]))

        return chi_features, tf_idf_model, chi_model

开发者ID:msgi，项目名称:nlp-journey，代码行数:17，代码来源:svm_classifier.py

示例9: _derive_entity_label_matrix

# 需要导入模块: from sklearn import feature_selection [as 别名]
# 或者: from sklearn.feature_selection import chi2 [as 别名]
def _derive_entity_label_matrix(train_full_results, entities):
    """
    Derive entity feature matrix for chi2 anaylsis using entity annotations from message api
    :param train_full_results: pandas data frame outputed by inference
    :param entities: list of entities that is defined in the workspace
    :return entity_feature_matrix: numpy matrix of examples with entities x number of entities
    :return labels: numpy array: number of labels correspond to number of examples
    :return entity_average_confidence_dict: dict entity --> average confidence for entity
    """
    entity_feature_matrix = list()
    labels = list()
    entity_conf_dict = dict()
    entity_count_dict = dict()
    entity_average_confidence_dict = dict()
    for i in range(len(train_full_results)):
        current_result = train_full_results.iloc[i]
        if current_result["entities"]:
            # create empty feature vector
            current_feature = [0] * len(entities)
            for entity_reference in current_result["entities"]:
                e_ref = entity_reference["entity"]
                e_conf = entity_reference["confidence"]

                entity_idx = entities.index(e_ref)
                current_feature[entity_idx] += 1
                entity_conf_dict[e_ref] = entity_conf_dict.get(e_ref, 0) + e_conf
                entity_count_dict[e_ref] = entity_count_dict.get(e_ref, 0) + 1

            entity_feature_matrix.append(current_feature)
            labels.append(current_result["correct_intent"])

    entity_feature_matrix = np.array(entity_feature_matrix)
    labels = np.array(labels)
    for key in entity_conf_dict:
        entity_average_confidence_dict[key] = (
            entity_conf_dict[key] / entity_count_dict[key]
        )

    return entity_feature_matrix, labels, entity_average_confidence_dict

开发者ID:watson-developer-cloud，项目名称:assistant-dialog-skill-analysis，代码行数:41，代码来源:entity_analyzer.py

示例10: entity_label_correlation_analysis

# 需要导入模块: from sklearn import feature_selection [as 别名]
# 或者: from sklearn.feature_selection import chi2 [as 别名]
def entity_label_correlation_analysis(train_full_results, entities_list, p_value=0.05):
    """
    Apply chi2 analysis on entities of the training set
    :param train_full_results: pandas data frame output by inference
    :param entities_list: the list of entities that is defined in the workspace
    :param p_value: threshold for chi2 analysis
    :return entity_label_df: pandas df with col 1 being intents and col 2 entities
    """
    (
        entity_feature_matrix,
        labels,
        entity_average_confidence_dict,
    ) = _derive_entity_label_matrix(train_full_results, entities_list)
    entities_list = np.array(entities_list)
    unique_labels = list(set(labels))
    final_labels = list()
    final_entities = list()

    for label in unique_labels:
        chi2_statistics, pval = chi2(entity_feature_matrix, labels == label)
        temp_entities_list = entities_list[pval < p_value]
        chi2_statistics = chi2_statistics[pval < p_value]
        ordered_entities = temp_entities_list[np.argsort(chi2_statistics)]
        if len(ordered_entities) == 0:
            continue

        final_labels.append(label)
        final_entities.append(", ".join(ordered_entities[-N:]))

    entity_label_df = pd.DataFrame(
        {"Intent": final_labels, "Correlated Entities": final_entities}
    )

    return entity_label_df

开发者ID:watson-developer-cloud，项目名称:assistant-dialog-skill-analysis，代码行数:36，代码来源:entity_analyzer.py

示例11: _preprocess_chi2

# 需要导入模块: from sklearn import feature_selection [as 别名]
# 或者: from sklearn.feature_selection import chi2 [as 别名]
def _preprocess_chi2(workspace_pd):
    """
    Preprocess dataframe for chi2 analysis
    :param workspace_pd: Preprocess dataframe for chi2
    :return labels: intents processed
    :return count_vectorizer: vectorizer instance
    :return features: features from transform
    """
    stopword_list = skills_util.STOP_WORDS

    workspace_pd["utterance_punc_stripped"] = workspace_pd["utterance"].apply(
        strip_punctuations
    )

    count_vectorizer = CountVectorizer(
        min_df=1,
        encoding="utf-8",
        ngram_range=(1, 2),
        stop_words=stopword_list,
        tokenizer=word_tokenize,
        token_pattern="(?u)\b\w+\b",
    )
    features = count_vectorizer.fit_transform(
        workspace_pd["utterance_punc_stripped"]
    ).toarray()
    labels = workspace_pd["intent"]
    return labels, count_vectorizer, features

开发者ID:watson-developer-cloud，项目名称:assistant-dialog-skill-analysis，代码行数:29，代码来源:chi2_analyzer.py

示例12: _compute_chi2_top_feature

# 需要导入模块: from sklearn import feature_selection [as 别名]
# 或者: from sklearn.feature_selection import chi2 [as 别名]
def _compute_chi2_top_feature(
    features, labels, vectorizer, cls, significance_level=0.05
):
    """
    Perform chi2 analysis, punctuation filtering and deduplication
    :param features: count vectorizer features
    :param labels: intents processed
    :param vectorizer: count vectorizer instances
    :param cls: classes for chi square
    :param significance_level: specify an alpha
    :return deduplicated_unigram:
    :return deduplicated_bigram:
    """
    features_chi2, pval = chi2(features, labels == cls)

    feature_names = np.array(vectorizer.get_feature_names())

    features_chi2 = features_chi2[pval < significance_level]
    feature_names = feature_names[pval < significance_level]

    indices = np.argsort(features_chi2)
    feature_names = feature_names[indices]

    unigrams = [v.strip() for v in feature_names if len(v.strip().split()) == 1]
    deduplicated_unigram = list()

    for unigram in unigrams:
        if unigram not in deduplicated_unigram:
            deduplicated_unigram.append(unigram)

    bigrams = [v.strip() for v in feature_names if len(v.strip().split()) == 2]

    deduplicated_bigram = list()
    for bigram in bigrams:
        if bigram not in deduplicated_bigram:
            deduplicated_bigram.append(bigram)

    return deduplicated_unigram, deduplicated_bigram

开发者ID:watson-developer-cloud，项目名称:assistant-dialog-skill-analysis，代码行数:40，代码来源:chi2_analyzer.py

示例13: select_best_feature

# 需要导入模块: from sklearn import feature_selection [as 别名]
# 或者: from sklearn.feature_selection import chi2 [as 别名]
def select_best_feature(self, data_set, data_lbl):
        ch2 = SelectKBest(chi2, k=10000)
        return ch2.fit_transform(data_set, data_lbl), ch2

开发者ID:shibing624，项目名称:text-classifier，代码行数:5，代码来源:feature.py

示例14: feature_ranking

# 需要导入模块: from sklearn import feature_selection [as 别名]
# 或者: from sklearn.feature_selection import chi2 [as 别名]
def feature_ranking(F):
    """
    Rank features in descending order according to chi2-score, the higher the chi2-score, the more important the feature is
    """
    idx = np.argsort(F)
    return idx[::-1]

开发者ID:jundongl，项目名称:scikit-feature，代码行数:8，代码来源:chi_square.py

示例15: _fit_transform_tfidf_vectorizer

# 需要导入模块: from sklearn import feature_selection [as 别名]
# 或者: from sklearn.feature_selection import chi2 [as 别名]
def _fit_transform_tfidf_vectorizer(self, x, y, dataset):
        from sklearn.feature_selection import chi2

        self.tfidf_vectorizer = TfidfVectorizer(
            config=self.config.tfidf_vectorizer_config,
            builtin_entity_parser=self.builtin_entity_parser,
            custom_entity_parser=self.custom_entity_parser,
            resources=self.resources,
            random_state=self.random_state,
        )
        x_tfidf = self.tfidf_vectorizer.fit_transform(x, dataset)

        if not self.tfidf_vectorizer.vocabulary:
            raise _EmptyDatasetUtterancesError(
                "Dataset is empty or with empty utterances")
        _, tfidf_pval = chi2(x_tfidf, y)
        best_tfidf_features = set(i for i, v in enumerate(tfidf_pval)
                                  if v < self.config.pvalue_threshold)
        if not best_tfidf_features:
            best_tfidf_features = set(
                idx for idx, val in enumerate(tfidf_pval) if
                val == tfidf_pval.min())

        best_ngrams = [ng for ng, i in
                       iteritems(self.tfidf_vectorizer.vocabulary)
                       if i in best_tfidf_features]
        self.tfidf_vectorizer.limit_vocabulary(best_ngrams)
        # We can't return x_tfidf[:best_tfidf_features] because of the
        # normalization in the transform of the tfidf_vectorizer
        # this would lead to inconsistent result between: fit_transform(x, y)
        # and fit(x, y).transform(x)
        return self.tfidf_vectorizer.transform(x)

开发者ID:snipsco，项目名称:snips-nlu，代码行数:34，代码来源:featurizer.py

注：本文中的sklearn.feature_selection.chi2方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台，相关代码片段筛选自各路编程大神贡献的开源项目，源码版权归原作者所有，传播和使用请参考对应项目的License；未经允许，请勿转载。