本文整理匯總了Python中category_encoders.ordinal.OrdinalEncoder.fit方法的典型用法代碼示例。如果您正苦於以下問題:Python OrdinalEncoder.fit方法的具體用法?Python OrdinalEncoder.fit怎麽用?Python OrdinalEncoder.fit使用的例子?那麽, 這裏精選的方法代碼示例或許可以為您提供幫助。您也可以進一步了解該方法所在類category_encoders.ordinal.OrdinalEncoder
的用法示例。
在下文中一共展示了OrdinalEncoder.fit方法的15個代碼示例,這些例子默認根據受歡迎程度排序。您可以為喜歡或者感覺有用的代碼點讚,您的評價將有助於係統推薦出更棒的Python代碼示例。
示例1: HelmertEncoder
# 需要導入模塊: from category_encoders.ordinal import OrdinalEncoder [as 別名]
# 或者: from category_encoders.ordinal.OrdinalEncoder import fit [as 別名]
class HelmertEncoder(BaseEstimator, TransformerMixin):
"""
"""
def __init__(self, verbose=0, cols=None, drop_invariant=False):
"""
:param verbose:
:param cols:
:return:
"""
self.drop_invariant = drop_invariant
self.drop_cols = []
self.verbose = verbose
self.cols = cols
self.ordinal_encoder = OrdinalEncoder(verbose=verbose, cols=cols)
def fit(self, X, y=None, **kwargs):
"""
:param X:
:param y:
:param kwargs:
:return:
"""
self.ordinal_encoder = self.ordinal_encoder.fit(X)
if self.drop_invariant:
self.drop_cols = []
X_temp = self.transform(X)
self.drop_cols = [x for x in X_temp.columns.values if X_temp[x].var() <= 10e-5]
return self
def transform(self, X):
"""
:param X:
:return:
"""
if not isinstance(X, pd.DataFrame):
X = pd.DataFrame(X)
if self.cols == []:
return X
X = self.ordinal_encoder.transform(X)
X = helmert_coding(X, cols=self.cols)
if self.drop_invariant:
for col in self.drop_cols:
X.drop(col, 1, inplace=True)
return X
示例2: BinaryEncoder
# 需要導入模塊: from category_encoders.ordinal import OrdinalEncoder [as 別名]
# 或者: from category_encoders.ordinal.OrdinalEncoder import fit [as 別名]
class BinaryEncoder(BaseEstimator, TransformerMixin):
"""
Binary encoding encodes the integers as binary code with one column per digit.
"""
def __init__(self, verbose=0, cols=None):
"""
:param verbose:
:param cols:
:return:
"""
self.verbose = verbose
self.cols = cols
self.ordinal_encoder = OrdinalEncoder(verbose=verbose, cols=cols)
def fit(self, X, y=None, **kwargs):
"""
:param X:
:param y:
:param kwargs:
:return:
"""
self.ordinal_encoder = self.ordinal_encoder.fit(X)
return self
def transform(self, X):
"""
:param X:
:return:
"""
if not isinstance(X, pd.DataFrame):
X = pd.DataFrame(X)
X = self.ordinal_encoder.transform(X)
return binary(X, cols=self.cols)
示例3: BackwardDifferenceEncoder
# 需要導入模塊: from category_encoders.ordinal import OrdinalEncoder [as 別名]
# 或者: from category_encoders.ordinal.OrdinalEncoder import fit [as 別名]
class BackwardDifferenceEncoder(BaseEstimator, TransformerMixin):
"""
"""
def __init__(self, verbose=0, cols=None):
"""
:param verbose:
:param cols:
:return:
"""
self.verbose = verbose
self.cols = cols
self.ordinal_encoder = OrdinalEncoder(verbose=verbose, cols=cols)
def fit(self, X, y=None, **kwargs):
"""
:param X:
:param y:
:param kwargs:
:return:
"""
self.ordinal_encoder = self.ordinal_encoder.fit(X)
return self
def transform(self, X):
"""
:param X:
:return:
"""
if not isinstance(X, pd.DataFrame):
X = pd.DataFrame(X)
X = self.ordinal_encoder.transform(X)
return backward_difference_coding(X, cols=self.cols)
示例4: HelmertEncoder
# 需要導入模塊: from category_encoders.ordinal import OrdinalEncoder [as 別名]
# 或者: from category_encoders.ordinal.OrdinalEncoder import fit [as 別名]
class HelmertEncoder(BaseEstimator, TransformerMixin):
"""Helmert contrast coding for encoding categorical features
Parameters
----------
verbose: int
integer indicating verbosity of output. 0 for none.
cols: list
a list of columns to encode, if None, all string columns will be encoded
drop_invariant: bool
boolean for whether or not to drop columns with 0 variance
return_df: bool
boolean for whether to return a pandas DataFrame from transform (otherwise it will be a numpy array)
Example
-------
>>>from category_encoders import *
>>>import pandas as pd
>>>from sklearn.datasets import load_boston
>>>bunch = load_boston()
>>>y = bunch.target
>>>X = pd.DataFrame(bunch.data, columns=bunch.feature_names)
>>>enc = HelmertEncoder(cols=['CHAS', 'RAD']).fit(X, y)
>>>numeric_dataset = enc.transform(X)
>>>print(numeric_dataset.info())
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 506 entries, 0 to 505
Data columns (total 22 columns):
col_CHAS_0 506 non-null float64
col_CHAS_1 506 non-null float64
col_RAD_0 506 non-null float64
col_RAD_1 506 non-null float64
col_RAD_2 506 non-null float64
col_RAD_3 506 non-null float64
col_RAD_4 506 non-null float64
col_RAD_5 506 non-null float64
col_RAD_6 506 non-null float64
col_RAD_7 506 non-null float64
col_RAD_8 506 non-null float64
col_CRIM 506 non-null float64
col_ZN 506 non-null float64
col_INDUS 506 non-null float64
col_NOX 506 non-null float64
col_RM 506 non-null float64
col_AGE 506 non-null float64
col_DIS 506 non-null float64
col_TAX 506 non-null float64
col_PTRATIO 506 non-null float64
col_B 506 non-null float64
col_LSTAT 506 non-null float64
dtypes: float64(22)
memory usage: 87.0 KB
None
References
----------
.. [1] Contrast Coding Systems for categorical variables. UCLA: Statistical Consulting Group. from
http://www.ats.ucla.edu/stat/r/library/contrast_coding.
.. [2] Gregory Carey (2003). Coding Categorical Variables, from
http://psych.colorado.edu/~carey/Courses/PSYC5741/handouts/Coding%20Categorical%20Variables%202006-03-03.pdf
"""
def __init__(self, verbose=0, cols=None, drop_invariant=False, return_df=True):
self.return_df = return_df
self.drop_invariant = drop_invariant
self.drop_cols = []
self.verbose = verbose
self.cols = cols
self.ordinal_encoder = None
self._dim = None
def fit(self, X, y=None, **kwargs):
"""Fit encoder according to X and y.
Parameters
----------
X : array-like, shape = [n_samples, n_features]
Training vectors, where n_samples is the number of samples
and n_features is the number of features.
y : array-like, shape = [n_samples]
Target values.
Returns
-------
self : encoder
Returns self.
"""
# first check the type
X = convert_input(X)
self._dim = X.shape[1]
#.........這裏部分代碼省略.........
示例5: MEstimateEncoder
# 需要導入模塊: from category_encoders.ordinal import OrdinalEncoder [as 別名]
# 或者: from category_encoders.ordinal.OrdinalEncoder import fit [as 別名]
class MEstimateEncoder(BaseEstimator, TransformerMixin):
"""M-probability estimate of likelihood.
This is a simplified version of target encoder. In comparison to target encoder, m-probability estimate
has only one tunable parameter ('m'), while target encoder has two tunable parameters ('min_samples_leaf'
and 'smoothing').
Parameters
----------
verbose: int
integer indicating verbosity of output. 0 for none.
cols: list
a list of columns to encode, if None, all string columns will be encoded.
drop_invariant: bool
boolean for whether or not to drop encoded columns with 0 variance.
return_df: bool
boolean for whether to return a pandas DataFrame from transform (otherwise it will be a numpy array).
handle_missing: str
options are 'return_nan', 'error' and 'value', defaults to 'value', which returns the prior probability.
handle_unknown: str
options are 'return_nan', 'error' and 'value', defaults to 'value', which returns the prior probability.
randomized: bool,
adds normal (Gaussian) distribution noise into training data in order to decrease overfitting (testing data are untouched).
sigma: float
standard deviation (spread or "width") of the normal distribution.
m: float
this is the "m" in the m-probability estimate. Higher value of m results into stronger shrinking.
M is non-negative.
Example
-------
>>> from category_encoders import *
>>> import pandas as pd
>>> from sklearn.datasets import load_boston
>>> bunch = load_boston()
>>> y = bunch.target > 22.5
>>> X = pd.DataFrame(bunch.data, columns=bunch.feature_names)
>>> enc = MEstimateEncoder(cols=['CHAS', 'RAD']).fit(X, y)
>>> numeric_dataset = enc.transform(X)
>>> print(numeric_dataset.info())
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 506 entries, 0 to 505
Data columns (total 13 columns):
CRIM 506 non-null float64
ZN 506 non-null float64
INDUS 506 non-null float64
CHAS 506 non-null float64
NOX 506 non-null float64
RM 506 non-null float64
AGE 506 non-null float64
DIS 506 non-null float64
RAD 506 non-null float64
TAX 506 non-null float64
PTRATIO 506 non-null float64
B 506 non-null float64
LSTAT 506 non-null float64
dtypes: float64(13)
memory usage: 51.5 KB
None
References
----------
.. [1] A Preprocessing Scheme for High-Cardinality Categorical Attributes in Classification and Prediction Problems, equation 7, from
https://dl.acm.org/citation.cfm?id=507538.
..[2] Additive smoothing, from
https://en.wikipedia.org/wiki/Additive_smoothing#Generalized_to_the_case_of_known_incidence_rates
"""
def __init__(self, verbose=0, cols=None, drop_invariant=False, return_df=True,
handle_unknown='value', handle_missing='value', random_state=None, randomized=False, sigma=0.05, m=1.0):
self.verbose = verbose
self.return_df = return_df
self.drop_invariant = drop_invariant
self.drop_cols = []
self.cols = cols
self.ordinal_encoder = None
self._dim = None
self.mapping = None
self.handle_unknown = handle_unknown
self.handle_missing = handle_missing
self._sum = None
self._count = None
self.random_state = random_state
self.randomized = randomized
self.sigma = sigma
self.m = m
self.feature_names = None
# noinspection PyUnusedLocal
def fit(self, X, y, **kwargs):
"""Fit encoder according to X and binary y.
Parameters
----------
#.........這裏部分代碼省略.........
示例6: BaseNEncoder
# 需要導入模塊: from category_encoders.ordinal import OrdinalEncoder [as 別名]
# 或者: from category_encoders.ordinal.OrdinalEncoder import fit [as 別名]
class BaseNEncoder(BaseEstimator, TransformerMixin):
"""Base-N encoder encodes the categories into arrays of their base-N representation. A base of 1 is equivalent to
one-hot encoding (not really base-1, but useful), a base of 2 is equivalent to binary encoding. N=number of actual
categories is equivalent to vanilla ordinal encoding.
Parameters
----------
verbose: int
integer indicating verbosity of output. 0 for none.
cols: list
a list of columns to encode, if None, all string columns will be encoded
drop_invariant: bool
boolean for whether or not to drop columns with 0 variance
return_df: bool
boolean for whether to return a pandas DataFrame from transform (otherwise it will be a numpy array)
Example
-------
>>>from category_encoders import *
>>>import pandas as pd
>>>from sklearn.datasets import load_boston
>>>bunch = load_boston()
>>>y = bunch.target
>>>X = pd.DataFrame(bunch.data, columns=bunch.feature_names)
>>>enc = BaseNEncoder(cols=['CHAS', 'RAD']).fit(X, y)
>>>numeric_dataset = enc.transform(X)
>>>print(numeric_dataset.info())
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 506 entries, 0 to 505
Data columns (total 16 columns):
CHAS_0 506 non-null int64
RAD_0 506 non-null int64
RAD_1 506 non-null int64
RAD_2 506 non-null int64
RAD_3 506 non-null int64
CRIM 506 non-null float64
ZN 506 non-null float64
INDUS 506 non-null float64
NOX 506 non-null float64
RM 506 non-null float64
AGE 506 non-null float64
DIS 506 non-null float64
TAX 506 non-null float64
PTRATIO 506 non-null float64
B 506 non-null float64
LSTAT 506 non-null float64
dtypes: float64(11), int64(5)
memory usage: 63.3 KB
None
"""
def __init__(self, verbose=0, cols=None, drop_invariant=False, return_df=True, base=2):
self.return_df = return_df
self.drop_invariant = drop_invariant
self.drop_cols = []
self.verbose = verbose
self.cols = cols
self.ordinal_encoder = None
self._dim = None
self.base = base
self._encoded_columns = None
def fit(self, X, y=None, **kwargs):
"""Fit encoder according to X and y.
Parameters
----------
X : array-like, shape = [n_samples, n_features]
Training vectors, where n_samples is the number of samples
and n_features is the number of features.
y : array-like, shape = [n_samples]
Target values.
Returns
-------
self : encoder
Returns self.
"""
# if the input dataset isn't already a dataframe, convert it to one (using default column names)
# first check the type
X = convert_input(X)
self._dim = X.shape[1]
# if columns aren't passed, just use every string column
if self.cols is None:
self.cols = get_obj_cols(X)
# train an ordinal pre-encoder
self.ordinal_encoder = OrdinalEncoder(verbose=self.verbose, cols=self.cols)
self.ordinal_encoder = self.ordinal_encoder.fit(X)
# do a transform on the training data to get a column list
X_t = self.transform(X, override_return_df=True)
#.........這裏部分代碼省略.........
示例7: SumEncoder
# 需要導入模塊: from category_encoders.ordinal import OrdinalEncoder [as 別名]
# 或者: from category_encoders.ordinal.OrdinalEncoder import fit [as 別名]
class SumEncoder(BaseEstimator, TransformerMixin):
"""Sum contrast coding for the encoding of categorical features.
Parameters
----------
verbose: int
integer indicating verbosity of output. 0 for none.
cols: list
a list of columns to encode, if None, all string columns will be encoded.
drop_invariant: bool
boolean for whether or not to drop columns with 0 variance.
return_df: bool
boolean for whether to return a pandas DataFrame from transform (otherwise it will be a numpy array).
handle_unknown: str
options are 'error', 'return_nan' and 'value', defaults to 'value'. Warning: if value is used,
an extra column will be added in if the transform matrix has unknown categories. This can cause
unexpected changes in the dimension in some cases.
handle_missing: str
options are 'error', 'return_nan', 'value', and 'indicator', defaults to 'indicator'. Warning: if indicator is used,
an extra column will be added in if the transform matrix has unknown categories. This can cause
unexpected changes in dimension in some cases.
Example
-------
>>> from category_encoders import *
>>> import pandas as pd
>>> from sklearn.datasets import load_boston
>>> bunch = load_boston()
>>> y = bunch.target
>>> X = pd.DataFrame(bunch.data, columns=bunch.feature_names)
>>> enc = SumEncoder(cols=['CHAS', 'RAD']).fit(X, y)
>>> numeric_dataset = enc.transform(X)
>>> print(numeric_dataset.info())
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 506 entries, 0 to 505
Data columns (total 21 columns):
intercept 506 non-null int64
CRIM 506 non-null float64
ZN 506 non-null float64
INDUS 506 non-null float64
CHAS_0 506 non-null float64
NOX 506 non-null float64
RM 506 non-null float64
AGE 506 non-null float64
DIS 506 non-null float64
RAD_0 506 non-null float64
RAD_1 506 non-null float64
RAD_2 506 non-null float64
RAD_3 506 non-null float64
RAD_4 506 non-null float64
RAD_5 506 non-null float64
RAD_6 506 non-null float64
RAD_7 506 non-null float64
TAX 506 non-null float64
PTRATIO 506 non-null float64
B 506 non-null float64
LSTAT 506 non-null float64
dtypes: float64(20), int64(1)
memory usage: 83.1 KB
None
References
----------
.. [1] Contrast Coding Systems for categorical variables. UCLA: Statistical Consulting Group. from
https://stats.idre.ucla.edu/r/library/r-library-contrast-coding-systems-for-categorical-variables/.
.. [2] Gregory Carey (2003). Coding Categorical Variables, from
http://psych.colorado.edu/~carey/Courses/PSYC5741/handouts/Coding%20Categorical%20Variables%202006-03-03.pdf
"""
def __init__(self, verbose=0, cols=None, mapping=None, drop_invariant=False, return_df=True,
handle_unknown='value', handle_missing='value'):
self.return_df = return_df
self.drop_invariant = drop_invariant
self.drop_cols = []
self.verbose = verbose
self.mapping = mapping
self.handle_unknown = handle_unknown
self.handle_missing=handle_missing
self.cols = cols
self.ordinal_encoder = None
self._dim = None
self.feature_names = None
def fit(self, X, y=None, **kwargs):
"""Fit encoder according to X and y.
Parameters
----------
X : array-like, shape = [n_samples, n_features]
Training vectors, where n_samples is the number of samples
and n_features is the number of features.
y : array-like, shape = [n_samples]
Target values.
Returns
#.........這裏部分代碼省略.........
示例8: HelmertEncoder
# 需要導入模塊: from category_encoders.ordinal import OrdinalEncoder [as 別名]
# 或者: from category_encoders.ordinal.OrdinalEncoder import fit [as 別名]
class HelmertEncoder(BaseEstimator, TransformerMixin):
"""
Parameters
----------
verbose: int
integer indicating verbosity of output. 0 for none.
cols: list
a list of columns to encode, if None, all string columns will be encoded
drop_invariant: bool
boolean for whether or not to drop columns with 0 variance
return_df: bool
boolean for whether to return a pandas DataFrame from transform (otherwise it will be a numpy array)
Example
-------
>>> from category_encoders import HelmertEncoder
>>> from sklearn.datasets import fetch_20newsgroups_vectorized
>>> bunch = fetch_20newsgroups_vectorized(subset="all")
>>> X, y = bunch.data, bunch.target
>>> enc = HelmertEncoder(return_df=False).fit(X, y)
>>> numeric_dataset = enc.transform(X)
References
----------
.. [1] Contrast Coding Systems for categorical variables. UCLA: Statistical Consulting Group. from
http://www.ats.ucla.edu/stat/r/library/contrast_coding.
.. [2] Gregory Carey (2003). Coding Categorical Variables, from
http://psych.colorado.edu/~carey/Courses/PSYC5741/handouts/Coding%20Categorical%20Variables%202006-03-03.pdf
"""
def __init__(self, verbose=0, cols=None, drop_invariant=False, return_df=True):
self.return_df = return_df
self.drop_invariant = drop_invariant
self.drop_cols = []
self.verbose = verbose
self.cols = cols
self.ordinal_encoder = None
self._dim = None
def fit(self, X, y=None, **kwargs):
"""Fit encoder according to X and y.
Parameters
----------
X : array-like, shape = [n_samples, n_features]
Training vectors, where n_samples is the number of samples
and n_features is the number of features.
y : array-like, shape = [n_samples]
Target values.
Returns
-------
self : encoder
Returns self.
"""
# first check the type
if not isinstance(X, pd.DataFrame):
if isinstance(X, list):
X = pd.DataFrame(np.array(X))
elif isinstance(X, (np.generic, np.ndarray)):
X = pd.DataFrame(X)
else:
raise ValueError('Unexpected input type: %s' % (str(type(X))))
self._dim = X.shape[1]
# if columns aren't passed, just use every string column
if self.cols is None:
self.cols = get_obj_cols(X)
self.ordinal_encoder = OrdinalEncoder(verbose=self.verbose, cols=self.cols)
self.ordinal_encoder = self.ordinal_encoder.fit(X)
if self.drop_invariant:
self.drop_cols = []
X_temp = self.transform(X)
self.drop_cols = [x for x in X_temp.columns.values if X_temp[x].var() <= 10e-5]
return self
def transform(self, X):
"""Perform the transformation to new categorical data.
Parameters
----------
X : array-like, shape = [n_samples, n_features]
Returns
-------
#.........這裏部分代碼省略.........
示例9: BinaryEncoder
# 需要導入模塊: from category_encoders.ordinal import OrdinalEncoder [as 別名]
# 或者: from category_encoders.ordinal.OrdinalEncoder import fit [as 別名]
class BinaryEncoder(BaseEstimator, TransformerMixin):
"""
Parameters
----------
verbose: int
integer indicating verbosity of output. 0 for none.
cols: list
a list of columns to encode, if None, all string columns will be encoded
drop_invariant: bool
boolean for whether or not to drop columns with 0 variance
return_df: bool
boolean for whether to return a pandas DataFrame from transform (otherwise it will be a numpy array)
Example
-------
>>> from category_encoders import BinaryEncoder
>>> from sklearn.datasets import fetch_20newsgroups_vectorized
>>> bunch = fetch_20newsgroups_vectorized(subset="all")
>>> X, y = bunch.data, bunch.target
>>> enc = BinaryEncoderr(return_df=False).fit(X, y)
>>> numeric_dataset = enc.transform(X)
"""
def __init__(self, verbose=0, cols=None, drop_invariant=False, return_df=True):
self.return_df = return_df
self.drop_invariant = drop_invariant
self.drop_cols = []
self.verbose = verbose
self.cols = cols
self.ordinal_encoder = None
self._dim = None
def fit(self, X, y=None, **kwargs):
"""Fit encoder according to X and y.
Parameters
----------
X : array-like, shape = [n_samples, n_features]
Training vectors, where n_samples is the number of samples
and n_features is the number of features.
y : array-like, shape = [n_samples]
Target values.
Returns
-------
self : encoder
Returns self.
"""
# if the input dataset isn't already a dataframe, convert it to one (using default column names)
# first check the type
if not isinstance(X, pd.DataFrame):
if isinstance(X, list):
X = pd.DataFrame(np.array(X))
elif isinstance(X, (np.generic, np.ndarray)):
X = pd.DataFrame(X)
else:
raise ValueError('Unexpected input type: %s' % (str(type(X))))
self._dim = X.shape[1]
# if columns aren't passed, just use every string column
if self.cols is None:
self.cols = get_obj_cols(X)
# train an ordinal pre-encoder
self.ordinal_encoder = OrdinalEncoder(verbose=self.verbose, cols=self.cols)
self.ordinal_encoder = self.ordinal_encoder.fit(X)
# drop all output columns with 0 variance.
if self.drop_invariant:
self.drop_cols = []
X_temp = self.transform(X)
self.drop_cols = [x for x in X_temp.columns.values if X_temp[x].var() <= 10e-5]
return self
def transform(self, X):
"""Perform the transformation to new categorical data.
Parameters
----------
X : array-like, shape = [n_samples, n_features]
Returns
-------
p : array, shape = [n_samples, n_numeric + N]
Transformed values with encoding applied.
"""
if self._dim is None:
raise ValueError('Must train encoder before it can be used to transform data.')
#.........這裏部分代碼省略.........
示例10: BaseNEncoder
# 需要導入模塊: from category_encoders.ordinal import OrdinalEncoder [as 別名]
# 或者: from category_encoders.ordinal.OrdinalEncoder import fit [as 別名]
class BaseNEncoder(BaseEstimator, TransformerMixin):
"""Base-N encoder encodes the categories into arrays of their base-N representation. A base of 1 is equivalent to
one-hot encoding (not really base-1, but useful), a base of 2 is equivalent to binary encoding. N=number of actual
categories is equivalent to vanilla ordinal encoding.
Parameters
----------
verbose: int
integer indicating verbosity of output. 0 for none.
cols: list
a list of columns to encode, if None, all string columns will be encoded.
drop_invariant: bool
boolean for whether or not to drop columns with 0 variance.
return_df: bool
boolean for whether to return a pandas DataFrame from transform (otherwise it will be a numpy array).
base: int
when the downstream model copes well with nonlinearities (like decision tree), use higher base.
handle_unknown: str
options are 'error', 'return_nan' and 'value', defaults to 'value'. Warning: if value is used,
an extra column will be added in if the transform matrix has unknown categories. This can cause
unexpected changes in dimension in some cases.
Example
-------
>>> from category_encoders import *
>>> import pandas as pd
>>> from sklearn.datasets import load_boston
>>> bunch = load_boston()
>>> y = bunch.target
>>> X = pd.DataFrame(bunch.data, columns=bunch.feature_names)
>>> enc = BaseNEncoder(cols=['CHAS', 'RAD']).fit(X, y)
>>> numeric_dataset = enc.transform(X)
>>> print(numeric_dataset.info())
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 506 entries, 0 to 505
Data columns (total 18 columns):
CRIM 506 non-null float64
ZN 506 non-null float64
INDUS 506 non-null float64
CHAS_0 506 non-null int64
CHAS_1 506 non-null int64
NOX 506 non-null float64
RM 506 non-null float64
AGE 506 non-null float64
DIS 506 non-null float64
RAD_0 506 non-null int64
RAD_1 506 non-null int64
RAD_2 506 non-null int64
RAD_3 506 non-null int64
RAD_4 506 non-null int64
TAX 506 non-null float64
PTRATIO 506 non-null float64
B 506 non-null float64
LSTAT 506 non-null float64
dtypes: float64(11), int64(7)
memory usage: 71.2 KB
None
"""
def __init__(self, verbose=0, cols=None, mapping=None, drop_invariant=False, return_df=True, base=2,
handle_unknown='value', handle_missing='value'):
self.return_df = return_df
self.drop_invariant = drop_invariant
self.drop_cols = []
self.verbose = verbose
self.handle_unknown = handle_unknown
self.handle_missing = handle_missing
self.cols = cols
self.mapping = mapping
self.ordinal_encoder = None
self._dim = None
self.base = base
self._encoded_columns = None
self.feature_names = None
def fit(self, X, y=None, **kwargs):
"""Fit encoder according to X and y.
Parameters
----------
X : array-like, shape = [n_samples, n_features]
Training vectors, where n_samples is the number of samples
and n_features is the number of features.
y : array-like, shape = [n_samples]
Target values.
Returns
-------
self : encoder
Returns self.
"""
# if the input dataset isn't already a dataframe, convert it to one (using default column names)
# first check the type
X = util.convert_input(X)
#.........這裏部分代碼省略.........
示例11: BaseNEncoder
# 需要導入模塊: from category_encoders.ordinal import OrdinalEncoder [as 別名]
# 或者: from category_encoders.ordinal.OrdinalEncoder import fit [as 別名]
class BaseNEncoder(BaseEstimator, TransformerMixin):
"""Base-N encoder encodes the categories into arrays of their base-N representation. A base of 1 is equivalent to
one-hot encoding (not really base-1, but useful), a base of 2 is equivalent to binary encoding. N=number of actual
categories is equivalent to vanilla ordinal encoding.
Parameters
----------
verbose: int
integer indicating verbosity of output. 0 for none.
cols: list
a list of columns to encode, if None, all string columns will be encoded
drop_invariant: bool
boolean for whether or not to drop columns with 0 variance
return_df: bool
boolean for whether to return a pandas DataFrame from transform (otherwise it will be a numpy array)
impute_missing: bool
boolean for whether or not to apply the logic for handle_unknown, will be deprecated in the future.
handle_unknown: str
options are 'error', 'ignore' and 'impute', defaults to 'impute', which will impute the category -1. Warning: if
impute is used, an extra column will be added in if the transform matrix has unknown categories. This can causes
unexpected changes in dimension in some cases.
Example
-------
>>>from category_encoders import *
>>>import pandas as pd
>>>from sklearn.datasets import load_boston
>>>bunch = load_boston()
>>>y = bunch.target
>>>X = pd.DataFrame(bunch.data, columns=bunch.feature_names)
>>>enc = BaseNEncoder(cols=['CHAS', 'RAD']).fit(X, y)
>>>numeric_dataset = enc.transform(X)
>>>print(numeric_dataset.info())
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 506 entries, 0 to 505
Data columns (total 16 columns):
CHAS_0 506 non-null int64
RAD_0 506 non-null int64
RAD_1 506 non-null int64
RAD_2 506 non-null int64
RAD_3 506 non-null int64
CRIM 506 non-null float64
ZN 506 non-null float64
INDUS 506 non-null float64
NOX 506 non-null float64
RM 506 non-null float64
AGE 506 non-null float64
DIS 506 non-null float64
TAX 506 non-null float64
PTRATIO 506 non-null float64
B 506 non-null float64
LSTAT 506 non-null float64
dtypes: float64(11), int64(5)
memory usage: 63.3 KB
None
"""
def __init__(self, verbose=0, cols=None, drop_invariant=False, return_df=True, base=2, impute_missing=True,
handle_unknown='impute'):
self.return_df = return_df
self.drop_invariant = drop_invariant
self.drop_cols = []
self.verbose = verbose
self.impute_missing = impute_missing
self.handle_unknown = handle_unknown
self.cols = cols
self.ordinal_encoder = None
self._dim = None
self.base = base
self._encoded_columns = None
self.digits_per_col = {}
def fit(self, X, y=None, **kwargs):
"""Fit encoder according to X and y.
Parameters
----------
X : array-like, shape = [n_samples, n_features]
Training vectors, where n_samples is the number of samples
and n_features is the number of features.
y : array-like, shape = [n_samples]
Target values.
Returns
-------
self : encoder
Returns self.
"""
# if the input dataset isn't already a dataframe, convert it to one (using default column names)
# first check the type
X = convert_input(X)
self._dim = X.shape[1]
#.........這裏部分代碼省略.........
示例12: BackwardDifferenceEncoder
# 需要導入模塊: from category_encoders.ordinal import OrdinalEncoder [as 別名]
# 或者: from category_encoders.ordinal.OrdinalEncoder import fit [as 別名]
class BackwardDifferenceEncoder(BaseEstimator, TransformerMixin):
"""
"""
def __init__(self, verbose=0, cols=None, drop_invariant=False):
"""
:param verbose:
:param cols:
:return:
"""
self.drop_invariant = drop_invariant
self.drop_cols = []
self.verbose = verbose
self.cols = cols
self.ordinal_encoder = OrdinalEncoder(verbose=verbose, cols=cols)
def fit(self, X, y=None, **kwargs):
"""
Fits an ordinal encoder to produce a consistent mapping across applications and optionally finds
generally invariant columns to drop consistently.
:param X:
:param y:
:param kwargs:
:return:
"""
# if the input dataset isn't already a dataframe, convert it to one (using default column names)
if not isinstance(X, pd.DataFrame):
X = pd.DataFrame(X)
# if columns aren't passed, just use every string column
if self.cols is None:
self.cols = get_obj_cols(X)
# train an ordinal pre-encoder
self.ordinal_encoder = self.ordinal_encoder.fit(X)
# drop all output columns with 0 variance.
if self.drop_invariant:
self.drop_cols = []
X_temp = self.transform(X)
self.drop_cols = [x for x in X_temp.columns.values if X_temp[x].var() <= 10e-5]
return self
def transform(self, X):
"""
:param X:
:return:
"""
if not isinstance(X, pd.DataFrame):
X = pd.DataFrame(X)
if self.cols == []:
return X
X = self.ordinal_encoder.transform(X)
X = backward_difference_coding(X, cols=self.cols)
if self.drop_invariant:
for col in self.drop_cols:
X.drop(col, 1, inplace=True)
return X
示例13: JamesSteinEncoder
# 需要導入模塊: from category_encoders.ordinal import OrdinalEncoder [as 別名]
# 或者: from category_encoders.ordinal.OrdinalEncoder import fit [as 別名]
class JamesSteinEncoder(BaseEstimator, TransformerMixin):
"""James-Stein estimator.
For feature value i, James-Stein estimator returns a weighted average of:
1) The mean target value for the observed feature value i.
2) The mean target value (regardless of the feature value).
This can be written as:
JS_i = (1-B)*mean(y_i) + B*mean(y)
The question is, what should be the weight B?
If we put too much weight on the conditional mean value, we will overfit.
If we put too much weight on the global mean, we will underfit.
The canonical solution in machine learning is to perform cross-validation.
However, Charles Stein came with a closed-form solution to the problem.
The intuition is: If the estimate of mean(y_i) is unreliable (y_i has high variance),
we should put more weight on mean(y). Stein put it into an equation as:
B = var(y_i) / (var(y_i)+var(y))
The only remaining issue is that we do not know var(y), let alone var(y_i).
Hence, we have to estimate the variances. But how can we reliably estimate the
variances, when we already struggle with the estimation of the mean values?!
There are multiple solutions:
1) If we have the same count of observations for each feature value i and all
y_i are close to each other, we can pretend that all var(y_i) are identical.
This is called a pooled model.
2) If the observation counts are not equal, it makes sense to replace the variances
with squared standard errors, which penalize small observation counts:
SE^2 = var(y)/count(y)
This is called an independent model.
James-Stein estimator has, however, one practical limitation - it was defined
only for normal distributions. If you want to apply it for binary classification,
which allows only values {0, 1}, it is better to first convert the mean target value
from the bound interval <0,1> into an unbounded interval by replacing mean(y)
with log-odds ratio:
log-odds_ratio_i = log(mean(y_i)/mean(y_not_i))
This is called binary model. The estimation of parameters of this model is, however,
tricky and sometimes it fails fatally. In these situations, it is better to use beta
model, which generally delivers slightly worse accuracy than binary model but does
not suffer from fatal failures.
Parameters
----------
verbose: int
integer indicating verbosity of output. 0 for none.
cols: list
a list of columns to encode, if None, all string columns will be encoded.
drop_invariant: bool
boolean for whether or not to drop encoded columns with 0 variance.
return_df: bool
boolean for whether to return a pandas DataFrame from transform (otherwise it will be a numpy array).
handle_missing: str
options are 'return_nan', 'error' and 'value', defaults to 'value', which returns the prior probability.
handle_unknown: str
options are 'return_nan', 'error' and 'value', defaults to 'value', which returns the prior probability.
model: str
options are 'pooled', 'beta', 'binary' and 'independent', defaults to 'independent'.
randomized: bool,
adds normal (Gaussian) distribution noise into training data in order to decrease overfitting (testing data are untouched).
sigma: float
standard deviation (spread or "width") of the normal distribution.
Example
-------
>>> from category_encoders import *
>>> import pandas as pd
>>> from sklearn.datasets import load_boston
>>> bunch = load_boston()
>>> y = bunch.target
>>> X = pd.DataFrame(bunch.data, columns=bunch.feature_names)
>>> enc = JamesSteinEncoder(cols=['CHAS', 'RAD']).fit(X, y)
>>> numeric_dataset = enc.transform(X)
>>> print(numeric_dataset.info())
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 506 entries, 0 to 505
Data columns (total 13 columns):
CRIM 506 non-null float64
ZN 506 non-null float64
INDUS 506 non-null float64
CHAS 506 non-null float64
NOX 506 non-null float64
RM 506 non-null float64
AGE 506 non-null float64
DIS 506 non-null float64
RAD 506 non-null float64
TAX 506 non-null float64
PTRATIO 506 non-null float64
B 506 non-null float64
LSTAT 506 non-null float64
dtypes: float64(13)
memory usage: 51.5 KB
None
References
----------
.. [1] Parametric empirical Bayes inference: Theory and applications, equations 1.19 & 1.20, from
https://www.jstor.org/stable/2287098
.. [2] Empirical Bayes for multiple sample sizes, from
#.........這裏部分代碼省略.........
示例14: BinaryEncoder
# 需要導入模塊: from category_encoders.ordinal import OrdinalEncoder [as 別名]
# 或者: from category_encoders.ordinal.OrdinalEncoder import fit [as 別名]
class BinaryEncoder(BaseEstimator, TransformerMixin):
"""
Binary encoding encodes the integers as binary code with one column per digit.
"""
def __init__(self, verbose=0, cols=None, drop_invariant=False):
"""
:param verbose:
:param cols:
:return:
"""
self.drop_invariant = drop_invariant
self.drop_cols = []
self.verbose = verbose
self.cols = cols
self.ordinal_encoder = OrdinalEncoder(verbose=verbose, cols=cols)
def fit(self, X, y=None, **kwargs):
"""
:param X:
:param y:
:param kwargs:
:return:
"""
# if the input dataset isn't already a dataframe, convert it to one (using default column names)
if not isinstance(X, pd.DataFrame):
X = pd.DataFrame(X)
# if columns aren't passed, just use every string column
if self.cols is None:
self.cols = get_obj_cols(X)
# train an ordinal pre-encoder
self.ordinal_encoder = self.ordinal_encoder.fit(X)
# drop all output columns with 0 variance.
if self.drop_invariant:
self.drop_cols = []
X_temp = self.transform(X)
self.drop_cols = [x for x in X_temp.columns.values if X_temp[x].var() <= 10e-5]
return self
def transform(self, X):
"""
:param X:
:return:
"""
if not isinstance(X, pd.DataFrame):
X = pd.DataFrame(X)
if self.cols == []:
return X
X = self.ordinal_encoder.transform(X)
X = binary(X, cols=self.cols)
if self.drop_invariant:
for col in self.drop_cols:
X.drop(col, 1, inplace=True)
return X
示例15: OneHotEncoder
# 需要導入模塊: from category_encoders.ordinal import OrdinalEncoder [as 別名]
# 或者: from category_encoders.ordinal.OrdinalEncoder import fit [as 別名]
class OneHotEncoder(BaseEstimator, TransformerMixin):
"""Onehot (or dummy) coding for categorical features, produces one feature per category, each binary.
Parameters
----------
verbose: int
integer indicating verbosity of output. 0 for none.
cols: list
a list of columns to encode, if None, all string columns will be encoded
drop_invariant: bool
boolean for whether or not to drop columns with 0 variance
return_df: bool
boolean for whether to return a pandas DataFrame from transform (otherwise it will be a numpy array)
impute_missing: bool
boolean for whether or not to apply the logic for handle_unknown, will be deprecated in the future.
handle_unknown: str
options are 'error', 'ignore' and 'impute', defaults to 'impute', which will impute the category -1. Warning: if
impute is used, an extra column will be added in if the transform matrix has unknown categories. This can causes
unexpected changes in dimension in some cases.
Example
-------
>>>from category_encoders import *
>>>import pandas as pd
>>>from sklearn.datasets import load_boston
>>>bunch = load_boston()
>>>y = bunch.target
>>>X = pd.DataFrame(bunch.data, columns=bunch.feature_names)
>>>enc = OneHotEncoder(cols=['CHAS', 'RAD']).fit(X, y)
>>>numeric_dataset = enc.transform(X)
>>>print(numeric_dataset.info())
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 506 entries, 0 to 505
Data columns (total 22 columns):
CHAS_0 506 non-null int64
CHAS_1 506 non-null int64
RAD_0 506 non-null int64
RAD_1 506 non-null int64
RAD_2 506 non-null int64
RAD_3 506 non-null int64
RAD_4 506 non-null int64
RAD_5 506 non-null int64
RAD_6 506 non-null int64
RAD_7 506 non-null int64
RAD_8 506 non-null int64
CRIM 506 non-null float64
ZN 506 non-null float64
INDUS 506 non-null float64
NOX 506 non-null float64
RM 506 non-null float64
AGE 506 non-null float64
DIS 506 non-null float64
TAX 506 non-null float64
PTRATIO 506 non-null float64
B 506 non-null float64
LSTAT 506 non-null float64
dtypes: float64(11), int64(11)
memory usage: 87.0 KB
None
References
----------
.. [1] Contrast Coding Systems for categorical variables. UCLA: Statistical Consulting Group. from
https://stats.idre.ucla.edu/r/library/r-library-contrast-coding-systems-for-categorical-variables/.
.. [2] Gregory Carey (2003). Coding Categorical Variables, from
http://psych.colorado.edu/~carey/Courses/PSYC5741/handouts/Coding%20Categorical%20Variables%202006-03-03.pdf
"""
def __init__(self, verbose=0, cols=None, drop_invariant=False, return_df=True, impute_missing=True, handle_unknown='impute'):
self.return_df = return_df
self.drop_invariant = drop_invariant
self.drop_cols = []
self.verbose = verbose
self.cols = cols
self.ordinal_encoder = None
self._dim = None
self.impute_missing = impute_missing
self.handle_unknown = handle_unknown
@property
def category_mapping(self):
return self.ordinal_encoder.category_mapping
def fit(self, X, y=None, **kwargs):
"""Fit encoder according to X and y.
Parameters
----------
X : array-like, shape = [n_samples, n_features]
Training vectors, where n_samples is the number of samples
and n_features is the number of features.
y : array-like, shape = [n_samples]
Target values.
#.........這裏部分代碼省略.........