"""Leave one out coding"""
import numpy as np
import pandas as pd
from sklearn.base import BaseEstimator
import category_encoders.utils as util
from sklearn.utils.random import check_random_state
__author__ = 'hbghhy'
[docs]class LeaveOneOutEncoder(BaseEstimator, util.TransformerWithTargetMixin):
"""Leave one out coding for categorical features.
This is very similar to target encoding but excludes the current row's
target when calculating the mean target for a level to reduce the effect
of outliers.
Parameters
----------
verbose: int
integer indicating verbosity of the output. 0 for none.
cols: list
a list of columns to encode, if None, all string columns will be encoded.
drop_invariant: bool
boolean for whether or not to drop columns with 0 variance.
return_df: bool
boolean for whether to return a pandas DataFrame from transform (otherwise it will be a numpy array).
handle_missing: str
options are 'error', 'return_nan' and 'value', defaults to 'value', which returns the target mean.
handle_unknown: str
options are 'error', 'return_nan' and 'value', defaults to 'value', which returns the target mean.
sigma: float
adds normal (Gaussian) distribution noise into training data in order to decrease overfitting (testing
data are untouched). Sigma gives the standard deviation (spread or "width") of the normal distribution.
The optimal value is commonly between 0.05 and 0.6. The default is to not add noise, but that leads
to significantly suboptimal results.
Example
-------
>>> from category_encoders import *
>>> import pandas as pd
>>> from sklearn.datasets import load_boston
>>> bunch = load_boston()
>>> y = bunch.target
>>> X = pd.DataFrame(bunch.data, columns=bunch.feature_names)
>>> enc = LeaveOneOutEncoder(cols=['CHAS', 'RAD']).fit(X, y)
>>> numeric_dataset = enc.transform(X)
>>> print(numeric_dataset.info())
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 506 entries, 0 to 505
Data columns (total 13 columns):
CRIM 506 non-null float64
ZN 506 non-null float64
INDUS 506 non-null float64
CHAS 506 non-null float64
NOX 506 non-null float64
RM 506 non-null float64
AGE 506 non-null float64
DIS 506 non-null float64
RAD 506 non-null float64
TAX 506 non-null float64
PTRATIO 506 non-null float64
B 506 non-null float64
LSTAT 506 non-null float64
dtypes: float64(13)
memory usage: 51.5 KB
None
References
----------
.. [1] Strategies to encode categorical variables with many categories, from
https://www.kaggle.com/c/caterpillar-tube-pricing/discussion/15748#143154.
"""
def __init__(self, verbose=0, cols=None, drop_invariant=False, return_df=True,
handle_unknown='value', handle_missing='value', random_state=None, sigma=None):
self.return_df = return_df
self.drop_invariant = drop_invariant
self.drop_cols = []
self.verbose = verbose
self.use_default_cols = cols is None # if True, even a repeated call of fit() will select string columns from X
self.cols = cols
self._dim = None
self.mapping = None
self.handle_unknown = handle_unknown
self.handle_missing = handle_missing
self._mean = None
self.random_state = random_state
self.sigma = sigma
self.feature_names = None
[docs] def fit(self, X, y, **kwargs):
"""Fit encoder according to X and y.
Parameters
----------
X : array-like, shape = [n_samples, n_features]
Training vectors, where n_samples is the number of samples
and n_features is the number of features.
y : array-like, shape = [n_samples]
Target values.
Returns
-------
self : encoder
Returns self.
"""
# unite the input into pandas types
X = util.convert_input(X)
y = util.convert_input_vector(y, X.index).astype(float)
if X.shape[0] != y.shape[0]:
raise ValueError("The length of X is " + str(X.shape[0]) + " but length of y is " + str(y.shape[0]) + ".")
self._dim = X.shape[1]
# if columns aren't passed, just use every string column
if self.use_default_cols:
self.cols = util.get_obj_cols(X)
else:
self.cols = util.convert_cols_to_list(self.cols)
if self.handle_missing == 'error':
if X[self.cols].isnull().any().any():
raise ValueError('Columns to be encoded can not contain null')
categories = self.fit_leave_one_out(
X, y,
cols=self.cols
)
self.mapping = categories
X_temp = self.transform(X, override_return_df=True)
self.feature_names = X_temp.columns.tolist()
if self.drop_invariant:
self.drop_cols = []
generated_cols = util.get_generated_cols(X, X_temp, self.cols)
self.drop_cols = [x for x in generated_cols if X_temp[x].var() <= 10e-5]
try:
[self.feature_names.remove(x) for x in self.drop_cols]
except KeyError as e:
if self.verbose > 0:
print("Could not remove column from feature names."
"Not found in generated cols.\n{}".format(e))
return self
[docs] def fit_leave_one_out(self, X_in, y, cols=None):
X = X_in.copy(deep=True)
if cols is None:
cols = X.columns.values
self._mean = y.mean()
return {col: self.fit_column_map(X[col], y) for col in cols}
[docs] def fit_column_map(self, series, y):
category = pd.Categorical(series)
categories = category.categories
codes = category.codes.copy()
codes[codes == -1] = len(categories)
categories = np.append(categories, np.nan)
return_map = pd.Series(dict([(code, category) for code, category in enumerate(categories)]))
result = y.groupby(codes).agg(['sum', 'count'])
return result.rename(return_map)
[docs] def get_feature_names(self):
"""
Returns the names of all transformed / added columns.
Returns
-------
feature_names: list
A list with all feature names transformed or added.
Note: potentially dropped features are not included!
"""
if not isinstance(self.feature_names, list):
raise ValueError('Must fit data first. Affected feature names are not known before.')
else:
return self.feature_names