# Authors: Soledad Galli <solegalli@protonmail.com>
# License: BSD 3 clause
from typing import List, Union
import numpy as np
import pandas as pd
from feature_engine.encoding.base_encoder import BaseCategoricalTransformer
from feature_engine.validation import _return_tags
from feature_engine.variable_manipulation import _check_input_parameter_variables
[docs]class WoEEncoder(BaseCategoricalTransformer):
"""
The WoERatioCategoricalEncoder() replaces categories by the weight of evidence
(WoE). The WoE was used primarily in the financial sector to create credit risk
scorecards.
The encoder will encode only categorical variables by default
(type 'object' or 'categorical'). You can pass a list of variables to encode.
Alternatively, the encoder will find and encode all categorical variables
(type 'object' or 'categorical').
With `ignore_format=True` you have the option to encode numerical variables as well.
The procedure is identical, you can either enter the list of variables to encode, or
the transformer will automatically select all variables.
The encoder first maps the categories to the weight of evidence for each variable
(fit). The encoder then transforms the categories into the mapped numbers
(transform).
**Note**
This categorical encoding is exclusive for binary classification.
**The weight of evidence is given by:**
.. math::
log( p(X=xj|Y = 1) / p(X=xj|Y=0) )
**The WoE is determined as follows:**
We calculate the percentage positive cases in each category of the total of all
positive cases. For example 20 positive cases in category A out of 100 total
positive cases equals 20 %. Next, we calculate the percentage of negative cases in
each category respect to the total negative cases, for example 5 negative cases in
category A out of a total of 50 negative cases equals 10%. Then we calculate the
WoE by dividing the category percentages of positive cases by the category
percentage of negative cases, and take the logarithm, so for category A in our
example WoE = log(20/10).
**Note**
- If WoE values are negative, negative cases supersede the positive cases.
- If WoE values are positive, positive cases supersede the negative cases.
- And if WoE is 0, then there are equal number of positive and negative examples.
**Encoding into WoE**:
- Creates a monotonic relationship between the encoded variable and the target
- Returns variables in a similar scale
**Note**
The log(0) is not defined and the division by 0 is not defined. Thus, if any of the
terms in the WoE equation are 0 for a given category, the encoder will return an
error. If this happens, try grouping less frequent categories.
Parameters
----------
variables: list, default=None
The list of categorical variables that will be encoded. If None, the
encoder will find and transform all variables of type object or categorical by
default. You can also make the transformer accept numerical variables, see the
next parameter.
ignore_format: bool, default=False
Whether the format in which the categorical variables are cast should be
ignored. If false, the encoder will automatically select variables of type
object or categorical, or check that the variables entered by the user are of
type object or categorical. If True, the encoder will select all variables or
accept all variables entered by the user, including those cast as numeric.
Attributes
----------
encoder_dict_:
Dictionary with the WoE per variable.
variables_:
The group of variables that will be transformed.
n_features_in_:
The number of features in the train set used in fit.
Methods
-------
fit:
Learn the WoE per category, per variable.
transform:
Encode the categories to numbers.
fit_transform:
Fit to the data, then transform it.
inverse_transform:
Encode the numbers into the original categories.
Notes
-----
For details on the calculation of the weight of evidence visit:
https://www.listendata.com/2015/03/weight-of-evidence-woe-and-information.html
In credit scoring, continuous variables are also transformed using the WoE. To do
this, first variables are sorted into a discrete number of bins, and then these
bins are encoded with the WoE as explained here for categorical variables. You can
do this by combining the use of the equal width, equal frequency or arbitrary
discretisers.
NAN are introduced when encoding categories that were not present in the training
dataset. If this happens, try grouping infrequent categories using the
RareLabelEncoder().
See Also
--------
feature_engine.encoding.RareLabelEncoder
feature_engine.discretisation
"""
def __init__(
self,
variables: Union[None, int, str, List[Union[str, int]]] = None,
ignore_format: bool = False,
) -> None:
if not isinstance(ignore_format, bool):
raise ValueError("ignore_format takes only booleans True and False")
self.variables = _check_input_parameter_variables(variables)
self.ignore_format = ignore_format
[docs] def fit(self, X: pd.DataFrame, y: pd.Series):
"""
Learn the WoE.
Parameters
----------
X: pandas dataframe of shape = [n_samples, n_features]
The training input samples.
Can be the entire dataframe, not just the categorical variables.
y: pandas series.
Target, must be binary.
Raises
------
TypeError
- If the input is not the Pandas DataFrame.
- If user enters non-categorical variables (unless ignore_format is True)
ValueError
- If there are no categorical variables in df or df is empty
- If variable(s) contain null values.
- If y is not binary with values 0 and 1.
- If p(0) = 0 or p(1) = 0.
Returns
-------
self
"""
X = self._check_fit_input_and_variables(X)
if not isinstance(y, pd.Series):
y = pd.Series(y)
# check that y is binary
if y.nunique() != 2:
raise ValueError(
"This encoder is designed for binary classification. The target "
"used has more than 2 unique values."
)
temp = pd.concat([X, y], axis=1)
temp.columns = list(X.columns) + ["target"]
# if target does not have values 0 and 1, we need to remap, to be able to
# compute the averages.
if any(x for x in y.unique() if x not in [0, 1]):
temp["target"] = np.where(temp["target"] == y.unique()[0], 0, 1)
self.encoder_dict_ = {}
total_pos = temp["target"].sum()
total_neg = len(temp) - total_pos
temp["non_target"] = np.where(temp["target"] == 1, 0, 1)
for var in self.variables_:
pos = temp.groupby([var])["target"].sum() / total_pos
neg = temp.groupby([var])["non_target"].sum() / total_neg
t = pd.concat([pos, neg], axis=1)
t["woe"] = np.log(t["target"] / t["non_target"])
if (
not t.loc[t["target"] == 0, :].empty
or not t.loc[t["non_target"] == 0, :].empty
):
raise ValueError(
"The proportion of one of the classes for a category in "
"variable {} is zero, and log of zero is not defined".format(var)
)
self.encoder_dict_[var] = t["woe"].to_dict()
self._check_encoding_dictionary()
self.n_features_in_ = X.shape[1]
return self
# Ugly work around to import the docstring for Sphinx, otherwise not necessary
transform.__doc__ = BaseCategoricalTransformer.transform.__doc__
inverse_transform.__doc__ = BaseCategoricalTransformer.inverse_transform.__doc__
def _more_tags(self):
tags_dict = _return_tags()
# in the current format, the tests are performed using continuous np.arrays
# this means that when we encode some of the values, the denominator is 0
# and this the transformer raises an error, and the test fails.
# For this reason, most sklearn transformers will fail. And it has nothing to
# do with the class not being compatible, it is just that the inputs passed
# are not suitable
tags_dict["_skip_test"] = True
return tags_dict