# Authors: Soledad Galli <solegalli@protonmail.com>
# License: BSD 3 clause
from typing import Optional
import pandas as pd
from feature_engine.dataframe_checks import (
_check_contains_inf,
_check_contains_na,
_is_dataframe,
)
from feature_engine.outliers.base_outlier import BaseOutlier
from feature_engine.parameter_checks import _define_numerical_dict
from feature_engine.validation import _return_tags
from feature_engine.variable_manipulation import _find_or_check_numerical_variables
[docs]class ArbitraryOutlierCapper(BaseOutlier):
"""
The ArbitraryOutlierCapper() caps the maximum or minimum values of a variable
at an arbitrary value indicated by the user.
You must provide the maximum or minimum values that will be used to cap each
variable in a dictionary {feature:capping value}
Parameters
----------
max_capping_dict: dictionary, default=None
Dictionary containing the user specified capping values for the right tail of
the distribution of each variable (maximum values).
min_capping_dict: dictionary, default=None
Dictionary containing user specified capping values for the eft tail of the
distribution of each variable (minimum values).
missing_values : string, default='raise'
Indicates if missing values should be ignored or raised. If
`missing_values='raise'` the transformer will return an error if the
training or the datasets to transform contain missing values.
Attributes
----------
right_tail_caps_:
Dictionary with the maximum values at which variables will be capped.
left_tail_caps_:
Dictionary with the minimum values at which variables will be capped.
variables_:
The group of variables that will be transformed.
n_features_in_:
The number of features in the train set used in fit.
Methods
-------
fit:
This transformer does not learn any parameter.
transform:
Cap the variables.
fit_transform:
Fit to the data. Then transform it.
"""
def __init__(
self,
max_capping_dict: Optional[dict] = None,
min_capping_dict: Optional[dict] = None,
missing_values: str = "raise",
) -> None:
if not max_capping_dict and not min_capping_dict:
raise ValueError(
"Please provide at least 1 dictionary with the capping values."
)
if missing_values not in ["raise", "ignore"]:
raise ValueError("missing_values takes only values 'raise' or 'ignore'")
self.max_capping_dict = _define_numerical_dict(max_capping_dict)
self.min_capping_dict = _define_numerical_dict(min_capping_dict)
self.missing_values = missing_values
[docs] def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None):
"""
This transformer does not learn any parameter.
Parameters
----------
X: pandas dataframe of shape = [n_samples, n_features]
The training input samples.
y: pandas Series, default=None
y is not needed in this transformer. You can pass y or None.
Raises
------
TypeError
If the input is not a Pandas DataFrame
Returns
-------
self
"""
X = _is_dataframe(X)
# find variables to be capped
if self.min_capping_dict is None and self.max_capping_dict:
self.variables_ = [x for x in self.max_capping_dict.keys()]
elif self.max_capping_dict is None and self.min_capping_dict:
self.variables_ = [x for x in self.min_capping_dict.keys()]
elif self.min_capping_dict and self.max_capping_dict:
tmp = self.min_capping_dict.copy()
tmp.update(self.max_capping_dict)
self.variables_ = [x for x in tmp.keys()]
if self.missing_values == "raise":
# check if dataset contains na
_check_contains_na(X, self.variables_)
_check_contains_inf(X, self.variables_)
# find or check for numerical variables
self.variables_ = _find_or_check_numerical_variables(X, self.variables_)
if self.max_capping_dict is not None:
self.right_tail_caps_ = self.max_capping_dict
else:
self.right_tail_caps_ = {}
if self.min_capping_dict is not None:
self.left_tail_caps_ = self.min_capping_dict
else:
self.left_tail_caps_ = {}
self.n_features_in_ = X.shape[1]
return self
# Ugly work around to import the docstring for Sphinx, otherwise not necessary
transform.__doc__ = BaseOutlier.transform.__doc__
def _more_tags(self):
tags_dict = _return_tags()
# add additional test that fails
tags_dict["_xfail_checks"][
"check_parameters_default_constructible"
] = "transformer has 1 mandatory parameter"
return tags_dict