# Authors: Soledad Galli <solegalli@protonmail.com>
# License: BSD 3 clause
from typing import List, Optional, Union
import pandas as pd
from feature_engine.dataframe_checks import _is_dataframe
from feature_engine.imputation.base_imputer import BaseImputer
from feature_engine.parameter_checks import _define_numerical_dict
from feature_engine.variable_manipulation import (
_check_input_parameter_variables,
_find_or_check_numerical_variables,
)
[docs]class ArbitraryNumberImputer(BaseImputer):
"""
The ArbitraryNumberImputer() replaces missing data in each variable by an arbitrary
value determined by the user. It works only with numerical variables.
You can impute all variables with the same number, in which case you need to define
the variables to impute in `variables` and the imputation number in
`arbitrary_number`. You can pass a dictionary of variable and numbers to use for
their imputation.
For example, you can impute varA and varB with 99 like this:
.. code-block:: python
transformer = ArbitraryNumberImputer(
variables = ['varA', 'varB'],
arbitrary_number = 99
)
Xt = transformer.fit_transform(X)
Alternatively, you can impute varA with 1 and varB with 99 like this:
.. code-block:: python
transformer = ArbitraryNumberImputer(
imputer_dict = {'varA' : 1, 'varB': 99]
)
Xt = transformer.fit_transform(X)
Parameters
----------
arbitrary_number: int or float, default=999
The number to be used to replace missing data.
variables: list, default=None
The list of variables to be imputed. If None, the imputer will find and
select all numerical variables. This parameter is used only if `imputer_dict`
is None.
imputer_dict: dict, default=None
The dictionary of variables and the arbitrary numbers for their imputation.
Attributes
----------
imputer_dict_:
Dictionary with the values to replace NAs in each variable.
variables_:
The group of variables that will be transformed.
n_features_in_:
The number of features in the train set used in fit.
Methods
-------
fit:
This transformer does not learn parameters.
transform:
Impute missing data.
fit_transform:
Fit to the data, then transform it.
See Also
--------
feature_engine.imputation.EndTailImputer
"""
def __init__(
self,
arbitrary_number: Union[int, float] = 999,
variables: Union[None, int, str, List[Union[str, int]]] = None,
imputer_dict: Optional[dict] = None,
) -> None:
if isinstance(arbitrary_number, int) or isinstance(arbitrary_number, float):
self.arbitrary_number = arbitrary_number
else:
raise ValueError("arbitrary_number must be numeric of type int or float")
self.variables = _check_input_parameter_variables(variables)
self.imputer_dict = _define_numerical_dict(imputer_dict)
[docs] def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None):
"""
This method does not learn any parameter. Checks dataframe and finds numerical
variables, or checks that the variables entered by user are numerical.
Parameters
----------
X: pandas dataframe of shape = [n_samples, n_features]
The training dataset.
y: None
y is not needed in this imputation. You can pass None or y.
Raises
------
TypeError
- If the input is not a Pandas DataFrame
- If any of the user provided variables are not numerical
ValueError
If there are no numerical variables in the df or the df is empty
Returns
-------
self
"""
# check input dataframe
X = _is_dataframe(X)
# find or check for numerical variables
if self.imputer_dict:
self.variables_ = _find_or_check_numerical_variables(
X, self.imputer_dict.keys() # type: ignore
)
else:
self.variables_ = _find_or_check_numerical_variables(X, self.variables)
# create the imputer dictionary
if self.imputer_dict:
self.imputer_dict_ = self.imputer_dict
else:
self.imputer_dict_ = {var: self.arbitrary_number for var in self.variables_}
self.n_features_in_ = X.shape[1]
return self
# Ugly work around to import the docstring for Sphinx, otherwise not necessary
transform.__doc__ = BaseImputer.transform.__doc__