Source code for feature_engine.imputation.mean_median
# Authors: Soledad Galli <solegalli@protonmail.com>
# License: BSD 3 clause
from typing import List, Optional, Union
import pandas as pd
from feature_engine.dataframe_checks import _is_dataframe
from feature_engine.imputation.base_imputer import BaseImputer
from feature_engine.variable_manipulation import (
_check_input_parameter_variables,
_find_or_check_numerical_variables,
)
[docs]class MeanMedianImputer(BaseImputer):
"""
The MeanMedianImputer() replaces missing data by the mean or median value of the
variable. It works only with numerical variables.
You can pass a list of variables to be imputed. Alternatively, the
MeanMedianImputer() will automatically select all variables of type numeric in the
training set.
The imputer:
- first calculates the mean / median values of the variables (fit).
- Then replaces the missing data with the estimated mean / median (transform).
Parameters
----------
imputation_method: str, default=median
Desired method of imputation. Can take 'mean' or 'median'.
variables: list, default=None
The list of variables to be imputed. If None, the imputer will select
all variables of type numeric.
Attributes
----------
imputer_dict_:
Dictionary with the mean or median values per variable.
variables_:
The group of variables that will be transformed.
n_features_in_:
The number of features in the train set used in fit.
Methods
-------
fit:
Learn the mean or median values.
transform:
Impute missing data.
fit_transform:
Fit to the data, then transform it.
"""
def __init__(
self,
imputation_method: str = "median",
variables: Union[None, int, str, List[Union[str, int]]] = None,
) -> None:
if imputation_method not in ["median", "mean"]:
raise ValueError("imputation_method takes only values 'median' or 'mean'")
self.imputation_method = imputation_method
self.variables = _check_input_parameter_variables(variables)
[docs] def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None):
"""
Learn the mean or median values.
Parameters
----------
X: pandas dataframe of shape = [n_samples, n_features]
The training dataset.
y: pandas series or None, default=None
y is not needed in this imputation. You can pass None or y.
Raises
------
TypeError
- If the input is not a Pandas DataFrame
- If any of the user provided variables are not numerical
ValueError
If there are no numerical variables in the df or the df is empty
Returns
-------
self
"""
# check input dataframe
X = _is_dataframe(X)
# find or check for numerical variables
self.variables_ = _find_or_check_numerical_variables(X, self.variables)
# find imputation parameters: mean or median
if self.imputation_method == "mean":
self.imputer_dict_ = X[self.variables_].mean().to_dict()
elif self.imputation_method == "median":
self.imputer_dict_ = X[self.variables_].median().to_dict()
self.n_features_in_ = X.shape[1]
return self
# Ugly work around to import the docstring for Sphinx, otherwise not necessary
transform.__doc__ = BaseImputer.transform.__doc__