Source code for feature_engine.imputation.mean_median
# Authors: Soledad Galli <solegalli@protonmail.com>
# License: BSD 3 clause
from typing import List, Optional, Union
import pandas as pd
from feature_engine.dataframe_checks import _is_dataframe
from feature_engine.imputation.base_imputer import BaseImputer
from feature_engine.variable_manipulation import (
    _check_input_parameter_variables,
    _find_or_check_numerical_variables,
)
[docs]class MeanMedianImputer(BaseImputer):
    """
    The MeanMedianImputer() replaces missing data by the mean or median value of the
    variable. It works only with numerical variables.
    You can pass a list of variables to be imputed. Alternatively, the
    MeanMedianImputer() will automatically select all variables of type numeric in the
    training set.
    The imputer:
    - first calculates the mean / median values of the variables (fit).
    - Then replaces the missing data with the estimated mean / median (transform).
    Parameters
    ----------
    imputation_method: str, default=median
        Desired method of imputation. Can take 'mean' or 'median'.
    variables: list, default=None
        The list of variables to be imputed. If None, the imputer will select
        all variables of type numeric.
    Attributes
    ----------
    imputer_dict_:
        Dictionary with the mean or median values per variable.
    variables_:
        The group of variables that will be transformed.
    n_features_in_:
        The number of features in the train set used in fit.
    Methods
    -------
    fit:
        Learn the mean or median values.
    transform:
        Impute missing data.
    fit_transform:
        Fit to the data, then transform it.
    """
    def __init__(
        self,
        imputation_method: str = "median",
        variables: Union[None, int, str, List[Union[str, int]]] = None,
    ) -> None:
        if imputation_method not in ["median", "mean"]:
            raise ValueError("imputation_method takes only values 'median' or 'mean'")
        self.imputation_method = imputation_method
        self.variables = _check_input_parameter_variables(variables)
[docs]    def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None):
        """
        Learn the mean or median values.
        Parameters
        ----------
        X: pandas dataframe of shape = [n_samples, n_features]
            The training dataset.
        y: pandas series or None, default=None
            y is not needed in this imputation. You can pass None or y.
        Raises
        ------
        TypeError
            - If the input is not a Pandas DataFrame
            - If any of the user provided variables are not numerical
        ValueError
            If there are no numerical variables in the df or the df is empty
        Returns
        -------
        self
        """
        # check input dataframe
        X = _is_dataframe(X)
        # find or check for numerical variables
        self.variables_ = _find_or_check_numerical_variables(X, self.variables)
        # find imputation parameters: mean or median
        if self.imputation_method == "mean":
            self.imputer_dict_ = X[self.variables_].mean().to_dict()
        elif self.imputation_method == "median":
            self.imputer_dict_ = X[self.variables_].median().to_dict()
        self.n_features_in_ = X.shape[1]
        return self
    # Ugly work around to import the docstring for Sphinx, otherwise not necessary
    transform.__doc__ = BaseImputer.transform.__doc__