Source code for feature_engine.imputation.drop_missing_data

# Authors: Pradumna Suryawanshi <pradumnasuryawanshi@gmail.com>
# License: BSD 3 clause

from typing import List, Optional, Union

import pandas as pd

from feature_engine.dataframe_checks import _is_dataframe
from feature_engine.imputation.base_imputer import BaseImputer
from feature_engine.variable_manipulation import _check_input_parameter_variables


[docs]class DropMissingData(BaseImputer):
    """
    The DropMissingData() will delete rows containing missing values. It provides
    similar functionality to pandas.drop_na().

    It works for both numerical and categorical variables. You can enter the list of
    variables for which missing values should be removed from the dataframe.
    Alternatively, the imputer will automatically select all variables in the dataframe.

    **Note**
    The transformer will first select all variables or all user entered
    variables and if `missing_only=True`, it will re-select from the original group
    only those that show missing data in during fit, that is in the train set.

    Parameters
    ----------
    missing_only: bool, default=True
        If true, missing observations will be dropped only for the variables that have
        missing data in the train set, during fit. If False, observations with NA
        will be dropped from all variables indicated by the user.

    variables: list, default=None
        The list of variables to be imputed. If None, the imputer will find and
        select all variables in the dataframe.


    Attributes
    ----------
    variables_:
        List of variables for which the rows with NA will be deleted.
    n_features_in_:
        The number of features in the train set used in fit.

    Methods
    -------
    fit:
        Learn the variables for which the rows with NA will be deleted
    transform:
        Remove observations with NA
    fit_transform:
        Fit to the data, then transform it.
    return_na_data:
        Returns the dataframe with the rows that contain NA .
    """

    def __init__(
        self,
        missing_only: bool = True,
        variables: Union[None, int, str, List[Union[str, int]]] = None,
    ) -> None:

        if not isinstance(missing_only, bool):
            raise ValueError("missing_only takes values True or False")

        self.variables = _check_input_parameter_variables(variables)
        self.missing_only = missing_only

[docs]    def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None):
        """
        Learn the variables for which the rows with NA will be deleted.

        Parameters
        ----------
        X: pandas dataframe of shape = [n_samples, n_features]
            The training dataset.

        y: pandas Series, default=None
            y is not needed in this imputation. You can pass None or y.

        Raises
        ------
        TypeError
            If the input is not a Pandas DataFrame

        Returns
        -------
        self
        """

        # check input dataframe
        X = _is_dataframe(X)

        # find variables for which indicator should be added
        if self.missing_only:
            if not self.variables:
                self.variables_ = [
                    var for var in X.columns if X[var].isnull().sum() > 0
                ]
            else:
                self.variables_ = [
                    var for var in self.variables if X[var].isnull().sum() > 0
                ]

        else:
            if not self.variables:
                self.variables_ = [var for var in X.columns]
            else:
                self.variables_ = self.variables

        self.n_features_in_ = X.shape[1]

        return self

[docs]    def transform(self, X: pd.DataFrame) -> pd.DataFrame:
        """
        Remove rows with missing values.

        Parameters
        ----------
        X: pandas dataframe of shape = [n_samples, n_features]
            The dataframe to be transformed.

        Returns
        -------
        X_transformed: pandas dataframe
            The complete case dataframe for the selected variables, of shape
            [n_samples - rows_with_na, n_features]
        """

        X = self._check_transform_input_and_state(X)

        X.dropna(axis=0, how="any", subset=self.variables_, inplace=True)

        return X

[docs]    def return_na_data(self, X: pd.DataFrame) -> pd.DataFrame:
        """
        Returns the subset of the dataframe which contains the rows with missing values.
        This method could be useful in production, in case we want to store the
        observations that will not be fed into the model.

        Parameters
        ----------
        X: pandas dataframe of shape = [n_samples, n_features]
            The dataframe to be transformed.

        Raises
        ------
        TypeError
            If the input is not a Pandas DataFrame

        Returns
        -------
        X: pandas dataframe of shape = [obs_with_na, features]
            The dataframe containing only the rows with missing values.
        """

        X = self._check_transform_input_and_state(X)

        idx = pd.isnull(X[self.variables_]).any(1)
        idx = idx[idx]
        return X.loc[idx.index, :]