Source code for feature_engine.transformation.yeojohnson

# Authors: Soledad Galli <solegalli@protonmail.com>
# License: BSD 3 clause

from typing import List, Optional, Union

import pandas as pd
import scipy.stats as stats

from feature_engine.base_transformers import BaseNumericalTransformer
from feature_engine.variable_manipulation import _check_input_parameter_variables


[docs]class YeoJohnsonTransformer(BaseNumericalTransformer): """ The YeoJohnsonTransformer() applies the Yeo-Johnson transformation to the numerical variables. The Yeo-Johnson transformation implemented by this transformer is that of SciPy.stats: https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.yeojohnson.html The YeoJohnsonTransformer() works only with numerical variables. A list of variables can be passed as an argument. Alternatively, the transformer will automatically select and transform all numerical variables. Parameters ---------- variables: list, default=None The list of numerical variables to transform. If None, the transformer will automatically find and select all numerical variables. Attributes ---------- lambda_dict_ Dictionary containing the best lambda for the Yeo-Johnson per variable. variables_: The group of variables that will be transformed. n_features_in_: The number of features in the train set used in fit. Methods ------- fit: Learn the optimal lambda for the Yeo-Johnson transformation. transform: Apply the Yeo-Johnson transformation. fit_transform: Fit to data, then transform it. References ---------- .. [1] Weisberg S. "Yeo-Johnson Power Transformations". https://www.stat.umn.edu/arc/yjpower.pdf """ def __init__( self, variables: Union[None, int, str, List[Union[str, int]]] = None ) -> None: self.variables = _check_input_parameter_variables(variables)
[docs] def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None): """ Learn the optimal lambda for the Yeo-Johnson transformation. Parameters ---------- X: pandas dataframe of shape = [n_samples, n_features] The training input samples. Can be the entire dataframe, not just the variables to transform. y: pandas Series, default=None It is not needed in this transformer. You can pass y or None. Raises ------ TypeError - If the input is not a Pandas DataFrame - If any of the user provided variables are not numerical ValueError - If there are no numerical variables in the df or the df is empty - If the variable(s) contain null values Returns ------- self """ # check input dataframe X = super().fit(X) self.lambda_dict_ = {} # to avoid NumPy error X[self.variables_] = X[self.variables_].astype("float") for var in self.variables_: _, self.lambda_dict_[var] = stats.yeojohnson(X[var]) self.n_features_in_ = X.shape[1] return self
[docs] def transform(self, X: pd.DataFrame) -> pd.DataFrame: """ Apply the Yeo-Johnson transformation. Parameters ---------- X: Pandas DataFrame of shape = [n_samples, n_features] The data to be transformed. Raises ------ TypeError If the input is not a Pandas DataFrame ValueError - If the variable(s) contain null values - If the df has different number of features than the df used in fit() Returns ------- X: pandas dataframe The dataframe with the transformed variables. """ # check input dataframe and if class was fitted X = super().transform(X) for feature in self.variables_: X[feature] = stats.yeojohnson(X[feature], lmbda=self.lambda_dict_[feature]) return X