Source code for feature_engine.encoding.count_frequency

# Authors: Soledad Galli <solegalli@protonmail.com>
# License: BSD 3 clause

from typing import List, Optional, Union

import pandas as pd

from feature_engine.encoding.base_encoder import BaseCategoricalTransformer
from feature_engine.variable_manipulation import _check_input_parameter_variables


[docs]class CountFrequencyEncoder(BaseCategoricalTransformer): """ The CountFrequencyEncoder() replaces categories by either the count or the percentage of observations per category. For example in the variable colour, if 10 observations are blue, blue will be replaced by 10. Alternatively, if 10% of the observations are blue, blue will be replaced by 0.1. The CountFrequencyEncoder() will encode only categorical variables by default (type 'object' or 'categorical'). You can pass a list of variables to encode. Alternatively, the encoder will find and encode all categorical variables (type 'object' or 'categorical'). With `ignore_format=True` you have the option to encode numerical variables as well. The procedure is identical, you can either enter the list of variables to encode, or the transformer will automatically select all variables. The encoder first maps the categories to the counts or frequencies for each variable (fit). The encoder then replaces the categories with those numbers (transform). Parameters ---------- encoding_method: str, default='count' Desired method of encoding. 'count': number of observations per category 'frequency': percentage of observations per category variables: list, default=None The list of categorical variables that will be encoded. If None, the encoder will find and transform all variables of type object or categorical by default. You can also make the transformer accept numerical variables, see the next parameter. ignore_format: bool, default=False Whether the format in which the categorical variables are cast should be ignored. If false, the encoder will automatically select variables of type object or categorical, or check that the variables entered by the user are of type object or categorical. If True, the encoder will select all variables or accept all variables entered by the user, including those cast as numeric. Attributes ---------- encoder_dict_: Dictionary with the count or frequency per category, per variable. variables_: The group of variables that will be transformed. n_features_in_: The number of features in the train set used in fit. Methods ------- fit: Learn the count or frequency per category, per variable. transform: Encode the categories to numbers. fit_transform: Fit to the data, then transform it. inverse_transform: Encode the numbers into the original categories. Notes ----- NAN are introduced when encoding categories that were not present in the training dataset. If this happens, try grouping infrequent categories using the RareLabelEncoder(). See Also -------- feature_engine.encoding.RareLabelEncoder """ def __init__( self, encoding_method: str = "count", variables: Union[None, int, str, List[Union[str, int]]] = None, ignore_format: bool = False, ) -> None: if encoding_method not in ["count", "frequency"]: raise ValueError( "encoding_method takes only values 'count' and 'frequency'" ) if not isinstance(ignore_format, bool): raise ValueError("ignore_format takes only booleans True and False") self.encoding_method = encoding_method self.variables = _check_input_parameter_variables(variables) self.ignore_format = ignore_format
[docs] def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None): """ Learn the counts or frequencies which will be used to replace the categories. Parameters ---------- X: pandas dataframe of shape = [n_samples, n_features] The training dataset. Can be the entire dataframe, not just the variables to be transformed. y: pandas Series, default = None y is not needed in this encoder. You can pass y or None. Raises ------ TypeError - If the input is not a Pandas DataFrame. - f user enters non-categorical variables (unless ignore_format is True) ValueError - If there are no categorical variables in the df or the df is empty - If the variable(s) contain null values Returns ------- self """ X = self._check_fit_input_and_variables(X) self.encoder_dict_ = {} # learn encoding maps for var in self.variables_: if self.encoding_method == "count": self.encoder_dict_[var] = X[var].value_counts().to_dict() elif self.encoding_method == "frequency": n_obs = float(len(X)) self.encoder_dict_[var] = (X[var].value_counts() / n_obs).to_dict() self._check_encoding_dictionary() self.n_features_in_ = X.shape[1] return self
# Ugly work around to import the docstring for Sphinx, otherwise not necessary
[docs] def transform(self, X: pd.DataFrame) -> pd.DataFrame: X = super().transform(X) return X
transform.__doc__ = BaseCategoricalTransformer.transform.__doc__
[docs] def inverse_transform(self, X: pd.DataFrame) -> pd.DataFrame: X = super().inverse_transform(X) return X
inverse_transform.__doc__ = BaseCategoricalTransformer.inverse_transform.__doc__