[docs]classArbitraryOutlierCapper(BaseOutlier):""" The ArbitraryOutlierCapper() caps the maximum or minimum values of a variable at an arbitrary value indicated by the user. You must provide the maximum or minimum values that will be used to cap each variable in a dictionary {feature:capping value} Parameters ---------- max_capping_dict: dictionary, default=None Dictionary containing the user specified capping values for the right tail of the distribution of each variable (maximum values). min_capping_dict: dictionary, default=None Dictionary containing user specified capping values for the eft tail of the distribution of each variable (minimum values). missing_values : string, default='raise' Indicates if missing values should be ignored or raised. If `missing_values='raise'` the transformer will return an error if the training or the datasets to transform contain missing values. Attributes ---------- right_tail_caps_: Dictionary with the maximum values at which variables will be capped. left_tail_caps_: Dictionary with the minimum values at which variables will be capped. variables_: The group of variables that will be transformed. n_features_in_: The number of features in the train set used in fit. Methods ------- fit: This transformer does not learn any parameter. transform: Cap the variables. fit_transform: Fit to the data. Then transform it. """def__init__(self,max_capping_dict:Optional[dict]=None,min_capping_dict:Optional[dict]=None,missing_values:str="raise",)->None:ifnotmax_capping_dictandnotmin_capping_dict:raiseValueError("Please provide at least 1 dictionary with the capping values.")ifmissing_valuesnotin["raise","ignore"]:raiseValueError("missing_values takes only values 'raise' or 'ignore'")self.max_capping_dict=_define_numerical_dict(max_capping_dict)self.min_capping_dict=_define_numerical_dict(min_capping_dict)self.missing_values=missing_values
[docs]deffit(self,X:pd.DataFrame,y:Optional[pd.Series]=None):""" This transformer does not learn any parameter. Parameters ---------- X: pandas dataframe of shape = [n_samples, n_features] The training input samples. y: pandas Series, default=None y is not needed in this transformer. You can pass y or None. Raises ------ TypeError If the input is not a Pandas DataFrame Returns ------- self """X=_is_dataframe(X)# find variables to be cappedifself.min_capping_dictisNoneandself.max_capping_dict:self.variables_=[xforxinself.max_capping_dict.keys()]elifself.max_capping_dictisNoneandself.min_capping_dict:self.variables_=[xforxinself.min_capping_dict.keys()]elifself.min_capping_dictandself.max_capping_dict:tmp=self.min_capping_dict.copy()tmp.update(self.max_capping_dict)self.variables_=[xforxintmp.keys()]ifself.missing_values=="raise":# check if dataset contains na_check_contains_na(X,self.variables_)_check_contains_inf(X,self.variables_)# find or check for numerical variablesself.variables_=_find_or_check_numerical_variables(X,self.variables_)ifself.max_capping_dictisnotNone:self.right_tail_caps_=self.max_capping_dictelse:self.right_tail_caps_={}ifself.min_capping_dictisnotNone:self.left_tail_caps_=self.min_capping_dictelse:self.left_tail_caps_={}self.n_features_in_=X.shape[1]returnself
# Ugly work around to import the docstring for Sphinx, otherwise not necessary
transform.__doc__=BaseOutlier.transform.__doc__def_more_tags(self):tags_dict=_return_tags()# add additional test that failstags_dict["_xfail_checks"]["check_parameters_default_constructible"]="transformer has 1 mandatory parameter"returntags_dict