[docs]classCatBoostEncoder(BaseEstimator,util.TransformerWithTargetMixin):"""CatBoost coding for categorical features. Supported targets: binomial and continuous. For polynomial target support, see PolynomialWrapper. This is very similar to leave-one-out encoding, but calculates the values "on-the-fly". Consequently, the values naturally vary during the training phase and it is not necessary to add random noise. Beware, the training data have to be randomly permutated. E.g.: # Random permutation perm = np.random.permutation(len(X)) X = X.iloc[perm].reset_index(drop=True) y = y.iloc[perm].reset_index(drop=True) This is necessary because some data sets are sorted based on the target value and this coder encodes the features on-the-fly in a single pass. Parameters ---------- verbose: int integer indicating verbosity of the output. 0 for none. cols: list a list of columns to encode, if None, all string columns will be encoded. drop_invariant: bool boolean for whether or not to drop columns with 0 variance. return_df: bool boolean for whether to return a pandas DataFrame from transform (otherwise it will be a numpy array). handle_missing: str options are 'error', 'return_nan' and 'value', defaults to 'value', which returns the target mean. handle_unknown: str options are 'error', 'return_nan' and 'value', defaults to 'value', which returns the target mean. sigma: float adds normal (Gaussian) distribution noise into training data in order to decrease overfitting (testing data are untouched). sigma gives the standard deviation (spread or "width") of the normal distribution. a: float additive smoothing (it is the same variable as "m" in m-probability estimate). By default set to 1. Example ------- >>> from category_encoders import * >>> import pandas as pd >>> from sklearn.datasets import load_boston >>> bunch = load_boston() >>> y = bunch.target >>> X = pd.DataFrame(bunch.data, columns=bunch.feature_names) >>> enc = CatBoostEncoder(cols=['CHAS', 'RAD']).fit(X, y) >>> numeric_dataset = enc.transform(X) >>> print(numeric_dataset.info()) <class 'pandas.core.frame.DataFrame'> RangeIndex: 506 entries, 0 to 505 Data columns (total 13 columns): CRIM 506 non-null float64 ZN 506 non-null float64 INDUS 506 non-null float64 CHAS 506 non-null float64 NOX 506 non-null float64 RM 506 non-null float64 AGE 506 non-null float64 DIS 506 non-null float64 RAD 506 non-null float64 TAX 506 non-null float64 PTRATIO 506 non-null float64 B 506 non-null float64 LSTAT 506 non-null float64 dtypes: float64(13) memory usage: 51.5 KB None References ---------- .. [1] Transforming categorical features to numerical features, from https://tech.yandex.com/catboost/doc/dg/concepts/algorithm-main-stages_cat-to-numberic-docpage/ .. [2] CatBoost: unbiased boosting with categorical features, from https://arxiv.org/abs/1706.09516 """def__init__(self,verbose=0,cols=None,drop_invariant=False,return_df=True,handle_unknown='value',handle_missing='value',random_state=None,sigma=None,a=1):self.return_df=return_dfself.drop_invariant=drop_invariantself.drop_cols=[]self.verbose=verboseself.use_default_cols=colsisNone# if True, even a repeated call of fit() will select string columns from Xself.cols=colsself._dim=Noneself.mapping=Noneself.handle_unknown=handle_unknownself.handle_missing=handle_missingself._mean=Noneself.random_state=random_stateself.sigma=sigmaself.feature_names=Noneself.a=a
[docs]deffit(self,X,y,**kwargs):"""Fit encoder according to X and y. Parameters ---------- X : array-like, shape = [n_samples, n_features] Training vectors, where n_samples is the number of samples and n_features is the number of features. y : array-like, shape = [n_samples] Target values. Returns ------- self : encoder Returns self. """# unite the input into pandas typesX=util.convert_input(X)y=util.convert_input_vector(y,X.index).astype(float)ifX.shape[0]!=y.shape[0]:raiseValueError("The length of X is "+str(X.shape[0])+" but length of y is "+str(y.shape[0])+".")self._dim=X.shape[1]# if columns aren't passed, just use every string columnifself.use_default_cols:self.cols=util.get_obj_cols(X)else:self.cols=util.convert_cols_to_list(self.cols)ifself.handle_missing=='error':ifX[self.cols].isnull().any().any():raiseValueError('Columns to be encoded can not contain null')categories=self._fit(X,y,cols=self.cols)self.mapping=categoriesX_temp=self.transform(X,y,override_return_df=True)self.feature_names=X_temp.columns.tolist()ifself.drop_invariant:self.drop_cols=[]generated_cols=util.get_generated_cols(X,X_temp,self.cols)self.drop_cols=[xforxingenerated_colsifX_temp[x].var()<=10e-5]try:[self.feature_names.remove(x)forxinself.drop_cols]exceptKeyErrorase:ifself.verbose>0:print("Could not remove column from feature names.""Not found in generated cols.\n{}".format(e))returnself
[docs]deftransform(self,X,y=None,override_return_df=False):"""Perform the transformation to new categorical data. Parameters ---------- X : array-like, shape = [n_samples, n_features] y : array-like, shape = [n_samples] when transform by leave one out None, when transform without target information (such as transform test set) Returns ------- p : array, shape = [n_samples, n_numeric + N] Transformed values with encoding applied. """ifself.handle_missing=='error':ifX[self.cols].isnull().any().any():raiseValueError('Columns to be encoded can not contain null')ifself._dimisNone:raiseValueError('Must train encoder before it can be used to transform data.')# unite the input into pandas typesX=util.convert_input(X)# then make sure that it is the right sizeifX.shape[1]!=self._dim:raiseValueError('Unexpected input dimension %d, expected %d'%(X.shape[1],self._dim,))# if we are encoding the training data, we have to check the targetifyisnotNone:y=util.convert_input_vector(y,X.index).astype(float)ifX.shape[0]!=y.shape[0]:raiseValueError("The length of X is "+str(X.shape[0])+" but length of y is "+str(y.shape[0])+".")ifnotlist(self.cols):returnXX=self._transform(X,y,mapping=self.mapping)ifself.drop_invariant:forcolinself.drop_cols:X.drop(col,1,inplace=True)ifself.return_dforoverride_return_df:returnXelse:returnX.values
def_fit(self,X_in,y,cols=None):X=X_in.copy(deep=True)ifcolsisNone:cols=X.columns.valuesself._mean=y.mean()return{col:self._fit_column_map(X[col],y)forcolincols}def_fit_column_map(self,series,y):category=pd.Categorical(series)categories=category.categoriescodes=category.codes.copy()codes[codes==-1]=len(categories)categories=np.append(categories,np.nan)return_map=pd.Series(dict([(code,category)forcode,categoryinenumerate(categories)]))result=y.groupby(codes).agg(['sum','count'])returnresult.rename(return_map)def_transform(self,X_in,y,mapping=None):""" The model uses a single column of floats to represent the means of the target variables. """X=X_in.copy(deep=True)random_state_=check_random_state(self.random_state)# Prepare the dataifyisnotNone:# Convert bools to numbers (the target must be summable)y=y.astype('double')forcol,colmapinmapping.items():level_notunique=colmap['count']>1unique_train=colmap.indexunseen_values=pd.Series([xforxinX_in[col].unique()ifxnotinunique_train],dtype=unique_train.dtype)is_nan=X_in[col].isnull()is_unknown_value=X_in[col].isin(unseen_values.dropna().astype(object))ifself.handle_unknown=='error'andis_unknown_value.any():raiseValueError('Columns to be encoded can not contain new values')ifyisNone:# Replace level with its mean target; if level occurs only once, use global meanlevel_means=((colmap['sum']+self._mean)/(colmap['count']+self.a)).where(level_notunique,self._mean)X[col]=X[col].map(level_means)else:# Simulation of CatBoost implementation, which calculates leave-one-out on the fly.# The nice thing about this is that it helps to prevent overfitting. The bad thing# is that CatBoost uses many iterations over the data. But we run just one iteration.# Still, it works better than leave-one-out without any noise.# See:# https://tech.yandex.com/catboost/doc/dg/concepts/algorithm-main-stages_cat-to-numberic-docpage/# Cumsum does not work nicely with None (while cumcount does).# As a workaround, we cast the grouping column as string.# See: issue #209temp=y.groupby(X[col].astype(str)).agg(['cumsum','cumcount'])X[col]=(temp['cumsum']-y+self._mean)/(temp['cumcount']+self.a)ifself.handle_unknown=='value':ifX[col].dtype.name=='category':X[col]=X[col].astype(float)X.loc[is_unknown_value,col]=self._meanelifself.handle_unknown=='return_nan':X.loc[is_unknown_value,col]=np.nanifself.handle_missing=='value':X.loc[is_nan&unseen_values.isnull().any(),col]=self._meanelifself.handle_missing=='return_nan':X.loc[is_nan,col]=np.nanifself.sigmaisnotNoneandyisnotNone:X[col]=X[col]*random_state_.normal(1.,self.sigma,X[col].shape[0])returnX
[docs]defget_feature_names(self):""" Returns the names of all transformed / added columns. Returns ------- feature_names: list A list with all feature names transformed or added. Note: potentially dropped features are not included! """ifnotisinstance(self.feature_names,list):raiseValueError('Must fit data first. Affected feature names are not known before.')else:returnself.feature_names