"""Binary encoding"""
import pandas as pd
from sklearn.base import BaseEstimator, TransformerMixin
import category_encoders as ce
__author__ = 'willmcginnis'
[docs]class BinaryEncoder(BaseEstimator, TransformerMixin):
"""Binary encoding for categorical variables, similar to onehot, but stores categories as binary bitstrings.
Parameters
----------
verbose: int
integer indicating verbosity of the output. 0 for none.
cols: list
a list of columns to encode, if None, all string columns will be encoded.
drop_invariant: bool
boolean for whether or not to drop columns with 0 variance.
return_df: bool
boolean for whether to return a pandas DataFrame from transform (otherwise it will be a numpy array).
handle_unknown: str
options are 'error', 'return_nan', 'value', and 'indicator'. The default is 'value'. Warning: if indicator is used,
an extra column will be added in if the transform matrix has unknown categories. This can cause
unexpected changes in dimension in some cases.
handle_missing: str
options are 'error', 'return_nan', 'value', and 'indicator'. The default is 'value'. Warning: if indicator is used,
an extra column will be added in if the transform matrix has nan values. This can cause
unexpected changes in dimension in some cases.
Example
-------
>>> from category_encoders import *
>>> import pandas as pd
>>> from sklearn.datasets import load_boston
>>> bunch = load_boston()
>>> y = bunch.target
>>> X = pd.DataFrame(bunch.data, columns=bunch.feature_names)
>>> enc = BinaryEncoder(cols=['CHAS', 'RAD']).fit(X, y)
>>> numeric_dataset = enc.transform(X)
>>> print(numeric_dataset.info())
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 506 entries, 0 to 505
Data columns (total 18 columns):
CRIM 506 non-null float64
ZN 506 non-null float64
INDUS 506 non-null float64
CHAS_0 506 non-null int64
CHAS_1 506 non-null int64
NOX 506 non-null float64
RM 506 non-null float64
AGE 506 non-null float64
DIS 506 non-null float64
RAD_0 506 non-null int64
RAD_1 506 non-null int64
RAD_2 506 non-null int64
RAD_3 506 non-null int64
RAD_4 506 non-null int64
TAX 506 non-null float64
PTRATIO 506 non-null float64
B 506 non-null float64
LSTAT 506 non-null float64
dtypes: float64(11), int64(7)
memory usage: 71.3 KB
None
"""
def __init__(self, verbose=0, cols=None, mapping=None, drop_invariant=False, return_df=True,
handle_unknown='value', handle_missing='value'):
self.verbose = verbose
self.cols = cols
self.mapping = mapping
self.drop_invariant = drop_invariant
self.return_df = return_df
self.handle_unknown = handle_unknown
self.handle_missing = handle_missing
self.base_n_encoder = ce.BaseNEncoder(base=2, verbose=self.verbose, cols=self.cols, mapping=self.mapping,
drop_invariant=self.drop_invariant, return_df=self.return_df,
handle_unknown=self.handle_unknown, handle_missing=self.handle_missing)
[docs] def fit(self, X, y=None, **kwargs):
"""Fit encoder according to X and y.
Parameters
----------
X : array-like, shape = [n_samples, n_features]
Training vectors, where n_samples is the number of samples
and n_features is the number of features.
y : array-like, shape = [n_samples]
Target values.
Returns
-------
self : encoder
Returns self.
"""
self.base_n_encoder.fit(X, y, **kwargs)
return self
[docs] def get_feature_names(self):
"""
Returns the names of all transformed / added columns.
Returns
-------
feature_names: list
A list with all feature names transformed or added.
Note: potentially dropped features are not included!
"""
return self.base_n_encoder.get_feature_names()