Source code for ballet.eng.base

from typing import Callable, Optional

import funcy as fy
import numpy as np
import pandas as pd
import sklearn.base
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import FunctionTransformer
from sklearn.utils.validation import check_is_fitted
from sklearn_pandas import DataFrameMapper
from sklearn_pandas import __version__ as sklearn_pandas_version

import ballet.transformer  # avoid circular import
from ballet.exc import BalletError
from ballet.util import get_arr_desc
from ballet.util.typing import OneOrMore, TransformerLike

__all__ = (
    'BaseTransformer',
    'ConditionalTransformer',
    'GroupedFunctionTransformer',
    'GroupwiseTransformer',
    'NoFitMixin',
    'SimpleFunctionTransformer',
    'SubsetTransformer',
)


[docs]class NoFitMixin: """Mix-in class for transformations that do not require a fit stage"""
[docs] def fit(self, X, y=None, **fit_kwargs): return self
[docs]class BaseTransformer(NoFitMixin, TransformerMixin, BaseEstimator): """Base transformer class for developing new transformers""" pass
[docs]class SimpleFunctionTransformer(FunctionTransformer): """Transformer that applies a callable to its input The callable will be called on the input X in the transform stage, optionally with additional arguments and keyword arguments. A simple wrapper around :py:class:`FunctionTransformer`. Args: func: callable to apply func_kwargs: keyword arguments to pass """ def __init__(self, func: Callable, func_kwargs: Optional[dict] = None): self.func = func self.func_kwargs = func_kwargs or {} super().__init__( func=self.func, kw_args=self.func_kwargs)
[docs]class GroupedFunctionTransformer(FunctionTransformer): """Transformer that applies a callable to each group of a groupby Args: func: callable to apply func_kwargs: keyword arguments to pass groupby_kwargs: keyword arguments to ``pd.DataFrame.groupby``. If omitted, no grouping is performed and the function is called on the entire DataFrame. """ def __init__(self, func: Callable, func_kwargs: Optional[dict] = None, groupby_kwargs: Optional[dict] = None): self.func = func self.func_kwargs = func_kwargs or {} self.groupby_kwargs = groupby_kwargs or {} super().__init__( func=func, kw_args=self.func_kwargs)
[docs] def transform(self, X, **transform_kwargs): if self.groupby_kwargs: call = X.groupby(**self.groupby_kwargs).apply else: call = X.pipe return call(super().transform)
[docs]class GroupwiseTransformer(BaseTransformer): """Transformer that does something different for every group For each group identified in the training set by the groupby operation, a separate transformer is cloned and fit. This is useful to learn group-wise transformers that do not leak data between the training and test sets. Consider the case of imputing missing values with the mean of some group. A normal, pure-pandas implementation, such as ``X_te.groupby(by='foo').apply('mean')`` would leak information about the test set means, which might differ from the training set means. Args: transformer: the transformer to apply to each group. If transformer is a transformer-like instance (i.e. has fit, transform methods etc.), then it is cloned for each group. If transformer is a transformer-like class (i.e. instances of the class are transformer-like), then it is initialized with no arguments for each group. If it is a callable, then it is called with no arguments for each group. groupby_kwargs: keyword arguments to pd.DataFrame.groupby column_selection: column, or list of columns, to select after the groupby. Equivalent to ``df.groupby(...)[column_selection]``. Defaults to None, i.e. no column selection is performed. handle_unknown: 'error' or 'ignore', default='error'. Whether to raise an error or ignore if an unknown group is encountered during transform. When this parameter is set to 'ignore' and an unknown group is encountered during transform, the group's values will be passed through unchanged. handle_error: 'error' or 'ignore', default='error'. Whether to raise an error or ignore if an error is raised during transforming an individual group. When this parameter is set to 'ignore' and an error is raised when calling the transformer's transform method on an individual group, the group's values will be passed through unchanged. Example usage: In this example, we create a groupwise transformer that fits a separate imputer for each group encountered. For new data points, values will be imputed according to the mean of its group on the training set, avoiding any data leakage. .. code-block:: python >>> from sklearn.impute import SimpleImputer >>> transformer = GroupwiseTransformer( ... SimpleImputer(strategy='mean'), ... groupby_kwargs = {'level': 'name'} ... ) Raises: ballet.exc.BalletError: if handle_unknown=='error' and an unknown group is encountered at transform-time. """ def __init__(self, transformer: TransformerLike, groupby_kwargs: dict = None, column_selection: OneOrMore[str] = None, handle_unknown: str = 'error', handle_error: str = 'error'): self.transformer = transformer self.groupby_kwargs = groupby_kwargs self.column_selection = column_selection self.handle_unknown = handle_unknown self.handle_error = handle_error def _make_transformer(self): if ( isinstance(self.transformer, type) or callable(self.transformer) ): return self.transformer() else: return sklearn.base.clone(self.transformer)
[docs] def fit(self, X, y=None, **fit_kwargs): # validation on inputs self.groupby_kwargs_ = self.groupby_kwargs or {} if self.handle_unknown not in ['error', 'ignore']: raise ValueError( f'Invalid value for handle_unknown: {self.handle_unknown}') if self.handle_error not in ['error', 'ignore']: raise ValueError( f'Invalid value for handle_error: {self.handle_error}') # Get the groups grouper = X.groupby(**self.groupby_kwargs_) self.groups_ = set(grouper.groups.keys()) # Create and fit a transformer for each group self.transformers_ = {} for group_name, x_group in grouper: transformer = self._make_transformer() if self.column_selection is not None: x_group = x_group[self.column_selection] if y is not None: # Extract y by integer indexing y_group = y[grouper.indices[group_name]] transformer.fit(x_group, y_group) else: transformer.fit(x_group) self.transformers_[group_name] = transformer return self
[docs] def transform(self, X, **transform_kwargs): check_is_fitted(self, ['groups_', 'transformers_']) def _transform(x_group, *args, **kwargs): # If the group is not a DataFrame, there are two problems # 1. We can't rely on group.name to lookup the right transformer # 2. We can't "reassemble" the transformed # However, the contract of ``pandas.core.groupby.GroupBy.apply`` is # that the input is a DataFrame, so this should never occur. if not isinstance(x_group, pd.DataFrame): raise NotImplementedError group_name = x_group.name if self.column_selection is not None: x_group = x_group[self.column_selection] if group_name in self.transformers_: transformer = self.transformers_[group_name] try: data = transformer.transform(x_group, *args, **kwargs) # This post-processing step is required because sklearn # transform converts a DataFrame to an array. This is my # best attempt so far to approximate the following: # >>> result = x_group.copy() # >>> result.values = data # which is an error as `values` cannot be set. index = x_group.index columns = x_group.columns return pd.DataFrame( data=data, index=index, columns=columns) except Exception: if self.handle_error == 'ignore': return x_group else: raise else: if self.handle_unknown == 'error': raise BalletError(f'Unknown group: {group_name}') elif self.handle_unknown == 'ignore': return x_group else: # Unreachable code raise RuntimeError return ( X .groupby(**self.groupby_kwargs_) .apply(_transform, **transform_kwargs) )
[docs]class ConditionalTransformer(BaseTransformer): """Transform columns that satisfy a condition during training In the fit stage, determines which variables (columns) satisfy the condition. In the transform stage, applies the given transformation to the satisfied columns. If a second transformation is given, applies the second transformation to the complement of the satisfied columns (i.e. the columns that fail to satisfy the condition). Otherwise, these unsatisfied columns are passed through unchanged. Args: condition: condition function satisfy_transform: transform function for satisfied columns unsatisfy_transform: transform function for unsatisfied columns (defaults to identity) """ def __init__( self, condition: Callable, satisfy_transform: Callable, unsatisfy_transform: Optional[Callable] = None ): super().__init__() self.condition = condition self.satisfy_transform = satisfy_transform self.unsatisfy_transform = unsatisfy_transform or fy.identity
[docs] def fit(self, X, y=None, **fit_args): # satisfied_columns_ is a bool or array[bool] self.satisfied_columns_ = self.condition(X) self.unsatisfied_columns_ = np.logical_not(self.satisfied_columns_) return self
[docs] def transform(self, X, **transform_args): check_is_fitted(self, ['satisfied_columns_', 'unsatisfied_columns_']) if isinstance(X, pd.DataFrame): X = X.copy() X.loc[:, self.satisfied_columns_] = self.satisfy_transform( X.loc[:, self.satisfied_columns_]) X.loc[:, self.unsatisfied_columns_] = self.unsatisfy_transform( X.loc[:, self.unsatisfied_columns_] ) return X elif np.ndim(X) == 1: return ( self.satisfy_transform(X) if self.satisfied_columns_ else self.unsatisfy_transform(X) ) elif isinstance(X, np.ndarray): X = X.copy().astype('float') if self.satisfied_columns_.any(): mask = np.tile(self.satisfied_columns_, (X.shape[0], 1)) np.putmask(X, mask, self.satisfy_transform( X[:, self.satisfied_columns_])) if self.unsatisfied_columns_.any(): mask = np.tile(self.unsatisfied_columns_, (X.shape[0], 1)) np.putmask(X, mask, self.unsatisfy_transform( X[:, self.unsatisfied_columns_])) return X elif not self.satisfied_columns_: # if we wouldn't otherwise have known what to do, we can pass # through X if transformation was not necessary anyways return self.unsatisfy_transform(X) else: raise TypeError( f'Couldn\'t apply transformer on features in ' f'{get_arr_desc(X)}.')
[docs]class SubsetTransformer(DataFrameMapper): """Transform a subset of columns with another transformer Args: input: transformer: alias: """ def __init__(self, input: OneOrMore[str], transformer: TransformerLike, alias: Optional[str] = None): self.input = input self.transformer = transformer self.alias = alias super().__init__( [(input, ballet.transformer.desugar_transformer(transformer), {'alias': alias})], default=None, input_df=True, df_out=True, ) if sklearn_pandas_version.startswith('1'): def __setstate__(self, state): # FIXME bug with sklearn-pandas 1.x that is fixed on 2.x (can # delete after upgrade) # horrible hack - DataFrameMapper does not call super.__setstate__ # in 1.x, but we happen to know that its parent is BaseEstimator BaseEstimator.__setstate__(self, state) DataFrameMapper.__setstate__(self, state)