Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

add features transformation via category encoders and test set transformation based on train #300

Merged
merged 10 commits into from
Jun 20, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
208 changes: 208 additions & 0 deletions causaltune/dataset_processor.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,208 @@
from typing import List, Optional

import copy
import pandas as pd
from sklearn.base import BaseEstimator, TransformerMixin
from category_encoders import OneHotEncoder, OrdinalEncoder, TargetEncoder, WOEEncoder
from causaltune.data_utils import CausalityDataset


class CausalityDatasetProcessor(BaseEstimator, TransformerMixin):
def __init__(self):
self.encoder_type = None
self.outcome = None
self.encoder = None

def fit(
self, cd: CausalityDataset, encoder_type: Optional[str] = "onehot", outcome: str = None
):
cd = copy.deepcopy(cd)
self.preprocess_dataset(
cd, encoder_type=encoder_type, outcome=outcome, fit_phase=True
)
return self

def transform(self, cd: CausalityDataset):
if self.encoder:
cd = self.preprocess_dataset(
cd,
encoder_type=self.encoder_type,
outcome=self.outcome,
fit_phase=False,
)
return cd
else:
raise ValueError("CausalityDatasetProcessor has not been trained")

def featurize(
self,
cd: CausalityDataset,
df: pd.DataFrame,
features: List[str],
exclude_cols: List[str],
drop_first: bool = False,
encoder_type: str = "onehot",
outcome: str = None,
fit_phase: bool = True,
) -> pd.DataFrame:
# fill all the NaNs
categ_columns = []
for col, t in zip(df.columns, df.dtypes):
if pd.api.types.is_float_dtype(t):
df[col] = df[col].fillna(0.0).astype("float32")
elif pd.api.types.is_integer_dtype(t):
df[col] = df[col].fillna(-1)
else:
df[col] = df[col].fillna("NA").astype("category")
categ_columns.append(col)

float_features = [
f for f in features if pd.api.types.is_float_dtype(df.dtypes[f])
]
float_df = df[float_features].reset_index(drop=True)

# cast 0/1 int columns to float single-column dummies
for col, t in zip(df.columns, df.dtypes):
if pd.api.types.is_integer_dtype(t):
if len(df[col].unique()) <= 2:
df[col] = df[col].fillna(0.0).astype("float32")

# for other categories, include first column dummy for easier interpretability
cat_df = df.drop(columns=exclude_cols + float_features)
if len(cat_df.columns) and encoder_type:
if encoder_type == "onehot":
if fit_phase:
encoder = OneHotEncoder(
cols=categ_columns, drop_invariant=drop_first
)
dummy_df = encoder.fit_transform(X=cat_df).reset_index(drop=True)
else:
dummy_df = self.encoder.transform(X=cat_df).reset_index(drop=True)
elif encoder_type == "label":
if fit_phase:
encoder = OrdinalEncoder(cols=categ_columns)
dummy_df = encoder.fit_transform(X=cat_df).reset_index(drop=True)
else:
dummy_df = self.encoder.transform(X=cat_df).reset_index(drop=True)
elif encoder_type == "target":
if outcome:
y = cd.data[outcome]
else:
y = cd.data[cd.outcomes[0]]
assert (
len(set(y)) < 10
), "Using TargetEncoder with continuous target is not allowed"
if fit_phase:
encoder = TargetEncoder(cols=categ_columns)
dummy_df = encoder.fit_transform(X=cat_df, y=y).reset_index(
drop=True
)
else:
dummy_df = self.encoder.transform(X=cat_df, y=y).reset_index(
drop=True
)
elif encoder_type == "woe":
if outcome:
y = cd.data[outcome]
else:
y = cd.data[cd.outcomes[0]]
assert (
len(set(y)) <= 2
), "WOEEncoder: the target column y must be binary"
if fit_phase:
encoder = WOEEncoder(cols=categ_columns)
dummy_df = encoder.fit_transform(X=cat_df, y=y).reset_index(
drop=True
)
else:
dummy_df = self.encoder.transform(X=cat_df, y=y).reset_index(
drop=True
)
else:
raise ValueError(f"Unsupported encoder type: {encoder_type}")
else:
encoder = "no"
dummy_df = pd.DataFrame()

out = pd.concat(
[df[exclude_cols].reset_index(drop=True), float_df, dummy_df], axis=1
)
if fit_phase:
self.encoder = encoder
self.encoder_type = encoder_type
self.outcome = outcome

return out

def preprocess_dataset(
self,
cd: CausalityDataset,
drop_first: Optional[bool] = False,
fit_phase: bool = True,
encoder_type: Optional[str] = "onehot",
outcome: Optional[str] = None,
):
"""Preprocesses input dataset for CausalTune by
converting treatment and instrument columns to integer, normalizing, filling nans, and one-hot encoding.

Args:
drop_first (bool): whether to drop the first dummy variable for each categorical feature (default False)
encoder_type (str): Type of encoder to use for categorical features (default 'onehot').
Available options are:
- 'onehot': OneHotEncoder
- 'label': OrdinalEncoder
- 'target': TargetEncoder
- 'woe': WOEEncoder

Returns:
None. Modifies self.data in-place by replacing it with the preprocessed dataframe.
"""

cd.data[cd.treatment] = cd.data[cd.treatment].astype(int)
cd.data[cd.instruments] = cd.data[cd.instruments].astype(int)

# normalize, fill in nans, one-hot encode all the features
new_chunks = []
processed_cols = []
original_columns = cd.data.columns.tolist()
cols = (
cd.__dict__["common_causes"]
+ cd.__dict__["effect_modifiers"]
+ cd.__dict__["propensity_modifiers"]
)
if cols:
processed_cols += cols
re_df = self.featurize(
cd,
cd.data[cols],
features=cols,
exclude_cols=[],
drop_first=drop_first,
fit_phase=fit_phase,
encoder_type=encoder_type,
outcome=outcome,
)
new_chunks.append(re_df)

remainder = cd.data[[c for c in cd.data.columns if c not in processed_cols]]
cd.data = pd.concat([remainder.reset_index(drop=True)] + new_chunks, axis=1)

# Columns after one-hot encoding
new_columns = cd.data.columns.tolist()
fields = ["common_causes", "effect_modifiers", "propensity_modifiers"]
# Mapping original columns to new (if one-hot) encoded columns
column_mapping = {}
for original_col in original_columns:
matches = [
col
for col in new_columns
if col.startswith(original_col + "_") or original_col == col
]
column_mapping[original_col] = matches
for col_group in fields:
updated_columns = []
for col in cd.__dict__[col_group]:
updated_columns.extend(column_mapping[col])
cd.__dict__[col_group] = updated_columns

return cd
65 changes: 52 additions & 13 deletions causaltune/optimiser.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import copy
import warnings
from copy import deepcopy
import warnings
from typing import List, Optional, Union
from collections import defaultdict

Expand Down Expand Up @@ -29,6 +29,7 @@
effect_stderr,
)
from causaltune.data_utils import CausalityDataset
from causaltune.dataset_processor import CausalityDatasetProcessor
from causaltune.models.passthrough import feature_filter


Expand Down Expand Up @@ -180,9 +181,9 @@ def __init__(
resources_per_trial if resources_per_trial is not None else {"cpu": 0.5}
)
self._settings["try_init_configs"] = try_init_configs
self._settings[
"include_experimental_estimators"
] = include_experimental_estimators
self._settings["include_experimental_estimators"] = (
include_experimental_estimators
)

# params for FLAML on component models:
self._settings["component_models"] = {}
Expand Down Expand Up @@ -285,6 +286,9 @@ def fit(
estimator_list: Optional[Union[str, List[str]]] = None,
resume: Optional[bool] = False,
time_budget: Optional[int] = None,
preprocess: bool = False,
encoder_type: Optional[str] = None,
encoder_outcome: Optional[str] = None,
):
"""Performs AutoML on list of causal inference estimators
- If estimator has a search space specified in its parameters, HPO is performed on the whole model.
Expand All @@ -301,6 +305,9 @@ def fit(
estimator_list (Optional[Union[str, List[str]]]): subset of estimators to consider
resume (Optional[bool]): set to True to continue previous fit
time_budget (Optional[int]): change new time budget allocated to fit, useful for warm starts.
preprocess (bool): preprocess CausalityDataset if needed.
encoder_type (Optional[str]): Categorical Encoder for preprocessing
encoder_outcome (Optional[str]): Categorical Encoder target for preprocessing: TargetEncoder, WOE.

Returns:
None
Expand All @@ -320,6 +327,16 @@ def fit(
propensity_modifiers=propensity_modifiers,
)

if preprocess:
data = copy.deepcopy(data)
self.dataset_processor = CausalityDatasetProcessor()
self.dataset_processor.fit(
data, encoder_type=encoder_type, outcome=encoder_outcome
)
data = self.dataset_processor.transform(data)
else:
self.dataset_processor = None

self.data = data
treatment_values = data.treatment_values

Expand Down Expand Up @@ -472,15 +489,17 @@ def fit(
self._tune_with_config,
search_space,
metric=self.metric,
points_to_evaluate=init_cfg
if len(self.resume_cfg) == 0
else self.resume_cfg,
evaluated_rewards=[]
if len(self.resume_scores) == 0
else self.resume_scores,
mode="min"
if self.metric in ["energy_distance", "psw_energy_distance"]
else "max",
points_to_evaluate=(
init_cfg if len(self.resume_cfg) == 0 else self.resume_cfg
),
evaluated_rewards=(
[] if len(self.resume_scores) == 0 else self.resume_scores
),
mode=(
"min"
if self.metric in ["energy_distance", "psw_energy_distance"]
else "max"
),
low_cost_partial_config={},
**self._settings["tuner"],
)
Expand Down Expand Up @@ -695,6 +714,26 @@ def effect(self, df, *args, **kwargs):
"""
return self.model.effect(df, *args, **kwargs)

def predict(
self, cd: CausalityDataset, preprocess: Optional[bool] = False, *args, **kwargs
):
"""Heterogeneous Treatment Effects for data CausalityDataset

Args:
cd (CausalityDataset): data to predict treatment effect for

Returns:
(np.ndarray): predicted treatment effect for each datapoint

"""
if preprocess:
cd = copy.deepcopy(cd)
if self.dataset_processor:
cd = self.dataset_processor.transform(cd)
else:
raise ValueError("CausalityDatasetProcessor has not been trained")
return self.model.effect(cd.data, *args, **kwargs)

def effect_inference(self, df, *args, **kwargs):
"""Inference (uncertainty) results produced by best estimator
Only implemented for EconML estimators so far
Expand Down
Loading
Loading