Skip to content

Commit

Permalink
Merge pull request #328 from EgorKraevTransferwise/andrin_erupt_nobig…
Browse files Browse the repository at this point in the history
…files

Andrin's changes minus all the pickle files, plus Egor's ERUPT refactor
  • Loading branch information
EgorKraevTransferwise authored Nov 29, 2024
2 parents d2c65f8 + a02d290 commit 346aa84
Show file tree
Hide file tree
Showing 41 changed files with 3,084 additions and 1,673 deletions.
2 changes: 1 addition & 1 deletion causaltune/__init__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
from causaltune.optimiser import CausalTune
from causaltune.visualizer import Visualizer
from causaltune.scoring import Scorer
from causaltune.score.scoring import Scorer

__all__ = ["CausalTune", "Visualizer", "Scorer"]
1 change: 1 addition & 0 deletions causaltune/dataset_processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ class CausalityDatasetProcessor(BaseEstimator, TransformerMixin):
outcome (str): The target variable used for encoding.
encoder: Encoder object used during feature transformations.
"""

def __init__(self):
"""
Initializes CausalityDatasetProcessor with default attributes for encoder_type, outcome, and encoder.
Expand Down
121 changes: 72 additions & 49 deletions causaltune/datasets.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
import pandas as pd
import numpy as np
import pickle
import os
from scipy import special

# from scipy.stats import betabinom
Expand All @@ -12,10 +14,8 @@


def linear_multi_dataset(
n_points=10000,
impact=None,
include_propensity=False,
include_control=False) -> CausalityDataset:
n_points=10000, impact=None, include_propensity=False, include_control=False
) -> CausalityDataset:
if impact is None:
impact = {0: 0.0, 1: 2.0, 2: 1.0}
df = pd.DataFrame(
Expand Down Expand Up @@ -80,8 +80,9 @@ def nhefs() -> CausalityDataset:
df = df.loc[~missing]

df = df[covariates + ["qsmk"] + ["wt82_71"]]
df.rename(columns={c: "x" + str(i + 1)
for i, c in enumerate(covariates)}, inplace=True)
df.rename(
columns={c: "x" + str(i + 1) for i, c in enumerate(covariates)}, inplace=True
)

return CausalityDataset(df, treatment="qsmk", outcomes=["wt82_71"])

Expand Down Expand Up @@ -172,8 +173,7 @@ def amazon_reviews(rating="pos") -> CausalityDataset:
gdown.download(url, "amazon_" + rating + ".csv", fuzzy=True)
df = pd.read_csv("amazon_" + rating + ".csv")
df.drop(df.columns[[2, 3, 4]], axis=1, inplace=True)
df.columns = ["treatment", "y_factual"] + \
["x" + str(i) for i in range(1, 301)]
df.columns = ["treatment", "y_factual"] + ["x" + str(i) for i in range(1, 301)]
return CausalityDataset(df, "treatment", ["y_factual"])
else:
print(
Expand Down Expand Up @@ -226,14 +226,10 @@ def synth_ihdp(return_df=False) -> CausalityDataset:
data.columns = col
# drop the columns we don't care about
ignore_patterns = ["y_cfactual", "mu"]
ignore_cols = [c for c in data.columns if any(
[s in c for s in ignore_patterns])]
ignore_cols = [c for c in data.columns if any([s in c for s in ignore_patterns])]
data = data.drop(columns=ignore_cols)

return CausalityDataset(
data,
"treatment",
["y_factual"]) if not return_df else data
return CausalityDataset(data, "treatment", ["y_factual"]) if not return_df else data


def synth_acic(condition=1) -> CausalityDataset:
Expand Down Expand Up @@ -347,6 +343,7 @@ def generate_synthetic_data(
noisy_outcomes: bool = False,
effect_size: Union[int, None] = None,
add_instrument: bool = False,
known_propensity: bool = False,
) -> CausalityDataset:
"""Generates synthetic dataset with conditional treatment effect (CATE) and optional instrumental variable.
Supports RCT (unconfounded) and observational (confounded) data.
Expand Down Expand Up @@ -385,11 +382,15 @@ def generate_synthetic_data(
p = np.clip(p, 0.1, 0.9)
C = p > np.random.rand(n_samples)
# print(min(p), max(p))

else:
p = 0.5 * np.ones(n_samples)
C = np.random.binomial(n=1, p=0.5, size=n_samples)

if known_propensity:
known_p = np.random.beta(2, 5, size=n_samples)
else:
known_p = p

if add_instrument:
Z = np.random.binomial(n=1, p=0.5, size=n_samples)
C0 = np.random.binomial(n=1, p=0.006, size=n_samples)
Expand All @@ -416,18 +417,11 @@ def mu(X):
Y = tau * T + Y_base

features = [f"X{i+1}" for i in range(n_covariates)]
df = pd.DataFrame(np.array([*X.T,
T,
Y,
tau,
p,
Y_base]).T,
columns=features + ["treatment",
"outcome",
"true_effect",
"propensity",
"base_outcome"],
)
df = pd.DataFrame(
np.array([*X.T, T, Y, tau, known_p, Y_base]).T,
columns=features
+ ["treatment", "outcome", "true_effect", "propensity", "base_outcome"],
)
data = CausalityDataset(
data=df,
treatment="treatment",
Expand All @@ -450,6 +444,7 @@ def generate_linear_synthetic_data(
noisy_outcomes: bool = False,
effect_size: Union[int, None] = None,
add_instrument: bool = False,
known_propensity: bool = False,
) -> CausalityDataset:
"""Generates synthetic dataset with linear treatment effect (CATE) and optional instrumental variable.
Supports RCT (unconfounded) and observational (confounded) data.
Expand Down Expand Up @@ -494,6 +489,11 @@ def generate_linear_synthetic_data(
p = 0.5 * np.ones(n_samples)
C = np.random.binomial(n=1, p=0.5, size=n_samples)

if known_propensity:
known_p = np.random.beta(2, 5, size=n_samples)
else:
known_p = p

if add_instrument:
Z = np.random.binomial(n=1, p=0.5, size=n_samples)
C0 = np.random.binomial(n=1, p=0.006, size=n_samples)
Expand All @@ -520,18 +520,11 @@ def mu(X):
Y = tau * T + Y_base

features = [f"X{i+1}" for i in range(n_covariates)]
df = pd.DataFrame(np.array([*X.T,
T,
Y,
tau,
p,
Y_base]).T,
columns=features + ["treatment",
"outcome",
"true_effect",
"propensity",
"base_outcome"],
)
df = pd.DataFrame(
np.array([*X.T, T, Y, tau, known_p, Y_base]).T,
columns=features
+ ["treatment", "outcome", "true_effect", "propensity", "base_outcome"],
)
data = CausalityDataset(
data=df,
treatment="treatment",
Expand Down Expand Up @@ -641,16 +634,8 @@ def generate_non_random_dataset(num_samples=1000):
)
treatment = np.random.binomial(1, propensity)
outcome = (
0.2
* treatment
+ 0.5
* x1
- 0.2
* x2
+ np.random.normal(
0,
1,
num_samples))
0.2 * treatment + 0.5 * x1 - 0.2 * x2 + np.random.normal(0, 1, num_samples)
)

dataset = {
"T": treatment,
Expand Down Expand Up @@ -729,3 +714,41 @@ def mlrate_experiment_synth_dgp(
cd = CausalityDataset(data=df, outcomes=["Y"], treatment="T")

return cd


def save_dataset(dataset: CausalityDataset, filename: str):
"""
Save a CausalityDataset object to a file using pickle.
Args:
dataset (CausalityDataset): The dataset to save.
filename (str): The name of the file to save the dataset to.
"""
with open(filename, "wb") as f:
pickle.dump(dataset, f)
print(f"Dataset saved to {filename}")


def load_dataset(filename: str) -> CausalityDataset:
"""
Load a CausalityDataset object from a file using pickle.
Args:
filename (str): The name of the file to load the dataset from.
Returns:
CausalityDataset: The loaded dataset.
"""
if not os.path.exists(filename):
raise FileNotFoundError(f"File {filename} not found.")

with open(filename, "rb") as f:
dataset = pickle.load(f)

if not isinstance(dataset, CausalityDataset):
raise ValueError(
f"The file {filename} does not contain a valid CausalityDataset object."
)

print(f"Dataset loaded from {filename}")
return dataset
Loading

0 comments on commit 346aa84

Please sign in to comment.