-
Notifications
You must be signed in to change notification settings - Fork 0
/
example.py
136 lines (113 loc) · 4.81 KB
/
example.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
from sklearn.datasets import load_diabetes
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.metrics import r2_score, root_mean_squared_error
from sklearn.model_selection import train_test_split
from tabpfn import TabPFNClassifier
from binwise import RegressionToClassificationEnsemble, RegressionToClassificationModel
# Simple helper function to help us evaluate model performance using RMSE and R2 score:
def evaluate_model(model, X_test, y_test, model_name):
"""Evaluates model performance using RMSE and R2 score."""
y_pred = model.predict(X_test)
rmse = root_mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f"{model_name} - RMSE: {rmse:.4f}, R2: {r2:.4f}")
# We will use the diabetes dataset as it's a well-known public regression dataset with small size.
# Let's load the diabetes dataset:
dataset = load_diabetes()
X, y = dataset.data, dataset.target
# Split the data into training and testing sets.
# Note: for better evaluation, consider using cross-validation instead of a single train-test split.
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=42
)
# Let's start with our experiments!
# 1. RandomForest Regressor.
# We will start with a standard RandomForest Regressor as our baseline:
rf_reg = RandomForestRegressor(
n_estimators=100,
random_state=42,
)
rf_reg.fit(X_train, y_train)
evaluate_model(rf_reg, X_test, y_test, "RandomForest")
# Output:
# RandomForest - RMSE: 54.3324, R2: 0.4428
# 2. RandomForest Classifier with RegressionToClassificationModel.
# Let's try RandomForest Classifier implementation with our adapter and see how it performs:
rf_class = RegressionToClassificationModel(
model_constructor=lambda: RandomForestClassifier(
n_estimators=100,
random_state=42,
),
n_bins=10,
binning_strategy="Uniform",
)
rf_class.fit(X_train, y_train)
evaluate_model(
rf_class, X_test, y_test, "RandomForest with RegressionToClassificationModel"
)
# Output:
# RandomForest with RegressionToClassificationModel - RMSE: 53.3076, R2: 0.4636
# As we can see, we've already improved our results just by using the classification version.
# But let's explore more.
# 3. RandomForest Classifier with RegressionToClassificationEnsemble.
# Let's see if we can do even better by using an ensemble of different bin sizes and strategies:
rf_ensemble = RegressionToClassificationEnsemble(
base_model_constructor=lambda: RandomForestClassifier(
n_estimators=100,
random_state=42,
),
bin_sizes=[5, 10, 15],
binning_strategies=["Uniform", "Quantile"],
subsample_ratio=1.0,
random_state=42,
)
rf_ensemble.fit(X_train, y_train)
evaluate_model(
rf_ensemble, X_test, y_test, "RandomForest with RegressionToClassificationEnsemble"
)
# Output:
# RandomForest with RegressionToClassificationEnsemble - RMSE: 52.9668, R2: 0.4705
# As we can see, the ensemble approach gives us another boost in performance.
# 4. TabPFN with RegressionToClassificationModel.
# But what about models that don't even have a regression implementation?
# Let's try TabPFN, which has shown good performance for small classification datasets in benchmark studies.
# Note: We set N_ensemble_configurations=1 to disable TabPFN's internal ensembling
tabpfn_class = RegressionToClassificationModel(
model_constructor=lambda: TabPFNClassifier(
device="cpu",
N_ensemble_configurations=1,
),
n_bins=10,
binning_strategy="Uniform",
)
tabpfn_class.fit(X_train, y_train)
evaluate_model(
tabpfn_class, X_test, y_test, "TabPFN with RegressionToClassificationModel"
)
# Output:
# TabPFN with RegressionToClassificationModel - RMSE: 49.3873, R2: 0.5396
# As we can see, TabPFN performs well, and even better than our previous attempts.
# 5. TabPFN with RegressionToClassificationEnsemble
# Finally, let's combine TabPFN with our ensemble of different bin sizes and strategies:
tabpfn_ensemble = RegressionToClassificationEnsemble(
base_model_constructor=lambda: TabPFNClassifier(
device="cpu",
N_ensemble_configurations=1,
),
bin_sizes=[8, 10],
binning_strategies=["Uniform", "Quantile"],
subsample_ratio=1.0,
random_state=42,
)
tabpfn_ensemble.fit(X_train, y_train)
evaluate_model(
tabpfn_ensemble, X_test, y_test, "TabPFN with RegressionToClassificationEnsemble"
)
# Output:
# TabPFN with RegressionToClassificationEnsemble - RMSE: 49.1802, R2: 0.5435
# We managed to improve again - our best performance yet!
# What's next?
# - Check out the uncertainty_example.py to see how to get and evaluate uncertainty estimates
# - Try this on your own data, especially if you have a small dataset
# - Don't forget to use proper cross-validation for more reliable results
# - Experiment with the base model parameters - maybe you can achieve even better performance