-
Notifications
You must be signed in to change notification settings - Fork 0
/
main_MLR2_RF.py
184 lines (142 loc) · 8.05 KB
/
main_MLR2_RF.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
## Build a Multivariate linear regression model and a bias correction Random Forest model for each grid to downscale Evapotranspiration dataset by a scale factor of 9, from 12km to 1.5 km
from operator import index
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
import hydroeval as he
import functions as functions
import time
from sklearn.ensemble import RandomForestRegressor
from sklearn import linear_model
import joblib
import numpy as np
import random
seed = 100
# input/output directories
load_dir_dataset = "/g/data/w97/sho561/Downscale/BARRA/Training_Testing_new/"
dir_model = "/g/data/w97/sho561/Downscale/BARRA/Models_new/"
pred_eval = "/g/data/w97/sho561/Downscale/BARRA/Prediction_Evaluation_new/"
# Initialisation
all_years = list(range(1990,2019))
train_grids = [642, 714, 720, 1207, 1233, 1682, 1728, 2348, 2817, 2855, 3002, 3114, 3346, 3809, 4233, 4322, 4615, 4623, 6081, 6145]
featuresList = ['av_lat_hflx', 'av_mslp', 'av_netlwsfc', 'av_netswsfc', 'av_qsair_scrn', 'av_temp_scrn',
'av_canopy_height', 'av_uwnd10m', 'av_vwnd10m', 'av_leaf_area_index', 'soil_albedo', 'soil_porosity', 'soil_bulk_density', 'topog' ]
# number predictors
npred = len(featuresList)
# I will run the script on 'experiment 1', i.e. training on 10 years, I might try other experiments in the future.
experiment = 'exp1'
if experiment == 'exp1':
train_years = [1990, 1991, 1992, 1995, 1996, 2001, 2003, 2004, 2016, 2018 ]
test_years = list(set(all_years) - set(train_years))
# number of trees in Random forest
ntrees = 100
#%%
# Build a new model for each coarse grid in train_grids
for coarse_grid in train_grids:
print(coarse_grid)
# training data
all_sample_df = pd.DataFrame()
for year in all_years:
filename_dataset = load_dir_dataset +'%s_%s_predictors_target.csv' %(coarse_grid, year)
single_year_df = pd.read_csv(filename_dataset)
# Multi layer perceptron doesn't like Null values
single_year_df = single_year_df.dropna(axis=0)
all_sample_df = pd.concat([all_sample_df, single_year_df])
# concatening will mess up with the index of the combined dataframe
all_sample_df= all_sample_df.reset_index()
# split the data into training and testing
index_testing = np.where(all_sample_df['year'].isin(test_years))
index_training = np.where(all_sample_df['year'].isin(train_years))
in_sample_df = all_sample_df.iloc[index_training]
out_sample_df = all_sample_df.iloc[index_testing]
# Keep only the predictors and normalise the training data
X = all_sample_df[featuresList]
X = StandardScaler().fit_transform(X)
X = X[index_training]
print('shape of X is ', X.shape)
# target
y = all_sample_df.iloc[index_training]
y = y[['target']].to_numpy()
# build a multivariate linear regression on the training data
random.seed(seed)
regr = linear_model.LinearRegression()
regr.fit(X, y)
#Save the trained model
filename_mlr = dir_model + "MLR_%s_%spred_%s.joblib" %(coarse_grid, npred, experiment)
joblib.dump(regr, filename_mlr, compress=3) # compression is ON
#regr = joblib.load(filename_mlr)
# predict on the training data
in_sample_df['MLR'] = regr.predict(X)
print('build a bias correction model with Random Forest')
# calculate the bias of the predictions, predicted minus target
in_sample_df['bias'] = in_sample_df['MLR'] - in_sample_df['target']
# save for later
in_sample_df_save = in_sample_df
# to train random forest, set a threshold on the bias of the training data
in_sample_df = in_sample_df.drop(in_sample_df[(in_sample_df.bias < -50) | (in_sample_df.bias > 50)].index)
# training data for Random Forest
train_features = in_sample_df[featuresList]
# target data for Random Forest
train_labels = in_sample_df[['bias']]
# initialisation, use all the available processors
rf_model = RandomForestRegressor(n_estimators = ntrees, random_state = seed, n_jobs=-1, oob_score=True)
# keep track of the time needed to train Random Forest
start_time = time.time()
# training
rf_model.fit(train_features, train_labels)
training_duration = time.time() - start_time
print("--- %s minutes ---" % round(training_duration/60, 1))
# save the model, saving the model is time consuming
filename_rf = dir_model + "RF_MLR_%s_trees%s_%spred_%s.joblib" %(coarse_grid, ntrees, npred, experiment) ## copied to E:\Data\BARRA_downscale\Final\models
joblib.dump(rf_model, filename_rf, compress=6) # compression is ON!
## predict and evaluate MLR and RF at the testing data
all_test_df = pd.DataFrame()
all_mean_eval_df = pd.DataFrame()
# normalise the entire data first, then keep the testing data
X_test = all_sample_df[featuresList]
X_test = StandardScaler().fit_transform(X_test)
X_test = X_test[index_testing]
# make prediction
print("predicting ...")
# predict using the trained MLR, add a column for the predicted values
out_sample_df['MLR'] = regr.predict(X_test)
# predict the bias using the trained Random Forset
features_df = out_sample_df[featuresList]
out_sample_df['bias'] = rf_model.predict(features_df)
out_sample_df['MLR_RF'] = out_sample_df['MLR'] - out_sample_df['bias']
# Save the output at the training and testing data in one single file
out_sample_df = out_sample_df[['ref_fine_cell', 'year', 'month', 'day','target', 'MLR_RF', 'MLR', 'bias']]
fine_grid_cells = out_sample_df['ref_fine_cell'].unique().tolist()
in_sample_df_save['bias'] = rf_model.predict(in_sample_df_save[featuresList])
in_sample_df_save['MLR_RF'] = in_sample_df_save['MLR'] - in_sample_df_save['bias']
training_testing_df = pd.concat([in_sample_df_save[['ref_fine_cell', 'year', 'month', 'day','target', 'MLR_RF', 'MLR', 'bias']], out_sample_df])
filename_test_1990_2018 = pred_eval + "MLR_RF_%s_100trees_pred_transfer_%s_%spred_1990_2018_%s.csv" %(coarse_grid, coarse_grid, npred, experiment)
training_testing_df.to_csv(filename_test_1990_2018, index=False)
# ''''' Evaluation at the testing years, compare predicted fine gridcells with target gridcells
all_mean_eval_df = pd.DataFrame() # dataframe for the performance of MLR
all_mean_eval_df2 = pd.DataFrame() # dataframe for the performance of MLR followed by RF
for test_year in test_years:
all_eval_df = pd.DataFrame()
all_eval_df2 = pd.DataFrame()
for fine_grid in fine_grid_cells:
subset_test_df = out_sample_df[(out_sample_df['ref_fine_cell'] == fine_grid) & (out_sample_df['year'] == test_year)]
eval_data = functions.testing_performance(subset_test_df, 'MLR_RF', test_year)
eval_df = pd.DataFrame(eval_data)
eval_df['ref_fine_cell'] = fine_grid
all_eval_df = pd.concat([all_eval_df, eval_df])
eval_data2 = functions.testing_performance(subset_test_df, 'MLR', test_year)
eval_df2 = pd.DataFrame(eval_data2)
eval_df2['ref_fine_cell'] = fine_grid
all_eval_df2 = pd.concat([all_eval_df2, eval_df2])
mean_eval_df = pd.DataFrame(all_eval_df.mean(axis=0)).T
mean_eval_df = mean_eval_df.drop(['ref_fine_cell'], axis=1)
all_mean_eval_df = pd.concat([all_mean_eval_df, mean_eval_df])
mean_eval_df2 = pd.DataFrame(all_eval_df2.mean(axis=0)).T
mean_eval_df2 = mean_eval_df2.drop(['ref_fine_cell'], axis=1)
all_mean_eval_df2 = pd.concat([all_mean_eval_df2, mean_eval_df2])
# save the results of evaluation
filename_evaluation_mean = pred_eval +'MLR_RF_%s_mean_testing_perf_trained_on_%sgrid_%spred_%s_%s.csv' %(coarse_grid, coarse_grid, npred, ntrees, experiment)
filename_evaluation_mean2 = pred_eval +'MLR_%s_mean_testing_perf_trained_on_%sgrid_%spred_%s_%s.csv' %(coarse_grid, coarse_grid, npred, ntrees, experiment)
all_mean_eval_df.to_csv(filename_evaluation_mean, index=False)
all_mean_eval_df2.to_csv(filename_evaluation_mean2, index=False)
print("############## END #################")