-
Notifications
You must be signed in to change notification settings - Fork 0
/
predict.py
353 lines (293 loc) · 14.4 KB
/
predict.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
import util
from train import load_data, append_coords, find_and_load_dataset_description
import os.path
import pickle
import gzip
import numpy as np
import torchvision.transforms as T
import torch
def predict_randomforest(x, model):
"""
Predict with the pixelwise random forest model.
@param x: Input data, should have shape (n_timesteps, n_predictors, n_lat, n_lon)
@param model: A trained instance of a random forest model.
@return: Predictions of the models on the testset
"""
x = append_coords(x)
n_timesteps, n_predvars, n_lat, n_lon = x.shape
x = x.transpose(0, 2, 3, 1).reshape(-1, n_predvars)
predict_test = model.predict(x)
return predict_test.reshape(n_timesteps, 1, n_lat, n_lon)
def predict_save_randomforest_pixelwise(dataset_description, model_training_description, base_folder,
model, output_folder, save_model=False):
"""
Predict on the given data set with an already trained randomforest pixelwise method.
Then store the model, the description and the results.
@param dataset_description: Details on the used dataset
@param model_training_description: Details on training and model
@param base_folder: Folder to load the model data from
@param model: List of lists of trained pixelwise linear regression model
@param output_folder: Folder to save output in
@param save_model: Whether or not we want to save model
@return:
"""
assert "DATASET_FOLDER" in model_training_description.keys()
dataset_description = find_and_load_dataset_description(model_training_description["DATASET_FOLDER"],
dataset_description)
if model_training_description["CREATE_VALIDATIONSET"]:
_, _, test_ds = load_data(
dataset_description, model_training_description, base_folder)
else:
_, test_ds = load_data(dataset_description,
model_training_description, base_folder)
x_te = test_ds[:][0].numpy()
predictions = predict_randomforest(x_te, model)
descriptions = {"DATASET_DESCRIPTION": dataset_description,
"MODEL_TRAINING_DESCRIPTION": model_training_description}
s1 = util.create_hash_from_description(dataset_description)
s2 = util.create_hash_from_description(model_training_description)
folder_name = os.path.join(output_folder, s1 + s2)
predictions_file = os.path.join(folder_name, "predictions.gz")
model_file = os.path.join(folder_name, "model.gz")
descriptions_file = os.path.join(folder_name, "descriptions.gz")
if util.test_if_folder_exists(folder_name):
raise FileExistsError(
"Specified configuration of data set, model and training configuration already exists.")
else:
os.makedirs(folder_name)
print("writing predictions")
with gzip.open(predictions_file, 'wb') as f:
pickle.dump(predictions, f)
if save_model:
print("writing model")
with gzip.open(model_file, 'wb') as f:
pickle.dump(model, f)
print("writing descriptions")
with gzip.open(descriptions_file, 'wb') as f:
pickle.dump(descriptions, f)
print("done")
def predict_linreg(x, models):
"""
Predict with the linear regression model.
@param x: Input data, should have shape (n_timesteps, n_predictors, n_lat, n_lon)
@param models: 2d list containing linear regression models for each pixel
@return: Predictions of the models on the testset
"""
n_timesteps, n_predvars, n_lat, n_lon = x.shape
predict_test = np.zeros((n_timesteps, 1, n_lat, n_lon))
for i in range(x.shape[-2]):
for j in range(x.shape[-1]):
predict_test[..., i, j] = models[i][j].predict(x[..., i, j])
return predict_test
def predict_save_linreg_pixelwise(dataset_description, model_training_description, base_folder,
models, output_folder, save_model=False):
"""
Predict on the given data set with an already trained randomforest pixelwise method.
Then store the model, the description and the results.
@param dataset_description: Details on the used dataset
@param model_training_description: Details on training and model
@param base_folder: Folder to load the model data from
@param models: List of lists of trained pixelwise linear regression model
@param output_folder: Folder to save output in
@param save_model: Whether or not we want to save model
@return:
"""
assert "DATASET_FOLDER" in model_training_description.keys()
dataset_description = find_and_load_dataset_description(model_training_description["DATASET_FOLDER"],
dataset_description)
if model_training_description["CREATE_VALIDATIONSET"]:
_, _, test_ds = load_data(
dataset_description, model_training_description, base_folder)
else:
_, test_ds = load_data(dataset_description,
model_training_description, base_folder)
x_te = test_ds[:][0].numpy()
predictions = predict_linreg(x_te, models)
full_dataset_description = find_and_load_dataset_description(model_training_description["DATASET_FOLDER"],
dataset_description)
descriptions = {"DATASET_DESCRIPTION": full_dataset_description,
"MODEL_TRAINING_DESCRIPTION": model_training_description}
s1 = util.create_hash_from_description(full_dataset_description)
s2 = util.create_hash_from_description(model_training_description)
folder_name = os.path.join(output_folder, s1 + s2)
predictions_file = os.path.join(folder_name, "predictions.gz")
model_file = os.path.join(folder_name, "model.gz")
descriptions_file = os.path.join(folder_name, "descriptions.gz")
if util.test_if_folder_exists(folder_name):
raise FileExistsError(
"Specified configuration of data set, model and training configuration already exists.")
else:
os.makedirs(folder_name)
print("writing predictions")
with gzip.open(predictions_file, 'wb') as f:
pickle.dump(predictions, f)
if save_model:
print("writing model")
with gzip.open(model_file, 'wb') as f:
pickle.dump(models, f)
print("writing descriptions")
with gzip.open(descriptions_file, 'wb') as f:
pickle.dump(descriptions, f)
print("done")
def predict_pca(x, pca, pca_targets, model):
"""
use the trained pca's and the regression to predict on test set.
"""
n_timesteps, n_predvars, n_lat, n_lon = x.shape
x_test = x.reshape(n_timesteps, -1)
x_test_rescaled = pca.transform(x_test)
predict_test = model.predict(x_test_rescaled)
predict_test = pca_targets.inverse_transform(predict_test)
return predict_test.reshape((n_timesteps, 1, n_lat, n_lon))
def predict_save_pca(dataset_description, model_training_description, base_folder,
pca, pca_targets, model, output_folder, save_model=False):
"""
Predict on the given data set with an already trained pca method.
Then store the model, the description and the results.
@param save_model: Whether or not we want to store the model
@param dataset_description: Description of the dataset
@param model_training_description: Description of training and model
@param base_folder: Folder in which the dataset is loaded from
@param output_folder: Folder in which to store output
@param pca: Previously trained PCA of the predictor variables
@param pca_targets: Previously trained PCA of the target variables
@param model: Regression model
"""
dataset_description = find_and_load_dataset_description(model_training_description["DATASET_FOLDER"],
dataset_description)
if model_training_description["CREATE_VALIDATIONSET"]:
_, _, test_ds = load_data(
dataset_description, model_training_description, base_folder)
else:
_, test_ds = load_data(dataset_description,
model_training_description, base_folder)
x_te = test_ds[:][0].numpy()
predictions = predict_pca(x_te, pca, pca_targets, model)
descriptions = {"DATASET_DESCRIPTION": dataset_description,
"MODEL_TRAINING_DESCRIPTION": model_training_description}
s1 = util.create_hash_from_description(dataset_description)
s2 = util.create_hash_from_description(model_training_description)
folder_name = os.path.join(output_folder, s1 + s2)
predictions_file = os.path.join(folder_name, "predictions.gz")
model_file = os.path.join(folder_name, "model.gz")
pca_file = os.path.join(folder_name, "pca.gz")
pca_targets_file = os.path.join(folder_name, "pca_targets.gz")
descriptions_file = os.path.join(folder_name, "descriptions.gz")
if util.test_if_folder_exists(folder_name):
raise FileExistsError(
"Specified configuration of dataset, model and training configuration already exists.")
else:
os.makedirs(folder_name)
print("writing predictions")
with gzip.open(predictions_file, 'wb') as f:
pickle.dump(predictions, f)
if save_model:
print("writing model")
with gzip.open(model_file, 'wb') as f:
pickle.dump(model, f)
with gzip.open(pca_file, 'wb') as f:
pickle.dump(pca, f)
with gzip.open(pca_targets_file, 'wb') as f:
pickle.dump(pca_targets, f)
print("writing descriptions")
with gzip.open(descriptions_file, 'wb') as f:
pickle.dump(descriptions, f)
print("done")
def predict_unet(x, x_loader, model_training_description, model):
"""
Predict with a trained instance of a UNet.
@param x: Input dataset
@param x_loader: Loader for x data
@param model_training_description: Description of the model and training
@param model: Trained UNet model
@return:
"""
assert model_training_description["MODEL_TYPE"] in [
"UNet_Ico", "UNet_Flat"]
predictions_model = torch.zeros_like(x[...][1])
# loop over test loader
for idx, batch in enumerate(x_loader):
model.eval()
with torch.no_grad():
predictors = batch[0]
predictors = predictors.to(model_training_description["DEVICE"])
outputs = model(predictors)
predictions_model[idx * model_training_description["BATCH_SIZE"]:(idx + 1) *
model_training_description["BATCH_SIZE"], ...] = outputs.cpu()
return predictions_model
def predict_save_unet(dataset_description, model_training_description, base_folder, model, output_folder,
save_model=False):
"""
Predict on the given data set with an already trained pca method.
Then store the model, the description and the results.
@param save_model: Whether or not we want to store the model
@param dataset_description: Description of the dataset
@param model_training_description: Description of training and model
@param base_folder: Folder in which the dataset is loaded from
@param output_folder: Folder in which to store output
@param model: UNet model
"""
if model_training_description["CREATE_VALIDATIONSET"] is True:
_, _, testloader, _, _, testset = load_data(
dataset_description, model_training_description, base_folder)
else:
_, testloader, _, testset = load_data(
dataset_description, model_training_description, base_folder)
dataset_description = find_and_load_dataset_description(model_training_description["DATASET_FOLDER"],
dataset_description)
predictions = predict_unet(
testset, testloader, model_training_description, model)
if model_training_description["MODEL_TYPE"] == "UNet_Flat":
predictions = T.Resize(size=dataset_description["GRID_SHAPE"])(
predictions).numpy()
descriptions = {"DATASET_DESCRIPTION": dataset_description,
"MODEL_TRAINING_DESCRIPTION": model_training_description}
s1 = util.create_hash_from_description(dataset_description)
s2 = util.create_hash_from_description(model_training_description)
folder_name = os.path.join(output_folder, s1 + s2)
predictions_file = os.path.join(folder_name, "predictions.gz")
model_file = os.path.join(folder_name, "model.gz")
descriptions_file = os.path.join(folder_name, "descriptions.gz")
if util.test_if_folder_exists(folder_name):
raise FileExistsError(
"Specified configuration of dataset, model and training configuration already exists.")
else:
os.makedirs(folder_name)
print("writing predictions")
with gzip.open(predictions_file, 'wb') as f:
pickle.dump(predictions, f)
if save_model:
print("writing model")
with gzip.open(model_file, 'wb') as f:
pickle.dump(model, f)
print("writing descriptions")
with gzip.open(descriptions_file, 'wb') as f:
pickle.dump(descriptions, f)
print("done")
def interpolate_data_between_grids(data, in_description, out_description):
"""
When doing cross-prediction, data can lie on different grids.
We use interpolation to go from one grid to another.
Latitudes and Longitudes are extracted from the descriptions of the datasets.
Assume data has shape (..., n_lats, n_lons).
@param data: Data to be interpolated
@param in_description: Description of the data set we interpolate from
@param out_description: Description of the data set we interpolate to
@return: Interpolated data set.
"""
from scipy.interpolate import RegularGridInterpolator
# for latitudes, one can specify invalid latitudes by using "LATITUDES_SLICE". These are already excluded here
lat_in = in_description["LATITUDES"]
lon_in = in_description["LONGITUDES"]
lat_out = out_description["LATITUDES"]
lon_out = out_description["LONGITUDES"]
lat_mg_out, lon_mg_out = np.meshgrid(lat_out, lon_out, indexing='ij')
ds = data.shape
# flatten everything but lat and lon
data = data.reshape(-1, ds[-2], ds[-1])
res = np.zeros((data.shape[0], len(lat_out), len(lon_out)))
for i in range(len(data)):
interp = RegularGridInterpolator(
(lat_in, lon_in), data[i], bounds_error=False, fill_value=None)
res[i] = interp((lat_mg_out, lon_mg_out))
return res