-
Notifications
You must be signed in to change notification settings - Fork 1
/
pac_cv_gcv.py
223 lines (150 loc) · 4.95 KB
/
pac_cv_gcv.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
import numpy as np
import scipy as sp
from scipy.stats import norm
import matplotlib.pyplot as plt
# Data generation process
# Fix some general setup variables
# sample size n
n = 100
# Monte Carlo iterations J
J = 10
# number of repetitions for the CV exercise J_cv
J_cv = 1
# noise variance
sig2 = 0.3
# Define colors
col_pink = 'hotpink'
col_blue = 'b'
col_red = 'indianred'
col_navy = 'navy'
## 1. Generate X$\sim U [0,1]$
sp.random.seed(201053)
x = sp.random.uniform(0,1,n)
x.sort()
## 2. Generate $y_i = \sin^3(2\pi x_i^3) + \varepsilon_i, i\in [1,n]$, where $\varepsilon_i \sim N(0,\sigma_\varepsilon^2)$ and $\sigma^2_\varepsilon = 0.3$
# 2a) Generate $\varepsilon_i \sim N(0,\sigma_\varepsilon^2)$ with $\sigma^2_\varepsilon $ to be determined
# Only for illustration - regenerated in next block!
sigma = np.sqrt(sig2)
np.random.seed(201053)
eps = np.random.normal(0, sigma, n)
# 2b) Generate $y_i = \sin^3(2\pi x_i^3) + \varepsilon_i$ for $i\in 1:n$
def f(x):
return (np.sin(2*np.pi*(x**3)))**3
def gendata(f, sigma, n):
return ( f + np.random.normal(0, sigma, n) )
f_x = f(x)
np.random.seed(201053)
y = gendata(f_x, sigma, n)
## Data plot
fig, ax = plt.subplots(1, 1)
ax.plot(x, y, 'o', color=col_red)
ax.plot(x,f_x,'-g',color=col_blue, lw=3)
# Kernel regression smoothing with NW kernel estimator
## Smoothing with Gaussian Kernel
# Naradaya-Watson at points t, a mesh of grid points
def nw_inner(t, x, y, h):
K_h = norm.pdf( (t-x)/h )/h
den = sum(K_h)
w = K_h/den
nw = np.dot(w, y)
return nw
# f_h estimation
def fh(ts, x, y, h):
return [nw_inner(t, x ,y ,h) for t in ts]
# trial bandwidth for Gaussian kernel, note that in terms of Uni (box car) kernel
# this has to be multiplied with 1.35/0.77, i.e. we for h=0.01 are averaging over the window (t - 0.0175, t + 0.0175)
# LM change the NW smoother !!!!
h = 0.02
# NW estimation
f_h = fh(x, x, y, h)
fig, ax = plt.subplots(1, 1)
ax.plot(x, y, 'o', color= col_red)
ax.plot(x,f_h,'-g',color= col_blue, lw=3)
# Bias-variance decomposition
# Estimation for some bandwidths e.g. $h \in [0.01, 0.02, \dots, 0.1]$
h = list( np.linspace(0.01, 0.3, 25) )
# Repeat data generation J times
f_h = np.zeros( (n, len(h)) )
E_fh = np.zeros( (n, len(h)) )
bias = np.zeros( (n, len(h)) )
variance = np.zeros( (n, len(h)) )
CV = np.zeros( len(h) )
CV_all = np.zeros( (J_cv, len(h)) ) # collects the CVs over simulations
L_emp = np.zeros( (J_cv, len(h)) ) # collects the emp error over simulations
# Computation of bias
np.random.seed(201053)
for j in range(1, J):
y = gendata(f_x, sigma, n)
for i in range(len(h)):
f_h[:,i] = fh(x, x, y, h[i])
E_fh[:,i] = E_fh[:,i] + f_h[:,i]
# end h loop
# end J loop
E_fh = E_fh/J
# now calc bias
for i in range(len(h)):
bias[:,i] = E_fh[:,i] - f_x
# end h loop
bias2 = bias**2
Int_bias_h = np.sum(bias2, axis = 0)/n
np.random.seed(201053)
for j in range(1,J):
y = gendata(f_x, sigma, n)
for i in range(len(h)):
f_h[:,i] = fh(x, x, y, h[i])
variance[:,i] = variance[:,i] + (f_h[:,i] - E_fh[:,i])**2
# end h loop
# end J loop
variance = variance/J
Int_var_h = np.sum(variance, axis = 0)/n
# Plot bias-variance-tradeoff
fig, ax1 = plt.subplots()
ax1.set_xlabel('bandwidth h')
ax1.set_ylabel('squared bias', color=col_blue)
ax1.plot(h, Int_bias_h, color=col_blue, lw = 3)
ax1.tick_params(axis='y', labelcolor=col_blue)
ax2 = ax1.twinx()
col_red = 'tab:red'
ax2.set_ylabel('variance', color=col_red)
ax2.plot(h, Int_var_h, color=col_red, lw = 3)
ax2.tick_params(axis='y', labelcolor=col_red)
fig.tight_layout()
plt.show()
plt.plot(h, Int_bias_h + Int_var_h, color = col_navy, lw=3)
plt.show()
## Cross-validation
np.random.seed(201053)
for j in range(J_cv):
y = gendata(f_x, sigma, n)
for i in range( len(h) ):
f_l = 0
for l in range( len(x) ):
x_l = np.delete(x, l, axis = 0)
y_l = np.delete(y, l, axis = 0)
u = fh([x[l]], x_l, y_l, h[i])
f_l = f_l + (y[l] - u)**2
# end l loop over obs
CV[i] = f_l
# end h loop
CV_all[j, i] = CV[i]/n
# end J loop
CV_mean = np.mean( CV_all, axis = 0 )
plt.plot(h, CV_mean, lw = 3, color = col_pink)
np.random.seed(201053)
for j in range(J_cv):
y = gendata(f_x, sigma, n)
for i in range( len(h) ):
f_h[:,i] = fh(x, x, y, h[i])
L_emp[j,i] = np.mean( (y - f_h[:,i])**2 , axis = 0)
# end h loop
# end J loop
L_emp = L_emp/J_cv
L_emp_mean = np.mean( L_emp, axis = 0 )
## Generalized Cross-validation
inner_t = (norm.pdf(0)/h)/n # tr( S(h)/n )
L_GCV = L_emp/( (1-inner_t)**2 )
L_GCV_mean = np.mean( L_GCV, axis = 0 )
plt.plot(h, sig2 + np.zeros( len(h) ), color=col_pink, lw=3)
plt.plot(h, CV_mean,linestyle='dashdot', color=col_blue, lw=3)
plt.plot(h, L_emp_mean, color= col_red, lw=3)
plt.plot(h, L_GCV_mean, linestyle='dotted', color = 'green', lw=3)