Name of Quantlet: PAC_CV_GCV
Published in: Metis
Description: 'Fitting the bandwidth of Nadaraya-Watson estimator to simulated data on an interval using cross-validation and generalised cross-validation in R and Python'
Keywords: smoothing, Nadaraya-Watson, cross-validation, generalised cross-validation, empirical error
Author: Anna Shchekina, Lili Matic, Wolfgang Karl Härdle
See also: QID-2142-SPMsimulmase
Submitted: 2021-05-17
# clear variables
rm(list = ls(all = TRUE))
# install and load packages
libraries = c("locpol", "dplyr")
lapply(libraries, function(x) if (!(x %in% installed.packages())) {
install.packages(x) })
lapply(libraries, library, quietly = TRUE, character.only = TRUE)
# specify the model
set.seed(201053)
n = 100 # number of observations
n_MC = 10 # number of Monte-Carlo iterations
sigma2 = 0.3
sigma = sqrt(sigma2)
x = runif(n) %>% sort() # uniform sample
f = (sin(2*pi*(x^3)))^3 %>% unname() # true line
y_sim <- function() {
eps = rnorm(n) * sigma # error
y = f + eps
return(y)
}
# bandwidth grid
n_h = 25
h = seq(0.03, by = (0.15-0.03)/(n_h-1), length = n_h)
# Nadaraya-Watson estimator
fNW <- function(x, X, Y, h, K = dnorm) {
Kx <- sapply(X, function(Xi) K((x - Xi) / h) / h)
if (is.vector(Kx)) Kx = matrix(Kx, nrow = 1)
W <- Kx / rowSums(Kx)
drop(W %*% Y)
}
# compute empirical, LOO CV and GCV errors, variance and bias^2
# for given data sample and a range of bandwidths
L_CV = matrix(0, n_h, 1)
L_GCV = L_CV
L_emp = L_CV
bias_MC = L_CV
m1_MC = L_CV
m2_MC = L_CV
y = y_sim()
for (k in 1:n_h) {
# Nadaraya–Watson with Gaussian kernel
fh = fNW(x = x, X = x, Y = y, h = h[k])
# empirical error
L_emp[k] = mean((y - fh)^2)
# LOO CV
fh_cv = sapply(1:n, function(i)
fNW(x = x[i], X = x[-i], Y = y[-i], h = h[k]))
L_CV[k] = mean((y - fh_cv)^2)
# GCV
tr_est = dnorm(0)/h[k]
L_GCV[k] = 1/(1 - tr_est/n)^2 * L_emp[k]
}
# Monte-Carlo estimates of true bias and variance
for (j in 1:n_MC) {
y_MC = y_sim()
for (k in 1:n_h) {
fh_MC = fNW(x = x, X = x, Y = y_MC, h = h[k])
bias_MC[k] = bias_MC[k] + mean(fh_MC - f)
m1_MC[k] = m1_MC[k] + mean(fh_MC)
m2_MC[k] = m2_MC[k] + mean(fh_MC^2)
}
}
bias_MC = bias_MC/n_MC
var_MC = m2_MC/n_MC - (m1_MC/n_MC)^2
# plot
png("errors.png", width = 900, height = 900, bg = "transparent")
plot(h, L_CV, type = "l", lwd = 3, lty = 2, col = "black", xlab = "Bandwidth h",
ylab = "", cex.lab = 2, cex.axis = 2, ylim = c(min(L_emp), max(L_CV)))
title("Choosing optimal bandwidth", cex.main = 2)
lines(h, L_emp, lwd = 3, lty = 1, col = "blue3")
abline(h = sigma2, col = "green", lwd = 1, lty = 1)
lines(h, L_GCV, lwd = 3, lty = 2, col = "red3")
legend("bottomright", c("L_emp", "L_CV", "L_GCV", "sigma^2"), lty = c(1, 2, 2, 1),
lwd = c(3, 3, 3, 1), col = c("blue3", "black", "red3", "green"), cex = 1.5)
dev.off()
png("bv.png", width = 900, height = 900, bg = "transparent")
par(mar = c(5, 4, 4, 4) + 0.3)
plot(h, var_MC, type = "l", lwd = 3, col = "red3", xlab = "Bandwidth h", ylab = "Variance",
cex.lab = 2, cex.axis = 2, ylim = c(0, max(var_MC)))
par(new = TRUE)
plot(h, bias_MC^2, type = "l", lwd = 3, axes = FALSE, col = "blue3", ylab = "", xlab = "")
axis(side = 4, at = pretty(range(bias_MC^2)), cex.axis = 2)
mtext("Bias^2", side = 4, line = 3, cex = 2)
title("Bias-Variance Tradeoff", cex.main = 2)
dev.off()
# choose optimal h acc CV
h_cv = h[which(L_CV == min(L_CV))]
f_cv = fNW(x = x, X = x, Y = y, h = h_cv)
png("cv.png", width = 900, height = 900, bg = "transparent")
plot(x, f, type = "l",col = "blue3", lwd = 3, ylab = "",
xlab = "x", cex.lab = 2, cex.axis = 2, ylim = range(y))
title("Simulated Data Estimated with CV", cex.main = 2)
points(x, y, pch = 19, col = "red3", cex = 0.7)
lines(x, f_cv, lwd = 3)
dev.off()
# choose optimal h acc GCV
h_gcv = h[which(L_GCV == min(L_GCV))]
f_gcv = fNW(x = x, X = x, Y = y, h = h_gcv)
png("gcv.png", width = 900, height = 900, bg = "transparent")
plot(x, f, type = "l", col = "blue3", lwd = 3, ylab = "",
xlab = "x", cex.lab = 2, cex.axis = 2, ylim = range(y))
title("Simulated Data Estimated with GCV", cex.main = 2)
points(x, y, pch = 19, col = "red3", cex = 0.7)
lines(x, f_gcv, lwd = 3)
dev.off()
import numpy as np
import scipy as sp
from scipy.stats import norm
import matplotlib.pyplot as plt
# Data generation process
# Fix some general setup variables
# sample size n
n = 100
# Monte Carlo iterations J
J = 10
# number of repetitions for the CV exercise J_cv
J_cv = 1
# noise variance
sig2 = 0.3
# Define colors
col_pink = 'hotpink'
col_blue = 'b'
col_red = 'indianred'
col_navy = 'navy'
## 1. Generate X$\sim U [0,1]$
sp.random.seed(201053)
x = sp.random.uniform(0,1,n)
x.sort()
## 2. Generate $y_i = \sin^3(2\pi x_i^3) + \varepsilon_i, i\in [1,n]$, where $\varepsilon_i \sim N(0,\sigma_\varepsilon^2)$ and $\sigma^2_\varepsilon = 0.3$
# 2a) Generate $\varepsilon_i \sim N(0,\sigma_\varepsilon^2)$ with $\sigma^2_\varepsilon $ to be determined
# Only for illustration - regenerated in next block!
sigma = np.sqrt(sig2)
np.random.seed(201053)
eps = np.random.normal(0, sigma, n)
# 2b) Generate $y_i = \sin^3(2\pi x_i^3) + \varepsilon_i$ for $i\in 1:n$
def f(x):
return (np.sin(2*np.pi*(x**3)))**3
def gendata(f, sigma, n):
return ( f + np.random.normal(0, sigma, n) )
f_x = f(x)
np.random.seed(201053)
y = gendata(f_x, sigma, n)
## Data plot
fig, ax = plt.subplots(1, 1)
ax.plot(x, y, 'o', color=col_red)
ax.plot(x,f_x,'-g',color=col_blue, lw=3)
# Kernel regression smoothing with NW kernel estimator
## Smoothing with Gaussian Kernel
# Naradaya-Watson at points t, a mesh of grid points
def nw_inner(t, x, y, h):
K_h = norm.pdf( (t-x)/h )/h
den = sum(K_h)
w = K_h/den
nw = np.dot(w, y)
return nw
# f_h estimation
def fh(ts, x, y, h):
return [nw_inner(t, x ,y ,h) for t in ts]
# trial bandwidth for Gaussian kernel, note that in terms of Uni (box car) kernel
# this has to be multiplied with 1.35/0.77, i.e. we for h=0.01 are averaging over the window (t - 0.0175, t + 0.0175)
# LM change the NW smoother !!!!
h = 0.02
# NW estimation
f_h = fh(x, x, y, h)
fig, ax = plt.subplots(1, 1)
ax.plot(x, y, 'o', color= col_red)
ax.plot(x,f_h,'-g',color= col_blue, lw=3)
# Bias-variance decomposition
# Estimation for some bandwidths e.g. $h \in [0.01, 0.02, \dots, 0.1]$
h = list( np.linspace(0.01, 0.3, 25) )
# Repeat data generation J times
f_h = np.zeros( (n, len(h)) )
E_fh = np.zeros( (n, len(h)) )
bias = np.zeros( (n, len(h)) )
variance = np.zeros( (n, len(h)) )
CV = np.zeros( len(h) )
CV_all = np.zeros( (J_cv, len(h)) ) # collects the CVs over simulations
L_emp = np.zeros( (J_cv, len(h)) ) # collects the emp error over simulations
# Computation of bias
np.random.seed(201053)
for j in range(1, J):
y = gendata(f_x, sigma, n)
for i in range(len(h)):
f_h[:,i] = fh(x, x, y, h[i])
E_fh[:,i] = E_fh[:,i] + f_h[:,i]
# end h loop
# end J loop
E_fh = E_fh/J
# now calc bias
for i in range(len(h)):
bias[:,i] = E_fh[:,i] - f_x
# end h loop
bias2 = bias**2
Int_bias_h = np.sum(bias2, axis = 0)/n
np.random.seed(201053)
for j in range(1,J):
y = gendata(f_x, sigma, n)
for i in range(len(h)):
f_h[:,i] = fh(x, x, y, h[i])
variance[:,i] = variance[:,i] + (f_h[:,i] - E_fh[:,i])**2
# end h loop
# end J loop
variance = variance/J
Int_var_h = np.sum(variance, axis = 0)/n
# Plot bias-variance-tradeoff
fig, ax1 = plt.subplots()
ax1.set_xlabel('bandwidth h')
ax1.set_ylabel('squared bias', color=col_blue)
ax1.plot(h, Int_bias_h, color=col_blue, lw = 3)
ax1.tick_params(axis='y', labelcolor=col_blue)
ax2 = ax1.twinx()
col_red = 'tab:red'
ax2.set_ylabel('variance', color=col_red)
ax2.plot(h, Int_var_h, color=col_red, lw = 3)
ax2.tick_params(axis='y', labelcolor=col_red)
fig.tight_layout()
plt.show()
plt.plot(h, Int_bias_h + Int_var_h, color = col_navy, lw=3)
plt.show()
## Cross-validation
np.random.seed(201053)
for j in range(J_cv):
y = gendata(f_x, sigma, n)
for i in range( len(h) ):
f_l = 0
for l in range( len(x) ):
x_l = np.delete(x, l, axis = 0)
y_l = np.delete(y, l, axis = 0)
u = fh([x[l]], x_l, y_l, h[i])
f_l = f_l + (y[l] - u)**2
# end l loop over obs
CV[i] = f_l
# end h loop
CV_all[j, i] = CV[i]/n
# end J loop
CV_mean = np.mean( CV_all, axis = 0 )
plt.plot(h, CV_mean, lw = 3, color = col_pink)
np.random.seed(201053)
for j in range(J_cv):
y = gendata(f_x, sigma, n)
for i in range( len(h) ):
f_h[:,i] = fh(x, x, y, h[i])
L_emp[j,i] = np.mean( (y - f_h[:,i])**2 , axis = 0)
# end h loop
# end J loop
L_emp = L_emp/J_cv
L_emp_mean = np.mean( L_emp, axis = 0 )
## Generalized Cross-validation
inner_t = (norm.pdf(0)/h)/n # tr( S(h)/n )
L_GCV = L_emp/( (1-inner_t)**2 )
L_GCV_mean = np.mean( L_GCV, axis = 0 )
plt.plot(h, sig2 + np.zeros( len(h) ), color=col_pink, lw=3)
plt.plot(h, CV_mean,linestyle='dashdot', color=col_blue, lw=3)
plt.plot(h, L_emp_mean, color= col_red, lw=3)
plt.plot(h, L_GCV_mean, linestyle='dotted', color = 'green', lw=3)