A replication of the Gaussian Process regression implementation from chapter 5 of Surrogates. Application of the same code to real data.
Create some dummy data being an independent and dependent variable, along with a grid of the independent variable values.
# Training data
<- 8
n <- matrix(seq(0, 2*pi, length=n), ncol=1) # independent variable
X <- sin(X) # dependent variable
y
# Predictive grid
<- matrix(seq(-0.5, 2*pi + 0.5, length=100), ncol=1) XX
Using inverse exponentiated squared distance.
Note that the first three lines below can be replicated with
D <- plgp::distance(X)
.
<- dist(X, diag = T, upper = T)
D <- D**2
D <- as.matrix(D) # euclidean distance
D <- sqrt(.Machine$double.eps) # nugget / jitter
eps <- exp(-D) + diag(eps, ncol(D)) # exponentiated squared euclidean distance Sigma
Covariance of testing grid data points.
<- as.matrix(dist(XX, diag = T, upper = T)**2) # Is this the same as plgp::dist?
DXX <- exp(-DXX) + diag(eps, ncol(DXX)) SXX
Covariance between testing grid and training data.
library('plgp')
<- plgp::distance(XX, X)
DX <- exp(-DX) SX
Kriging equations, mean mup
and variance
Sigmap
.
<- solve(Sigma)
Si <- SX %*% Si %*% y
mup <- SXX - SX %*% Si %*% t(SX) Sigmap
Generate Y values from the posterior/predictive distribution and plot.
<- rmvnorm(100, mup, Sigmap)
YY
# Error bars
<- mup + qnorm(0.05, 0, sqrt(diag(Sigmap)))
q1 <- mup + qnorm(0.95, 0, sqrt(diag(Sigmap)))
q2
# Plot
matplot(XX, t(YY), type="l", col="gray", lty=1, xlab="x", ylab="y")
points(X, y, pch=20, cex=2)
lines(XX, sin(XX), col="blue")
lines(XX, mup, lwd=2)
lines(XX, q1, lwd=2, lty=2, col=2)
lines(XX, q2, lwd=2, lty=2, col=2)
The procedures above are now applied to real data, that being stock valuations and fundamentals.
library(romerb)
data("stock_data")
<- stock_data
fundamental_raw rm(stock_data)
# Medical devices sector only
<- fundamental_raw[fundamental_raw$sector == 7, ]
df $log_mkt_cap <- log(df$mkt_cap)
df$log_book <- log(-df$total_equity_cln)
df<- df[df$date_stamp == as.Date('2021-06-30'), c('log_book','log_mkt_cap','log_pb','roe','leverage')]
df
# nugget / jitter
<- sqrt(.Machine$double.eps)
eps
# Training data
<- matrix(df$roe)
X <- matrix(df$log_pb)
y
# Predictive grid
<- matrix(seq(min(X), max(X), length=200), ncol=1)
XX
# Plot
plot(X, y, main = "Medical devices sector",
xlab = "Return on equity", ylab = "Price book ratio",
pch = 19, frame = FALSE)
<- plgp::distance(X)
D <- exp(-D) + diag(eps, length(X))
Sigma <- plgp::distance(XX)
DXX <- exp(-DXX) + diag(eps, length(XX))
SXX <- exp(-distance(XX, sort(X))) # note the sort, required to construct symmetric matrix at Sigmap
SX
# Kriging equations, mean mup and variance Sigmap
<- solve(Sigma)
Si <- SX %*% Si %*% y
mup <- SXX - SX %*% Si %*% t(SX)
Sigmap <- sfsmisc::posdefify(Sigmap, method = "allEVadd")
Sigmap
# Generate Y values & error bars from the posterior/predictive distribution
<- rmvnorm(100, mup, Sigmap)
YY <- mup + qnorm(0.05, 0, sqrt(diag(Sigmap)))
q1 <- mup + qnorm(0.95, 0, sqrt(diag(Sigmap))) q2
matplot(XX, t(YY), type="l", col="gray", lty=1, xlab="x", ylab="y")
#points(X, y, pch=20, cex=2)
#lines(XX, sin(XX), col="blue")
lines(XX, mup, lwd=2)
lines(XX, q1, lwd=2, lty=2, col=2)
lines(XX, q2, lwd=2, lty=2, col=2)