Implementations

Data Generation and Problem Settings

Import the necessary packages. The library provfis is a stochastic profiler and can be used to profile our code.

library(profvis)
library(tidyverse)
library(MASS)
library(microbenchmark)

First, let’s decide some parameters and settings such as the amount of training and testing data.

# Number of data points for plotting & Bandwidth squared of the RBF kernel
ntest <- 500
# Characteristic Length-scale for kernel
sigmasq <- 1.0
# Number of training points, standard deviation of additive noise
ntrain <- 10
sigma_n <- 0.5
# Define number of samples for prior gp mvn and posterior gp mvn to take
nprior_samples <- 5
npost_samples <- 5

Define some helper functions. In particular, the kernel function is a squared exponential. The bandwidth is computed as the median of all pairwise distances. We also define a matrix that applies the squared exponential to rows of two matrices, i.e. it computes the kernel matrix \(K(X, Y)\). for two matrices \(X\) and \(Y\). Finally, we also have a regression function which represents the true structure of the data. This is just a quintic polynomial for simplicity, but can be replaced by more complicated functions.

squared_exponential <- function(x, c, sigmasq){
  return(exp(-0.5*sum((x - c)^2) / sigmasq))
}
kernel_matrix <- function(X, Xstar, sigmasq){
  # compute the kernel matrix
  K <- apply(
    X=Xstar,
    MARGIN=1, 
    FUN=function(xstar_row) apply(
      X=X, 
      MARGIN=1, 
      FUN=squared_exponential, 
      xstar_row,
      sigmasq
      )
    )
  return(K)
}
regression_function <- function(x){
    val <- (x+5)*(x+2)*(x)*(x-4)*(x-3)/10 + 2
  return(val)
}

Now let’s actually generate some data

set.seed(12345)
# training data
xtrain <- matrix(runif(ntrain, min=-5, max=5))
ytrain <- regression_function(xtrain) + matrix(rnorm(ntrain, sd=sigma_n))
# testing data
xtest <- matrix(seq(-5,5, len=ntest))

Naive Vectorized Implementation

The first implementation that we look at is naive in the sense that it basically blindly copies the operations. This means that we invert \(K + \sigma_n^2 I\) directly.

source("gp_naive.R", keep.source = TRUE)
profvis(result <- gp_naive(xtrain, ytrain, xtest, sigma_n, sigmasq))

Online Non-Vectorized Cholesky Implementation

This implementation can also be used online. It is recommended in the “Gaussian Processes for Machine Learning” book.

source("gp_online.R", keep.source = TRUE)
profvis(result_online <- gp_online(xtrain, ytrain, xtest, sigma_n, sigmasq))

Online Vectorized-Kernel Cholesky Implementation

We can see that most of the time is spent computing the kernel matrix. We can therefore find a faster way to compute it as follows

kernel_matrix_vectorized <- function(X, sigmasq, Y=NULL){
  if (is.null(Y)){
    Y <- X
  }
  n <- nrow(X)
  m <- nrow(Y)
  # Find three matrices above
  Xnorm <- matrix(apply(X^2, 1, sum), n, m)
  Ynorm <- matrix(apply(Y^2, 1, sum), n, m, byrow=TRUE)
  XY <- tcrossprod(X, Y)
  return(exp(-(Xnorm - 2*XY + Ynorm) / (2*sigmasq)))
}

using this, we get

source("gp_online_vect.R", keep.source = TRUE)
profvis(gp_online_vect(xtrain, ytrain, xtest, sigma_n, sigmasq))

We can see that this implementation uses much less memory and it’s much faster.

Completely vectorized implementation

We can combine these ideas to obtain a much faster implementation.

source("gp_completely_vectorized.R", keep.source = TRUE)
profvis(gp_completely_vectorized(xtrain, ytrain, xtest, sigma_n, sigmasq), interval=0.005)

Visualizations

Before seeing training data

Before seeing the training data we only have the test data. The Gaussian Process will therefore predict random (smooth) functions with mean zero.

Kss <- kernel_matrix_vectorized(xtest, sigmasq)
# Sample nprior_samples Multivariate Normals with mean zero and variance-covariance
# being the kernel matrix
data.frame(x=xtest, t(mvrnorm(nprior_samples, rep(0, length=ntest), Kss))) %>% 
  setNames(c("x", sprintf("Sample %s", 1:nprior_samples))) %>% 
  gather("MVN Samples", "Value", -x) %>% 
  ggplot(aes(x=x, y=Value)) + 
    # Because diag(Kss) are all 1s. We use mean +\- 2*standard deviation
    geom_rect(xmin=-Inf, xmax=Inf, ymin=-2, ymax=2, fill="grey80") + 
    geom_line(aes(color=`MVN Samples`)) +
    geom_abline(slope=0.0, intercept=0.0, lty=2) + 
    scale_y_continuous(lim=c(-3, 3)) + 
    labs(title=paste(nprior_samples, "MVN Samples before seeing the data")) + 
    theme(plot.title=element_text(hjust=0.5))

After seeing training data

We only need to find the predicted mean and the predicted variance.

# Get predicitons. To predict noisy data just add sigma_n^2*diag(ncol(xtest)) 
# to the covariance matrix as implemented in the script 
results <- gp_completely_vectorized(xtrain, ytrain, xtest, sigma_n, sigmasq)

gpmean <- results[[1]]
gpvcov <- results[[2]]
# for plotting
dftrain = data.frame(xtrain=xtrain, ytrain=ytrain)
# Plot
data.frame(x=xtest, t(mvrnorm(npost_samples, gpmean, gpvcov))) %>% 
  setNames(c("x", sprintf("Sample %s", 1:npost_samples))) %>% 
  mutate(ymin=gpmean-2*sqrt(diag(gpvcov)), ymax=gpmean+2*sqrt(diag(gpvcov)),
         gpmean=gpmean, ytrue=regression_function(xtest)) %>% 
  gather("MVN Samples", "Value", -x, -ymin, -ymax, -gpmean, -ytrue) %>% 
  ggplot(aes(x=x, y=Value)) + 
    geom_ribbon(aes(ymin=ymin, ymax=ymax), fill="grey80") + 
    geom_line(aes(color=`MVN Samples`)) +
    geom_line(aes(y=gpmean), size=1) +
    geom_line(aes(y=ytrue), color="darkred", lty=2) + 
    geom_point(data=dftrain, aes(x=xtrain, y=ytrain), color="red") +
    ggtitle(paste(npost_samples, "MVN Samples after seeing", ntrain, "data points")) + 
    theme(plot.title=element_text(hjust=0.5))

Note on Vectorized Kernel Matrix

One might wonder why in the function kernel_matrix_vectorized() we fill the matrix Ynorm by row. Afterall R works with column-major storage so this should be inefficient. One can compare filling in a matrix by row versus filling it by column and then taking the transpose in different cases:

  • Number of rows < Number of columns
nrows <- 100
ncols <- 2000
microbenchmark(
  rowwise=matrix(0, nrows, ncols, byrow=TRUE),
  transpose=t(matrix(0, nrows, ncols))
  )
## Unit: microseconds
##       expr     min       lq     mean   median        uq      max neval
##    rowwise 321.202 343.0085 648.0742 360.0345  697.1185 5734.401   100
##  transpose 534.310 569.7350 950.3921 608.9455 1233.3545 3354.230   100
  • Number of rows = Number of columns
nrows <- 2000
ncols <- 2000
microbenchmark(
  rowwise=matrix(0, nrows, ncols, byrow=TRUE),
  transpose=t(matrix(0, nrows, ncols))
  )
## Unit: milliseconds
##       expr      min       lq     mean   median       uq      max neval
##    rowwise 16.69890 24.08596 26.52232 24.48049 28.26676 91.81093   100
##  transpose 23.32213 29.73081 38.70225 36.65087 41.26867 96.43356   100
  • Number of rows > Number of columns
nrows <- 2000
ncols <- 100
microbenchmark(
  rowwise=matrix(0, nrows, ncols, byrow=TRUE),
  transpose=t(matrix(0, nrows, ncols))
  )
## Unit: microseconds
##       expr     min      lq     mean   median       uq      max neval
##    rowwise 378.090 383.898 470.1237 392.2410 398.4715 2512.430   100
##  transpose 564.521 579.105 688.0039 593.8405 621.8380 2687.371   100

Generally, byrow=TRUE is faster.