Building distance matrices.


Python set-up

Sys.setenv(RETICULATE_PYTHON = "C:/Users/brent/anaconda3/envs/STOCK_MASTER/python.exe")
reticulate::use_condaenv(condaenv = 'STOCK_MASTER', required = TRUE)
knitr::opts_chunk$set(echo = TRUE, cache = TRUE)
library('reticulate')


Libraries

import numpy as np
np.set_printoptions(precision=2, linewidth=150)
from scipy.spatial import distance


Data

X1 = np.array([[6,1,7],[10,9,4],[13,9,3],[10,8,15],[14,4,1]])
X2 = np.array([6,1,7])
print(X1, "\n\n", X2)
## [[ 6  1  7]
##  [10  9  4]
##  [13  9  3]
##  [10  8 15]
##  [14  4  1]] 
## 
##  [6 1 7]


Euclidean distance

Scipy

def ed_sc(x):
  x = x[:,None] if x.ndim == 1 else x
  D_sp = distance.pdist(x, metric='euclidean')
  Dmat = distance.squareform(D_sp, force='no', checks=True)
  return Dmat

a = ed_sc(X1)
b = ed_sc(X2)
print(a, "\n\n", b)
## [[ 0.    9.43 11.36 11.36 10.44]
##  [ 9.43  0.    3.16 11.05  7.07]
##  [11.36  3.16  0.   12.41  5.48]
##  [11.36 11.05 12.41  0.   15.1 ]
##  [10.44  7.07  5.48 15.1   0.  ]] 
## 
##  [[0. 5. 1.]
##  [5. 0. 6.]
##  [1. 6. 0.]]


Numpy method 1

SO reference

def ed_np1(x):
  x = x[:,None] if x.ndim == 1 else x
  a = np.sum(x * x, 1)
  b = np.repeat(a[:, None], x.shape[0], axis=1)
  D = b + b.T -2 * np.dot(x, x.T)
  Dmat = np.sqrt(D)
  return Dmat

a = ed_np1(X1)
b = ed_np1(X2)
print(a, "\n\n", b)
## [[ 0.    9.43 11.36 11.36 10.44]
##  [ 9.43  0.    3.16 11.05  7.07]
##  [11.36  3.16  0.   12.41  5.48]
##  [11.36 11.05 12.41  0.   15.1 ]
##  [10.44  7.07  5.48 15.1   0.  ]] 
## 
##  [[0. 5. 1.]
##  [5. 0. 6.]
##  [1. 6. 0.]]


Numpy method 2

Code adapted from here

def ed_np2(x):
  x = x[:,None] if x.ndim == 1 else x
  D = np.sum(x**2, 1).reshape(-1, 1) \
        + np.sum(x**2, 1)            \
        - 2 * np.dot(x, x.T)
  Dmat = np.sqrt(D)
  return Dmat

a = ed_np2(X1)
b = ed_np2(X2)
print(a, "\n\n", b)
## [[ 0.    9.43 11.36 11.36 10.44]
##  [ 9.43  0.    3.16 11.05  7.07]
##  [11.36  3.16  0.   12.41  5.48]
##  [11.36 11.05 12.41  0.   15.1 ]
##  [10.44  7.07  5.48 15.1   0.  ]] 
## 
##  [[0. 5. 1.]
##  [5. 0. 6.]
##  [1. 6. 0.]]


Numpy method 3

SO reference

def ed_np3(x):
  x = x[:,None] if x.ndim == 1 else x
  Dmat = np.sqrt(((x[:, :, None] - x[:, :, None].T) ** 2).sum(1))
  return Dmat

a = ed_np3(X1)
b = ed_np3(X2)
print(a, "\n\n", b)
## [[ 0.    9.43 11.36 11.36 10.44]
##  [ 9.43  0.    3.16 11.05  7.07]
##  [11.36  3.16  0.   12.41  5.48]
##  [11.36 11.05 12.41  0.   15.1 ]
##  [10.44  7.07  5.48 15.1   0.  ]] 
## 
##  [[0. 5. 1.]
##  [5. 0. 6.]
##  [1. 6. 0.]]


Mahalanobis distance

Scipy

def md_sp(x):
  x = x[:,None] if x.ndim == 1 else x
  D_sp = distance.pdist(x, metric='mahalanobis')
  Dmat = distance.squareform(D_sp, force='no', checks=True)
  return Dmat

a = md_sp(X1)
b = md_sp(X2)
print(a, "\n\n", b)
## [[0.   2.39 2.63 2.76 2.6 ]
##  [2.39 0.   1.22 2.68 2.8 ]
##  [2.63 1.22 0.   2.3  1.85]
##  [2.76 2.68 2.3  0.   2.78]
##  [2.6  2.8  1.85 2.78 0.  ]] 
## 
##  [[0.   1.56 0.31]
##  [1.56 0.   1.87]
##  [0.31 1.87 0.  ]]

Numpy

def md_np(x):
  dec = np.linalg.cholesky(np.cov(x, rowvar=False))
  tmp = np.linalg.solve(dec, x.T)
  tmp1 = tmp.T
  return ed_np3(tmp1)

print(md_np(X1))
## [[0.   2.39 2.63 2.76 2.6 ]
##  [2.39 0.   1.22 2.68 2.8 ]
##  [2.63 1.22 0.   2.3  1.85]
##  [2.76 2.68 2.3  0.   2.78]
##  [2.6  2.8  1.85 2.78 0.  ]]


R

md_r <- function(x) {
  dec  <- chol( cov(x) )
  tmp0 <- forwardsolve(t(dec), t(x) )
  tmp1 <- t(tmp0)
  lower_tri <- dist(tmp1)
  m <- matrix(NA, nrow(x), nrow(x))
  m[lower.tri(m)] <- lower_tri
  m[upper.tri(m)] <- t(m)[upper.tri(m)]
  diag(m) <- 0
  m
}

d <- matrix(c(6,1,7,10,9,4,13,9,3,10,8,15,14,4,1), ncol = 3, byrow= TRUE)
md_r(d)
##          [,1]     [,2]     [,3]     [,4]     [,5]
## [1,] 0.000000 2.388123 2.631810 2.762811 2.603338
## [2,] 2.388123 0.000000 1.220222 2.678094 2.798580
## [3,] 2.631810 1.220222 0.000000 2.303122 1.847275
## [4,] 2.762811 2.678094 2.303122 0.000000 2.783878
## [5,] 2.603338 2.798580 1.847275 2.783878 0.000000