CD algorithms find relations between two matrices (X,Y), specifically the direction in X that explains the max variance in Y. (In other words, projecting both X & Y into a lower-dimensional subspace such that covariance(transformed(X)
,transformed(Y)
) is maximal.
Similar to Principal Component Regression (PCR), except that PCR is unsupervised dimensionality reduction (You could lose important info in the process. PLS is similar, but considers the y
targets too.)
PLS estimators are well suited when the predictors matrix has #variables > #observations, and when multicollinearity is present. (Standard linear regression would fail in this situation unless regularized.)
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cross_decomposition import PLSCanonical as PLSC, PLSRegression as PLSR, CCA
# Dataset based latent variables model
n = 500
l1 = np.random.normal(size=n)
l2 = np.random.normal(size=n)
latents = np.array([l1, l1, l2, l2]).T
X = latents + np.random.normal(size=4 * n).reshape((n, 4))
Y = latents + np.random.normal(size=4 * n).reshape((n, 4))
print("shapes: X:\t",X.shape,"\t Y:\t",Y.shape)
X_train = X[:n // 2]; Y_train = Y[:n // 2]
X_test = X[n // 2:]; Y_test = Y[n // 2:]
print("Corr(X):\t", np.round(np.corrcoef(X.T), 2))
print("Corr(Y):\t", np.round(np.corrcoef(Y.T), 2))
shapes: X: (500, 4) Y: (500, 4) Corr(X): [[ 1. 0.5 -0.07 -0.02] [ 0.5 1. -0.06 0.06] [-0.07 -0.06 1. 0.5 ] [-0.02 0.06 0.5 1. ]] Corr(Y): [[ 1. 0.56 -0.01 0. ] [ 0.56 1. -0.03 0.02] [-0.01 -0.03 1. 0.49] [ 0. 0.02 0.49 1. ]]
# Canonical (symmetric) PLS
# Transform data
# ~~~~~~~~~~~~~~
plsca = PLSC(n_components=2)
plsca.fit(X_train, Y_train)
X_train_r, Y_train_r = plsca.transform(X_train, Y_train)
X_test_r, Y_test_r = plsca.transform(X_test, Y_test)
# Scatter plot of scores
plt.figure(figsize=(12, 8))
plt.subplot(221)
plt.scatter(X_train_r[:, 0], Y_train_r[:, 0], label="train", marker="o", s=25)
plt.scatter(X_test_r[:, 0], Y_test_r[:, 0], label="test", marker="o", s=25)
plt.xlabel("x scores"); plt.ylabel("y scores")
plt.xticks(()); plt.yticks(())
plt.legend(loc="best")
plt.title('Comp. 1: X vs Y (test corr = %.2f)' %
np.corrcoef(X_test_r[:, 0], Y_test_r[:, 0])[0, 1])
plt.subplot(224)
plt.scatter(X_train_r[:, 1], Y_train_r[:, 1], label="train", marker="o", s=25)
plt.scatter(X_test_r[:, 1], Y_test_r[:, 1], label="test", marker="o", s=25)
plt.xlabel("x scores"); plt.ylabel("y scores")
plt.xticks(()); plt.yticks(())
plt.legend(loc="best")
plt.title('Comp. 2: X vs Y (test corr = %.2f)' %
np.corrcoef(X_test_r[:, 1], Y_test_r[:, 1])[0, 1])
# 2) Off diagonal plot components 1 vs 2 for X and Y
plt.subplot(222)
plt.scatter(X_train_r[:, 0], X_train_r[:, 1], label="train", marker="*", s=50)
plt.scatter(X_test_r[:, 0], X_test_r[:, 1], label="test", marker="*", s=50)
plt.xlabel("X comp. 1"); plt.ylabel("X comp. 2")
plt.xticks(()); plt.yticks(())
plt.legend(loc="best")
plt.title('X comp. 1 vs X comp. 2 (test corr = %.2f)'
% np.corrcoef(X_test_r[:, 0], X_test_r[:, 1])[0, 1])
plt.subplot(223)
plt.scatter(Y_train_r[:, 0], Y_train_r[:, 1], label="train", marker="*", s=50)
plt.scatter(Y_test_r[:, 0], Y_test_r[:, 1], label="test", marker="*", s=50)
plt.xlabel("Y comp. 1"); plt.ylabel("Y comp. 2")
plt.xticks(()); plt.yticks(())
plt.legend(loc="best")
plt.title('Y comp. 1 vs Y comp. 2 , (test corr = %.2f)'
% np.corrcoef(Y_test_r[:, 0], Y_test_r[:, 1])[0, 1])
plt.show()
A simplified version of Canonical PLS: instead of iteratively deflating $X_k$ and $Y_k$, PLSSVD builds the SVD of $C = X^TY$ only once & stores n_components
singular vectors corresponding to the biggest singular values found in U
and V
.
The transform is transfomed(X)=XU
and `transformed(Y)=YV'.
if n_components==1
, PLSSVD & Cononical PLS are equal.
Similar to Canonical PLS with algorithm='nipals'
, with 2 differences:
'predict
and transform
attributes will be different. n,q,p, = 1000,3,10
X = np.random.normal(size=n * p).reshape((n, p))
B = np.array([[1, 2] + [0] * (p - 2)] * q).T
# each Yj = 1*X1 + 2*X2 + noize
Y = np.dot(X, B) + np.random.normal(size=n * q).reshape((n, q)) + 5
pls2 = PLSR(n_components=3); pls2.fit(X, Y)
print("True B (such that Y = XB + Err):\t",B)
print("Estimated B:\t", np.round(pls2.coef_, 1))
pls2.predict(X)
True B (such that Y = XB + Err): [[1 1 1] [2 2 2] [0 0 0] [0 0 0] [0 0 0] [0 0 0] [0 0 0] [0 0 0] [0 0 0] [0 0 0]] Estimated B: [[ 0.9 1. 1. ] [ 2. 2. 1.9] [ 0. 0. 0. ] [-0. 0. 0. ] [-0. 0. 0.1] [ 0. 0. -0. ] [ 0. 0. 0. ] [ 0.1 0. 0. ] [-0. -0. 0. ] [ 0.1 0. 0. ]]
array([[4.04651817, 3.93693871, 3.83864135], [3.75053571, 3.70600707, 3.68020728], [4.21630212, 4.3943549 , 4.47647869], ..., [0.56459249, 0.78448658, 0.9751541 ], [5.73350424, 5.46070677, 5.26380084], [5.92112963, 5.67410798, 5.46752561]])
n,p = 1000,10
X = np.random.normal(size=n * p).reshape((n, p))
y = X[:, 0] + 2 * X[:, 1] + np.random.normal(size=n * 1) + 5
pls1 = PLSR(n_components=3); pls1.fit(X, y)
# note: #components>1 (the dimension of y)
print("Estimated betas:\t", np.round(pls1.coef_, 1))
Estimated betas: [[ 1.] [ 2.] [-0.] [ 0.] [ 0.] [ 0.] [-0.] [-0.] [ 0.] [ 0.]]
cca = CCA(n_components=2)
cca.fit(X_train, Y_train)
X_train_r, Y_train_r = cca.transform(X_train, Y_train)
X_test_r, Y_test_r = cca.transform(X_test, Y_test)
Goal: show how PLS can outperform PCR when a target is correlated with directions that have a low variance.
PCR has two steps: 1) Apply PCA to the training data (possibly including dimensionality reduction; 2) train a linear regression on the transformed data. The PCA step is unsupervised, so PCR may perform poorly when the target is correlated with low-variance directions.
PLS does both transformation & regression. It is similar to PCR, except that the transform is supervised.
# create 2-feature dataset.
# fit PCA estimator to display two most principal components (ie two directions that explain
# the most variance in the data.)
import numpy as np
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
rng = np.random.RandomState(0)
n_samples = 500
cov = [[3, 3],[3, 4]]
X = rng.multivariate_normal(mean=[0, 0], cov=cov, size=n_samples)
pca = PCA(n_components=2).fit(X)
plt.scatter(X[:, 0], X[:, 1], alpha=.3, label='samples')
for i, (comp, var) in enumerate(zip(pca.components_,
pca.explained_variance_)):
comp = comp*var # scale component by its variance explanation power
plt.plot([0, comp[0]],
[0, comp[1]],
label=f"Component {i}",
linewidth=5,
color=f"C{i + 2}")
plt.gca().set(aspect='equal',
title="2-dimensional dataset with principal components",
xlabel='first feature', ylabel='second feature')
plt.legend()
plt.show()
y = X.dot(pca.components_[1]) + rng.normal(size=n_samples) / 2
fig, axes = plt.subplots(1, 2, figsize=(10, 3))
axes[0].scatter(X.dot(pca.components_[0]), y, alpha=.3)
axes[1].scatter(X.dot(pca.components_[1]), y, alpha=.3)
axes[0].set(xlabel='Projected data to 1st PCA component', ylabel='y')
axes[1].set(xlabel='Projected data to 2nd PCA component', ylabel='y')
plt.tight_layout()
plt.show()
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cross_decomposition import PLSRegression
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=rng)
pcr = make_pipeline(StandardScaler(),
PCA(n_components=1),
LinearRegression()); pcr.fit(X_train, y_train)
pca = pcr.named_steps['pca'] # retrieve the PCA step of the pipeline
pls = PLSRegression(n_components=1); pls.fit(X_train, y_train)
PLSRegression(n_components=1)
fig, axes = plt.subplots(1, 2, figsize=(10, 3))
axes[0].scatter(pca.transform(X_test), y_test, alpha=.3, label='ground truth')
axes[0].scatter(pca.transform(X_test), pcr.predict(X_test), alpha=.3, label='predictions')
axes[0].set(xlabel='Projected data to 1st PCA component', ylabel='y', title='PCR / PCA')
axes[1].set(xlabel='Projected data to 1st PLS component', ylabel='y', title='PLS')
axes[0].legend(); axes[1].legend()
axes[1].scatter(pls.transform(X_test), y_test, alpha=.3, label='ground truth')
axes[1].scatter(pls.transform(X_test), pls.predict(X_test), alpha=.3, label='predictions')
plt.tight_layout(); plt.show()
No handles with labels found to put in legend.
print(f"PRC r^2 score:\t {pcr.score(X_test,y_test):.3f}")
print(f"PLS r^2 score:\t {pls.score(X_test,y_test):.3f}")
PRC r^2 score: -0.026 PLS r^2 score: 0.658
pca_2 = make_pipeline(PCA(n_components=2), LinearRegression())
pca_2.fit(X_train, y_train)
print(f"PCR r^2 score (2 components):\t {pca_2.score(X_test, y_test):.3f}")
PCR r^2 score (2 components): 0.673