fit
method learns $n$ components.whiten=True
enables projecting the data onto the singular space while scaling each component to unit variance.import numpy as np
from sklearn.decomposition import PCA
X = np.array([[-1, -1], [-2, -1], [-3, -2],
[ 1, 1], [ 2, 1], [ 3, 2]])
pca = PCA(n_components=2).fit(X)
pca2 = PCA(n_components=2, svd_solver='full').fit(X)
pca3 = PCA(n_components=1, svd_solver='arpack').fit(X)
print(pca.explained_variance_ratio_)
print(pca.singular_values_)
print(pca2.explained_variance_ratio_)
print(pca2.singular_values_)
print(pca3.explained_variance_ratio_)
print(pca3.singular_values_)
[0.99244289 0.00755711] [6.30061232 0.54980396] [0.99244289 0.00755711] [6.30061232 0.54980396] [0.99244289] [6.30061232]
import numpy as np
import matplotlib.pyplot as plt
from scipy import linalg
from sklearn.decomposition import PCA, FactorAnalysis as FA
from sklearn.covariance import ShrunkCovariance as SC, LedoitWolf as LW
from sklearn.model_selection import cross_val_score as CVscore
from sklearn.model_selection import GridSearchCV
n_samples, n_features, rank, sigma = 1000, 50, 10, 1.0
rng = np.random.RandomState(42)
U, _, _ = linalg.svd(rng.randn(n_features,
n_features))
X = np.dot(rng.randn(n_samples, rank),
U[:, :rank].T)
# Adding homoscedastic & heteroscedastic noise
sigmas = sigma * rng.rand(n_features) + sigma / 2.
X_homo = X + sigma * rng.randn(n_samples, n_features)
X_hetero = X + sigmas* rng.randn(n_samples, n_features)
n_components = np.arange(0, n_features, 5) # options for n_components
def compute_scores(X):
pca = PCA(svd_solver='full')
fa = FA()
pca_scores, fa_scores = [], []
for n in n_components:
pca.n_components = n
fa.n_components = n
pca_scores.append(np.mean(CVscore(pca, X)))
fa_scores.append(np.mean( CVscore(fa, X)))
return pca_scores, fa_scores
def shrunk_cov_score(X):
shrinkages = np.logspace(-2, 0, 30)
cv = GridSearchCV(SC(), {'shrinkage': shrinkages})
return np.mean(CVscore(cv.fit(X).best_estimator_, X))
def lw_score(X):
return np.mean(CVscore(LW(), X))
for X, title in [(X_homo, 'Homoscedastic Noise'),
(X_hetero, 'Heteroscedastic Noise')]:
pca_scores, fa_scores = compute_scores(X)
n_components_pca = n_components[np.argmax(pca_scores)]
n_components_fa = n_components[np.argmax(fa_scores)]
pca = PCA(svd_solver='full',
n_components='mle').fit(X)
n_components_pca_mle = pca.n_components_
print("best n_components by PCA CV = %d" %
n_components_pca)
print("best n_components by FactorAnalysis CV = %d" %
n_components_fa)
print("best n_components by PCA MLE = %d" %
n_components_pca_mle)
plt.figure()
plt.plot(n_components, pca_scores, 'b', label='PCA scores')
plt.plot(n_components, fa_scores, 'r', label='FA scores')
plt.axvline(rank, color='g', label='TRUTH: %d' %
rank, linestyle='-')
plt.axvline(n_components_pca, color='b',
label='PCA CV: %d' %
n_components_pca, linestyle='--')
plt.axvline(n_components_fa, color='r',
label='FactorAnalysis CV: %d' %
n_components_fa,
linestyle='--')
plt.axvline(n_components_pca_mle, color='k',
label='PCA MLE: %d' %
n_components_pca_mle, linestyle='--')
# compare with other covariance estimators
plt.axhline(shrunk_cov_score(X), color='violet',
label='Shrunk Covariance MLE', linestyle='-.')
plt.axhline(lw_score(X), color='orange',
label='LedoitWolf MLE' % n_components_pca_mle, linestyle='-.')
plt.xlabel('nb of components')
plt.ylabel('CV scores')
plt.legend(loc='lower right')
plt.title(title)
best n_components by PCA CV = 10 best n_components by FactorAnalysis CV = 10 best n_components by PCA MLE = 10 best n_components by PCA CV = 45 best n_components by FactorAnalysis CV = 10 best n_components by PCA MLE = 39
import matplotlib.pyplot as plt
from sklearn import datasets
from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
iris = datasets.load_iris()
X,y = iris.data, iris.target
names = iris.target_names
pca = PCA(n_components=2)
lda = LDA(n_components=2)
X_r = pca.fit(X).transform(X)
X_r2 = lda.fit(X, y).transform(X)
print('explained variance ratio (1st two components): %s'
% str(pca.explained_variance_ratio_))
explained variance ratio (1st two components): [0.92461872 0.05306648]
colors = ['navy', 'turquoise', 'darkorange']
lw = 2
for color, i, name in zip(colors, [0, 1, 2], names):
plt.scatter(X_r[y == i, 0],
X_r[y == i, 1],
color=color, alpha=.8, lw=lw,
label=name)
plt.legend(loc='best', shadow=False, scatterpoints=1)
plt.title('PCA of IRIS dataset')
Text(0.5, 1.0, 'PCA of IRIS dataset')
for color, i, name in zip(colors, [0, 1, 2], names):
plt.scatter(X_r2[y == i, 0],
X_r2[y == i, 1], alpha=.8, color=color,
label=name)
plt.legend(loc='best', shadow=False, scatterpoints=1)
plt.title('LDA of IRIS dataset')
Text(0.5, 1.0, 'LDA of IRIS dataset')
partial_fit
uses subsets of data fetched sequentially from disk or a network.numpy.memmap
also enables calling the fit
method on sparse matrices or a memory mapped file.explained_variance_ration
incrementally.import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import load_iris
from sklearn.decomposition import PCA, IncrementalPCA as IPCA
iris = load_iris()
X,y,n = iris.data, iris.target, 2
ipca = IPCA(n_components=n, batch_size=10)
pca = PCA(n_components=n)
X_ipca = ipca.fit_transform(X)
X_pca = pca.fit_transform(X)
colors = ['navy', 'turquoise', 'darkorange']
for X_transformed, title in [(X_ipca, "Incremental PCA"), (X_pca, "PCA")]:
plt.figure(figsize=(8, 8))
for color, i, name in zip(colors, [0, 1, 2],
iris.target_names):
plt.scatter(X_transformed[y == i, 0],
X_transformed[y == i, 1],
color=color, lw=2,
label=name)
if "Incremental" in title:
err = np.abs(np.abs(X_pca) - np.abs(X_ipca)).mean()
plt.title(title + " of iris dataset\nMean absolute unsigned error "
"%.6f" % err)
else:
plt.title(title + " of iris dataset")
plt.legend(loc="best", shadow=False, scatterpoints=1)
plt.axis([-4, 4, -1.5, 1.5])
Use case: projecting data to a low-D space while preserving most of the variance. This is done by dropping the singular vector of components associated with lower singular values.
Example: 64x64 pixel gray-level pix (for face recognition) has dimensionality = 4096 (which will make SVM training very slow). Since most human faces look somewhat alike, it makes sense to use PCA to project down to ~200 dimensions.
In other words: if we're going to drop most of the vectors, it's more efficient to limit the computation to an approximation of the vectors.
accomplished with svd_solver="randomized"
.
The memory footprint is much smaller than that of exact PCA: $2 \cdot n_{\max} \cdot n_{\mathrm{components}}$ vs $n_{\max} \cdot n_{\min}$.
||X-UV||_2^2+\alpha||V||_1 \\
\text{subject to } & ||U_k||_2 = 1 \text{ for all }
0 \leq k < n_{components}\end{split}$
alpha
. (Small values = gentle regularization; large values shrink many coefficients to zero.)import numpy as np
from sklearn.datasets import make_friedman1
from sklearn.decomposition import SparsePCA
X, _ = make_friedman1(n_samples=200, n_features=30, random_state=0)
transformer = SparsePCA(n_components=5,
random_state=0).fit(X)
X_transformed = transformer.transform(X)
print(X_transformed.shape)
print(np.mean(transformer.components_ == 0))
(200, 5) 0.9666666666666667
from time import time
import logging
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split as TTS
from sklearn.model_selection import GridSearchCV
from sklearn.datasets import fetch_lfw_people
from sklearn.metrics import classification_report as CR
from sklearn.metrics import confusion_matrix as CM
from sklearn.decomposition import PCA
from sklearn.svm import SVC
lfw_people = fetch_lfw_people(min_faces_per_person=70, resize=0.4)
n_samples, h, w = lfw_people.images.shape
X = lfw_people.data
n_features = X.shape[1]
y = lfw_people.target
target_names = lfw_people.target_names
n_classes = target_names.shape[0]
print("Total dataset size:")
print("n_samples: %d" % n_samples)
print("n_features: %d" % n_features)
print("n_classes: %d" % n_classes)
Total dataset size: n_samples: 1288 n_features: 1850 n_classes: 7
X_train, X_test, y_train, y_test = TTS(
X, y, test_size=0.25, random_state=42)
n_components = 150
t0 = time()
pca = PCA(n_components=n_components,
svd_solver='randomized',
whiten=True).fit(X_train)
print("PCA fit done in %0.3fs" % (time() - t0))
eigenfaces = pca.components_.reshape((n_components, h, w))
t0 = time()
X_train_pca = pca.transform(X_train)
X_test_pca = pca.transform(X_test)
print("PCA transform done in %0.3fs" % (time() - t0))
t0 = time()
param_grid = {'C': [1e3, 5e3, 1e4, 5e4, 1e5],
'gamma': [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.1], }
clf = GridSearchCV(
SVC(kernel='rbf', class_weight='balanced'),
param_grid).fit(X_train_pca,
y_train)
print("CV done in %0.3fs" % (time() - t0))
print("Best estimator:", clf.best_estimator_)
PCA fit done in 0.706s PCA transform done in 0.044s CV done in 11.058s Best estimator: SVC(C=1000.0, class_weight='balanced', gamma=0.005)
t0 = time()
y_pred = clf.predict(X_test_pca)
print("Prediction quality on test data: done in %0.3fs" % (time() - t0))
print(CR(y_test, y_pred, target_names=target_names))
print(CM(y_test, y_pred, labels=range(n_classes)))
Prediction quality on test data: done in 0.038s precision recall f1-score support Ariel Sharon 0.80 0.62 0.70 13 Colin Powell 0.82 0.88 0.85 60 Donald Rumsfeld 0.86 0.67 0.75 27 George W Bush 0.85 0.98 0.91 146 Gerhard Schroeder 0.95 0.80 0.87 25 Hugo Chavez 1.00 0.53 0.70 15 Tony Blair 1.00 0.81 0.89 36 accuracy 0.87 322 macro avg 0.90 0.75 0.81 322 weighted avg 0.87 0.87 0.86 322 [[ 8 2 0 3 0 0 0] [ 1 53 1 5 0 0 0] [ 1 1 18 7 0 0 0] [ 0 3 0 143 0 0 0] [ 0 1 0 4 20 0 0] [ 0 3 0 3 1 8 0] [ 0 2 2 3 0 0 29]]
def plot_gallery(images, titles, h, w, n_row=3, n_col=4):
plt.figure(figsize=(1.8*n_col, 2.4*n_row))
plt.subplots_adjust(bottom=0, left=.01, right=.99, top=.90, hspace=.35)
for i in range(n_row*n_col):
plt.subplot(n_row, n_col, i+1)
plt.imshow(images[i].reshape((h, w)), cmap=plt.cm.gray)
plt.title(titles[i], size=12)
plt.xticks(())
plt.yticks(())
def title(y_pred, y_test, target_names, i):
pred_name = target_names[y_pred[i]].rsplit(' ', 1)[-1]
true_name = target_names[y_test[i]].rsplit(' ', 1)[-1]
return 'predicted: %s\ntrue: %s' % (pred_name, true_name)
# plots predictions on test set
prediction_titles = [title(y_pred, y_test, target_names, i)
for i in range(y_pred.shape[0])]
plot_gallery(X_test, prediction_titles, h, w)
# plot most significative eigenfaces
eigenface_titles = ["eigenface %d" % i for i in range(eigenfaces.shape[0])]
plot_gallery(eigenfaces, eigenface_titles, h, w)