fit
method learns $n$ components.whiten=True
enables projecting the data onto the singular space while scaling each component to unit variance.import numpy as np
from sklearn.decomposition import PCA
X = np.array([[-1, -1], [-2, -1], [-3, -2],
[ 1, 1], [ 2, 1], [ 3, 2]])
pca = PCA(n_components=2).fit(X)
pca2 = PCA(n_components=2, svd_solver='full').fit(X)
pca3 = PCA(n_components=1, svd_solver='arpack').fit(X)
print(pca.explained_variance_ratio_)
print(pca.singular_values_)
print(pca2.explained_variance_ratio_)
print(pca2.singular_values_)
print(pca3.explained_variance_ratio_)
print(pca3.singular_values_)
[0.99244289 0.00755711] [6.30061232 0.54980396] [0.99244289 0.00755711] [6.30061232 0.54980396] [0.99244289] [6.30061232]
import numpy as np
import matplotlib.pyplot as plt
from scipy import linalg
from sklearn.decomposition import PCA, FactorAnalysis as FA
from sklearn.covariance import ShrunkCovariance as SC, LedoitWolf as LW
from sklearn.model_selection import cross_val_score as CVscore
from sklearn.model_selection import GridSearchCV
n_samples, n_features, rank, sigma = 1000, 50, 10, 1.0
rng = np.random.RandomState(42)
U, _, _ = linalg.svd(rng.randn(n_features,
n_features))
X = np.dot(rng.randn(n_samples, rank),
U[:, :rank].T)
# Adding homoscedastic & heteroscedastic noise
sigmas = sigma * rng.rand(n_features) + sigma / 2.
X_homo = X + sigma * rng.randn(n_samples, n_features)
X_hetero = X + sigmas* rng.randn(n_samples, n_features)
n_components = np.arange(0, n_features, 5) # options for n_components
def compute_scores(X):
pca = PCA(svd_solver='full')
fa = FA()
pca_scores, fa_scores = [], []
for n in n_components:
pca.n_components = n
fa.n_components = n
pca_scores.append(np.mean(CVscore(pca, X)))
fa_scores.append(np.mean( CVscore(fa, X)))
return pca_scores, fa_scores
def shrunk_cov_score(X):
shrinkages = np.logspace(-2, 0, 30)
cv = GridSearchCV(SC(), {'shrinkage': shrinkages})
return np.mean(CVscore(cv.fit(X).best_estimator_, X))
def lw_score(X):
return np.mean(CVscore(LW(), X))
for X, title in [(X_homo, 'Homoscedastic Noise'),
(X_hetero, 'Heteroscedastic Noise')]:
pca_scores, fa_scores = compute_scores(X)
n_components_pca = n_components[np.argmax(pca_scores)]
n_components_fa = n_components[np.argmax(fa_scores)]
pca = PCA(svd_solver='full',
n_components='mle').fit(X)
n_components_pca_mle = pca.n_components_
print("best n_components by PCA CV = %d" %
n_components_pca)
print("best n_components by FactorAnalysis CV = %d" %
n_components_fa)
print("best n_components by PCA MLE = %d" %
n_components_pca_mle)
plt.figure()
plt.plot(n_components, pca_scores, 'b', label='PCA scores')
plt.plot(n_components, fa_scores, 'r', label='FA scores')
plt.axvline(rank, color='g', label='TRUTH: %d' %
rank, linestyle='-')
plt.axvline(n_components_pca, color='b',
label='PCA CV: %d' %
n_components_pca, linestyle='--')
plt.axvline(n_components_fa, color='r',
label='FactorAnalysis CV: %d' %
n_components_fa,
linestyle='--')
plt.axvline(n_components_pca_mle, color='k',
label='PCA MLE: %d' %
n_components_pca_mle, linestyle='--')
# compare with other covariance estimators
plt.axhline(shrunk_cov_score(X), color='violet',
label='Shrunk Covariance MLE', linestyle='-.')
plt.axhline(lw_score(X), color='orange',
label='LedoitWolf MLE' % n_components_pca_mle, linestyle='-.')
plt.xlabel('nb of components')
plt.ylabel('CV scores')
plt.legend(loc='lower right')
plt.title(title)
best n_components by PCA CV = 10 best n_components by FactorAnalysis CV = 10 best n_components by PCA MLE = 10 best n_components by PCA CV = 45 best n_components by FactorAnalysis CV = 10 best n_components by PCA MLE = 39
import matplotlib.pyplot as plt
from sklearn import datasets
from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
iris = datasets.load_iris()
X,y = iris.data, iris.target
names = iris.target_names
pca = PCA(n_components=2)
lda = LDA(n_components=2)
X_r = pca.fit(X).transform(X)
X_r2 = lda.fit(X, y).transform(X)
print('explained variance ratio (1st two components): %s'
% str(pca.explained_variance_ratio_))
explained variance ratio (1st two components): [0.92461872 0.05306648]
colors = ['navy', 'turquoise', 'darkorange']
lw = 2
for color, i, name in zip(colors, [0, 1, 2], names):
plt.scatter(X_r[y == i, 0],
X_r[y == i, 1],
color=color, alpha=.8, lw=lw,
label=name)
plt.legend(loc='best', shadow=False, scatterpoints=1)
plt.title('PCA of IRIS dataset')
Text(0.5, 1.0, 'PCA of IRIS dataset')
for color, i, name in zip(colors, [0, 1, 2], names):
plt.scatter(X_r2[y == i, 0],
X_r2[y == i, 1], alpha=.8, color=color,
label=name)
plt.legend(loc='best', shadow=False, scatterpoints=1)
plt.title('LDA of IRIS dataset')
Text(0.5, 1.0, 'LDA of IRIS dataset')
partial_fit
uses subsets of data fetched sequentially from disk or a network.numpy.memmap
also enables calling the fit
method on sparse matrices or a memory mapped file.explained_variance_ration
incrementally.import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import load_iris
from sklearn.decomposition import PCA, IncrementalPCA as IPCA
iris = load_iris()
X,y,n = iris.data, iris.target, 2
ipca = IPCA(n_components=n, batch_size=10)
pca = PCA(n_components=n)
X_ipca = ipca.fit_transform(X)
X_pca = pca.fit_transform(X)
colors = ['navy', 'turquoise', 'darkorange']
for X_transformed, title in [(X_ipca, "Incremental PCA"), (X_pca, "PCA")]:
plt.figure(figsize=(8, 8))
for color, i, name in zip(colors, [0, 1, 2],
iris.target_names):
plt.scatter(X_transformed[y == i, 0],
X_transformed[y == i, 1],
color=color, lw=2,
label=name)
if "Incremental" in title:
err = np.abs(np.abs(X_pca) - np.abs(X_ipca)).mean()
plt.title(title + " of iris dataset\nMean absolute unsigned error "
"%.6f" % err)
else:
plt.title(title + " of iris dataset")
plt.legend(loc="best", shadow=False, scatterpoints=1)
plt.axis([-4, 4, -1.5, 1.5])
Use case: projecting data to a low-D space while preserving most of the variance. This is done by dropping the singular vector of components associated with lower singular values.
Example: 64x64 pixel gray-level pix (for face recognition) has dimensionality = 4096 (which will make SVM training very slow). Since most human faces look somewhat alike, it makes sense to use PCA to project down to ~200 dimensions.
In other words: if we're going to drop most of the vectors, it's more efficient to limit the computation to an approximation of the vectors.
accomplished with svd_solver="randomized"
.
The memory footprint is much smaller than that of exact PCA: $2 \cdot n_{\max} \cdot n_{\mathrm{components}}$ vs $n_{\max} \cdot n_{\min}$.
||X-UV||_2^2+\alpha||V||_1 \\
\text{subject to } & ||U_k||_2 = 1 \text{ for all }
0 \leq k < n_{components}\end{split}$
alpha
. (Small values = gentle regularization; large values shrink many coefficients to zero.)import numpy as np
from sklearn.datasets import make_friedman1
from sklearn.decomposition import SparsePCA
X, _ = make_friedman1(n_samples=200, n_features=30, random_state=0)
transformer = SparsePCA(n_components=5,
random_state=0).fit(X)
X_transformed = transformer.transform(X)
print(X_transformed.shape)
print(np.mean(transformer.components_ == 0))
(200, 5) 0.9666666666666667
from time import time
import logging
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split as TTS
from sklearn.model_selection import GridSearchCV
from sklearn.datasets import fetch_lfw_people
from sklearn.metrics import classification_report as CR
from sklearn.metrics import confusion_matrix as CM
from sklearn.decomposition import PCA
from sklearn.svm import SVC
lfw_people = fetch_lfw_people(min_faces_per_person=70, resize=0.4)
n_samples, h, w = lfw_people.images.shape
X = lfw_people.data
n_features = X.shape[1]
y = lfw_people.target
target_names = lfw_people.target_names
n_classes = target_names.shape[0]
print("Total dataset size:")
print("n_samples: %d" % n_samples)
print("n_features: %d" % n_features)
print("n_classes: %d" % n_classes)
Total dataset size: n_samples: 1288 n_features: 1850 n_classes: 7
X_train, X_test, y_train, y_test = TTS(
X, y, test_size=0.25, random_state=42)
n_components = 150
t0 = time()
pca = PCA(n_components=n_components,
svd_solver='randomized',
whiten=True).fit(X_train)
print("PCA fit done in %0.3fs" % (time() - t0))
eigenfaces = pca.components_.reshape((n_components, h, w))
t0 = time()
X_train_pca = pca.transform(X_train)
X_test_pca = pca.transform(X_test)
print("PCA transform done in %0.3fs" % (time() - t0))
t0 = time()
param_grid = {'C': [1e3, 5e3, 1e4, 5e4, 1e5],
'gamma': [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.1], }
clf = GridSearchCV(
SVC(kernel='rbf', class_weight='balanced'),
param_grid).fit(X_train_pca,
y_train)
print("CV done in %0.3fs" % (time() - t0))
print("Best estimator:", clf.best_estimator_)
PCA fit done in 0.706s PCA transform done in 0.044s CV done in 11.058s Best estimator: SVC(C=1000.0, class_weight='balanced', gamma=0.005)
t0 = time()
y_pred = clf.predict(X_test_pca)
print("Prediction quality on test data: done in %0.3fs" % (time() - t0))
print(CR(y_test, y_pred, target_names=target_names))
print(CM(y_test, y_pred, labels=range(n_classes)))
Prediction quality on test data: done in 0.038s precision recall f1-score support Ariel Sharon 0.80 0.62 0.70 13 Colin Powell 0.82 0.88 0.85 60 Donald Rumsfeld 0.86 0.67 0.75 27 George W Bush 0.85 0.98 0.91 146 Gerhard Schroeder 0.95 0.80 0.87 25 Hugo Chavez 1.00 0.53 0.70 15 Tony Blair 1.00 0.81 0.89 36 accuracy 0.87 322 macro avg 0.90 0.75 0.81 322 weighted avg 0.87 0.87 0.86 322 [[ 8 2 0 3 0 0 0] [ 1 53 1 5 0 0 0] [ 1 1 18 7 0 0 0] [ 0 3 0 143 0 0 0] [ 0 1 0 4 20 0 0] [ 0 3 0 3 1 8 0] [ 0 2 2 3 0 0 29]]
def plot_gallery(images, titles, h, w, n_row=3, n_col=4):
plt.figure(figsize=(1.8*n_col, 2.4*n_row))
plt.subplots_adjust(bottom=0, left=.01, right=.99, top=.90, hspace=.35)
for i in range(n_row*n_col):
plt.subplot(n_row, n_col, i+1)
plt.imshow(images[i].reshape((h, w)), cmap=plt.cm.gray)
plt.title(titles[i], size=12)
plt.xticks(())
plt.yticks(())
def title(y_pred, y_test, target_names, i):
pred_name = target_names[y_pred[i]].rsplit(' ', 1)[-1]
true_name = target_names[y_test[i]].rsplit(' ', 1)[-1]
return 'predicted: %s\ntrue: %s' % (pred_name, true_name)
# plots predictions on test set
prediction_titles = [title(y_pred, y_test, target_names, i)
for i in range(y_pred.shape[0])]
plot_gallery(X_test, prediction_titles, h, w)
# plot most significative eigenfaces
eigenface_titles = ["eigenface %d" % i for i in range(eigenfaces.shape[0])]
plot_gallery(eigenfaces, eigenface_titles, h, w)
import logging
from time import time
from numpy.random import RandomState as RS
import matplotlib.pyplot as plt
from sklearn.datasets import fetch_olivetti_faces
from sklearn.cluster import MiniBatchKMeans
from sklearn import decomposition
n_row, n_col = 2, 3
n_components = n_row*n_col
image_shape = (64, 64)
rng = RS(0)
faces, _ = fetch_olivetti_faces(return_X_y=True, shuffle=True,
random_state=rng)
# global then local centering
n_samples, n_features = faces.shape
faces_centered = faces - faces.mean(axis=0)
faces_centered -= faces_centered.mean(axis=1).reshape(n_samples, -1)
print("Dataset: %d faces" % n_samples)
Dataset: 400 faces
def plot_gallery(title, images, n_col=n_col, n_row=n_row, cmap=plt.cm.gray):
plt.figure(figsize=(2.0*n_col, 2.26*n_row))
plt.suptitle(title, size=16)
for i, comp in enumerate(images):
plt.subplot(n_row, n_col, i+1)
vmax = max(comp.max(), -comp.min())
plt.imshow(comp.reshape(image_shape), cmap=cmap,
interpolation='nearest',
vmin=-vmax, vmax=vmax)
plt.xticks(())
plt.yticks(())
plt.subplots_adjust(0.01, 0.05, 0.99, 0.93, 0.04, 0.)
estimators = [
('Eigenfaces - PCA/randomized SVD',
decomposition.PCA(n_components=n_components,
svd_solver='randomized',
whiten=True),
True),
('NMF',
decomposition.NMF(n_components=n_components,
init='nndsvda',
tol=5e-3),
False),
('FastICA',
decomposition.FastICA(n_components=n_components,
whiten=True),
True),
('MiniBatchSparsePCA',
decomposition.MiniBatchSparsePCA(n_components=n_components,
alpha=0.8,
n_iter=100,
batch_size=3,
random_state=rng),
True),
('MiniBatchDictionaryLearning',
decomposition.MiniBatchDictionaryLearning(n_components=15,
alpha=0.1,
n_iter=50,
batch_size=3,
random_state=rng),
True),
('Cluster centers - MiniBatchKMeans',
MiniBatchKMeans(n_clusters=n_components,
tol=1e-3,
batch_size=20,
max_iter=50,
random_state=rng),
True),
('Factor Analysis',
decomposition.FactorAnalysis(n_components=n_components,
max_iter=20),
True),
]
plot_gallery("Olivetti faces, first centered",
faces_centered[:n_components])
noise_variance
attribute. noise_variance_
(mean of pixelwise variance) - skip it.for name, estimator, center in estimators:
t0 = time()
data = faces
if center:
data = faces_centered
estimator.fit(data)
train_time = time()-t0
print("done in %0.3fs" % train_time)
if hasattr(estimator, 'cluster_centers_'):
components_ = estimator.cluster_centers_
else:
components_ = estimator.components_
if (hasattr(estimator, 'noise_variance_') and
estimator.noise_variance_.ndim > 0): # Skip the Eigenfaces case
plot_gallery("Pixelwise variance",
estimator.noise_variance_.reshape(1, -1),
n_col=1, n_row=1)
plot_gallery('%s - Train time %.1fs' % (name, train_time),
components_[:n_components])
done in 0.388s done in 0.198s done in 0.135s done in 0.722s done in 1.407s done in 0.251s done in 0.533s
from sklearn.decomposition import MiniBatchDictionaryLearning as MBDL
# dictionary learning with various positivity constraints
estimators = [
('Dictionary learning (DL)',
MBDL(n_components=15,
alpha=0.1, n_iter=50, batch_size=3,
random_state=rng), True),
('DL - positive dictionary',
MBDL(n_components=15,
alpha=0.1, n_iter=50, batch_size=3,
random_state=rng, positive_dict=True), True),
('DL - positive code',
MBDL(n_components=15,
alpha=0.1, n_iter=50, batch_size=3,
random_state=rng, positive_code=True, fit_algorithm='cd'), True),
('DL - positive dictionary & code',
MBDL(n_components=15,
alpha=0.1, n_iter=50, batch_size=3,
random_state=rng, positive_code=True, fit_algorithm='cd',
positive_dict=True), True),
]
plot_gallery("First centered Olivetti faces",
faces_centered[:n_components],
cmap=plt.cm.RdBu)
for name, estimator, center in estimators:
data = faces
if center:
data = faces_centered
estimator.fit(data)
components_ = estimator.components_
plot_gallery(name, components_[:n_components], cmap=plt.cm.RdBu)
import numpy as np
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA, KernelPCA
from sklearn.datasets import make_circles
np.random.seed(0)
X, y = make_circles(n_samples=400, factor=.3, noise=.05)
kpca = KernelPCA(kernel="rbf", fit_inverse_transform=True, gamma=10)
X_kpca = kpca.fit_transform(X)
X_back = kpca.inverse_transform(X_kpca)
pca = PCA()
X_pca = pca.fit_transform(X)
plt.figure()
plt.subplot(2, 2, 1, aspect='equal')
plt.title("Original space")
reds = y == 0
blues = y == 1
plt.scatter(X[reds, 0], X[reds, 1], c="red", s=20, edgecolor='k')
plt.scatter(X[blues, 0], X[blues, 1], c="blue", s=20, edgecolor='k')
plt.xlabel("$x_1$")
plt.ylabel("$x_2$")
X1, X2 = np.meshgrid(np.linspace(-1.5, 1.5, 50),
np.linspace(-1.5, 1.5, 50))
X_grid = np.array([np.ravel(X1),
np.ravel(X2)]).T
# projection on the first principal component (in the phi space)
Z_grid = kpca.transform(X_grid)[:, 0].reshape(X1.shape)
plt.contour(X1, X2, Z_grid,
colors='grey', linewidths=1, origin='lower')
plt.subplot(2, 2, 2, aspect='equal')
plt.scatter(X_pca[reds, 0], X_pca[reds, 1], c="red", s=20, edgecolor='k')
plt.scatter(X_pca[blues, 0], X_pca[blues, 1], c="blue", s=20, edgecolor='k')
plt.title("Projection by PCA")
plt.xlabel("1st principal component")
plt.ylabel("2nd component")
plt.subplot(2, 2, 3, aspect='equal')
plt.scatter(X_kpca[reds, 0], X_kpca[reds, 1], c="red", s=20, edgecolor='k')
plt.scatter(X_kpca[blues, 0], X_kpca[blues, 1], c="blue", s=20, edgecolor='k')
plt.title("Projection by KPCA")
plt.xlabel(r"1st principal component in space induced by $\phi$")
plt.ylabel("2nd component")
plt.subplot(2, 2, 4, aspect='equal')
plt.scatter(X_back[reds, 0], X_back[reds, 1], c="red", s=20, edgecolor='k')
plt.scatter(X_back[blues, 0], X_back[blues, 1], c="blue", s=20, edgecolor='k')
plt.title("Original space after inverse transform")
plt.xlabel("$x_1$")
plt.ylabel("$x_2$")
plt.tight_layout()
Only computes the $k$ largest singular values. $k$ is a given parameter.
Also known as latent semantic analysis (LSA) when applied to term-document matrices (usually built using CountVectorizer or TfIdfVectorizer.)
Very similar to PCA, but $X$ does not need to be centered. If the column-wise (per feature) means are subtracted from $X$, then truncated SVD on the resulting matrix is equivalent to PCA. This means Trauncated SVD accepts scipy.sparse
data as-is.
Recommendeded (over raw frequency counts) when used on tf-idf matrices.
Recommend using sublinear_tf=True
& use_idf=True
to bring the feature values closer to a Gaussian distribution.
Using a bag-of-words approach.
Two feature extraction methods:
tfidf vectorization: uses an in-memory vocabulary (dict) to map frequent words to features. The word frequencies are re-weighted using the IDF vector collected across the corpus.
hashing vectorization: hashes word frequencies to a fixed dimensional space with potential collisions. The word counts are normalized such that each has an $l2$-norm equal to one, which should aid k-means.
Kmeans & minibatch Kmeans are sensitive to feature scaling. IDF weighting improves cluster quality when measured against the "ground truth" labels in the 20 newsgroups dataset.
This improvement is not visible in metrics such as Silhouette Coefficient. Other measures (V-Measure, Adjusted Rand Index) are based on cluster assignments - not distances - therefore are penalized by dimensionality factors.
from sklearn.datasets import fetch_20newsgroups
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer as TV
from sklearn.feature_extraction.text import HashingVectorizer as HV
from sklearn.feature_extraction.text import TfidfTransformer as TT
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Normalizer
from sklearn import metrics
from sklearn.cluster import KMeans, MiniBatchKMeans
import logging
from optparse import OptionParser
import sys
from time import time
import numpy as np
categories = [
'alt.atheism',
'talk.religion.misc',
'comp.graphics',
'sci.space',
]
dataset = fetch_20newsgroups(subset='all',
categories=categories,
shuffle=True, random_state=42)
print("%d documents" % len(dataset.data))
print("%d categories" % len(dataset.target_names))
3387 documents 4 categories
labels = dataset.target
true_k = np.unique(labels).shape[0]
n_features = 10000 # default
# use hashing; use IDF
t0 = time()
hasher = HV(n_features=n_features,
stop_words='english',
alternate_sign=False,
norm=None)
vectorizer_1 = make_pipeline(hasher, TT())
svd = TruncatedSVD(n_components=100)
normalizer = Normalizer(copy=False)
lsa = make_pipeline(svd,normalizer)
X = vectorizer_1.fit_transform(dataset.data)
X = lsa.fit_transform(X)
print("done:\t %fs" % (time()-t0))
print("#samples: %d, #features: %d" % X.shape)
print("explained variance in SVD: {}%".format(
svd.explained_variance_ratio_.sum()))
done: 1.533583s #samples: 3387, #features: 100 explained variance in SVD: 0.23241585380104632%
def do_clustering(data,mini):
if mini:
km = MiniBatchKMeans(n_clusters=true_k,
init='k-means++',
n_init=1,
init_size=1000,
batch_size=1000,
verbose=False)
else:
km = KMeans(n_clusters=true_k,
init='k-means++',
max_iter=100,
n_init=1,
verbose=False)
t0=time(); km.fit(data); print("time:", time()-t0)
do_clustering(X,True)
do_clustering(X,False)
time: 0.1408977508544922 time: 0.1494762897491455
def do_metrics(data,labels,labels_):
print("Homogeneity:\t %0.2f" %
metrics.homogeneity_score(labels,labels_))
print("Completeness:\t %0.2f" %
metrics.completeness_score(labels,labels_))
print("V-Measure:\t %0.2f" %
metrics.v_measure_score(labels,labels_))
print("Adj Rand Index:\t %0.2f" %
metrics.adjusted_rand_score(labels,labels_))
print("Silhouette Coef:\t %0.2f" %
metrics.silhouette_score(data,labels_,sample_size=1000))
do_metrics(X,labels=labels,labels_=km.labels_)
Homogeneity: 0.50 Completeness: 0.50 V-Measure: 0.50 Adj Rand Index: 0.44 Silhouette Coef: 0.04
def dothis(svd,km,comps,vectorizer):
if comps:
original_space_centroids = svd.inverse_transform(km.cluster_centers)
order_centroids = original_space_centroids.argsort()[:,::-1]
else:
order_centroids = km.cluster_centers.argsort()[:,::-1]
terms = vectorizer.get_feature_names()
for i in range(true_k):
print("Cluster:\t %d" % i)
for ind in order_centroids[i,:10]:
print(" %s" % terms[ind])
Transforms signals into a sparse linear combination of atoms from a fixed, precomputed dictionary. (So a fit
method is not needed.)
Multiple transform methods are available via transform_method
:
split_code
enables separating positive & negative values in the results. This allows the learning algorithm to assign different weights to negative loads of a specific atom.
The split code for a sample has length=2*n_components
. The first n_components
entries are filled with the positive part of the vector; the remainder are filled with the negative entries. (But with a positive sign. The split code is thus non-negative.)
Transform a signal as a sparse combination of Ricker wavelets.
Shows how different atom widths effect the fit quality. (Therefore dictonary learning helps.)
Richer dictionary on the right: not larger, but heavier subsampling is needed to stay on the same order of magnitude.
import numpy as np
import matplotlib.pyplot as plt
from sklearn.decomposition import SparseCoder
from sklearn.utils.fixes import np_version, parse_version
# Discrete sub-sampled Ricker wavelet
def ricker_function(resolution, center, width):
x = np.linspace(0, resolution-1, resolution)
x = ((2/(np.sqrt(3*width)*np.pi**0.25))
* (1-(x-center)**2/width**2)
* np.exp(-(x-center)**2/(2*width**2)))
return x
# Dictionary of Ricker wavelets
def ricker_matrix(width, resolution, n_components):
centers = np.linspace(0, resolution-1, n_components)
D = np.empty((n_components, resolution))
for i, center in enumerate(centers):
D[i] = ricker_function(resolution, center, width)
D /= np.sqrt(np.sum(D**2, axis=1))[:, np.newaxis]
return D
resolution, subsampling, width = 1024, 3, 100
n_components = resolution // subsampling
# Compute a wavelet dictionary
D_fixed = ricker_matrix(width=width,
resolution=resolution,
n_components=n_components)
D_multi = np.r_[tuple(ricker_matrix(width=w,
resolution=resolution,
n_components=n_components//5)
for w in (10, 50, 100, 500, 1000))]
# Generate a signal
y = np.linspace(0, resolution-1, resolution)
first_quarter = y<resolution/4
y[first_quarter] = 3.
y[np.logical_not(first_quarter)] = -1.
# title, xform_algo, xform_alpha, xform_n_nozero_coefs, color)
estimators = [('OMP', 'omp', None, 15, 'navy'),
('Lasso', 'lasso_lars', 2, None, 'turquoise'), ]
lw = 2
# Avoid FutureWarning about default value change when numpy >= 1.14
lstsq_rcond = None if np_version >= parse_version('1.14') else -1
plt.figure(figsize=(13,6))
for subplot, (D, title) in enumerate(zip((D_fixed, D_multi),
('fixed width',
'multiple widths'))):
plt.subplot(1, 2, subplot + 1)
plt.title('Sparse coding against %s dictionary' % title)
plt.plot(y, lw=lw, linestyle='--', label='Original signal')
# Do a wavelet approximation
for title, algo, alpha, n_nonzero, color in estimators:
coder = SparseCoder(dictionary=D,
transform_n_nonzero_coefs=n_nonzero,
transform_alpha=alpha,
transform_algorithm=algo)
x = coder.transform(y.reshape(1, -1))
density = len(np.flatnonzero(x))
x = np.ravel(np.dot(x,D))
squared_error = np.sum((y-x)**2)
plt.plot(x, color=color, lw=lw,
label='%s: %s nonzero coefs,\n%.2f error'
% (title, density, squared_error))
# Soft thresholding debiasing
coder = SparseCoder(dictionary=D,
transform_algorithm='threshold',
transform_alpha=20)
x = coder.transform(y.reshape(1, -1))
_, idx = np.where(x != 0)
x[0, idx], _, _, _ = np.linalg.lstsq(D[idx, :].T,
y,
rcond=lstsq_rcond)
x = np.ravel(np.dot(x, D))
squared_error = np.sum((y-x)**2)
plt.plot(x,
color='darkorange', lw=lw,
label='Thresholding w/ debiasing:\n%d nonzero coefs, %.2f error'
% (len(idx), squared_error))
plt.axis('tight')
plt.legend(shadow=False, loc='best')
plt.subplots_adjust(.04, .07, .97, .90, .09, .2)
Task: reconstruct the noisy fragments of a raccoon's face.
Dictionary is fitted on the left half (distorted) of the image & used to reconstruct the right half.
Observations:
from time import time
import matplotlib.pyplot as plt
import numpy as np
import scipy as sp
from sklearn.decomposition import MiniBatchDictionaryLearning as MBDL
from sklearn.feature_extraction.image import extract_patches_2d as EP2D
from sklearn.feature_extraction.image import reconstruct_from_patches_2d as RP2D
try: # SciPy >= 0.16 have face in misc
from scipy.misc import face
face = face(gray=True)
except ImportError:
face = sp.face(gray=True)
# Convert from uint8 (0-255) to fp (0.0-1.0); downsample for speed
face = face / 255.
face = face[::4, ::4] + face[1::4, ::4] + face[::4, 1::4] + face[1::4, 1::4]
face /= 4.0
height, width = face.shape
# Distort the right half of the image
distorted = face.copy()
distorted[:, width // 2:] += 0.075 * np.random.randn(height, width // 2)
# Extract reference patches from the left half of the image
t0 = time()
patch_size = (7, 7)
data = EP2D(distorted[:, :width // 2], patch_size)
data = data.reshape(data.shape[0], -1)
data -= np.mean(data, axis=0)
data /= np.std(data, axis=0)
print('done: %.3fs.' % (time() - t0))
done: 0.006s.
t0 = time()
dico = MBDL(n_components=100,
alpha=1,
n_iter=500)
V = dico.fit(data).components_
print('done:\t', time()-t0)
plt.figure(figsize=(4.2, 4))
for i, comp in enumerate(V[:100]):
plt.subplot(10, 10, i + 1)
plt.imshow(comp.reshape(patch_size),
cmap=plt.cm.gray_r,
interpolation='nearest')
plt.xticks(())
plt.yticks(())
#plt.suptitle('Dictionary learned from face patches\n' +
# 'Train time %.1fs on %d patches' % (dt, len(data)), fontsize=16)
plt.subplots_adjust(0.08, 0.02, 0.92, 0.85, 0.08, 0.23)
done: 8.665220499038696
# distorted image
def show_with_diff(image, reference, title):
plt.figure(figsize=(5, 3.3))
plt.subplot(1, 2, 1)
plt.title('Image')
plt.imshow(image, vmin=0, vmax=1, cmap=plt.cm.gray,
interpolation='nearest')
plt.xticks(())
plt.yticks(())
plt.subplot(1, 2, 2)
difference = image - reference
plt.title('Difference (norm: %.2f)' % np.sqrt(np.sum(difference ** 2)))
plt.imshow(difference, vmin=-0.5, vmax=0.5, cmap=plt.cm.PuOr,
interpolation='nearest')
plt.xticks(())
plt.yticks(())
plt.suptitle(title, size=16)
plt.subplots_adjust(0.02, 0.02, 0.98, 0.79, 0.02, 0.2)
show_with_diff(distorted, face, 'Distorted image')
t0 = time()
data = EP2D(distorted[:, width // 2:], patch_size)
data = data.reshape(data.shape[0], -1)
intercept = np.mean(data, axis=0)
data -= intercept
print('Noisy patch extraction: done in %.2fs.' % (time() - t0))
transform_algorithms = [
('OMP\n1 atom', 'omp', {'transform_n_nonzero_coefs': 1}),
('OMP\n2 atoms', 'omp', {'transform_n_nonzero_coefs': 2}),
('LARS\n5 atoms', 'lars', {'transform_n_nonzero_coefs': 5}),
('Thresholding\n alpha=0.1', 'threshold', {'transform_alpha': .1})]
reconstructions = {}
for title, transform_algorithm, kwargs in transform_algorithms:
print(title + '...')
reconstructions[title] = face.copy()
t0 = time()
dico.set_params(transform_algorithm=transform_algorithm, **kwargs)
code = dico.transform(data)
patches = np.dot(code, V)
patches += intercept
patches = patches.reshape(len(data), *patch_size)
if transform_algorithm == 'threshold':
patches -= patches.min()
patches /= patches.max()
reconstructions[title][:, width // 2:] = RP2D(patches,
(height,
width//2))
dt = time() - t0
print('done in %.2fs.' % dt)
show_with_diff(reconstructions[title], face,
title + ' (time: %.1fs)' % dt)
Noisy patch extraction: done in 0.00s. OMP 1 atom... done in 0.92s. OMP 2 atoms... done in 1.66s. LARS 5 atoms... done in 14.27s. Thresholding alpha=0.1... done in 0.19s.
Faster but less accurate. Better suited for large datasets.
Divides dataset into batches & cycles between them for a specified #iterations. This implementation does not have a stopping condition.
partial_fit
is supported. It updates the dictionary by iterating only once on a given batch. This can be used for online learning where the complete dataset is not available from the beginning, or does not fit into main memory.
Uses the Olivetti faces dataset.
Good example of online (streamed) API - load one image, extract 50 random patches. Once 500 patches are accumulated from 10 images, run partial_fit on the minibatch Kmeans object.
Some clusters are reassigned during successive calls to partial_fit, because the #patches they represent has become too low - it's better to choose a new random cluster.
import time
import matplotlib.pyplot as plt
import numpy as np
from sklearn import datasets
from sklearn.cluster import MiniBatchKMeans as MBKM
from sklearn.feature_extraction.image import extract_patches_2d as EP2D
faces = datasets.fetch_olivetti_faces()
# Learn image dictionary
rng = np.random.RandomState(0)
kmeans = MBKM(n_clusters=81, random_state=rng, verbose=True)
patch_size = (20, 20)
buffer = []
t0 = time.time()
# online learning: cycle through entire dataset 6 times
index = 0
for _ in range(6):
for img in faces.images:
data = EP2D(img, patch_size, max_patches=50, random_state=rng)
data = np.reshape(data, (len(data), -1))
buffer.append(data)
index += 1
if index % 10 == 0:
data = np.concatenate(buffer, axis=0)
data -= np.mean(data, axis=0)
data /= np.std(data, axis=0)
kmeans.partial_fit(data)
buffer = []
#if index % 100 == 0:
# print('Partial fit of %4i out of %i'
# % (index, 6 * len(faces.images)))
dt = time.time() - t0
print('done in %.2fs.' % dt)
#plt.figure(figsize=(4.2, 4))
plt.figure(figsize=(8,8))
for i, patch in enumerate(kmeans.cluster_centers_):
plt.subplot(9, 9, i + 1)
plt.imshow(patch.reshape(patch_size), cmap=plt.cm.gray,
interpolation='nearest')
plt.xticks(())
plt.yticks(())
plt.suptitle('Patches of faces\nTrain time %.1fs on %d patches' %
(dt, 8 * len(faces.images)), fontsize=16)
plt.subplots_adjust(0.08, 0.02, 0.92, 0.85, 0.08, 0.23)
plt.show()
[MiniBatchKMeans] Reassigning 11 cluster centers. [MiniBatchKMeans] Reassigning 2 cluster centers. done in 4.07s.
An unlabeled dataset $X = \{x_1, x_2, \dots, x_n\}$ can be described mathematically as $x_i = W h_i + \mu + \epsilon$, where $h_i$ is "latent" (unobserved).
$\sigma$ is a Gaussian noise term with mean=0 and covariance=$\Psi$.
$\mu$ is an arbitrary offset vector.
This type of model is "generative" - it describes how $x_i$ is generated from $h_i$.
FA can model variance in every direction of the input space, independently. This is an advantage over standard PCA.
FA is often followed with a factor rotation (using rotation
) to improve interpretability. For example Varimax rotation maximizes the sum of variances of the squared loadings. It tends to produce sparser factors.
Matrix decomposition can reveal latent relations between sepal length, petal length & petal width.
Applying rotations to the resulting components doesn't improve the predictability, but can help with visualization.
import matplotlib.pyplot as plt
import numpy as np
from sklearn.decomposition import FactorAnalysis as FA, PCA
from sklearn.preprocessing import StandardScaler as SS
from sklearn.datasets import load_iris
data = load_iris()
X = SS().fit_transform(data["data"])
feature_names = data["feature_names"]
ax = plt.axes()
im = ax.imshow(np.corrcoef(X.T), cmap="RdBu_r", vmin=-1, vmax=1)
ax.set_xticks([0, 1, 2, 3])
ax.set_xticklabels(list(feature_names), rotation=90)
ax.set_yticks([0, 1, 2, 3])
ax.set_yticklabels(list(feature_names))
plt.colorbar(im).ax.set_ylabel("$r$", rotation=0)
ax.set_title("Iris feature correlation matrix")
plt.tight_layout()
ICA separates multivariate signals into maximally indepedent subcomponents.
ICA does not include a noise term, so whitening must be applied for correct modeling. This can be done internally or with a PCA variant.
Dataset: 3 instruments playing simulateously + 3 microphones recording the mixture. ICA is used to recover each source.
Note: PCA will fail - the correlated signals are non-Gaussian.
import numpy as np
import matplotlib.pyplot as plt
from scipy import signal
from sklearn.decomposition import FastICA, PCA
np.random.seed(0)
n_samples = 2000
time = np.linspace(0, 8, n_samples)
# 1 = sinusoid; 2 = square; 3 = sawtooth
s1 = np.sin(2 * time)
s2 = np.sign(np.sin(3 * time))
s3 = signal.sawtooth(2 * np.pi * time)
# add noise, standardize, then mix
S = np.c_[s1, s2, s3]
S += 0.2 * np.random.normal(size=S.shape)
S /= S.std(axis=0)
A = np.array([[1.0, 1.0, 1.0],
[0.5, 2.0, 1.0],
[1.5, 1.0, 2.0]])
X = np.dot(S, A.T)
# ICA: reconstruct signals; get est. mix matrix
ica = FastICA(n_components=3)
S_ = ica.fit_transform(X)
A_ = ica.mixing_
# prove ICA is applicable by undoing the unmixing
assert np.allclose(X, np.dot(S_, A_.T) + ica.mean_)
# PCA: reconstruct signals using orthogonal components
pca = PCA(n_components=3)
H = pca.fit_transform(X)
models = [X, S, S_, H]
names = ['Observations',
'True Sources',
'Recovered (ICA)',
'Recovered (PCA)']
colors = ['red', 'steelblue', 'orange']
for ii, (model, name) in enumerate(zip(models, names), 1):
plt.subplot(4, 1, ii)
plt.title(name)
for sig, color in zip(model.T, colors):
plt.plot(sig, color=color)
#plt.tight_layout()
Assumes all data & components are not negative. It can be used instead of PCA.
NMF decomposes $X$ into two matrices $W$ and $H$ by optimizing the distance $d$ between $X$ and the matrix product $W.H$.
The squared Frobenius norm is the most commonly used distance function. It is an extension of the Euclidean norm to matrices: $d_{\mathrm{Fro}}(X, Y) = \frac{1}{2} ||X - Y||_{\mathrm{Fro}}^2 = \frac{1}{2} \sum_{i,j} (X_{ij} - {Y}_{ij})^2$.
Other distance function options:
The Kullback-Leibler (KL) divergence: $d_{KL}(X, Y) = \sum_{i,j} (X_{ij} \log(\frac{X_{ij}}{Y_{ij}}) - X_{ij} + Y_{ij})$
The Itakura-Saito (IS) divergence: $d_{IS}(X, Y) = \sum_{i,j} (\frac{X_{ij}}{Y_{ij}} - \log(\frac{X_{ij}}{Y_{ij}}) - 1)$
(These three are special cases of the beta divergence family with $\beta$=2,1,0 respectively): $d_{\beta}(X, Y) = \sum_{i,j} \frac{1}{\beta(\beta - 1)}(X_{ij}^\beta + (\beta-1)Y_{ij}^\beta - \beta X_{ij} Y_{ij}^{\beta - 1})$
When carefully constrained, NMF can build a parts-based representation of a dataset - resulting in interpretable models.
init
determines the initialization method:
NNDSVD (Non-negative double singular value decomposition) is based on two SVD processes - one approximating a data matrix, another approximating positive sections of the resulting partial SVD factors. This is best suited for sparse factorization.
NNDSVDa sets all zeros equal to the mean of all data elements.
NNDSVDar sets all zeros to random pertubations < the data mean, divided by 100. This is best suited for dense factorization.
"random"
L1 & L2 priors can be added to the loss function for regularization. The L2 prior uses the Frobenius norm; the L1 prior uses an element-wise L1 norm. l1_ratio
controls the L1/L2 combination; alpha
controls regularization "intensity".
NMF regularizes $W$ and $H$ by default. regularization
provides fine-tuned control. (Only $W$, only $H$, or both.
NMF has two solvers: Coordinate Descent (cd
) and Multiplicative Update (mu
). The cd
solver can only optimize the Frobenius norm. The mu
solver can optimize every beta-divergence and is significantly faster on KL and IS divergences.
import numpy as np
import matplotlib.pyplot as plt
from sklearn.decomposition._nmf import _beta_divergence
x = np.linspace(0.001, 4, 1000)
y = np.zeros(x.shape)
colors = 'mbgyr'
for j, beta in enumerate((0., 0.5, 1., 1.5, 2.)):
for i, xi in enumerate(x):
y[i] = _beta_divergence(1, xi, 1, beta)
name = "beta = %1.1f" % beta
plt.plot(x, y, label=name, color=colors[j])
plt.xlabel("x")
plt.title("beta-divergence(1, x)")
plt.legend(loc=0)
plt.axis([0, 4, 0, 3])
(0.0, 4.0, 0.0, 3.0)
Another generative model for collection of discrete data. Can also be used to discover topics from document collections. Graphically:
Latent variables model the random mix of topics in the corpus, and distribution of words in the documents.
The goal of LDA is to use the observed words to infer the hidden topic structure.
When LDA is applied to a document-term matrix, the matrix will be decomposed into topic-term (stored in components_
) and document-topic matrices. the document-topic matrix can be found using the transform
method.
LDA supports partial_fit
for handling sequential fetching.
from time import time
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer as TV
from sklearn.feature_extraction.text import CountVectorizer as CV
from sklearn.decomposition import NMF, LatentDirichletAllocation as LDA
from sklearn.datasets import fetch_20newsgroups
n_samples, n_features, n_components, n_top_words = 2000, 1000, 10, 20
def plot_top_words(model, feature_names, n_top_words, title):
fig, axes = plt.subplots(2, 5, figsize=(30, 15), sharex=True)
axes = axes.flatten()
for topic_idx, topic in enumerate(model.components_):
top_features_ind = topic.argsort()[:-n_top_words - 1:-1]
top_features = [feature_names[i] for i in top_features_ind]
weights = topic[top_features_ind]
ax = axes[topic_idx]
ax.barh(top_features, weights, height=0.7)
ax.set_title(f'Topic {topic_idx +1}',
fontdict={'fontsize': 30})
ax.invert_yaxis()
ax.tick_params(axis='both', which='major', labelsize=20)
for i in 'top right left'.split():
ax.spines[i].set_visible(False)
fig.suptitle(title, fontsize=40)
plt.subplots_adjust(top=0.90, bottom=0.05, wspace=0.90, hspace=0.3)
t0 = time()
data, _ = fetch_20newsgroups(shuffle=True, random_state=1,
remove=('headers', 'footers', 'quotes'),
return_X_y=True)
data_samples = data[:n_samples]
print("done in %0.3fs." % (time() - t0))
done in 0.858s.
tfidf_vectorizer = TV(max_df=0.95,
min_df=2,
max_features=n_features,
stop_words='english')
t0 = time()
tfidf = tfidf_vectorizer.fit_transform(data_samples)
print("done in %0.3fs." % (time() - t0))
done in 0.216s.
# Use tf (raw term count) features for LDA.
tf_vectorizer = CV(max_df=0.95,
min_df=2,
max_features=n_features,
stop_words='english')
t0 = time()
tf = tf_vectorizer.fit_transform(data_samples)
print("done in %0.3fs." % (time() - t0))
done in 0.211s.
# Fit the NMF model
t0 = time()
nmf = NMF(n_components=n_components,
random_state=1,
alpha=.1,
l1_ratio=.5).fit(tfidf)
print("done in %0.3fs." % (time() - t0))
/home/bjpcjp/.local/lib/python3.8/site-packages/sklearn/decomposition/_nmf.py:312: FutureWarning: The 'init' value, when 'init=None' and n_components is less than n_samples and n_features, will be changed from 'nndsvd' to 'nndsvda' in 1.1 (renaming of 0.26). warnings.warn(("The 'init' value, when 'init=None' and "
done in 0.301s.
tfidf_feature_names = tfidf_vectorizer.get_feature_names()
plot_top_words(nmf, tfidf_feature_names, n_top_words,
'Topics in NMF model (Frobenius norm)')
# Fit the NMF model
t0 = time()
nmf = NMF(n_components=n_components,
random_state=1,
beta_loss='kullback-leibler',
solver='mu',
max_iter=1000,
alpha=.1,
l1_ratio=.5).fit(tfidf)
print("done in %0.3fs." % (time() - t0))
/home/bjpcjp/.local/lib/python3.8/site-packages/sklearn/decomposition/_nmf.py:312: FutureWarning: The 'init' value, when 'init=None' and n_components is less than n_samples and n_features, will be changed from 'nndsvd' to 'nndsvda' in 1.1 (renaming of 0.26). warnings.warn(("The 'init' value, when 'init=None' and "
done in 1.183s.
tfidf_feature_names = tfidf_vectorizer.get_feature_names()
plot_top_words(nmf, tfidf_feature_names, n_top_words,
'Topics in NMF model (generalized Kullback-Leibler divergence)')
lda = LDA(n_components=n_components,
max_iter=5,
learning_method='online',
learning_offset=50.,
random_state=0)
t0 = time()
lda.fit(tf)
print("done in %0.3fs." % (time() - t0))
done in 2.433s.
tf_feature_names = tf_vectorizer.get_feature_names()
plot_top_words(lda, tf_feature_names, n_top_words, 'Topics in LDA model')