import numpy as np
from sklearn.decomposition import PCA
X = np.array([[-1, -1], [-2, -1], [-3, -2], 
              [ 1,  1], [ 2,  1], [ 3,  2]])

pca  = PCA(n_components=2).fit(X)
pca2 = PCA(n_components=2, svd_solver='full').fit(X)
pca3 = PCA(n_components=1, svd_solver='arpack').fit(X)

print(pca.explained_variance_ratio_)
print(pca.singular_values_)

print(pca2.explained_variance_ratio_)
print(pca2.singular_values_)

print(pca3.explained_variance_ratio_)
print(pca3.singular_values_)

[0.99244289 0.00755711]
[6.30061232 0.54980396]
[0.99244289 0.00755711]
[6.30061232 0.54980396]
[0.99244289]
[6.30061232]


import numpy as np
import matplotlib.pyplot as plt
from scipy import linalg
from sklearn.decomposition import PCA, FactorAnalysis as FA
from sklearn.covariance import ShrunkCovariance as SC, LedoitWolf as LW
from sklearn.model_selection import cross_val_score as CVscore
from sklearn.model_selection import GridSearchCV


n_samples, n_features, rank, sigma = 1000, 50, 10, 1.0
rng = np.random.RandomState(42)

U, _, _ = linalg.svd(rng.randn(n_features, 
                               n_features))
X = np.dot(rng.randn(n_samples, rank), 
           U[:, :rank].T)

# Adding homoscedastic & heteroscedastic noise
sigmas = sigma * rng.rand(n_features) + sigma / 2.

X_homo   = X + sigma * rng.randn(n_samples, n_features)
X_hetero = X + sigmas* rng.randn(n_samples, n_features)


n_components = np.arange(0, n_features, 5)  # options for n_components

def compute_scores(X):
    pca = PCA(svd_solver='full')
    fa  = FA()
    pca_scores, fa_scores = [], []

    for n in n_components:
        pca.n_components = n
        fa.n_components = n
        pca_scores.append(np.mean(CVscore(pca, X)))
        fa_scores.append(np.mean( CVscore(fa,  X)))
    return pca_scores, fa_scores

def shrunk_cov_score(X):
    shrinkages = np.logspace(-2, 0, 30)
    cv = GridSearchCV(SC(), {'shrinkage': shrinkages})
    return np.mean(CVscore(cv.fit(X).best_estimator_, X))

def lw_score(X):
    return np.mean(CVscore(LW(), X))


for X, title in [(X_homo, 'Homoscedastic Noise'),
                 (X_hetero, 'Heteroscedastic Noise')]:

    pca_scores, fa_scores = compute_scores(X)
    n_components_pca      = n_components[np.argmax(pca_scores)]
    n_components_fa       = n_components[np.argmax(fa_scores)]
    pca                   = PCA(svd_solver='full', 
                                n_components='mle').fit(X)
    n_components_pca_mle  = pca.n_components_

    print("best n_components by PCA CV = %d" % 
          n_components_pca)
    print("best n_components by FactorAnalysis CV = %d" % 
          n_components_fa)
    print("best n_components by PCA MLE = %d" % 
          n_components_pca_mle)

    plt.figure()
    plt.plot(n_components, pca_scores, 'b', label='PCA scores')
    plt.plot(n_components,  fa_scores, 'r', label='FA scores')
    
    plt.axvline(rank, color='g', label='TRUTH: %d' % 
                rank, linestyle='-')
    plt.axvline(n_components_pca, color='b',
                label='PCA CV: %d' % 
                n_components_pca, linestyle='--')
    plt.axvline(n_components_fa, color='r',
                label='FactorAnalysis CV: %d' % 
                n_components_fa,
                linestyle='--')
    plt.axvline(n_components_pca_mle, color='k',
                label='PCA MLE: %d' % 
                n_components_pca_mle, linestyle='--')

    # compare with other covariance estimators
    plt.axhline(shrunk_cov_score(X), color='violet',
                label='Shrunk Covariance MLE', linestyle='-.')
    plt.axhline(lw_score(X), color='orange',
                label='LedoitWolf MLE' % n_components_pca_mle, linestyle='-.')

    plt.xlabel('nb of components')
    plt.ylabel('CV scores')
    plt.legend(loc='lower right')
    plt.title(title)

best n_components by PCA CV = 10
best n_components by FactorAnalysis CV = 10
best n_components by PCA MLE = 10
best n_components by PCA CV = 45
best n_components by FactorAnalysis CV = 10
best n_components by PCA MLE = 39


import matplotlib.pyplot as plt

from sklearn import datasets
from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA


iris  = datasets.load_iris()
X,y   = iris.data, iris.target
names = iris.target_names
pca   = PCA(n_components=2)
lda   = LDA(n_components=2)
X_r   = pca.fit(X).transform(X)
X_r2  = lda.fit(X, y).transform(X)

print('explained variance ratio (1st two components): %s'
      % str(pca.explained_variance_ratio_))

explained variance ratio (1st two components): [0.92461872 0.05306648]


colors = ['navy', 'turquoise', 'darkorange']
lw = 2

for color, i, name in zip(colors, [0, 1, 2], names):
    plt.scatter(X_r[y == i, 0], 
                X_r[y == i, 1], 
                color=color, alpha=.8, lw=lw,
                label=name)
plt.legend(loc='best', shadow=False, scatterpoints=1)
plt.title('PCA of IRIS dataset')

Text(0.5, 1.0, 'PCA of IRIS dataset')


for color, i, name in zip(colors, [0, 1, 2], names):
    plt.scatter(X_r2[y == i, 0], 
                X_r2[y == i, 1], alpha=.8, color=color,
                label=name)
plt.legend(loc='best', shadow=False, scatterpoints=1)
plt.title('LDA of IRIS dataset')

Text(0.5, 1.0, 'LDA of IRIS dataset')


import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import load_iris
from sklearn.decomposition import PCA, IncrementalPCA as IPCA


iris   = load_iris()
X,y,n  = iris.data, iris.target, 2
ipca   = IPCA(n_components=n, batch_size=10)
pca    =  PCA(n_components=n)

X_ipca = ipca.fit_transform(X)
X_pca = pca.fit_transform(X)


colors = ['navy', 'turquoise', 'darkorange']

for X_transformed, title in [(X_ipca, "Incremental PCA"), (X_pca, "PCA")]:
    plt.figure(figsize=(8, 8))
    for color, i, name in zip(colors, [0, 1, 2], 
                                     iris.target_names):
        plt.scatter(X_transformed[y == i, 0], 
                    X_transformed[y == i, 1],
                    color=color, lw=2, 
                    label=name)

    if "Incremental" in title:
        err = np.abs(np.abs(X_pca) - np.abs(X_ipca)).mean()
        plt.title(title + " of iris dataset\nMean absolute unsigned error "
                  "%.6f" % err)
    else:
        plt.title(title + " of iris dataset")
    plt.legend(loc="best", shadow=False, scatterpoints=1)
    plt.axis([-4, 4, -1.5, 1.5])


import numpy as np
from sklearn.datasets import make_friedman1
from sklearn.decomposition import SparsePCA

X, _        = make_friedman1(n_samples=200, n_features=30, random_state=0)
transformer = SparsePCA(n_components=5, 
                        random_state=0).fit(X)

X_transformed = transformer.transform(X)
print(X_transformed.shape)
print(np.mean(transformer.components_ == 0))

(200, 5)
0.9666666666666667


from time import time
import logging
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split as TTS
from sklearn.model_selection import GridSearchCV 
from sklearn.datasets import fetch_lfw_people
from sklearn.metrics import classification_report as CR
from sklearn.metrics import confusion_matrix as CM
from sklearn.decomposition import PCA
from sklearn.svm import SVC


lfw_people = fetch_lfw_people(min_faces_per_person=70, resize=0.4)

n_samples, h, w = lfw_people.images.shape
X               = lfw_people.data
n_features      = X.shape[1]
y               = lfw_people.target
target_names    = lfw_people.target_names
n_classes       = target_names.shape[0]

print("Total dataset size:")
print("n_samples: %d" % n_samples)
print("n_features: %d" % n_features)
print("n_classes: %d" % n_classes)

Total dataset size:
n_samples: 1288
n_features: 1850
n_classes: 7


X_train, X_test, y_train, y_test = TTS(
    X, y, test_size=0.25, random_state=42)

n_components = 150
t0 = time()
pca = PCA(n_components=n_components, 
          svd_solver='randomized',
          whiten=True).fit(X_train)
print("PCA fit done in %0.3fs" % (time() - t0))

eigenfaces = pca.components_.reshape((n_components, h, w))

t0 = time()
X_train_pca = pca.transform(X_train)
X_test_pca = pca.transform(X_test)
print("PCA transform done in %0.3fs" % (time() - t0))

t0 = time()
param_grid = {'C':     [1e3,    5e3,    1e4,   5e4,   1e5],
              'gamma': [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.1], }

clf = GridSearchCV(
    SVC(kernel='rbf', class_weight='balanced'), 
    param_grid).fit(X_train_pca, 
                   y_train)

print("CV done in %0.3fs" % (time() - t0))
print("Best estimator:", clf.best_estimator_)

PCA fit done in 0.706s
PCA transform done in 0.044s
CV done in 11.058s
Best estimator: SVC(C=1000.0, class_weight='balanced', gamma=0.005)


t0 = time()
y_pred = clf.predict(X_test_pca)
print("Prediction quality on test data: done in %0.3fs" % (time() - t0))
print(CR(y_test, y_pred, target_names=target_names))
print(CM(y_test, y_pred, labels=range(n_classes)))

Prediction quality on test data: done in 0.038s
                   precision    recall  f1-score   support

     Ariel Sharon       0.80      0.62      0.70        13
     Colin Powell       0.82      0.88      0.85        60
  Donald Rumsfeld       0.86      0.67      0.75        27
    George W Bush       0.85      0.98      0.91       146
Gerhard Schroeder       0.95      0.80      0.87        25
      Hugo Chavez       1.00      0.53      0.70        15
       Tony Blair       1.00      0.81      0.89        36

         accuracy                           0.87       322
        macro avg       0.90      0.75      0.81       322
     weighted avg       0.87      0.87      0.86       322

[[  8   2   0   3   0   0   0]
 [  1  53   1   5   0   0   0]
 [  1   1  18   7   0   0   0]
 [  0   3   0 143   0   0   0]
 [  0   1   0   4  20   0   0]
 [  0   3   0   3   1   8   0]
 [  0   2   2   3   0   0  29]]


def plot_gallery(images, titles, h, w, n_row=3, n_col=4):
    plt.figure(figsize=(1.8*n_col, 2.4*n_row))
    plt.subplots_adjust(bottom=0, left=.01, right=.99, top=.90, hspace=.35)
    
    for i in range(n_row*n_col):
        plt.subplot(n_row, n_col, i+1)
        plt.imshow(images[i].reshape((h, w)), cmap=plt.cm.gray)
        plt.title(titles[i], size=12)
        plt.xticks(())
        plt.yticks(())

def title(y_pred, y_test, target_names, i):
    pred_name = target_names[y_pred[i]].rsplit(' ', 1)[-1]
    true_name = target_names[y_test[i]].rsplit(' ', 1)[-1]
    return 'predicted: %s\ntrue:      %s' % (pred_name, true_name)


# plots predictions on test set
prediction_titles = [title(y_pred, y_test, target_names, i)
                     for i in range(y_pred.shape[0])]

plot_gallery(X_test, prediction_titles, h, w)


# plot most significative eigenfaces
eigenface_titles = ["eigenface %d" % i for i in range(eigenfaces.shape[0])]
plot_gallery(eigenfaces, eigenface_titles, h, w)


import logging
from time import time
from numpy.random import RandomState as RS
import matplotlib.pyplot as plt
from sklearn.datasets import fetch_olivetti_faces
from sklearn.cluster import MiniBatchKMeans
from sklearn import decomposition


n_row, n_col = 2, 3
n_components = n_row*n_col
image_shape  = (64, 64)
rng          = RS(0)
faces, _     = fetch_olivetti_faces(return_X_y=True, shuffle=True,
                                random_state=rng)
# global then local centering
n_samples, n_features = faces.shape
faces_centered        = faces - faces.mean(axis=0)
faces_centered       -= faces_centered.mean(axis=1).reshape(n_samples, -1)
print("Dataset: %d faces" % n_samples)

Dataset: 400 faces


def plot_gallery(title, images, n_col=n_col, n_row=n_row, cmap=plt.cm.gray):
    plt.figure(figsize=(2.0*n_col, 2.26*n_row))
    plt.suptitle(title, size=16)
    for i, comp in enumerate(images):
        plt.subplot(n_row, n_col, i+1)
        vmax = max(comp.max(), -comp.min())
        plt.imshow(comp.reshape(image_shape), cmap=cmap,
                   interpolation='nearest',
                   vmin=-vmax, vmax=vmax)
        plt.xticks(())
        plt.yticks(())
    plt.subplots_adjust(0.01, 0.05, 0.99, 0.93, 0.04, 0.)


estimators = [
    ('Eigenfaces - PCA/randomized SVD',
     decomposition.PCA(n_components=n_components, 
                       svd_solver='randomized',
                       whiten=True),
     True),

    ('NMF',
     decomposition.NMF(n_components=n_components, 
                       init='nndsvda', 
                       tol=5e-3),
     False),

    ('FastICA',
     decomposition.FastICA(n_components=n_components, 
                           whiten=True),
     True),

    ('MiniBatchSparsePCA',
     decomposition.MiniBatchSparsePCA(n_components=n_components, 
                                      alpha=0.8,
                                      n_iter=100, 
                                      batch_size=3,
                                      random_state=rng),
     True),

    ('MiniBatchDictionaryLearning',
        decomposition.MiniBatchDictionaryLearning(n_components=15, 
                                                  alpha=0.1,
                                                  n_iter=50, 
                                                  batch_size=3,
                                                  random_state=rng),
     True),

    ('Cluster centers - MiniBatchKMeans',
        MiniBatchKMeans(n_clusters=n_components, 
                        tol=1e-3, 
                        batch_size=20,
                        max_iter=50, 
                        random_state=rng),
     True),

    ('Factor Analysis',
     decomposition.FactorAnalysis(n_components=n_components, 
                                  max_iter=20),
     True),
]


plot_gallery("Olivetti faces, first centered", 
             faces_centered[:n_components])


for name, estimator, center in estimators:
    t0 = time()
    data = faces
    if center:
        data = faces_centered
    estimator.fit(data)
    train_time = time()-t0
    print("done in %0.3fs" % train_time)
    
    if hasattr(estimator, 'cluster_centers_'):
        components_ = estimator.cluster_centers_
    else:
        components_ = estimator.components_

    if (hasattr(estimator, 'noise_variance_') and
            estimator.noise_variance_.ndim > 0):  # Skip the Eigenfaces case
        plot_gallery("Pixelwise variance",
                     estimator.noise_variance_.reshape(1, -1), 
                     n_col=1, n_row=1)
    plot_gallery('%s - Train time %.1fs' % (name, train_time),
                 components_[:n_components])

done in 0.388s
done in 0.198s
done in 0.135s
done in 0.722s
done in 1.407s
done in 0.251s
done in 0.533s


from sklearn.decomposition import MiniBatchDictionaryLearning as MBDL

# dictionary learning with various positivity constraints
estimators = [
    ('Dictionary learning (DL)',
        MBDL(n_components=15, 
             alpha=0.1, n_iter=50, batch_size=3,
             random_state=rng), True),
    
    ('DL - positive dictionary',
        MBDL(n_components=15, 
             alpha=0.1, n_iter=50, batch_size=3,
             random_state=rng, positive_dict=True), True),
    
    ('DL - positive code',
        MBDL(n_components=15, 
             alpha=0.1, n_iter=50, batch_size=3,
             random_state=rng, positive_code=True, fit_algorithm='cd'), True),
    
    ('DL - positive dictionary & code',
        MBDL(n_components=15, 
             alpha=0.1, n_iter=50, batch_size=3,
             random_state=rng, positive_code=True, fit_algorithm='cd',
             positive_dict=True), True),
]


plot_gallery("First centered Olivetti faces", 
             faces_centered[:n_components],
             cmap=plt.cm.RdBu)


for name, estimator, center in estimators:
    data = faces
    if center:
        data = faces_centered
    estimator.fit(data)
    components_ = estimator.components_
    plot_gallery(name, components_[:n_components], cmap=plt.cm.RdBu)


import numpy as np
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA, KernelPCA
from sklearn.datasets import make_circles


np.random.seed(0)
X, y   = make_circles(n_samples=400, factor=.3, noise=.05)
kpca   = KernelPCA(kernel="rbf", fit_inverse_transform=True, gamma=10)
X_kpca = kpca.fit_transform(X)
X_back = kpca.inverse_transform(X_kpca)
pca    = PCA()
X_pca  = pca.fit_transform(X)


plt.figure()
plt.subplot(2, 2, 1, aspect='equal')
plt.title("Original space")
reds = y == 0
blues = y == 1

plt.scatter(X[reds, 0], X[reds, 1], c="red",    s=20, edgecolor='k')
plt.scatter(X[blues, 0], X[blues, 1], c="blue", s=20, edgecolor='k')
plt.xlabel("$x_1$")
plt.ylabel("$x_2$")

X1, X2 = np.meshgrid(np.linspace(-1.5, 1.5, 50), 
                     np.linspace(-1.5, 1.5, 50))
X_grid = np.array([np.ravel(X1), 
                   np.ravel(X2)]).T

# projection on the first principal component (in the phi space)
Z_grid = kpca.transform(X_grid)[:, 0].reshape(X1.shape)
plt.contour(X1, X2, Z_grid, 
            colors='grey', linewidths=1, origin='lower')

plt.subplot(2, 2, 2, aspect='equal')
plt.scatter(X_pca[reds,  0], X_pca[reds,  1], c="red",  s=20, edgecolor='k')
plt.scatter(X_pca[blues, 0], X_pca[blues, 1], c="blue", s=20, edgecolor='k')
plt.title("Projection by PCA")
plt.xlabel("1st principal component")
plt.ylabel("2nd component")

plt.subplot(2, 2, 3, aspect='equal')
plt.scatter(X_kpca[reds,  0], X_kpca[reds,  1], c="red",  s=20, edgecolor='k')
plt.scatter(X_kpca[blues, 0], X_kpca[blues, 1], c="blue", s=20, edgecolor='k')
plt.title("Projection by KPCA")
plt.xlabel(r"1st principal component in space induced by $\phi$")
plt.ylabel("2nd component")

plt.subplot(2, 2, 4, aspect='equal')
plt.scatter(X_back[reds,  0], X_back[reds,  1], c="red",  s=20, edgecolor='k')
plt.scatter(X_back[blues, 0], X_back[blues, 1], c="blue", s=20, edgecolor='k')
plt.title("Original space after inverse transform")
plt.xlabel("$x_1$")
plt.ylabel("$x_2$")

plt.tight_layout()


from sklearn.datasets import fetch_20newsgroups
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer as TV
from sklearn.feature_extraction.text import HashingVectorizer as HV
from sklearn.feature_extraction.text import TfidfTransformer as TT
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Normalizer
from sklearn import metrics
from sklearn.cluster import KMeans, MiniBatchKMeans
import logging
from optparse import OptionParser
import sys
from time import time
import numpy as np


categories = [
    'alt.atheism',
    'talk.religion.misc',
    'comp.graphics',
    'sci.space',
]
dataset = fetch_20newsgroups(subset='all', 
                             categories=categories,
                             shuffle=True, random_state=42)

print("%d documents" % len(dataset.data))
print("%d categories" % len(dataset.target_names))

3387 documents
4 categories


labels = dataset.target
true_k = np.unique(labels).shape[0]
n_features = 10000 # default

# use hashing; use IDF
t0 = time()
hasher = HV(n_features=n_features, 
            stop_words='english', 
            alternate_sign=False,
            norm=None)

vectorizer_1 = make_pipeline(hasher, TT()) 
svd          = TruncatedSVD(n_components=100)
normalizer   = Normalizer(copy=False)
lsa          = make_pipeline(svd,normalizer)
X            = vectorizer_1.fit_transform(dataset.data)
X            = lsa.fit_transform(X)

print("done:\t %fs" % (time()-t0))
print("#samples: %d, #features: %d" % X.shape)
print("explained variance in SVD: {}%".format(
        svd.explained_variance_ratio_.sum()))

done:	 1.533583s
#samples: 3387, #features: 100
explained variance in SVD: 0.23241585380104632%


def do_clustering(data,mini):
    if mini:
        km = MiniBatchKMeans(n_clusters=true_k, 
                       init='k-means++', 
                       n_init=1,
                       init_size=1000, 
                       batch_size=1000, 
                       verbose=False)
    else:
        km = KMeans(n_clusters=true_k, 
                init='k-means++', 
                max_iter=100, 
                n_init=1,
                verbose=False)
    t0=time(); km.fit(data); print("time:", time()-t0)

do_clustering(X,True)
do_clustering(X,False)

time: 0.1408977508544922
time: 0.1494762897491455


def do_metrics(data,labels,labels_):
    print("Homogeneity:\t %0.2f" %
             metrics.homogeneity_score(labels,labels_))
    print("Completeness:\t %0.2f" %
             metrics.completeness_score(labels,labels_))
    print("V-Measure:\t %0.2f" %
             metrics.v_measure_score(labels,labels_))
    print("Adj Rand Index:\t %0.2f" %
             metrics.adjusted_rand_score(labels,labels_))
    print("Silhouette Coef:\t %0.2f" %
             metrics.silhouette_score(data,labels_,sample_size=1000))

do_metrics(X,labels=labels,labels_=km.labels_)

Homogeneity:	 0.50
Completeness:	 0.50
V-Measure:	 0.50
Adj Rand Index:	 0.44
Silhouette Coef:	 0.04


def dothis(svd,km,comps,vectorizer):
    if comps:
        original_space_centroids = svd.inverse_transform(km.cluster_centers)
        order_centroids = original_space_centroids.argsort()[:,::-1]
    else:
        order_centroids = km.cluster_centers.argsort()[:,::-1]
    terms = vectorizer.get_feature_names()
    for i in range(true_k):
        print("Cluster:\t %d" % i)
        for ind in order_centroids[i,:10]:
            print(" %s" % terms[ind])


import numpy as np
import matplotlib.pyplot as plt
from sklearn.decomposition import SparseCoder
from sklearn.utils.fixes import np_version, parse_version


# Discrete sub-sampled Ricker wavelet
def ricker_function(resolution, center, width):
    x = np.linspace(0, resolution-1, resolution)
    x = ((2/(np.sqrt(3*width)*np.pi**0.25))
         * (1-(x-center)**2/width**2)
         * np.exp(-(x-center)**2/(2*width**2)))
    return x

# Dictionary of Ricker wavelets
def ricker_matrix(width, resolution, n_components):
    centers = np.linspace(0, resolution-1, n_components)
    D       = np.empty((n_components, resolution))
    for i, center in enumerate(centers):
        D[i] = ricker_function(resolution, center, width)
    D /= np.sqrt(np.sum(D**2, axis=1))[:, np.newaxis]
    return D


resolution, subsampling, width = 1024, 3, 100
n_components = resolution // subsampling

# Compute a wavelet dictionary
D_fixed = ricker_matrix(width=width, 
                        resolution=resolution,
                        n_components=n_components)
D_multi = np.r_[tuple(ricker_matrix(width=w, 
                                    resolution=resolution,
                                    n_components=n_components//5)
                for w in (10, 50, 100, 500, 1000))]

# Generate a signal
y                                = np.linspace(0, resolution-1, resolution)
first_quarter                    = y<resolution/4
y[first_quarter]                 = 3.
y[np.logical_not(first_quarter)] = -1.


# title, xform_algo, xform_alpha, xform_n_nozero_coefs, color)
estimators = [('OMP',   'omp',        None, 15,   'navy'),
              ('Lasso', 'lasso_lars', 2,    None, 'turquoise'), ]
lw = 2
# Avoid FutureWarning about default value change when numpy >= 1.14
lstsq_rcond = None if np_version >= parse_version('1.14') else -1


plt.figure(figsize=(13,6))

for subplot, (D, title) in enumerate(zip((D_fixed, D_multi),
                                         ('fixed width', 
                                          'multiple widths'))):
    plt.subplot(1, 2, subplot + 1)
    plt.title('Sparse coding against %s dictionary' % title)
    plt.plot(y, lw=lw, linestyle='--', label='Original signal')

    # Do a wavelet approximation
    for title, algo, alpha, n_nonzero, color in estimators:
        coder = SparseCoder(dictionary=D, 
                            transform_n_nonzero_coefs=n_nonzero,
                            transform_alpha=alpha, 
                            transform_algorithm=algo)
        
        x             = coder.transform(y.reshape(1, -1))
        density       = len(np.flatnonzero(x))
        x             = np.ravel(np.dot(x,D))
        squared_error = np.sum((y-x)**2)
        
        plt.plot(x, color=color, lw=lw,
                 label='%s: %s nonzero coefs,\n%.2f error'
                 % (title, density, squared_error))

    # Soft thresholding debiasing
    coder = SparseCoder(dictionary=D, 
                        transform_algorithm='threshold',
                        transform_alpha=20)
    
    x = coder.transform(y.reshape(1, -1))
    _, idx = np.where(x != 0)
    x[0, idx], _, _, _ = np.linalg.lstsq(D[idx, :].T, 
                                         y, 
                                         rcond=lstsq_rcond)
    x = np.ravel(np.dot(x, D))
    squared_error = np.sum((y-x)**2)
    
    plt.plot(x, 
             color='darkorange', lw=lw,
             label='Thresholding w/ debiasing:\n%d nonzero coefs, %.2f error'
             % (len(idx), squared_error))
    plt.axis('tight')
    plt.legend(shadow=False, loc='best')
plt.subplots_adjust(.04, .07, .97, .90, .09, .2)


from time import time
import matplotlib.pyplot as plt
import numpy as np
import scipy as sp
from sklearn.decomposition import MiniBatchDictionaryLearning as MBDL
from sklearn.feature_extraction.image import extract_patches_2d as EP2D
from sklearn.feature_extraction.image import reconstruct_from_patches_2d as RP2D


try:  # SciPy >= 0.16 have face in misc
    from scipy.misc import face
    face = face(gray=True)
except ImportError:
    face = sp.face(gray=True)

# Convert from uint8 (0-255) to fp (0.0-1.0); downsample for speed
face = face / 255.
face = face[::4, ::4] + face[1::4, ::4] + face[::4, 1::4] + face[1::4, 1::4]
face /= 4.0
height, width = face.shape

# Distort the right half of the image
distorted = face.copy()
distorted[:, width // 2:] += 0.075 * np.random.randn(height, width // 2)

# Extract reference patches from the left half of the image
t0 = time()
patch_size = (7, 7)
data       = EP2D(distorted[:, :width // 2], patch_size)
data       = data.reshape(data.shape[0], -1)
data      -= np.mean(data, axis=0)
data      /= np.std(data, axis=0)
print('done: %.3fs.' % (time() - t0))

done: 0.006s.


t0 = time()
dico = MBDL(n_components=100, 
            alpha=1, 
            n_iter=500)
V = dico.fit(data).components_
print('done:\t', time()-t0)

plt.figure(figsize=(4.2, 4))
for i, comp in enumerate(V[:100]):
    plt.subplot(10, 10, i + 1)
    plt.imshow(comp.reshape(patch_size), 
               cmap=plt.cm.gray_r,
               interpolation='nearest')
    plt.xticks(())
    plt.yticks(())
#plt.suptitle('Dictionary learned from face patches\n' +
#             'Train time %.1fs on %d patches' % (dt, len(data)), fontsize=16)
plt.subplots_adjust(0.08, 0.02, 0.92, 0.85, 0.08, 0.23)

done:	 8.665220499038696


# distorted image
def show_with_diff(image, reference, title):
    plt.figure(figsize=(5, 3.3))
    plt.subplot(1, 2, 1)
    plt.title('Image')
    plt.imshow(image, vmin=0, vmax=1, cmap=plt.cm.gray,
               interpolation='nearest')
    plt.xticks(())
    plt.yticks(())
    
    plt.subplot(1, 2, 2)
    difference = image - reference
    plt.title('Difference (norm: %.2f)' % np.sqrt(np.sum(difference ** 2)))
    plt.imshow(difference, vmin=-0.5, vmax=0.5, cmap=plt.cm.PuOr,
               interpolation='nearest')
    plt.xticks(())
    plt.yticks(())
    
    plt.suptitle(title, size=16)
    plt.subplots_adjust(0.02, 0.02, 0.98, 0.79, 0.02, 0.2)

show_with_diff(distorted, face, 'Distorted image')


t0 = time()
data      = EP2D(distorted[:, width // 2:], patch_size)
data      = data.reshape(data.shape[0], -1)
intercept = np.mean(data, axis=0)
data     -= intercept
print('Noisy patch extraction: done in %.2fs.' % (time() - t0))

transform_algorithms = [
    ('OMP\n1 atom', 'omp',    {'transform_n_nonzero_coefs': 1}),
    ('OMP\n2 atoms', 'omp',   {'transform_n_nonzero_coefs': 2}),
    ('LARS\n5 atoms', 'lars', {'transform_n_nonzero_coefs': 5}),
    ('Thresholding\n alpha=0.1', 'threshold', {'transform_alpha': .1})]

reconstructions = {}

for title, transform_algorithm, kwargs in transform_algorithms:
    print(title + '...')
    reconstructions[title] = face.copy()

    t0 = time()
    dico.set_params(transform_algorithm=transform_algorithm, **kwargs)
    code     = dico.transform(data)
    patches  = np.dot(code, V)
    patches += intercept
    patches  = patches.reshape(len(data), *patch_size)
    
    if transform_algorithm == 'threshold':
        patches -= patches.min()
        patches /= patches.max()
    
    reconstructions[title][:, width // 2:] = RP2D(patches, 
                                                  (height, 
                                                   width//2))
    dt = time() - t0
    print('done in %.2fs.' % dt)
    show_with_diff(reconstructions[title], face,
                   title + ' (time: %.1fs)' % dt)

Noisy patch extraction: done in 0.00s.
OMP
1 atom...
done in 0.92s.
OMP
2 atoms...
done in 1.66s.
LARS
5 atoms...
done in 14.27s.
Thresholding
 alpha=0.1...
done in 0.19s.


import time
import matplotlib.pyplot as plt
import numpy as np
from sklearn import datasets
from sklearn.cluster import MiniBatchKMeans as MBKM
from sklearn.feature_extraction.image import extract_patches_2d as EP2D

faces = datasets.fetch_olivetti_faces()

# Learn image dictionary
rng = np.random.RandomState(0)
kmeans = MBKM(n_clusters=81, random_state=rng, verbose=True)
patch_size = (20, 20)
buffer = []
t0 = time.time()

# online learning: cycle through entire dataset 6 times
index = 0
for _ in range(6):
    for img in faces.images:
        data = EP2D(img, patch_size, max_patches=50, random_state=rng)
        data = np.reshape(data, (len(data), -1))
        buffer.append(data)
        index += 1
        if index % 10 == 0:
            data = np.concatenate(buffer, axis=0)
            data -= np.mean(data, axis=0)
            data /= np.std(data, axis=0)
            kmeans.partial_fit(data)
            buffer = []
        #if index % 100 == 0:
        #    print('Partial fit of %4i out of %i'
        #          % (index, 6 * len(faces.images)))

dt = time.time() - t0
print('done in %.2fs.' % dt)

#plt.figure(figsize=(4.2, 4))
plt.figure(figsize=(8,8))
for i, patch in enumerate(kmeans.cluster_centers_):
    plt.subplot(9, 9, i + 1)
    plt.imshow(patch.reshape(patch_size), cmap=plt.cm.gray,
               interpolation='nearest')
    plt.xticks(())
    plt.yticks(())


plt.suptitle('Patches of faces\nTrain time %.1fs on %d patches' %
             (dt, 8 * len(faces.images)), fontsize=16)
plt.subplots_adjust(0.08, 0.02, 0.92, 0.85, 0.08, 0.23)

plt.show()

[MiniBatchKMeans] Reassigning 11 cluster centers.
[MiniBatchKMeans] Reassigning 2 cluster centers.
done in 4.07s.


import matplotlib.pyplot as plt
import numpy as np
from sklearn.decomposition import FactorAnalysis as FA, PCA
from sklearn.preprocessing import StandardScaler as SS
from sklearn.datasets import load_iris


data = load_iris()
X = SS().fit_transform(data["data"])
feature_names = data["feature_names"]


ax = plt.axes()
im = ax.imshow(np.corrcoef(X.T), cmap="RdBu_r", vmin=-1, vmax=1)
ax.set_xticks([0, 1, 2, 3])
ax.set_xticklabels(list(feature_names), rotation=90)
ax.set_yticks([0, 1, 2, 3])
ax.set_yticklabels(list(feature_names))
plt.colorbar(im).ax.set_ylabel("$r$", rotation=0)
ax.set_title("Iris feature correlation matrix")
plt.tight_layout()


import numpy as np
import matplotlib.pyplot as plt
from scipy import signal
from sklearn.decomposition import FastICA, PCA


np.random.seed(0)
n_samples = 2000
time      = np.linspace(0, 8, n_samples)

# 1 = sinusoid; 2 = square; 3 = sawtooth
s1        = np.sin(2 * time)
s2        = np.sign(np.sin(3 * time))
s3        = signal.sawtooth(2 * np.pi * time)

# add noise, standardize, then mix
S  = np.c_[s1, s2, s3]
S += 0.2 * np.random.normal(size=S.shape)
S /= S.std(axis=0)
A  = np.array([[1.0, 1.0, 1.0], 
               [0.5, 2.0, 1.0], 
               [1.5, 1.0, 2.0]])
X  = np.dot(S, A.T)


# ICA: reconstruct signals; get est. mix matrix
ica = FastICA(n_components=3)
S_  = ica.fit_transform(X)
A_  = ica.mixing_

# prove ICA is applicable by undoing the unmixing
assert np.allclose(X, np.dot(S_, A_.T) + ica.mean_)

# PCA: reconstruct signals using orthogonal components
pca = PCA(n_components=3)
H = pca.fit_transform(X)


models = [X, S, S_, H]
names = ['Observations',
         'True Sources',
         'Recovered (ICA)',
         'Recovered (PCA)']
colors = ['red', 'steelblue', 'orange']

for ii, (model, name) in enumerate(zip(models, names), 1):
    plt.subplot(4, 1, ii)
    plt.title(name)
    for sig, color in zip(model.T, colors):
        plt.plot(sig, color=color)

#plt.tight_layout()


import numpy as np
import matplotlib.pyplot as plt
from sklearn.decomposition._nmf import _beta_divergence


x = np.linspace(0.001, 4, 1000)
y = np.zeros(x.shape)

colors = 'mbgyr'
for j, beta in enumerate((0., 0.5, 1., 1.5, 2.)):
    for i, xi in enumerate(x):
        y[i] = _beta_divergence(1, xi, 1, beta)
    name = "beta = %1.1f" % beta
    plt.plot(x, y, label=name, color=colors[j])

plt.xlabel("x")
plt.title("beta-divergence(1, x)")
plt.legend(loc=0)
plt.axis([0, 4, 0, 3])

(0.0, 4.0, 0.0, 3.0)


from time import time
import matplotlib.pyplot as plt

from sklearn.feature_extraction.text import TfidfVectorizer as TV 
from sklearn.feature_extraction.text import CountVectorizer as CV
from sklearn.decomposition import NMF, LatentDirichletAllocation as LDA
from sklearn.datasets import fetch_20newsgroups


n_samples, n_features, n_components, n_top_words = 2000, 1000, 10, 20


def plot_top_words(model, feature_names, n_top_words, title):
    fig, axes = plt.subplots(2, 5, figsize=(30, 15), sharex=True)
    axes = axes.flatten()
    
    for topic_idx, topic in enumerate(model.components_):
        top_features_ind = topic.argsort()[:-n_top_words - 1:-1]
        top_features     = [feature_names[i] for i in top_features_ind]
        weights          = topic[top_features_ind]

        ax = axes[topic_idx]
        ax.barh(top_features, weights, height=0.7)
        ax.set_title(f'Topic {topic_idx +1}',
                     fontdict={'fontsize': 30})
        ax.invert_yaxis()
        ax.tick_params(axis='both', which='major', labelsize=20)
        for i in 'top right left'.split():
            ax.spines[i].set_visible(False)
        fig.suptitle(title, fontsize=40)

    plt.subplots_adjust(top=0.90, bottom=0.05, wspace=0.90, hspace=0.3)


t0 = time()
data, _ = fetch_20newsgroups(shuffle=True, random_state=1,
                             remove=('headers', 'footers', 'quotes'),
                             return_X_y=True)
data_samples = data[:n_samples]
print("done in %0.3fs." % (time() - t0))

done in 0.858s.


tfidf_vectorizer = TV(max_df=0.95, 
                      min_df=2,
                      max_features=n_features,
                      stop_words='english')
t0 = time()
tfidf = tfidf_vectorizer.fit_transform(data_samples)
print("done in %0.3fs." % (time() - t0))

done in 0.216s.


# Use tf (raw term count) features for LDA.
tf_vectorizer = CV(max_df=0.95, 
                   min_df=2,
                   max_features=n_features,
                   stop_words='english')
t0 = time()
tf = tf_vectorizer.fit_transform(data_samples)
print("done in %0.3fs." % (time() - t0))

done in 0.211s.


# Fit the NMF model 
t0 = time()
nmf = NMF(n_components=n_components, 
          random_state=1,
          alpha=.1, 
          l1_ratio=.5).fit(tfidf)
print("done in %0.3fs." % (time() - t0))

/home/bjpcjp/.local/lib/python3.8/site-packages/sklearn/decomposition/_nmf.py:312: FutureWarning: The 'init' value, when 'init=None' and n_components is less than n_samples and n_features, will be changed from 'nndsvd' to 'nndsvda' in 1.1 (renaming of 0.26).
  warnings.warn(("The 'init' value, when 'init=None' and "

done in 0.301s.


tfidf_feature_names = tfidf_vectorizer.get_feature_names()
plot_top_words(nmf, tfidf_feature_names, n_top_words,
               'Topics in NMF model (Frobenius norm)')


# Fit the NMF model
t0 = time()
nmf = NMF(n_components=n_components, 
          random_state=1,
          beta_loss='kullback-leibler', 
          solver='mu', 
          max_iter=1000, 
          alpha=.1,
          l1_ratio=.5).fit(tfidf)
print("done in %0.3fs." % (time() - t0))

/home/bjpcjp/.local/lib/python3.8/site-packages/sklearn/decomposition/_nmf.py:312: FutureWarning: The 'init' value, when 'init=None' and n_components is less than n_samples and n_features, will be changed from 'nndsvd' to 'nndsvda' in 1.1 (renaming of 0.26).
  warnings.warn(("The 'init' value, when 'init=None' and "

done in 1.183s.


tfidf_feature_names = tfidf_vectorizer.get_feature_names()
plot_top_words(nmf, tfidf_feature_names, n_top_words,
               'Topics in NMF model (generalized Kullback-Leibler divergence)')


lda = LDA(n_components=n_components, 
          max_iter=5,
          learning_method='online',
          learning_offset=50.,
          random_state=0)
t0 = time()
lda.fit(tf)
print("done in %0.3fs." % (time() - t0))

done in 2.433s.


tf_feature_names = tf_vectorizer.get_feature_names()
plot_top_words(lda, tf_feature_names, n_top_words, 'Topics in LDA model')

Decomposing Signals / Matrix Factorization ¶

PCA (Principal Component Analysis)¶

Probabilistic PCA vs Factor Analysis ¶

Example: PCA vs LDA, Iris dataset ¶

Incremental PCA ¶

Example, Incremental PCA, Iris ¶

PCA with Randomized SVD ¶

PCA with Sparse Data ¶

Example: face recognition with eigenfaces & SVMs ¶

Example: Dimension Reduction method comparison ¶

Non-Linear Dimensionality Reduction - Kernel PCA ¶

Truncated SVD ¶

Example: Document topic clustering - feature extraction method comparison ¶

Dictionary Learning: Sparse Coding ¶

Example; Compare sparse coding methods ¶

Generic Dictionary Learning ¶

Example: Image denoising with DL ¶

Dictionary Learning (Minibatch)¶

Example: Learn Face Image Patches ¶

Factor Analysis (FA)¶

Example: FA with rotation ¶

Independent Component Analysis (ICA)¶

Example: blind source separation with ICA ¶

Non-Negative matrix factorization (NMF/NNMF)¶

Example: plot Beta-divergence loss functions using the `mu` solver ¶

Latent Dirichlet Allocation (LDA)¶

Example: Topic extraction with NMF & LDA ¶

Non-Linear Dimensionality Reduction - Kernel PCA¶

Generic Dictionary Learning¶

Non-Linear Dimensionality Reduction - Kernel PCA ¶

Generic Dictionary Learning ¶