# eps = max distortion rate per johnson-lindenstrauss lemma, [0..1]
from sklearn.random_projection import johnson_lindenstrauss_min_dim as JLMD

print(JLMD(n_samples=1e6,             eps=0.5))
print(JLMD(n_samples=1e6,             eps=[0.5, 0.1, 0.01]))
print(JLMD(n_samples=[1e4, 1e5, 1e6], eps=0.1))

663
[    663   11841 1112658]
[ 7894  9868 11841]


import numpy as np
from sklearn import random_projection
X = np.random.rand(100, 10000)
print(X.shape)

X_new = random_projection.GaussianRandomProjection().fit_transform(X)
print(X_new.shape)

(100, 10000)
(100, 3947)


import numpy as np
from sklearn import random_projection
X = np.random.rand(100, 10000)
print(X.shape)

X_new = random_projection.SparseRandomProjection().fit_transform(X)
print(X_new.shape)

(100, 10000)
(100, 3947)


import sys
from time import time
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
from sklearn.random_projection import johnson_lindenstrauss_min_dim as JLMD
from sklearn.random_projection import SparseRandomProjection as SRP
from sklearn.datasets import fetch_20newsgroups_vectorized
from sklearn.datasets import load_digits
from sklearn.metrics.pairwise import euclidean_distances as ED
from sklearn.utils.fixes import parse_version

# `normed` is being deprecated in favor of `density` in histograms
if parse_version(matplotlib.__version__) >= parse_version('2.1'):
    density_param = {'density': True}
else:
    density_param = {'normed': True}


# range of admissible distortions
eps_range = np.linspace(0.1, 0.99, 5)
colors = plt.cm.Blues(np.linspace(0.3, 1.0, len(eps_range)))

# range of number of samples (observation) to embed
n_samples_range = np.logspace(1, 9, 9)

plt.figure()
for eps, color in zip(eps_range, colors):
    min_n_components = JLMD(n_samples_range, eps=eps)
    plt.loglog(n_samples_range, min_n_components, color=color)

plt.legend(["eps = %0.1f" % eps for eps in eps_range], loc="lower right")
plt.xlabel("Number of observations to eps-embed")
plt.ylabel("Minimum number of dimensions")
plt.title("Johnson-Lindenstrauss bounds:\nn_samples vs n_components")

Text(0.5, 1.0, 'Johnson-Lindenstrauss bounds:\nn_samples vs n_components')


eps_range = np.linspace(0.01, 0.99, 100)

# range of number of samples (observation) to embed
n_samples_range = np.logspace(2, 6, 5)
colors = plt.cm.Blues(np.linspace(0.3, 1.0, len(n_samples_range)))

plt.figure()
for n_samples, color in zip(n_samples_range, colors):
    min_n_components = JLMD(n_samples, eps=eps_range)
    plt.semilogy(eps_range, min_n_components, color=color)

plt.legend(["n_samples = %d" % n for n in n_samples_range], loc="upper right")
plt.xlabel("Distortion eps")
plt.ylabel("Minimum number of dimensions")
plt.title("Johnson-Lindenstrauss bounds:\nn_components vs eps")

Text(0.5, 1.0, 'Johnson-Lindenstrauss bounds:\nn_components vs eps')


data = fetch_20newsgroups_vectorized().data[:500]

n_samples, n_features = data.shape
print("%d samples; dim %d"
      % (n_samples, n_features))

n_components_range = np.array([300, 1000, 10000])
dists = ED(data, squared=True).ravel()

# select only non-identical samples pairs
nonzero = dists != 0
dists = dists[nonzero]

500 samples; dim 130107


for n_components in n_components_range:
    t0 = time()
    rp = SRP(n_components=n_components)
    projected_data = rp.fit_transform(data)
    print("Projected %d samples from %d to %d in %0.3fs"
          % (n_samples, n_features, n_components, time() - t0))
    if hasattr(rp, 'components_'):
        n_bytes = rp.components_.data.nbytes
        n_bytes += rp.components_.indices.nbytes
        print("Random matrix with size: %0.3fMB" % (n_bytes / 1e6))

    projected_dists = ED(
        projected_data, squared=True).ravel()[nonzero]

    plt.figure()
    min_dist = min(projected_dists.min(), dists.min())
    max_dist = max(projected_dists.max(), dists.max())
    plt.hexbin(dists, projected_dists, gridsize=100, cmap=plt.cm.PuBu,
               extent=[min_dist, max_dist, min_dist, max_dist])
    plt.xlabel("Pairwise squared distances in original space")
    plt.ylabel("Pairwise squared distances in projected space")
    plt.title("Pairwise distances distribution for n_components=%d" %
              n_components)
    cb = plt.colorbar()
    cb.set_label('Sample pairs counts')

    rates = projected_dists / dists
    print("Mean distances rate: %0.2f (%0.2f)"
          % (np.mean(rates), np.std(rates)))

    plt.figure()
    plt.hist(rates, bins=50, range=(0., 2.), edgecolor='k', **density_param)
    plt.xlabel("Squared distances rate: projected / original")
    plt.ylabel("Distribution of samples pairs")
    plt.title("Histogram of pairwise distance rates for n_components=%d" %
              n_components)

    # TODO: compute the expected value of eps and add them to the previous plot
    # as vertical lines / region

Projected 500 samples from 130107 to 300 in 0.236s
Random matrix with size: 1.298MB
Mean distances rate: 0.97 (0.18)
Projected 500 samples from 130107 to 1000 in 0.733s
Random matrix with size: 4.335MB
Mean distances rate: 0.93 (0.10)
Projected 500 samples from 130107 to 10000 in 7.479s
Random matrix with size: 43.273MB
Mean distances rate: 0.99 (0.03)

Dimensionality Reduction - Random Projections ¶

The Johnson-Lindenstrauss lemma ¶

Gaussian RP ¶

Sparse RP ¶

Example: JL embedding boundary with random projections ¶

Empirical Validation¶

Dimensionality Reduction - Random Projections¶

The Johnson-Lindenstrauss lemma¶

Gaussian RP¶

Sparse RP¶

Example: JL embedding boundary with random projections¶

Empirical Validation¶

Dimensionality Reduction - Random Projections ¶

The Johnson-Lindenstrauss lemma ¶

Gaussian RP ¶

Sparse RP ¶

Example: JL embedding boundary with random projections ¶