from time import time
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import offsetbox
from sklearn import (manifold, datasets, decomposition, ensemble,
discriminant_analysis, random_projection, neighbors)
digits = datasets.load_digits(n_class=6)
X,y = digits.data, digits.target
n_samples, n_features = X.shape
n_neighbors = 30
def plot_embedding(X, title=None):
x_min, x_max = np.min(X, 0), np.max(X, 0)
X = (X - x_min) / (x_max - x_min)
plt.figure()
ax = plt.subplot(111)
for i in range(X.shape[0]):
plt.text(X[i, 0], X[i, 1], str(y[i]),
color=plt.cm.Set1(y[i] / 10.),
fontdict={'weight': 'bold', 'size': 9})
if hasattr(offsetbox, 'AnnotationBbox'):
# only print thumbnails with matplotlib > 1.0
shown_images = np.array([[1., 1.]]) # just something big
for i in range(X.shape[0]):
dist = np.sum((X[i] - shown_images) ** 2, 1)
if np.min(dist) < 4e-3:
continue # don't show points that are too close
shown_images = np.r_[shown_images, [X[i]]]
imagebox = offsetbox.AnnotationBbox(
offsetbox.OffsetImage(digits.images[i],
cmap=plt.cm.gray_r),
X[i])
ax.add_artist(imagebox)
plt.xticks([]), plt.yticks([])
if title is not None:
plt.title(title)
# Plot images of the digits
n_img_per_row = 20
img = np.zeros((10*n_img_per_row,
10*n_img_per_row))
for i in range(n_img_per_row):
ix = 10*i+1
for j in range(n_img_per_row):
iy = 10*j+1
img[ix:ix + 8,
iy:iy + 8] = X[i * n_img_per_row + j].reshape((8, 8))
plt.imshow(img, cmap=plt.cm.binary)
plt.xticks([]); plt.yticks([]); plt.title('From the 64-D digits dataset')
Text(0.5, 1.0, 'From the 64-D digits dataset')
t0 = time()
rp = random_projection.SparseRandomProjection(n_components=2, random_state=42)
X_projected = rp.fit_transform(X)
plot_embedding(X_projected, "Sparse Random Projection (time %.3fs)" % (time()-t0))
t0 = time()
X_pca = decomposition.TruncatedSVD(n_components=2).fit_transform(X)
plot_embedding(X_pca,
"Principal Components (Truncated SVD) (time %.3fs)" %
(time() - t0))
X2 = X.copy(); X2.flat[::X.shape[1] + 1] += 0.01 # Make X invertible
t0 = time()
X_lda = discriminant_analysis.LinearDiscriminantAnalysis(n_components=2).fit_transform(X2, y)
plot_embedding(X_lda, "LDA (time %.3fs)" % (time() - t0))
path_method
selects either Dijkstra's or Floyd-Warshall algorithm. The code chooses a method, based on the dataset, if not specified.eigen_solver
controls the solver method & looks for a best method, based on the data, if not specified. The computation cost can be improved if ARPACK
is used.t0 = time()
X_iso = manifold.Isomap(n_neighbors=n_neighbors, n_components=2).fit_transform(X)
plot_embedding(X_iso, "Isomap (time %.3fs)" % (time() - t0))
clf = manifold.LocallyLinearEmbedding(n_neighbors=n_neighbors, n_components=2,
method='standard')
t0 = time()
X_lle = clf.fit_transform(X)
plot_embedding(X_lle, "LLE (time %.3fs)" % (time() - t0))
print("Reconstruction error: %g" % clf.reconstruction_error_)
Reconstruction error: 2.11987e-06
method="modified"
controls this feature. It requires n_neighbors
> n_components
.clf = manifold.LocallyLinearEmbedding(
n_neighbors=n_neighbors, n_components=2, method='modified')
t0 = time()
X_mlle = clf.fit_transform(X)
plot_embedding(X_mlle, "LLE (modified) (time %.2fs)" % (time() - t0))
print("Reconstruction error: %g" % clf.reconstruction_error_)
Reconstruction error: 0.360724
method="hessian"
controls this feature. It requires n_neighbors
> n_components*(n_components+3)/2
.clf = manifold.LocallyLinearEmbedding(
n_neighbors=n_neighbors, n_components=2, method='hessian')
t0 = time()
X_hlle = clf.fit_transform(X)
plot_embedding(X_hlle, "LLE (Hessian) (time %.3fs)" %
(time() - t0))
print("Reconstruction error: %g" % clf.reconstruction_error_)
Reconstruction error: 0.212673
clf = manifold.LocallyLinearEmbedding(n_neighbors=n_neighbors, n_components=2, method='ltsa')
t0 = time()
X_ltsa = clf.fit_transform(X)
plot_embedding(X_ltsa, "LTSA (time %.3fs)" % (time() - t0))
print("Reconstruction error: %g" % clf.reconstruction_error_)
Reconstruction error: 0.212677
clf = manifold.MDS(n_components=2, n_init=1, max_iter=100)
t0 = time()
X_mds = clf.fit_transform(X)
plot_embedding(X_mds, "MDS (time %.2fs)" % (time() - t0))
print("Stress: %f" % clf.stress_)
Stress: 140185087.993700
# metric vs non-metric MDS - reconstructed points on noisy data
# (plots slightly shifted to avoid complete overlap)
import numpy as np
from matplotlib import pyplot as plt
from matplotlib.collections import LineCollection
from sklearn import manifold
from sklearn.metrics import euclidean_distances
from sklearn.decomposition import PCA
EPSILON = np.finfo(np.float32).eps
n_samples = 20
seed = np.random.RandomState(seed=3)
X_true = seed.randint(0, 20, 2 * n_samples).astype(float)
X_true = X_true.reshape((n_samples, 2))
X_true -= X_true.mean() # Center the data
similarities = euclidean_distances(X_true)
noise = np.random.rand(n_samples, n_samples)
noise = noise + noise.T
noise[np.arange(noise.shape[0]), np.arange(noise.shape[0])] = 0
similarities += noise
mds = manifold.MDS(n_components=2,
max_iter=3000, eps=1e-9, random_state=seed,
dissimilarity="precomputed", n_jobs=1)
nmds = manifold.MDS(n_components=2, metric=False,
max_iter=3000, eps=1e-12,
dissimilarity="precomputed",
random_state=seed, n_jobs=1, n_init=1)
pos = mds.fit(similarities).embedding_
npos = nmds.fit_transform(similarities, init=pos)
# Rescale the data
pos *= np.sqrt((X_true ** 2).sum()) / np.sqrt((pos ** 2).sum())
npos *= np.sqrt((X_true ** 2).sum()) / np.sqrt((npos ** 2).sum())
clf = PCA(n_components=2) # Rotate the data
X_true = clf.fit_transform(X_true)
pos = clf.fit_transform(pos)
npos = clf.fit_transform(npos)
fig = plt.figure(1)
ax = plt.axes([0., 0., 1., 1.])
s = 100
plt.scatter(X_true[:, 0],
X_true[:, 1], color='navy', s=s, lw=0,
label='True Position')
plt.scatter(pos[:, 0],
pos[:, 1], color='turquoise', s=s, lw=0, label='MDS')
plt.scatter(npos[:, 0],
npos[:, 1], color='darkorange', s=s, lw=0, label='NMDS')
plt.legend(scatterpoints=1, loc='best', shadow=False)
similarities = similarities.max() / (similarities + EPSILON) * 100
np.fill_diagonal(similarities, 0)
# Plot the edges
start_idx, end_idx = np.where(pos)
# a sequence of (*line0*, *line1*, *line2*), where::
# linen = (x0, y0), (x1, y1), ... (xm, ym)
segments = [[X_true[i, :], X_true[j, :]]
for i in range(len(pos)) for j in range(len(pos))]
values = np.abs(similarities)
lc = LineCollection(segments,
zorder=0, cmap=plt.cm.Blues,
norm=plt.Normalize(0, values.max()))
lc.set_array(similarities.flatten())
lc.set_linewidths(np.full(len(segments), 0.5))
ax.add_collection(lc)
plt.show()
hasher = ensemble.RandomTreesEmbedding(n_estimators=200, random_state=0,
max_depth=5)
t0 = time()
X_transformed = hasher.fit_transform(X)
pca = decomposition.TruncatedSVD(n_components=2)
X_reduced = pca.fit_transform(X_transformed)
plot_embedding(X_reduced, "Random Forests(time %.3fs)" % (time() - t0))
embedder = manifold.SpectralEmbedding(n_components=2, random_state=0,
eigen_solver="arpack")
t0 = time()
X_se = embedder.fit_transform(X)
plot_embedding(X_se, "Spectral embedding (time %.3fs)" % (time() - t0))
Global structure is not preserved - this problem can be mitigated by initializing points with PCA (init="PCA"
).
Key parameters:
perplexity: The perplexity of a k-sided die is k = effectively the number of nearest neighbors t-SNE considers when generating the conditional probabilities. Larger perplexities lead to more nearest neighbors and is less sensitive to small structure. Lower perplexities consider a smaller number of neighbors, and thus ignores more global information in favour of the local neighborhood.
As dataset sizes get larger more points will be required to get a reasonable sample of the local neighborhood - larger perplexities may be required. Noisier datasets will require larger perplexity values to encompass enough local neighbors to see beyond the background noise.
early exaggeration factor: During early exaggeration the original joint probabilities will be artificially increased by multiplication with a given factor. Larger factors result in larger gaps between natural clusters. If the factor is too high, the KL divergence could increase during this phase. Usually it does not have to be tuned.
learning rate: If it is too low, gradient descent will get stuck in a bad local minimum. If it is too high, the KL divergence will increase during optimization.
max #iterations: usually high enough and does not need any tuning.
angle: a tradeoff between performance and accuracy. Larger angles imply that we can approximate larger regions by a single point, leading to better speed but less accurate results.
tsne = manifold.TSNE(n_components=2, init='pca', random_state=0)
t0 = time()
X_tsne = tsne.fit_transform(X)
plot_embedding(X_tsne, "t-SNE (time %.3fs)" % (time() - t0))
nca = neighbors.NeighborhoodComponentsAnalysis(init='random',
n_components=2, random_state=0)
t0 = time()
X_nca = nca.fit_transform(X, y)
plot_embedding(X_nca, "NCA (time %.3fs)" % (time() - t0))