algorithm=<keyword>
:BallTree
KDTree
brute
force"auto
: the best approach is determined by the method.from sklearn.neighbors import NearestNeighbors as NN
import numpy as np
X = np.array([[-1, -1], [-2, -1], [-3, -2], [1, 1], [2, 1], [3, 2]])
nbrs = NN(n_neighbors=2,
algorithm='ball_tree').fit(X)
distances, indices = nbrs.kneighbors(X)
print(distances,indices)
[[0. 1. ] [0. 1. ] [0. 1.41421356] [0. 1. ] [0. 1. ] [0. 1.41421356]] [[0 1] [1 0] [2 1] [3 4] [4 3] [5 4]]
nbrs.kneighbors_graph(X).toarray()
array([[1., 1., 0., 0., 0., 0.], [1., 1., 0., 0., 0., 0.], [0., 1., 1., 0., 0., 0.], [0., 0., 0., 1., 1., 0.], [0., 0., 0., 1., 1., 0.], [0., 0., 0., 0., 1., 1.]])
from sklearn.neighbors import KDTree
import numpy as np
X = np.array([[-1, -1], [-2, -1], [-3, -2], [1, 1], [2, 1], [3, 2]])
kdt = KDTree(X, leaf_size=30, metric='euclidean')
kdt.query(X, k=2, return_distance=False)
array([[0, 1], [1, 0], [2, 1], [3, 4], [4, 3], [5, 4]])
Radius-based NNs more useful when data can't be uniformly sampled. Depends on radius $r$ - points in sparser neighborhoods use fewer NNs for classification. Radius-based NNs suffer from dimensionality problems in high-dimension datasets.
Basic classification uses weights=uniform
to compute a simple majority vote of NNs. weights=distance
assigns weights proportional to the inverse of distance from the query point. (You can also define a custom distance function.)
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.colors import ListedColormap
from sklearn import neighbors, datasets
n_neighbors = 15
iris = datasets.load_iris()
# Use only 1st two features. We could avoid this ugly
# slicing by using a 2D dataset
X,y = iris.data[:, :2], iris.target
h = .02 # mesh step size
cmap_light = ListedColormap(['orange', 'cyan', 'cornflowerblue'])
cmap_bold = ['darkorange', 'c', 'darkblue']
for weights in ['uniform', 'distance']:
clf = neighbors.KNeighborsClassifier(n_neighbors, weights=weights)
clf.fit(X, y)
# Plot the decision boundary.
x_min, x_max = X[:, 0].min()-1, X[:, 0].max()+1
y_min, y_max = X[:, 1].min()-1, X[:, 1].max()+1
xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
np.arange(y_min, y_max, h))
Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])
# Put the result into a color plot
Z = Z.reshape(xx.shape)
plt.figure(figsize=(8, 6))
plt.contourf(xx, yy, Z, cmap=cmap_light)
# Plot also the training points
sns.scatterplot(x=X[:, 0],
y=X[:, 1],
hue=iris.target_names[y],
palette=cmap_bold,
alpha=1.0, edgecolor="black")
plt.xlim(xx.min(), xx.max())
plt.ylim(yy.min(), yy.max())
plt.title("3-Class classification (k = %i, weights = '%s')"
% (n_neighbors, weights))
plt.xlabel(iris.feature_names[0])
plt.ylabel(iris.feature_names[1])
plt.show()
# sample data - noisy sinusoid
import numpy as np
import matplotlib.pyplot as plt
from sklearn import neighbors
np.random.seed(0)
X = np.sort(5 * np.random.rand(40, 1), axis=0)
T = np.linspace(0, 5, 500)[:, np.newaxis]
y = np.sin(X).ravel()
y[::5] += 1 * (0.5 - np.random.rand(8))
nn = 5
for i, weights in enumerate(['uniform', 'distance']):
knn = neighbors.KNeighborsRegressor(nn, weights=weights)
y_ = knn.fit(X, y).predict(T)
plt.subplot(2, 1, i + 1)
plt.scatter(X, y, color='darkorange', label='data')
plt.plot(T, y_, color='navy', label='prediction')
plt.axis('tight')
plt.legend()
plt.title("KNN regression (k = %i, weights = '%s')" \
% (nn, weights))
plt.tight_layout()
plt.show()
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import fetch_olivetti_faces as Fetch
from sklearn.utils.validation import check_random_state as Check
from sklearn.ensemble import ExtraTreesRegressor as ETR
from sklearn.neighbors import KNeighborsRegressor as KNR
from sklearn.linear_model import LinearRegression as LR
from sklearn.linear_model import RidgeCV
# dataset
data, targets = Fetch(return_X_y=True)
train = data[targets < 30]
test = data[targets >= 30]
downloading Olivetti faces from https://ndownloader.figshare.com/files/5976027 to /home/bjpcjp/scikit_learn_data
# Test on a subset of people
n_faces = 5
rng = Check(4)
face_ids = rng.randint(test.shape[0], size=(n_faces, ))
test = test[face_ids, :]
n_pixels = data.shape[1]
X_train = train[:, :(n_pixels + 1) // 2] # Upper half of the faces
y_train = train[:, n_pixels // 2:] # Lower half of the faces
X_test = test[:, :(n_pixels + 1) // 2]
y_test = test[:, n_pixels // 2:]
ESTIMATORS = {
"Extra trees": ETR(n_estimators=10,
max_features=32,
random_state=0),
"K-nn": KNR(),
"Linear regression": LR(),
"Ridge": RidgeCV(),
}
y_test_predict = dict()
for name, estimator in ESTIMATORS.items():
estimator.fit(X_train, y_train)
y_test_predict[name] = estimator.predict(X_test)
image_shape, n_cols = (64, 64), 1+len(ESTIMATORS)
plt.figure(figsize=(2.0* n_cols, 2.26*n_faces))
plt.suptitle("Face completion, multi-output estimators", size=16)
for i in range(n_faces):
true_face = np.hstack((X_test[i], y_test[i]))
if i:
sub = plt.subplot(n_faces, n_cols, i*n_cols+1)
else:
sub = plt.subplot(n_faces, n_cols, i*n_cols+1, title="true faces")
sub.axis("off")
sub.imshow(true_face.reshape(image_shape),
cmap=plt.cm.gray,
interpolation="nearest")
for j, est in enumerate(sorted(ESTIMATORS)):
completed_face = np.hstack((X_test[i], y_test_predict[est][i]))
if i:
sub = plt.subplot(n_faces, n_cols, i*n_cols+2+j)
else:
sub = plt.subplot(n_faces, n_cols, i*n_cols+2+j, title=est)
sub.axis("off")
sub.imshow(completed_face.reshape(image_shape),
cmap=plt.cm.gray,
interpolation="nearest")
plt.show()
Brute Force (algorithm=brute
): computational complexity scales with $N$ samples in $D$ dimensions as $O[D N^2]$. Quickly becomes unfeasible for anything beyond small datasets.
KD Tree ("K dimensional tree") ('algorithm=kdtree'):
algorithm=ball_tree
):from sklearn.neighbors import NearestCentroid
import numpy as np
X = np.array([[-1, -1], [-2, -1], [-3, -2], [1, 1], [2, 1], [3, 2]])
y = np.array([1, 1, 1, 2, 2, 2])
clf = NearestCentroid(); clf.fit(X, y); print(clf.predict([[-0.8, -1]]))
[1]
shrink_threshold
parameter to divide the value of each feature by its within-class variance, then reduce it by shrink_threshold
. import numpy as np
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
from sklearn import datasets
from sklearn.neighbors import NearestCentroid as NC
nn,h = 15, 0.02
iris = datasets.load_iris()
X,y = iris.data[:, :2], iris.target
cmap_light = ListedColormap(['orange', 'cyan', 'cornflowerblue'])
cmap_bold = ListedColormap(['darkorange', 'c', 'darkblue'])
for shrinkage in [None, .2]:
clf = NC(shrink_threshold=shrinkage)
clf.fit(X, y)
y_pred = clf.predict(X)
print("shrinkage:\t",shrinkage, np.mean(y == y_pred))
x_min, x_max = X[:, 0].min()-1, X[:, 0].max()+1
y_min, y_max = X[:, 1].min()-1, X[:, 1].max()+1
xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
np.arange(y_min, y_max, h))
Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)
plt.figure()
plt.pcolormesh(xx, yy, Z, cmap=cmap_light, shading="auto")
plt.scatter(X[:, 0],
X[:, 1],
c=y, cmap=cmap_bold,
edgecolor='k', s=20)
plt.title("3-class classification (shrink_threshold=%r)"
% shrinkage)
plt.axis('tight')
plt.show()
shrinkage: None 0.8133333333333334 shrinkage: 0.2 0.82
mode="connectivity"
.mode="distance"
.from sklearn.manifold import Isomap
from sklearn.neighbors import KNeighborsTransformer as KNT
from sklearn.pipeline import make_pipeline
estimator = make_pipeline(
KNT(n_neighbors=5,
mode='distance'),
Isomap(neighbors_algorithm='precomputed'),
#memory='/path/to/cache'
)
n_jobs
(not available on all estimators).mode="distance"
, for compatibility with other estimators. The safe choice is to always include one extra neighbor.- **** TODO: SOLVE Attribute error bug - ticket submitted to scikit-learn github account ****
import time
import sys
# conda install -c conda-forge python-annoy
# conda install -c conda-forge nmslib
try:
import annoy
except ImportError:
print("The package 'annoy' is required to run this example.")
sys.exit()
try:
import nmslib
except ImportError:
print("The package 'nmslib' is required to run this example.")
sys.exit()
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.ticker import NullFormatter
from scipy.sparse import csr_matrix
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.neighbors import KNeighborsTransformer
from sklearn.utils._testing import assert_array_almost_equal
from sklearn.datasets import fetch_openml
from sklearn.pipeline import make_pipeline
from sklearn.manifold import TSNE
from sklearn.utils import shuffle
class NMSlibTransformer(TransformerMixin, BaseEstimator):
"""Wrapper for using nmslib as sklearn's KNeighborsTransformer"""
def __init__(self, n_neighbors=5,
metric='euclidean',
method='sw-graph',
n_jobs=1):
self.n_neighbors = n_neighbors
self.method = method
self.metric = metric
self.n_jobs = n_jobs
def fit(self, X):
self.n_samples_fit_ = X.shape[0]
# see https://github.com/nmslib/nmslib/tree/master/manual
space = {
'sqeuclidean': 'l2',
'euclidean': 'l2',
'cosine': 'cosinesimil',
'l1': 'l1',
'l2': 'l2',
}[self.metric]
self.nmslib_ = nmslib.init(method=self.method, space=space)
self.nmslib_.addDataPointBatch(X)
self.nmslib_.createIndex()
return self
def transform(self, X):
n_samples_transform = X.shape[0]
# For compatibility reasons
# each sample considered as its own neighbor, one extra neighbor will be computed.
n_neighbors = self.n_neighbors + 1
results = self.nmslib_.knnQueryBatch(X, k=n_neighbors,
num_threads=self.n_jobs)
indices, distances = zip(*results)
indices, distances = np.vstack(indices), np.vstack(distances)
if self.metric == 'sqeuclidean':
distances **= 2
indptr = np.arange(0, n_samples_transform * n_neighbors + 1,
n_neighbors)
kneighbors_graph = csr_matrix((distances.ravel(), indices.ravel(),
indptr), shape=(n_samples_transform,
self.n_samples_fit_))
return kneighbors_graph
class AnnoyTransformer(TransformerMixin, BaseEstimator):
"""Wrapper for using annoy.AnnoyIndex as sklearn's KNeighborsTransformer"""
def __init__(self, n_neighbors=5, metric='euclidean', n_trees=10,
search_k=-1):
self.n_neighbors = n_neighbors
self.n_trees = n_trees
self.search_k = search_k
self.metric = metric
def fit(self, X):
self.n_samples_fit_ = X.shape[0]
metric = self.metric if self.metric != 'sqeuclidean' else 'euclidean'
self.annoy_ = annoy.AnnoyIndex(X.shape[1], metric=metric)
for i, x in enumerate(X):
self.annoy_.add_item(i, x.tolist())
self.annoy_.build(self.n_trees)
return self
def transform(self, X):
return self._transform(X)
def fit_transform(self, X, y=None):
return self.fit(X)._transform(X=None)
def _transform(self, X):
"""As `transform`, but handles X is None for faster `fit_transform`."""
n_samples_transform = self.n_samples_fit_ if X is None else X.shape[0]
# For compatibility reasons, as each sample is considered as its own
# neighbor, one extra neighbor will be computed.
n_neighbors = self.n_neighbors + 1
indices = np.empty((n_samples_transform, n_neighbors),
dtype=int)
distances = np.empty((n_samples_transform, n_neighbors))
if X is None:
for i in range(self.annoy_.get_n_items()):
ind, dist = self.annoy_.get_nns_by_item(
i, n_neighbors, self.search_k, include_distances=True)
indices[i], distances[i] = ind, dist
else:
for i, x in enumerate(X):
indices[i], distances[i] = self.annoy_.get_nns_by_vector(
x.tolist(), n_neighbors, self.search_k,
include_distances=True)
if self.metric == 'sqeuclidean':
distances **= 2
indptr = np.arange(0, n_samples_transform * n_neighbors + 1,
n_neighbors)
kneighbors_graph = csr_matrix((distances.ravel(), indices.ravel(),
indptr), shape=(n_samples_transform,
self.n_samples_fit_))
return kneighbors_graph
def test_transformers():
# AnnoyTransformer and KNeighborsTransformer give same results?
X = np.random.RandomState(42).randn(10, 2)
knn = KNeighborsTransformer(); Xt0 = knn.fit_transform(X)
ann = AnnoyTransformer(); Xt1 = ann.fit_transform(X)
nms = NMSlibTransformer(); Xt2 = nms.fit_transform(X)
assert_array_almost_equal(Xt0.toarray(), Xt1.toarray(), decimal=5)
assert_array_almost_equal(Xt0.toarray(), Xt2.toarray(), decimal=5)
# Load MNIST, shuffle data, return only n_samples
def load_mnist(n_samples):
mnist = fetch_openml("mnist_784")
X, y = shuffle(mnist.data, mnist.target, random_state=2)
return X[:n_samples] / 255, y[:n_samples]
import pandas as pd
def run_benchmark():
datasets = [
('MNIST_2000', load_mnist(n_samples=2000)),
('MNIST_10000', load_mnist(n_samples=10000)),
]
n_iter = 500
perplexity = 30
# TSNE requires a certain number of neighbors which depends on the
# perplexity parameter.
# Add one since we include each sample as its own neighbor.
n_neighbors = int(3. * perplexity + 1) + 1
transformers = [
('AnnoyTransformer', AnnoyTransformer(n_neighbors=n_neighbors,
metric='sqeuclidean')),
('NMSlibTransformer', NMSlibTransformer(n_neighbors=n_neighbors,
metric='sqeuclidean')),
('KNeighborsTransformer', KNeighborsTransformer(
n_neighbors=n_neighbors, mode='distance', metric='sqeuclidean')),
('TSNE with AnnoyTransformer', make_pipeline(
AnnoyTransformer(n_neighbors=n_neighbors, metric='sqeuclidean'),
TSNE(metric='precomputed', perplexity=perplexity,
method="barnes_hut", random_state=42, n_iter=n_iter), )),
('TSNE with NMSlibTransformer', make_pipeline(
NMSlibTransformer(n_neighbors=n_neighbors, metric='sqeuclidean'),
TSNE(metric='precomputed', perplexity=perplexity,
method="barnes_hut", random_state=42, n_iter=n_iter), )),
('TSNE with KNeighborsTransformer', make_pipeline(
KNeighborsTransformer(n_neighbors=n_neighbors, mode='distance',
metric='sqeuclidean'),
TSNE(metric='precomputed', perplexity=perplexity,
method="barnes_hut", random_state=42, n_iter=n_iter), )),
('TSNE with internal NearestNeighbors',
TSNE(metric='sqeuclidean', perplexity=perplexity, method="barnes_hut",
random_state=42, n_iter=n_iter)),
]
# init the plot
nrows = len(datasets)
ncols = np.sum([1 for name, model in transformers if 'TSNE' in name])
fig, axes = plt.subplots(nrows=nrows, ncols=ncols, squeeze=False,
figsize=(5 * ncols, 4 * nrows))
axes = axes.ravel()
i_ax = 0
for dataset_name, (X, y) in datasets:
msg = 'Benchmarking on %s:' % dataset_name
print('\n%s\n%s' % (msg, '-' * len(msg)))
for transformer_name, transformer in transformers:
start = time.time()
Xt = transformer.fit_transform(X)
duration = time.time() - start
# print the duration report
longest = np.max([len(name) for name, model in transformers])
whitespaces = ' ' * (longest - len(transformer_name))
print('%s: %s%.3f sec' % (transformer_name, whitespaces, duration))
# plot TSNE embedding which should be very similar across methods
if 'TSNE' in transformer_name:
axes[i_ax].set_title(transformer_name + '\non ' + dataset_name)
axes[i_ax].scatter(Xt[:, 0], Xt[:, 1], c=y.astype(np.int32),
alpha=0.2, cmap=plt.cm.viridis)
axes[i_ax].xaxis.set_major_formatter(NullFormatter())
axes[i_ax].yaxis.set_major_formatter(NullFormatter())
axes[i_ax].axis('tight')
i_ax += 1
fig.tight_layout()
plt.show()
if __name__ == '__main__':
test_transformers()
run_benchmark()
Benchmarking on MNIST_2000: ---------------------------
--------------------------------------------------------------------------- AttributeError Traceback (most recent call last) <ipython-input-10-853c6a71b56e> in <module> 83 if __name__ == '__main__': 84 test_transformers() ---> 85 run_benchmark() <ipython-input-10-853c6a71b56e> in run_benchmark() 59 for transformer_name, transformer in transformers: 60 start = time.time() ---> 61 Xt = transformer.fit_transform(X) 62 duration = time.time() - start 63 <ipython-input-6-ce66225c39c5> in fit_transform(self, X, y) 22 23 def fit_transform(self, X, y=None): ---> 24 return self.fit(X)._transform(X=None) 25 26 def _transform(self, X): <ipython-input-6-ce66225c39c5> in fit(self, X) 14 self.annoy_ = annoy.AnnoyIndex(X.shape[1], metric=metric) 15 for i, x in enumerate(X): ---> 16 self.annoy_.add_item(i, x.tolist()) 17 self.annoy_.build(self.n_trees) 18 return self AttributeError: 'str' object has no attribute 'tolist'
from tempfile import TemporaryDirectory
import matplotlib.pyplot as plt
from sklearn.neighbors import KNeighborsTransformer as KNT, KNeighborsClassifier as KNC
from sklearn.model_selection import GridSearchCV
from sklearn.datasets import load_digits
from sklearn.pipeline import Pipeline
X, y = load_digits(return_X_y=True)
n_neighbors_list = [1, 2, 3, 4, 5, 6, 7, 8, 9]
# The transformer computes the nearest neighbors graph using the maximum number
# of neighbors necessary in the grid search. The classifier model filters the
# nearest neighbors graph as required by its own n_neighbors parameter.
graph_model = KNT(n_neighbors=max(n_neighbors_list), mode='distance')
classifier_model = KNC(metric='precomputed')
# Give `memory` a directory to cache the graph computation
with TemporaryDirectory(prefix="sklearn_graph_cache_") as tmpdir:
full_model = Pipeline(
steps=[('graph', graph_model),
('classifier', classifier_model)],
memory=tmpdir)
param_grid = {'classifier__n_neighbors': n_neighbors_list}
grid_model = GridSearchCV(full_model, param_grid)
grid_model.fit(X, y)
fig, axes = plt.subplots(1, 2, figsize=(8, 4))
axes[0].errorbar(x=n_neighbors_list,
y=grid_model.cv_results_['mean_test_score'],
yerr=grid_model.cv_results_['std_test_score'])
axes[0].set(xlabel='n_neighbors',
title='Classification accuracy')
axes[1].errorbar(x=n_neighbors_list,
y=grid_model.cv_results_['mean_fit_time'],
yerr=grid_model.cv_results_['std_fit_time'],
color='r')
axes[1].set(xlabel='n_neighbors', title='Fit time (with caching)')
fig.tight_layout()
plt.show()
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import make_classification
from sklearn.neighbors import NeighborhoodComponentsAnalysis as NCA
from matplotlib import cm
from scipy.special import logsumexp
# create 9 samples from 3 classes
# focus on point #3. link thickness between #3 and any other point is proportional to distance.
X, y = make_classification(n_samples=9, n_features=2, n_informative=2,
n_redundant=0, n_classes=3, n_clusters_per_class=1,
class_sep=1.0, random_state=0)
def link_thickness_i(X, i):
diff_embedded = X[i] - X
dist_embedded = np.einsum('ij,ij->i', diff_embedded,
diff_embedded)
dist_embedded[i] = np.inf
# compute exponentiated distances (use the log-sum-exp trick to
# avoid numerical instabilities...
exp_dist_embedded = np.exp(-dist_embedded -
logsumexp(-dist_embedded))
return exp_dist_embedded
def relate_point(X, i, ax):
pt_i = X[i]
for j, pt_j in enumerate(X):
thickness = link_thickness_i(X, i)*3
if i != j:
line = ([pt_i[0], pt_j[0]], [pt_i[1], pt_j[1]])
ax.plot(*line, c=cm.Set1(y[j]),
linewidth=5*thickness[j])
plt.figure(1)
ax = plt.gca()
for i in range(X.shape[0]):
ax.text( X[i, 0], X[i, 1], str(i), va='center', ha='center')
ax.scatter(X[i, 0], X[i, 1], s=300, c=cm.Set1(y[[i]]), alpha=0.4)
ax.set_title("Original points")
ax.axes.get_xaxis().set_visible(False)
ax.axes.get_yaxis().set_visible(False)
ax.axis('equal') # so that boundaries are displayed correctly as circles
i = 3
relate_point(X, i, ax)
plt.show()
nca = NCA(max_iter=30, random_state=0)
nca = nca.fit(X, y)
plt.figure(2)
ax2 = plt.gca()
X_embedded = nca.transform(X)
relate_point(X_embedded, i, ax2)
for i in range(len(X)):
ax2.text(X_embedded[i, 0], X_embedded[i, 1], str(i),
va='center', ha='center')
ax2.scatter(X_embedded[i, 0], X_embedded[i, 1], s=300, c=cm.Set1(y[[i]]),
alpha=0.4)
ax2.set_title("NCA embedding")
ax2.axes.get_xaxis().set_visible(False)
ax2.axes.get_yaxis().set_visible(False)
ax2.axis('equal')
plt.show()
from sklearn.neighbors import (NeighborhoodComponentsAnalysis, KNeighborsClassifier)
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
X, y = load_iris(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.7, random_state=42)
nca = NeighborhoodComponentsAnalysis(random_state=42)
knn = KNeighborsClassifier(n_neighbors=3)
nca_pipe = Pipeline([('nca', nca), ('knn', knn)])
nca_pipe.fit(X_train, y_train)
print("score:\t",nca_pipe.score(X_test, y_test))
score: 0.9619047619047619
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import (KNeighborsClassifier as KNC,
NeighborhoodComponentsAnalysis as NCA)
from sklearn.pipeline import Pipeline
n_neighbors = 1
dataset = datasets.load_iris()
X, y = dataset.data, dataset.target
X = X[:, [0, 2]]
X_train, X_test, y_train, y_test = \
train_test_split(X, y, stratify=y, test_size=0.7, random_state=42)
h = .01 # step size in the mesh
cmap_light = ListedColormap(['#FFAAAA', '#AAFFAA', '#AAAAFF'])
cmap_bold = ListedColormap(['#FF0000', '#00FF00', '#0000FF'])
names = ['KNN', 'NCA, KNN']
classifiers = [Pipeline([('scaler', StandardScaler()),
('knn', KNC(n_neighbors=n_neighbors))
]),
Pipeline([('scaler', StandardScaler()),
('nca', NCA()),
('knn', KNC(n_neighbors=n_neighbors))
])]
x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
np.arange(y_min, y_max, h))
for name, clf in zip(names, classifiers):
clf.fit(X_train, y_train)
score = clf.score(X_test, y_test)
# Plot the decision boundary. For that, we will assign a color to each
# point in the mesh [x_min, x_max]x[y_min, y_max].
Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)
plt.figure()
plt.pcolormesh(xx, yy, Z, cmap=cmap_light, alpha=.8, shading='auto')
# Plot also the training and testing points
plt.scatter(X[:, 0], X[:, 1], c=y, cmap=cmap_bold, edgecolor='k', s=20)
plt.xlim(xx.min(), xx.max())
plt.ylim(yy.min(), yy.max())
plt.title("{} (k = {})".format(name, n_neighbors))
plt.text(0.9, 0.1, '{:.2f}'.format(score), size=15,
ha='center', va='center', transform=plt.gca().transAxes)
plt.show()
n_components
.import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.neighbors import (KNeighborsClassifier as KNC,
NeighborhoodComponentsAnalysis as NCA)
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
n_neighbors = 3
random_state = 0
# Load Digits dataset
X, y = datasets.load_digits(return_X_y=True)
X_train, X_test, y_train, y_test = \
train_test_split(X, y, test_size=0.5, stratify=y, random_state=random_state)
dim = len(X[0])
nn = len(np.unique(y))
# Reduce dimension to 2 with PCA, LDA, NCA
pca = make_pipeline(StandardScaler(), PCA(n_components=2, random_state=random_state))
lda = make_pipeline(StandardScaler(), LDA(n_components=2))
nca = make_pipeline(StandardScaler(), NCA(n_components=2, random_state=random_state))
# Use a nearest neighbor classifier to evaluate the methods
knn = KNC(n_neighbors=n_neighbors)
methods = [('PCA', pca), ('LDA', lda), ('NCA', nca)]
for i, (name, model) in enumerate(methods):
plt.figure()
# plt.subplot(1, 3, i + 1, aspect=1)
model.fit(X_train, y_train)
knn.fit(model.transform(X_train), y_train)
acc_knn = knn.score(model.transform(X_test), y_test)
X_embedded = model.transform(X)
plt.scatter(X_embedded[:, 0],
X_embedded[:, 1],
c=y, s=30, cmap='Set1')
plt.title("{}, KNN (k={})\nTest accuracy = {:.2f}".format(name,
n_neighbors,
acc_knn))
plt.show()