algorithm=<keyword>
:BallTree
KDTree
brute
force"auto
: the best approach is determined by the method.from sklearn.neighbors import NearestNeighbors as NN
import numpy as np
X = np.array([[-1, -1], [-2, -1], [-3, -2], [1, 1], [2, 1], [3, 2]])
nbrs = NN(n_neighbors=2,
algorithm='ball_tree').fit(X)
distances, indices = nbrs.kneighbors(X)
print(distances,indices)
[[0. 1. ] [0. 1. ] [0. 1.41421356] [0. 1. ] [0. 1. ] [0. 1.41421356]] [[0 1] [1 0] [2 1] [3 4] [4 3] [5 4]]
nbrs.kneighbors_graph(X).toarray()
array([[1., 1., 0., 0., 0., 0.], [1., 1., 0., 0., 0., 0.], [0., 1., 1., 0., 0., 0.], [0., 0., 0., 1., 1., 0.], [0., 0., 0., 1., 1., 0.], [0., 0., 0., 0., 1., 1.]])
from sklearn.neighbors import KDTree
import numpy as np
X = np.array([[-1, -1], [-2, -1], [-3, -2], [1, 1], [2, 1], [3, 2]])
kdt = KDTree(X, leaf_size=30, metric='euclidean')
kdt.query(X, k=2, return_distance=False)
array([[0, 1], [1, 0], [2, 1], [3, 4], [4, 3], [5, 4]])
Radius-based NNs more useful when data can't be uniformly sampled. Depends on radius $r$ - points in sparser neighborhoods use fewer NNs for classification. Radius-based NNs suffer from dimensionality problems in high-dimension datasets.
Basic classification uses weights=uniform
to compute a simple majority vote of NNs. weights=distance
assigns weights proportional to the inverse of distance from the query point. (You can also define a custom distance function.)
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.colors import ListedColormap
from sklearn import neighbors, datasets
n_neighbors = 15
iris = datasets.load_iris()
# Use only 1st two features. We could avoid this ugly
# slicing by using a 2D dataset
X,y = iris.data[:, :2], iris.target
h = .02 # mesh step size
cmap_light = ListedColormap(['orange', 'cyan', 'cornflowerblue'])
cmap_bold = ['darkorange', 'c', 'darkblue']
for weights in ['uniform', 'distance']:
clf = neighbors.KNeighborsClassifier(n_neighbors, weights=weights)
clf.fit(X, y)
# Plot the decision boundary.
x_min, x_max = X[:, 0].min()-1, X[:, 0].max()+1
y_min, y_max = X[:, 1].min()-1, X[:, 1].max()+1
xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
np.arange(y_min, y_max, h))
Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])
# Put the result into a color plot
Z = Z.reshape(xx.shape)
plt.figure(figsize=(8, 6))
plt.contourf(xx, yy, Z, cmap=cmap_light)
# Plot also the training points
sns.scatterplot(x=X[:, 0],
y=X[:, 1],
hue=iris.target_names[y],
palette=cmap_bold,
alpha=1.0, edgecolor="black")
plt.xlim(xx.min(), xx.max())
plt.ylim(yy.min(), yy.max())
plt.title("3-Class classification (k = %i, weights = '%s')"
% (n_neighbors, weights))
plt.xlabel(iris.feature_names[0])
plt.ylabel(iris.feature_names[1])
plt.show()
# sample data - noisy sinusoid
import numpy as np
import matplotlib.pyplot as plt
from sklearn import neighbors
np.random.seed(0)
X = np.sort(5 * np.random.rand(40, 1), axis=0)
T = np.linspace(0, 5, 500)[:, np.newaxis]
y = np.sin(X).ravel()
y[::5] += 1 * (0.5 - np.random.rand(8))
nn = 5
for i, weights in enumerate(['uniform', 'distance']):
knn = neighbors.KNeighborsRegressor(nn, weights=weights)
y_ = knn.fit(X, y).predict(T)
plt.subplot(2, 1, i + 1)
plt.scatter(X, y, color='darkorange', label='data')
plt.plot(T, y_, color='navy', label='prediction')
plt.axis('tight')
plt.legend()
plt.title("KNN regression (k = %i, weights = '%s')" \
% (nn, weights))
plt.tight_layout()
plt.show()
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import fetch_olivetti_faces as Fetch
from sklearn.utils.validation import check_random_state as Check
from sklearn.ensemble import ExtraTreesRegressor as ETR
from sklearn.neighbors import KNeighborsRegressor as KNR
from sklearn.linear_model import LinearRegression as LR
from sklearn.linear_model import RidgeCV
# dataset
data, targets = Fetch(return_X_y=True)
train = data[targets < 30]
test = data[targets >= 30]
downloading Olivetti faces from https://ndownloader.figshare.com/files/5976027 to /home/bjpcjp/scikit_learn_data
# Test on a subset of people
n_faces = 5
rng = Check(4)
face_ids = rng.randint(test.shape[0], size=(n_faces, ))
test = test[face_ids, :]
n_pixels = data.shape[1]
X_train = train[:, :(n_pixels + 1) // 2] # Upper half of the faces
y_train = train[:, n_pixels // 2:] # Lower half of the faces
X_test = test[:, :(n_pixels + 1) // 2]
y_test = test[:, n_pixels // 2:]
ESTIMATORS = {
"Extra trees": ETR(n_estimators=10,
max_features=32,
random_state=0),
"K-nn": KNR(),
"Linear regression": LR(),
"Ridge": RidgeCV(),
}
y_test_predict = dict()
for name, estimator in ESTIMATORS.items():
estimator.fit(X_train, y_train)
y_test_predict[name] = estimator.predict(X_test)
image_shape, n_cols = (64, 64), 1+len(ESTIMATORS)
plt.figure(figsize=(2.0* n_cols, 2.26*n_faces))
plt.suptitle("Face completion, multi-output estimators", size=16)
for i in range(n_faces):
true_face = np.hstack((X_test[i], y_test[i]))
if i:
sub = plt.subplot(n_faces, n_cols, i*n_cols+1)
else:
sub = plt.subplot(n_faces, n_cols, i*n_cols+1, title="true faces")
sub.axis("off")
sub.imshow(true_face.reshape(image_shape),
cmap=plt.cm.gray,
interpolation="nearest")
for j, est in enumerate(sorted(ESTIMATORS)):
completed_face = np.hstack((X_test[i], y_test_predict[est][i]))
if i:
sub = plt.subplot(n_faces, n_cols, i*n_cols+2+j)
else:
sub = plt.subplot(n_faces, n_cols, i*n_cols+2+j, title=est)
sub.axis("off")
sub.imshow(completed_face.reshape(image_shape),
cmap=plt.cm.gray,
interpolation="nearest")
plt.show()
Brute Force (algorithm=brute
): computational complexity scales with $N$ samples in $D$ dimensions as $O[D N^2]$. Quickly becomes unfeasible for anything beyond small datasets.
KD Tree ("K dimensional tree") ('algorithm=kdtree'):
algorithm=ball_tree
):from sklearn.neighbors import NearestCentroid
import numpy as np
X = np.array([[-1, -1], [-2, -1], [-3, -2], [1, 1], [2, 1], [3, 2]])
y = np.array([1, 1, 1, 2, 2, 2])
clf = NearestCentroid(); clf.fit(X, y); print(clf.predict([[-0.8, -1]]))
[1]
shrink_threshold
parameter to divide the value of each feature by its within-class variance, then reduce it by shrink_threshold
. import numpy as np
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
from sklearn import datasets
from sklearn.neighbors import NearestCentroid as NC
nn,h = 15, 0.02
iris = datasets.load_iris()
X,y = iris.data[:, :2], iris.target
cmap_light = ListedColormap(['orange', 'cyan', 'cornflowerblue'])
cmap_bold = ListedColormap(['darkorange', 'c', 'darkblue'])
for shrinkage in [None, .2]:
clf = NC(shrink_threshold=shrinkage)
clf.fit(X, y)
y_pred = clf.predict(X)
print("shrinkage:\t",shrinkage, np.mean(y == y_pred))
x_min, x_max = X[:, 0].min()-1, X[:, 0].max()+1
y_min, y_max = X[:, 1].min()-1, X[:, 1].max()+1
xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
np.arange(y_min, y_max, h))
Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)
plt.figure()
plt.pcolormesh(xx, yy, Z, cmap=cmap_light, shading="auto")
plt.scatter(X[:, 0],
X[:, 1],
c=y, cmap=cmap_bold,
edgecolor='k', s=20)
plt.title("3-class classification (shrink_threshold=%r)"
% shrinkage)
plt.axis('tight')
plt.show()
shrinkage: None 0.8133333333333334 shrinkage: 0.2 0.82
mode="connectivity"
.mode="distance"
.from sklearn.manifold import Isomap
from sklearn.neighbors import KNeighborsTransformer as KNT
from sklearn.pipeline import make_pipeline
estimator = make_pipeline(
KNT(n_neighbors=5,
mode='distance'),
Isomap(neighbors_algorithm='precomputed'),
#memory='/path/to/cache'
)
n_jobs
(not available on all estimators).mode="distance"
, for compatibility with other estimators. The safe choice is to always include one extra neighbor.- **** TODO: SOLVE Attribute error bug - ticket submitted to scikit-learn github account ****
import time
import sys
# conda install -c conda-forge python-annoy
# conda install -c conda-forge nmslib
try:
import annoy
except ImportError:
print("The package 'annoy' is required to run this example.")
sys.exit()
try:
import nmslib
except ImportError:
print("The package 'nmslib' is required to run this example.")
sys.exit()
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.ticker import NullFormatter
from scipy.sparse import csr_matrix
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.neighbors import KNeighborsTransformer
from sklearn.utils._testing import assert_array_almost_equal
from sklearn.datasets import fetch_openml
from sklearn.pipeline import make_pipeline
from sklearn.manifold import TSNE
from sklearn.utils import shuffle
class NMSlibTransformer(TransformerMixin, BaseEstimator):
"""Wrapper for using nmslib as sklearn's KNeighborsTransformer"""
def __init__(self, n_neighbors=5,
metric='euclidean',
method='sw-graph',
n_jobs=1):
self.n_neighbors = n_neighbors
self.method = method
self.metric = metric
self.n_jobs = n_jobs
def fit(self, X):
self.n_samples_fit_ = X.shape[0]
# see https://github.com/nmslib/nmslib/tree/master/manual
space = {
'sqeuclidean': 'l2',
'euclidean': 'l2',
'cosine': 'cosinesimil',
'l1': 'l1',
'l2': 'l2',
}[self.metric]
self.nmslib_ = nmslib.init(method=self.method, space=space)
self.nmslib_.addDataPointBatch(X)
self.nmslib_.createIndex()
return self
def transform(self, X):
n_samples_transform = X.shape[0]
# For compatibility reasons
# each sample considered as its own neighbor, one extra neighbor will be computed.
n_neighbors = self.n_neighbors + 1
results = self.nmslib_.knnQueryBatch(X, k=n_neighbors,
num_threads=self.n_jobs)
indices, distances = zip(*results)
indices, distances = np.vstack(indices), np.vstack(distances)
if self.metric == 'sqeuclidean':
distances **= 2
indptr = np.arange(0, n_samples_transform * n_neighbors + 1,
n_neighbors)
kneighbors_graph = csr_matrix((distances.ravel(), indices.ravel(),
indptr), shape=(n_samples_transform,
self.n_samples_fit_))
return kneighbors_graph
class AnnoyTransformer(TransformerMixin, BaseEstimator):
"""Wrapper for using annoy.AnnoyIndex as sklearn's KNeighborsTransformer"""
def __init__(self, n_neighbors=5, metric='euclidean', n_trees=10,
search_k=-1):
self.n_neighbors = n_neighbors
self.n_trees = n_trees
self.search_k = search_k
self.metric = metric
def fit(self, X):
self.n_samples_fit_ = X.shape[0]
metric = self.metric if self.metric != 'sqeuclidean' else 'euclidean'
self.annoy_ = annoy.AnnoyIndex(X.shape[1], metric=metric)
for i, x in enumerate(X):
self.annoy_.add_item(i, x.tolist())
self.annoy_.build(self.n_trees)
return self
def transform(self, X):
return self._transform(X)
def fit_transform(self, X, y=None):
return self.fit(X)._transform(X=None)
def _transform(self, X):
"""As `transform`, but handles X is None for faster `fit_transform`."""
n_samples_transform = self.n_samples_fit_ if X is None else X.shape[0]
# For compatibility reasons, as each sample is considered as its own
# neighbor, one extra neighbor will be computed.
n_neighbors = self.n_neighbors + 1
indices = np.empty((n_samples_transform, n_neighbors),
dtype=int)
distances = np.empty((n_samples_transform, n_neighbors))
if X is None:
for i in range(self.annoy_.get_n_items()):
ind, dist = self.annoy_.get_nns_by_item(
i, n_neighbors, self.search_k, include_distances=True)
indices[i], distances[i] = ind, dist
else:
for i, x in enumerate(X):
indices[i], distances[i] = self.annoy_.get_nns_by_vector(
x.tolist(), n_neighbors, self.search_k,
include_distances=True)
if self.metric == 'sqeuclidean':
distances **= 2
indptr = np.arange(0, n_samples_transform * n_neighbors + 1,
n_neighbors)
kneighbors_graph = csr_matrix((distances.ravel(), indices.ravel(),
indptr), shape=(n_samples_transform,
self.n_samples_fit_))
return kneighbors_graph
def test_transformers():
# AnnoyTransformer and KNeighborsTransformer give same results?
X = np.random.RandomState(42).randn(10, 2)
knn = KNeighborsTransformer(); Xt0 = knn.fit_transform(X)
ann = AnnoyTransformer(); Xt1 = ann.fit_transform(X)
nms = NMSlibTransformer(); Xt2 = nms.fit_transform(X)
assert_array_almost_equal(Xt0.toarray(), Xt1.toarray(), decimal=5)
assert_array_almost_equal(Xt0.toarray(), Xt2.toarray(), decimal=5)
# Load MNIST, shuffle data, return only n_samples
def load_mnist(n_samples):
mnist = fetch_openml("mnist_784")
X, y = shuffle(mnist.data, mnist.target, random_state=2)
return X[:n_samples] / 255, y[:n_samples]
import pandas as pd
def run_benchmark():
datasets = [
('MNIST_2000', load_mnist(n_samples=2000)),
('MNIST_10000', load_mnist(n_samples=10000)),
]
n_iter = 500
perplexity = 30
# TSNE requires a certain number of neighbors which depends on the
# perplexity parameter.
# Add one since we include each sample as its own neighbor.
n_neighbors = int(3. * perplexity + 1) + 1
transformers = [
('AnnoyTransformer', AnnoyTransformer(n_neighbors=n_neighbors,
metric='sqeuclidean')),
('NMSlibTransformer', NMSlibTransformer(n_neighbors=n_neighbors,
metric='sqeuclidean')),
('KNeighborsTransformer', KNeighborsTransformer(
n_neighbors=n_neighbors, mode='distance', metric='sqeuclidean')),
('TSNE with AnnoyTransformer', make_pipeline(
AnnoyTransformer(n_neighbors=n_neighbors, metric='sqeuclidean'),
TSNE(metric='precomputed', perplexity=perplexity,
method="barnes_hut", random_state=42, n_iter=n_iter), )),
('TSNE with NMSlibTransformer', make_pipeline(
NMSlibTransformer(n_neighbors=n_neighbors, metric='sqeuclidean'),
TSNE(metric='precomputed', perplexity=perplexity,
method="barnes_hut", random_state=42, n_iter=n_iter), )),
('TSNE with KNeighborsTransformer', make_pipeline(
KNeighborsTransformer(n_neighbors=n_neighbors, mode='distance',
metric='sqeuclidean'),
TSNE(metric='precomputed', perplexity=perplexity,
method="barnes_hut", random_state=42, n_iter=n_iter), )),
('TSNE with internal NearestNeighbors',
TSNE(metric='sqeuclidean', perplexity=perplexity, method="barnes_hut",
random_state=42, n_iter=n_iter)),
]
# init the plot
nrows = len(datasets)
ncols = np.sum([1 for name, model in transformers if 'TSNE' in name])
fig, axes = plt.subplots(nrows=nrows, ncols=ncols, squeeze=False,
figsize=(5 * ncols, 4 * nrows))
axes = axes.ravel()
i_ax = 0
for dataset_name, (X, y) in datasets:
msg = 'Benchmarking on %s:' % dataset_name
print('\n%s\n%s' % (msg, '-' * len(msg)))
for transformer_name, transformer in transformers:
start = time.time()
Xt = transformer.fit_transform(X)
duration = time.time() - start
# print the duration report
longest = np.max([len(name) for name, model in transformers])
whitespaces = ' ' * (longest - len(transformer_name))
print('%s: %s%.3f sec' % (transformer_name, whitespaces, duration))
# plot TSNE embedding which should be very similar across methods
if 'TSNE' in transformer_name:
axes[i_ax].set_title(transformer_name + '\non ' + dataset_name)
axes[i_ax].scatter(Xt[:, 0], Xt[:, 1], c=y.astype(np.int32),
alpha=0.2, cmap=plt.cm.viridis)
axes[i_ax].xaxis.set_major_formatter(NullFormatter())
axes[i_ax].yaxis.set_major_formatter(NullFormatter())
axes[i_ax].axis('tight')
i_ax += 1
fig.tight_layout()
plt.show()
if __name__ == '__main__':
test_transformers()
run_benchmark()
Benchmarking on MNIST_2000: ---------------------------
--------------------------------------------------------------------------- AttributeError Traceback (most recent call last) <ipython-input-10-853c6a71b56e> in <module> 83 if __name__ == '__main__': 84 test_transformers() ---> 85 run_benchmark() <ipython-input-10-853c6a71b56e> in run_benchmark() 59 for transformer_name, transformer in transformers: 60 start = time.time() ---> 61 Xt = transformer.fit_transform(X) 62 duration = time.time() - start 63 <ipython-input-6-ce66225c39c5> in fit_transform(self, X, y) 22 23 def fit_transform(self, X, y=None): ---> 24 return self.fit(X)._transform(X=None) 25 26 def _transform(self, X): <ipython-input-6-ce66225c39c5> in fit(self, X) 14 self.annoy_ = annoy.AnnoyIndex(X.shape[1], metric=metric) 15 for i, x in enumerate(X): ---> 16 self.annoy_.add_item(i, x.tolist()) 17 self.annoy_.build(self.n_trees) 18 return self AttributeError: 'str' object has no attribute 'tolist'
from tempfile import TemporaryDirectory
import matplotlib.pyplot as plt
from sklearn.neighbors import KNeighborsTransformer as KNT, KNeighborsClassifier as KNC
from sklearn.model_selection import GridSearchCV
from sklearn.datasets import load_digits
from sklearn.pipeline import Pipeline
X, y = load_digits(return_X_y=True)
n_neighbors_list = [1, 2, 3, 4, 5, 6, 7, 8, 9]
# The transformer computes the nearest neighbors graph using the maximum number
# of neighbors necessary in the grid search. The classifier model filters the
# nearest neighbors graph as required by its own n_neighbors parameter.
graph_model = KNT(n_neighbors=max(n_neighbors_list), mode='distance')
classifier_model = KNC(metric='precomputed')
# Give `memory` a directory to cache the graph computation
with TemporaryDirectory(prefix="sklearn_graph_cache_") as tmpdir:
full_model = Pipeline(
steps=[('graph', graph_model),
('classifier', classifier_model)],
memory=tmpdir)
param_grid = {'classifier__n_neighbors': n_neighbors_list}
grid_model = GridSearchCV(full_model, param_grid)
grid_model.fit(X, y)
fig, axes = plt.subplots(1, 2, figsize=(8, 4))
axes[0].errorbar(x=n_neighbors_list,
y=grid_model.cv_results_['mean_test_score'],
yerr=grid_model.cv_results_['std_test_score'])
axes[0].set(xlabel='n_neighbors',
title='Classification accuracy')
axes[1].errorbar(x=n_neighbors_list,
y=grid_model.cv_results_['mean_fit_time'],
yerr=grid_model.cv_results_['std_fit_time'],
color='r')
axes[1].set(xlabel='n_neighbors', title='Fit time (with caching)')
fig.tight_layout()
plt.show()