from sklearn.neighbors import NearestNeighbors as NN
import numpy as np
X = np.array([[-1, -1], [-2, -1], [-3, -2], [1, 1], [2, 1], [3, 2]])
nbrs = NN(n_neighbors=2, 
          algorithm='ball_tree').fit(X)
distances, indices = nbrs.kneighbors(X)

print(distances,indices)

[[0.         1.        ]
 [0.         1.        ]
 [0.         1.41421356]
 [0.         1.        ]
 [0.         1.        ]
 [0.         1.41421356]] [[0 1]
 [1 0]
 [2 1]
 [3 4]
 [4 3]
 [5 4]]


nbrs.kneighbors_graph(X).toarray()

array([[1., 1., 0., 0., 0., 0.],
       [1., 1., 0., 0., 0., 0.],
       [0., 1., 1., 0., 0., 0.],
       [0., 0., 0., 1., 1., 0.],
       [0., 0., 0., 1., 1., 0.],
       [0., 0., 0., 0., 1., 1.]])


from sklearn.neighbors import KDTree
import numpy as np

X = np.array([[-1, -1], [-2, -1], [-3, -2], [1, 1], [2, 1], [3, 2]])

kdt = KDTree(X, leaf_size=30, metric='euclidean')

kdt.query(X, k=2, return_distance=False)

array([[0, 1],
       [1, 0],
       [2, 1],
       [3, 4],
       [4, 3],
       [5, 4]])


import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.colors import ListedColormap
from sklearn import neighbors, datasets


n_neighbors = 15
iris = datasets.load_iris()

# Use only 1st two features. We could avoid this ugly
# slicing by using a 2D dataset
X,y = iris.data[:, :2], iris.target
h   = .02  # mesh step size


cmap_light = ListedColormap(['orange',     'cyan', 'cornflowerblue'])
cmap_bold                 = ['darkorange', 'c',    'darkblue']

for weights in ['uniform', 'distance']:
    clf = neighbors.KNeighborsClassifier(n_neighbors, weights=weights)
    clf.fit(X, y)

    # Plot the decision boundary.
    x_min, x_max = X[:, 0].min()-1, X[:, 0].max()+1
    y_min, y_max = X[:, 1].min()-1, X[:, 1].max()+1
    
    xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
                         np.arange(y_min, y_max, h))

    Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])

    # Put the result into a color plot
    Z = Z.reshape(xx.shape)
    plt.figure(figsize=(8, 6))
    plt.contourf(xx, yy, Z, cmap=cmap_light)

    # Plot also the training points
    sns.scatterplot(x=X[:, 0], 
                    y=X[:, 1], 
                    hue=iris.target_names[y],
                    palette=cmap_bold, 
                    alpha=1.0, edgecolor="black")

    plt.xlim(xx.min(), xx.max())
    plt.ylim(yy.min(), yy.max())
    plt.title("3-Class classification (k = %i, weights = '%s')"
              % (n_neighbors, weights))
    plt.xlabel(iris.feature_names[0])
    plt.ylabel(iris.feature_names[1])

plt.show()


# sample data - noisy sinusoid
import numpy as np
import matplotlib.pyplot as plt
from sklearn import neighbors

np.random.seed(0)
X       = np.sort(5 * np.random.rand(40, 1), axis=0)
T       = np.linspace(0, 5, 500)[:, np.newaxis]
y       = np.sin(X).ravel()
y[::5] += 1 * (0.5 - np.random.rand(8))


nn = 5

for i, weights in enumerate(['uniform', 'distance']):
    knn = neighbors.KNeighborsRegressor(nn, weights=weights)
    y_ = knn.fit(X, y).predict(T)

    plt.subplot(2, 1, i + 1)
    plt.scatter(X, y, color='darkorange', label='data')
    plt.plot(T, y_, color='navy', label='prediction')
    plt.axis('tight')
    plt.legend()
    plt.title("KNN regression (k = %i, weights = '%s')" \
              % (nn, weights))
plt.tight_layout()
plt.show()


import numpy as np
import matplotlib.pyplot as plt

from sklearn.datasets import fetch_olivetti_faces as Fetch
from sklearn.utils.validation import check_random_state as Check

from sklearn.ensemble import ExtraTreesRegressor as ETR
from sklearn.neighbors import KNeighborsRegressor as KNR
from sklearn.linear_model import LinearRegression as LR
from sklearn.linear_model import RidgeCV


# dataset
data, targets = Fetch(return_X_y=True)
train = data[targets < 30]
test  = data[targets >= 30]

downloading Olivetti faces from https://ndownloader.figshare.com/files/5976027 to /home/bjpcjp/scikit_learn_data


# Test on a subset of people
n_faces  = 5
rng      = Check(4)
face_ids = rng.randint(test.shape[0], size=(n_faces, ))
test     = test[face_ids, :]

n_pixels = data.shape[1]
X_train  = train[:, :(n_pixels + 1) // 2] # Upper half of the faces
y_train  = train[:,   n_pixels      // 2:] # Lower half of the faces
X_test   =  test[:, :(n_pixels + 1) // 2]
y_test   =  test[:,   n_pixels      // 2:]


ESTIMATORS = {
    "Extra trees": ETR(n_estimators=10, 
                       max_features=32,
                       random_state=0),
    "K-nn": KNR(),
    "Linear regression": LR(),
    "Ridge": RidgeCV(),
}

y_test_predict = dict()
for name, estimator in ESTIMATORS.items():
    estimator.fit(X_train, y_train)
    y_test_predict[name] = estimator.predict(X_test)


image_shape, n_cols = (64, 64), 1+len(ESTIMATORS)

plt.figure(figsize=(2.0* n_cols, 2.26*n_faces))
plt.suptitle("Face completion, multi-output estimators", size=16)

for i in range(n_faces):
    true_face = np.hstack((X_test[i], y_test[i]))

    if i:
        sub = plt.subplot(n_faces, n_cols, i*n_cols+1)
    else:
        sub = plt.subplot(n_faces, n_cols, i*n_cols+1, title="true faces")

    sub.axis("off")
    sub.imshow(true_face.reshape(image_shape),
               cmap=plt.cm.gray,
               interpolation="nearest")

    for j, est in enumerate(sorted(ESTIMATORS)):
        completed_face = np.hstack((X_test[i], y_test_predict[est][i]))

        if i:
            sub = plt.subplot(n_faces, n_cols, i*n_cols+2+j)
        else:
            sub = plt.subplot(n_faces, n_cols, i*n_cols+2+j, title=est)

        sub.axis("off")
        sub.imshow(completed_face.reshape(image_shape),
                   cmap=plt.cm.gray,
                   interpolation="nearest")
plt.show()


from sklearn.neighbors import NearestCentroid
import numpy as np
X = np.array([[-1, -1], [-2, -1], [-3, -2], [1, 1], [2, 1], [3, 2]])
y = np.array([1, 1, 1, 2, 2, 2])

clf = NearestCentroid(); clf.fit(X, y); print(clf.predict([[-0.8, -1]]))

[1]


import numpy as np
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
from sklearn import datasets
from sklearn.neighbors import NearestCentroid as NC


nn,h = 15, 0.02

iris = datasets.load_iris()
X,y = iris.data[:, :2], iris.target


cmap_light = ListedColormap(['orange', 'cyan', 'cornflowerblue'])
cmap_bold = ListedColormap(['darkorange', 'c', 'darkblue'])

for shrinkage in [None, .2]:
    clf = NC(shrink_threshold=shrinkage)
    clf.fit(X, y)
    y_pred = clf.predict(X)
    
    print("shrinkage:\t",shrinkage, np.mean(y == y_pred))

    x_min, x_max = X[:, 0].min()-1, X[:, 0].max()+1
    y_min, y_max = X[:, 1].min()-1, X[:, 1].max()+1
    
    xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
                         np.arange(y_min, y_max, h))
    
    Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])
    Z = Z.reshape(xx.shape)

    plt.figure()
    plt.pcolormesh(xx, yy, Z, cmap=cmap_light, shading="auto")
    plt.scatter(X[:, 0], 
                X[:, 1], 
                c=y, cmap=cmap_bold,
                edgecolor='k', s=20)
    plt.title("3-class classification (shrink_threshold=%r)"
              % shrinkage)
    plt.axis('tight')
plt.show()

shrinkage:	 None 0.8133333333333334
shrinkage:	 0.2 0.82


from sklearn.manifold import Isomap
from sklearn.neighbors import KNeighborsTransformer as KNT
from sklearn.pipeline import make_pipeline

estimator = make_pipeline(
    KNT(n_neighbors=5, 
        mode='distance'),
    Isomap(neighbors_algorithm='precomputed'),
    #memory='/path/to/cache'
)


import time
import sys

#  conda install -c conda-forge python-annoy 
#  conda install -c conda-forge nmslib

try:
    import annoy
except ImportError:
    print("The package 'annoy' is required to run this example.")
    sys.exit()

try:
    import nmslib
except ImportError:
    print("The package 'nmslib' is required to run this example.")
    sys.exit()


import numpy as np
import matplotlib.pyplot as plt
from matplotlib.ticker import NullFormatter
from scipy.sparse import csr_matrix

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.neighbors import KNeighborsTransformer
from sklearn.utils._testing import assert_array_almost_equal
from sklearn.datasets import fetch_openml
from sklearn.pipeline import make_pipeline
from sklearn.manifold import TSNE
from sklearn.utils import shuffle


class NMSlibTransformer(TransformerMixin, BaseEstimator):
    """Wrapper for using nmslib as sklearn's KNeighborsTransformer"""

    def __init__(self, n_neighbors=5, 
                 metric='euclidean', 
                 method='sw-graph',
                 n_jobs=1):
        self.n_neighbors = n_neighbors
        self.method = method
        self.metric = metric
        self.n_jobs = n_jobs

    def fit(self, X):
        self.n_samples_fit_ = X.shape[0]

        # see https://github.com/nmslib/nmslib/tree/master/manual
        space = {
            'sqeuclidean': 'l2',
            'euclidean': 'l2',
            'cosine': 'cosinesimil',
            'l1': 'l1',
            'l2': 'l2',
        }[self.metric]

        self.nmslib_ = nmslib.init(method=self.method, space=space)
        self.nmslib_.addDataPointBatch(X)
        self.nmslib_.createIndex()
        return self

    def transform(self, X):
        n_samples_transform = X.shape[0]

        # For compatibility reasons
        # each sample considered as its own neighbor, one extra neighbor will be computed.
        n_neighbors = self.n_neighbors + 1

        results = self.nmslib_.knnQueryBatch(X, k=n_neighbors,
                                             num_threads=self.n_jobs)
        indices, distances = zip(*results)
        indices, distances = np.vstack(indices), np.vstack(distances)

        if self.metric == 'sqeuclidean':
            distances **= 2

        indptr = np.arange(0, n_samples_transform * n_neighbors + 1,
                           n_neighbors)
        kneighbors_graph = csr_matrix((distances.ravel(), indices.ravel(),
                                       indptr), shape=(n_samples_transform,
                                                       self.n_samples_fit_))

        return kneighbors_graph


class AnnoyTransformer(TransformerMixin, BaseEstimator):
    """Wrapper for using annoy.AnnoyIndex as sklearn's KNeighborsTransformer"""

    def __init__(self, n_neighbors=5, metric='euclidean', n_trees=10,
                 search_k=-1):
        self.n_neighbors = n_neighbors
        self.n_trees = n_trees
        self.search_k = search_k
        self.metric = metric

    def fit(self, X):
        self.n_samples_fit_ = X.shape[0]
        metric = self.metric if self.metric != 'sqeuclidean' else 'euclidean'
        self.annoy_ = annoy.AnnoyIndex(X.shape[1], metric=metric)
        for i, x in enumerate(X):
            self.annoy_.add_item(i, x.tolist())
        self.annoy_.build(self.n_trees)
        return self

    def transform(self, X):
        return self._transform(X)

    def fit_transform(self, X, y=None):
        return self.fit(X)._transform(X=None)

    def _transform(self, X):
        """As `transform`, but handles X is None for faster `fit_transform`."""

        n_samples_transform = self.n_samples_fit_ if X is None else X.shape[0]

        # For compatibility reasons, as each sample is considered as its own
        # neighbor, one extra neighbor will be computed.
        n_neighbors = self.n_neighbors + 1

        indices = np.empty((n_samples_transform, n_neighbors),
                           dtype=int)
        distances = np.empty((n_samples_transform, n_neighbors))

        if X is None:
            for i in range(self.annoy_.get_n_items()):
                ind, dist = self.annoy_.get_nns_by_item(
                    i, n_neighbors, self.search_k, include_distances=True)

                indices[i], distances[i] = ind, dist
        else:
            for i, x in enumerate(X):
                indices[i], distances[i] = self.annoy_.get_nns_by_vector(
                    x.tolist(), n_neighbors, self.search_k,
                    include_distances=True)

        if self.metric == 'sqeuclidean':
            distances **= 2

        indptr = np.arange(0, n_samples_transform * n_neighbors + 1,
                           n_neighbors)
        kneighbors_graph = csr_matrix((distances.ravel(), indices.ravel(),
                                       indptr), shape=(n_samples_transform,
                                                       self.n_samples_fit_))

        return kneighbors_graph


def test_transformers():
    # AnnoyTransformer and KNeighborsTransformer give same results?
    X = np.random.RandomState(42).randn(10, 2)

    knn = KNeighborsTransformer(); Xt0 = knn.fit_transform(X)
    ann = AnnoyTransformer();      Xt1 = ann.fit_transform(X)
    nms = NMSlibTransformer();     Xt2 = nms.fit_transform(X)

    assert_array_almost_equal(Xt0.toarray(), Xt1.toarray(), decimal=5)
    assert_array_almost_equal(Xt0.toarray(), Xt2.toarray(), decimal=5)


# Load MNIST, shuffle data, return only n_samples
def load_mnist(n_samples):
    mnist = fetch_openml("mnist_784")
    X, y = shuffle(mnist.data, mnist.target, random_state=2)
    return X[:n_samples] / 255, y[:n_samples]

import pandas as pd

def run_benchmark():
    datasets = [
        ('MNIST_2000', load_mnist(n_samples=2000)),
        ('MNIST_10000', load_mnist(n_samples=10000)),
    ]
    n_iter = 500
    perplexity = 30
    # TSNE requires a certain number of neighbors which depends on the
    # perplexity parameter.
    # Add one since we include each sample as its own neighbor.
    n_neighbors = int(3. * perplexity + 1) + 1

    transformers = [
        ('AnnoyTransformer', AnnoyTransformer(n_neighbors=n_neighbors,
                                              metric='sqeuclidean')),
        ('NMSlibTransformer', NMSlibTransformer(n_neighbors=n_neighbors,
                                                metric='sqeuclidean')),
        ('KNeighborsTransformer', KNeighborsTransformer(
            n_neighbors=n_neighbors, mode='distance', metric='sqeuclidean')),
        ('TSNE with AnnoyTransformer', make_pipeline(
            AnnoyTransformer(n_neighbors=n_neighbors, metric='sqeuclidean'),
            TSNE(metric='precomputed', perplexity=perplexity,
                 method="barnes_hut", random_state=42, n_iter=n_iter), )),
        ('TSNE with NMSlibTransformer', make_pipeline(
            NMSlibTransformer(n_neighbors=n_neighbors, metric='sqeuclidean'),
            TSNE(metric='precomputed', perplexity=perplexity,
                 method="barnes_hut", random_state=42, n_iter=n_iter), )),
        ('TSNE with KNeighborsTransformer', make_pipeline(
            KNeighborsTransformer(n_neighbors=n_neighbors, mode='distance',
                                  metric='sqeuclidean'),
            TSNE(metric='precomputed', perplexity=perplexity,
                 method="barnes_hut", random_state=42, n_iter=n_iter), )),
        ('TSNE with internal NearestNeighbors',
         TSNE(metric='sqeuclidean', perplexity=perplexity, method="barnes_hut",
              random_state=42, n_iter=n_iter)),
    ]

    # init the plot
    nrows = len(datasets)
    ncols = np.sum([1 for name, model in transformers if 'TSNE' in name])
    fig, axes = plt.subplots(nrows=nrows, ncols=ncols, squeeze=False,
                             figsize=(5 * ncols, 4 * nrows))
    axes = axes.ravel()
    i_ax = 0

    for dataset_name, (X, y) in datasets:

        msg = 'Benchmarking on %s:' % dataset_name
        print('\n%s\n%s' % (msg, '-' * len(msg)))

        for transformer_name, transformer in transformers:
            start = time.time()
            Xt = transformer.fit_transform(X)
            duration = time.time() - start

            # print the duration report
            longest = np.max([len(name) for name, model in transformers])
            whitespaces = ' ' * (longest - len(transformer_name))
            print('%s: %s%.3f sec' % (transformer_name, whitespaces, duration))

            # plot TSNE embedding which should be very similar across methods
            if 'TSNE' in transformer_name:
                axes[i_ax].set_title(transformer_name + '\non ' + dataset_name)
                axes[i_ax].scatter(Xt[:, 0], Xt[:, 1], c=y.astype(np.int32),
                                   alpha=0.2, cmap=plt.cm.viridis)
                axes[i_ax].xaxis.set_major_formatter(NullFormatter())
                axes[i_ax].yaxis.set_major_formatter(NullFormatter())
                axes[i_ax].axis('tight')
                i_ax += 1

    fig.tight_layout()
    plt.show()


if __name__ == '__main__':
    test_transformers()
    run_benchmark()

Benchmarking on MNIST_2000:
---------------------------

---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
<ipython-input-10-853c6a71b56e> in <module>
     83 if __name__ == '__main__':
     84     test_transformers()
---> 85     run_benchmark()

<ipython-input-10-853c6a71b56e> in run_benchmark()
     59         for transformer_name, transformer in transformers:
     60             start = time.time()
---> 61             Xt = transformer.fit_transform(X)
     62             duration = time.time() - start
     63 

<ipython-input-6-ce66225c39c5> in fit_transform(self, X, y)
     22 
     23     def fit_transform(self, X, y=None):
---> 24         return self.fit(X)._transform(X=None)
     25 
     26     def _transform(self, X):

<ipython-input-6-ce66225c39c5> in fit(self, X)
     14         self.annoy_ = annoy.AnnoyIndex(X.shape[1], metric=metric)
     15         for i, x in enumerate(X):
---> 16             self.annoy_.add_item(i, x.tolist())
     17         self.annoy_.build(self.n_trees)
     18         return self

AttributeError: 'str' object has no attribute 'tolist'


from tempfile import TemporaryDirectory
import matplotlib.pyplot as plt

from sklearn.neighbors import KNeighborsTransformer as KNT, KNeighborsClassifier as KNC
from sklearn.model_selection import GridSearchCV
from sklearn.datasets import load_digits
from sklearn.pipeline import Pipeline


X, y = load_digits(return_X_y=True)
n_neighbors_list = [1, 2, 3, 4, 5, 6, 7, 8, 9]

# The transformer computes the nearest neighbors graph using the maximum number
# of neighbors necessary in the grid search. The classifier model filters the
# nearest neighbors graph as required by its own n_neighbors parameter.

graph_model      = KNT(n_neighbors=max(n_neighbors_list), mode='distance')
classifier_model = KNC(metric='precomputed')


# Give `memory` a directory to cache the graph computation

with TemporaryDirectory(prefix="sklearn_graph_cache_") as tmpdir:
    full_model = Pipeline(
        steps=[('graph', graph_model), 
               ('classifier', classifier_model)],
        memory=tmpdir)

    param_grid = {'classifier__n_neighbors': n_neighbors_list}
    grid_model = GridSearchCV(full_model, param_grid)
    grid_model.fit(X, y)


fig, axes = plt.subplots(1, 2, figsize=(8, 4))

axes[0].errorbar(x=n_neighbors_list,
                 y=grid_model.cv_results_['mean_test_score'],
                 yerr=grid_model.cv_results_['std_test_score'])

axes[0].set(xlabel='n_neighbors', 
            title='Classification accuracy')

axes[1].errorbar(x=n_neighbors_list, 
                 y=grid_model.cv_results_['mean_fit_time'],
                 yerr=grid_model.cv_results_['std_fit_time'], 
                 color='r')

axes[1].set(xlabel='n_neighbors', title='Fit time (with caching)')
fig.tight_layout()
plt.show()


import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import make_classification
from sklearn.neighbors import NeighborhoodComponentsAnalysis as NCA
from matplotlib import cm
from scipy.special import logsumexp


# create 9 samples from 3 classes
# focus on point #3. link thickness between #3 and any other point is proportional to distance.

X, y = make_classification(n_samples=9, n_features=2, n_informative=2,
                           n_redundant=0, n_classes=3, n_clusters_per_class=1,
                           class_sep=1.0, random_state=0)


def link_thickness_i(X, i):
    diff_embedded = X[i] - X
    dist_embedded = np.einsum('ij,ij->i', diff_embedded,
                              diff_embedded)
    dist_embedded[i] = np.inf

    # compute exponentiated distances (use the log-sum-exp trick to
    # avoid numerical instabilities...
    exp_dist_embedded = np.exp(-dist_embedded -
                               logsumexp(-dist_embedded))
    return exp_dist_embedded


def relate_point(X, i, ax):
    pt_i = X[i]
    for j, pt_j in enumerate(X):
        thickness = link_thickness_i(X, i)*3
        if i != j:
            line = ([pt_i[0], pt_j[0]], [pt_i[1], pt_j[1]])
            ax.plot(*line, c=cm.Set1(y[j]),
                    linewidth=5*thickness[j])


plt.figure(1)
ax = plt.gca()
for i in range(X.shape[0]):
    ax.text(   X[i, 0], X[i, 1], str(i), va='center', ha='center')
    ax.scatter(X[i, 0], X[i, 1], s=300, c=cm.Set1(y[[i]]), alpha=0.4)

ax.set_title("Original points")
ax.axes.get_xaxis().set_visible(False)
ax.axes.get_yaxis().set_visible(False)
ax.axis('equal')  # so that boundaries are displayed correctly as circles
i = 3
relate_point(X, i, ax)
plt.show()


nca = NCA(max_iter=30, random_state=0)
nca = nca.fit(X, y)

plt.figure(2)
ax2 = plt.gca()
X_embedded = nca.transform(X)
relate_point(X_embedded, i, ax2)

for i in range(len(X)):
    ax2.text(X_embedded[i, 0], X_embedded[i, 1], str(i),
             va='center', ha='center')
    ax2.scatter(X_embedded[i, 0], X_embedded[i, 1], s=300, c=cm.Set1(y[[i]]),
                alpha=0.4)

ax2.set_title("NCA embedding")
ax2.axes.get_xaxis().set_visible(False)
ax2.axes.get_yaxis().set_visible(False)
ax2.axis('equal')
plt.show()


from sklearn.neighbors import (NeighborhoodComponentsAnalysis, KNeighborsClassifier)
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline

X, y = load_iris(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.7, random_state=42)

nca      = NeighborhoodComponentsAnalysis(random_state=42)
knn      = KNeighborsClassifier(n_neighbors=3)
nca_pipe = Pipeline([('nca', nca), ('knn', knn)])

nca_pipe.fit(X_train, y_train)
print("score:\t",nca_pipe.score(X_test, y_test))

score:	 0.9619047619047619


import numpy as np
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import (KNeighborsClassifier as KNC,
                               NeighborhoodComponentsAnalysis as NCA)
from sklearn.pipeline import Pipeline


n_neighbors = 1

dataset = datasets.load_iris()
X, y = dataset.data, dataset.target
X = X[:, [0, 2]]

X_train, X_test, y_train, y_test = \
    train_test_split(X, y, stratify=y, test_size=0.7, random_state=42)

h          = .01  # step size in the mesh
cmap_light = ListedColormap(['#FFAAAA', '#AAFFAA', '#AAAAFF'])
cmap_bold  = ListedColormap(['#FF0000', '#00FF00', '#0000FF'])
names      = ['KNN', 'NCA, KNN']

classifiers = [Pipeline([('scaler', StandardScaler()),
                         ('knn', KNC(n_neighbors=n_neighbors))
                         ]),
               Pipeline([('scaler', StandardScaler()),
                         ('nca', NCA()),
                         ('knn', KNC(n_neighbors=n_neighbors))
                         ])]


x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
                     np.arange(y_min, y_max, h))


for name, clf in zip(names, classifiers):

    clf.fit(X_train, y_train)
    score = clf.score(X_test, y_test)

    # Plot the decision boundary. For that, we will assign a color to each
    # point in the mesh [x_min, x_max]x[y_min, y_max].
    Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])
    Z = Z.reshape(xx.shape)

    plt.figure()
    plt.pcolormesh(xx, yy, Z, cmap=cmap_light, alpha=.8, shading='auto')

    # Plot also the training and testing points
    plt.scatter(X[:, 0], X[:, 1], c=y, cmap=cmap_bold, edgecolor='k', s=20)
    plt.xlim(xx.min(), xx.max())
    plt.ylim(yy.min(), yy.max())
    plt.title("{} (k = {})".format(name, n_neighbors))
    plt.text(0.9, 0.1, '{:.2f}'.format(score), size=15,
             ha='center', va='center', transform=plt.gca().transAxes)

plt.show()


import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.neighbors import (KNeighborsClassifier as KNC,
                               NeighborhoodComponentsAnalysis as NCA)
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler


n_neighbors = 3
random_state = 0

# Load Digits dataset
X, y = datasets.load_digits(return_X_y=True)
X_train, X_test, y_train, y_test = \
    train_test_split(X, y, test_size=0.5, stratify=y, random_state=random_state)

dim = len(X[0])
nn  = len(np.unique(y))


# Reduce dimension to 2 with PCA, LDA, NCA

pca = make_pipeline(StandardScaler(), PCA(n_components=2, random_state=random_state))
lda = make_pipeline(StandardScaler(), LDA(n_components=2))
nca = make_pipeline(StandardScaler(), NCA(n_components=2, random_state=random_state))


# Use a nearest neighbor classifier to evaluate the methods
knn = KNC(n_neighbors=n_neighbors)
methods = [('PCA', pca), ('LDA', lda), ('NCA', nca)]


for i, (name, model) in enumerate(methods):
    plt.figure()
    # plt.subplot(1, 3, i + 1, aspect=1)

    model.fit(X_train, y_train)
    knn.fit(model.transform(X_train), y_train)
    acc_knn = knn.score(model.transform(X_test), y_test)
    X_embedded = model.transform(X)

    plt.scatter(X_embedded[:, 0], 
                X_embedded[:, 1], 
                c=y, s=30, cmap='Set1')

    plt.title("{}, KNN (k={})\nTest accuracy = {:.2f}".format(name,
                                                              n_neighbors,
                                                              acc_knn))
plt.show()

Nearest Neighbors ¶

Base Algorithms¶

Example: find NNs between 2 sets of data¶

Example: Return a Graph (sparse) w/ connections between points¶

Example: KDTree implementation¶

KNN and Radius-based NN Classification¶

Example: 3-class NN analysis - uniform vs distance weights¶

KNN and Radius-based NN Regression¶

Example: NN regression - uniform vs distance weights¶

Example: upper/lower half face matching¶

Algorithm Comparison¶

Nearest Centroid Classifier¶

Example: 3-class classification with different shrink thresholds¶

KNN and Radius-based NN Transformers¶

Example: Approximate NNs pipelined to TSNE¶

Example: Nearest Neighbors Caching¶

Neighborhood Components Analysis ¶

Example: Using NCA to learn an embedding¶

NCA Classification¶

Example: NN classification with/without NCA¶

NCA and Dimensionality Reduction¶

Example: Dimensionality Reduction - NCA vs LDA vs PCA¶

Nearest Neighbors¶

Base Algorithms¶

Example: find NNs between 2 sets of data¶

Example: Return a Graph (sparse) w/ connections between points¶

Example: KDTree implementation¶

KNN and Radius-based NN Classification¶

Example: 3-class NN analysis - uniform vs distance weights¶

KNN and Radius-based NN Regression¶

Example: NN regression - uniform vs distance weights¶

Example: upper/lower half face matching¶

Algorithm Comparison¶

Nearest Centroid Classifier¶

Example: 3-class classification with different shrink thresholds¶

KNN and Radius-based NN Transformers¶

Example: Approximate NNs pipelined to TSNE¶

Example: Nearest Neighbors Caching¶

Neighborhood Components Analysis¶

Example: Using NCA to learn an embedding¶

NCA Classification¶

Example: NN classification with/without NCA¶

NCA and Dimensionality Reduction¶

Example: Dimensionality Reduction - NCA vs LDA vs PCA¶

Nearest Neighbors ¶

Neighborhood Components Analysis ¶