import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.datasets import make_blobs


# Incorrect number of clusters
n, rndm = 1500, 170
X, y = make_blobs(n_samples=n, random_state=rndm)
y_pred1 = KMeans(n_clusters=2, random_state=rndm).fit_predict(X)


# Anisotropicly distributed data
transformation = [[0.60834549, -0.63667341], [-0.40887718, 0.85253229]]
X_aniso = np.dot(X, transformation)
y_pred2 = KMeans(n_clusters=3, random_state=rndm).fit_predict(X_aniso)


# Different variance
X_varied, y_varied = make_blobs(n_samples=n,
                                cluster_std=[1.0, 2.5, 0.5],
                                random_state=rndm)
y_pred3 = KMeans(n_clusters=3, random_state=rndm).fit_predict(X_varied)


# Unevenly sized blobs
X_filtered = np.vstack((X[y == 0][:500], 
                        X[y == 1][:100], 
                        X[y == 2][:10]))
y_pred4 = KMeans(n_clusters=3, random_state=rndm).fit_predict(X_filtered)


plt.figure(figsize=(12, 12))

plt.subplot(221)
plt.scatter(X[:, 0], X[:, 1], c=y_pred1)
plt.title("Incorrect Number of Blobs")

plt.subplot(222)
plt.scatter(X_aniso[:, 0], X_aniso[:, 1], c=y_pred2)
plt.title("Anisotropicly Distributed Blobs")

plt.subplot(223)
plt.scatter(X_varied[:, 0], X_varied[:, 1], c=y_pred3)
plt.title("Unequal Variance")

plt.subplot(224)
plt.scatter(X_filtered[:, 0], X_filtered[:, 1], c=y_pred4)
plt.title("Unevenly Sized Blobs")

plt.show()


import numpy as np
from sklearn.datasets import load_digits
from time import time
from sklearn import metrics
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans as KM
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt


data, labels = load_digits(return_X_y=True)
(samples, features), digits = data.shape, np.unique(labels).size


import matplotlib.pyplot as plt

# PCA reduces #dims from 64 to 2
reduced_data = PCA(n_components=2).fit_transform(data)
kmeans = KMeans(init="k-means++", n_clusters=digits, n_init=4)
kmeans.fit(reduced_data)

# Mesh step size
h = .02

# Plot the decision boundary. For that, we will assign a color to each
x_min, x_max = reduced_data[:, 0].min() - 1, reduced_data[:, 0].max() + 1
y_min, y_max = reduced_data[:, 1].min() - 1, reduced_data[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))

# Obtain labels for each point in mesh. Use last trained model.
Z = kmeans.predict(np.c_[xx.ravel(), yy.ravel()])

# Put the result into a color plot
Z = Z.reshape(xx.shape); plt.figure(1); plt.clf()

plt.imshow(Z, interpolation="nearest",
           extent=(xx.min(), xx.max(), yy.min(), yy.max()),
           cmap=plt.cm.Paired, aspect="auto", origin="lower")

plt.plot(reduced_data[:, 0], 
         reduced_data[:, 1], 'k.', markersize=2)

centroids = kmeans.cluster_centers_

plt.scatter(centroids[:, 0], 
            centroids[:, 1], 
            marker="x", s=169, linewidths=3, color="w", zorder=10)

plt.title("K-means on digits (PCA-reduced)\n" "Centroids = white cross")
plt.xlim(x_min, x_max)
plt.ylim(y_min, y_max)
plt.xticks(()); plt.yticks(()); plt.show()


import time
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import MiniBatchKMeans as KMMB, KMeans as KM
from sklearn.metrics.pairwise import pairwise_distances_argmin as PDA
from sklearn.datasets import make_blobs


np.random.seed(0)
batch_size = 45
centers = [[1, 1], [-1, -1], [1, -1]]
n_clusters = len(centers)
X, labels_true = make_blobs(
    n_samples=3000, 
    centers=centers, 
    cluster_std=0.7)

k_means = KM(  init='k-means++', n_clusters=3, n_init=10)
mbk     = KMMB(init='k-means++', n_clusters=3, batch_size=batch_size,
               n_init=10, max_no_improvement=10, verbose=0)

t0 = time.time(); k_means.fit(X); t_batch = time.time() - t0
t0 = time.time();     mbk.fit(X); t_mini_batch = time.time() - t0


fig = plt.figure(figsize=(8, 3))
fig.subplots_adjust(left=0.02, right=0.98, bottom=0.05, top=0.9)
colors = ['#4EACC5', '#FF9C34', '#4E9A06']

# We want to have the same colors for the same cluster from the
# MiniBatchKMeans and the KMeans algorithm. Let's pair the cluster centers per
# closest one.
k_means_cluster_centers   = k_means.cluster_centers_
order                     = PDA(k_means.cluster_centers_, 
                                mbk.cluster_centers_)
mbk_means_cluster_centers = mbk.cluster_centers_[order]
k_means_labels            = PDA(X, k_means_cluster_centers)
mbk_means_labels          = PDA(X, mbk_means_cluster_centers)

# KMeans
ax = fig.add_subplot(1, 3, 1)
for k, col in zip(range(n_clusters), colors):
    my_members = k_means_labels == k
    cluster_center = k_means_cluster_centers[k]
    ax.plot(X[my_members, 0], 
            X[my_members, 1], 'w',
            markerfacecolor=col, 
            marker='.')
    ax.plot(cluster_center[0], 
            cluster_center[1], 'o', 
            markerfacecolor=col,
            markeredgecolor='k', markersize=6)

ax.set_title('KMeans'); ax.set_xticks(()); ax.set_yticks(())

plt.text(-3.5, 1.8,  'train time: %.2fs\ninertia: %f' % (
    t_batch, k_means.inertia_))

# MiniBatchKMeans
ax = fig.add_subplot(1, 3, 2)
for k, col in zip(range(n_clusters), colors):
    my_members = mbk_means_labels == k
    cluster_center = mbk_means_cluster_centers[k]
    ax.plot(X[my_members, 0], 
            X[my_members, 1], 'w',
            markerfacecolor=col, marker='.')
    ax.plot(cluster_center[0], 
            cluster_center[1], 'o', 
            markerfacecolor=col,
            markeredgecolor='k', markersize=6)

ax.set_title('MiniBatchKMeans'); ax.set_xticks(()); ax.set_yticks(())

plt.text(-3.5, 1.8, 'train time: %.2fs\ninertia: %f' %
         (t_mini_batch, mbk.inertia_))

# Initialise the different array to all False
different = (mbk_means_labels == 4)
ax = fig.add_subplot(1, 3, 3)

for k in range(n_clusters):
    different += ((k_means_labels == k) != (mbk_means_labels == k))

identic = np.logical_not(different)
ax.plot(X[identic, 0], X[identic, 1], 'w', 
        markerfacecolor='#bbbbbb', marker='.')
ax.plot(X[different, 0], X[different, 1], 'w',
        markerfacecolor='m', marker='.')

ax.set_title('Difference'); ax.set_xticks(()); ax.set_yticks(())
plt.show()


from sklearn.cluster import AffinityPropagation as AP
from sklearn import metrics
from sklearn.datasets import make_blobs
import matplotlib.pyplot as plt
from itertools import cycle


centers = [[1, 1], [-1, -1], [1, -1]]
X, labels_true = make_blobs(n_samples=300, 
                            centers=centers, 
                            cluster_std=0.5,
                            random_state=None)

af                      = AP(preference=-50).fit(X)
cluster_centers_indices = af.cluster_centers_indices_
labels                  = af.labels_
n_clusters_             = len(cluster_centers_indices)

/home/bjpcjp/.local/lib/python3.8/site-packages/sklearn/cluster/_affinity_propagation.py:148: FutureWarning: 'random_state' has been introduced in 0.23. It will be set to None starting from 1.0 (renaming of 0.25) which means that results will differ at every function call. Set 'random_state' to None to silence this warning, or to 0 to keep the behavior of versions <0.23.
  warnings.warn(


# metrics
print('Estimated #clusters: %d' % n_clusters_)
print("Homogeneity: %0.3f" % 
      metrics.homogeneity_score(labels_true, labels))
print("Completeness: %0.3f" % 
      metrics.completeness_score(labels_true, labels))
print("V-measure: %0.3f" % 
      metrics.v_measure_score(labels_true, labels))
print("Adjusted Rand Index: %0.3f"
      % metrics.adjusted_rand_score(labels_true, labels))
print("Adjusted Mutual Information: %0.3f"
      % metrics.adjusted_mutual_info_score(labels_true, labels))
print("Silhouette Coefficient: %0.3f"
      % metrics.silhouette_score(X, labels, metric='sqeuclidean'))

Estimated #clusters: 3
Homogeneity: 0.896
Completeness: 0.897
V-measure: 0.897
Adjusted Rand Index: 0.922
Adjusted Mutual Information: 0.896
Silhouette Coefficient: 0.766


plt.close('all')
plt.figure(1)
plt.clf()

colors = cycle('bgrcmykbgrcmykbgrcmykbgrcmyk')
for k, col in zip(range(n_clusters_), colors):
    class_members = labels == k
    cluster_center = X[cluster_centers_indices[k]]
    plt.plot(X[class_members, 0], 
             X[class_members, 1], col + '.')
    plt.plot(cluster_center[0], 
             cluster_center[1], 
             'o', markerfacecolor=col,
             markeredgecolor='k', markersize=14)
    for x in X[class_members]:
        plt.plot([cluster_center[0], x[0]], 
                 [cluster_center[1], x[1]], col)

plt.title('Estimated #clusters: %d' % n_clusters_)
plt.show()


import sys
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.collections import LineCollection
import pandas as pd
from sklearn import cluster, covariance, manifold


symbol_dict = {
    'TOT': 'Total',    'XOM': 'Exxon',    'CVX': 'Chevron',    'COP': 'ConocoPhillips',
    'VLO': 'Valero Energy',    'MSFT': 'Microsoft',    'IBM': 'IBM',
    'TWX': 'Time Warner',    'CMCSA': 'Comcast',    'CVC': 'Cablevision',
    'YHOO': 'Yahoo',    'DELL': 'Dell',    'HPQ': 'HP',    'AMZN': 'Amazon',
    'TM': 'Toyota',    'CAJ': 'Canon',    'SNE': 'Sony',    'F': 'Ford',
    'HMC': 'Honda',    'NAV': 'Navistar',    'NOC': 'Northrop Grumman',    'BA': 'Boeing',
    'KO': 'Coca Cola',    'MMM': '3M',    'MCD': 'McDonald\'s',    'PEP': 'Pepsi',
    'K': 'Kellogg',    'UN': 'Unilever',    'MAR': 'Marriott',    'PG': 'Procter Gamble',
    'CL': 'Colgate-Palmolive',    'GE': 'General Electrics',    'WFC': 'Wells Fargo',    'JPM': 'JPMorgan Chase',
    'AIG': 'AIG',    'AXP': 'American express',    'BAC': 'Bank of America',    'GS': 'Goldman Sachs',
    'AAPL': 'Apple',    'SAP': 'SAP',    'CSCO': 'Cisco',    'TXN': 'Texas Instruments',
    'XRX': 'Xerox',    'WMT': 'Wal-Mart',    'HD': 'Home Depot',    'GSK': 'GlaxoSmithKline',
    'PFE': 'Pfizer',    'SNY': 'Sanofi-Aventis',    'NVS': 'Novartis',    'KMB': 'Kimberly-Clark',
    'R': 'Ryder',    'GD': 'General Dynamics',    'RTN': 'Raytheon',    'CVS': 'CVS',
    'CAT': 'Caterpillar',    'DD': 'DuPont de Nemours'}


symbols, names = np.array(sorted(symbol_dict.items())).T

quotes = []

for symbol in symbols:
    print('Fetching quote history for %r' % symbol, file=sys.stderr)
    url = ('https://raw.githubusercontent.com/scikit-learn/examples-data/'
           'master/financial-data/{}.csv')
    quotes.append(pd.read_csv(url.format(symbol)))

close_prices = np.vstack([q['close'] for q in quotes])
open_prices  = np.vstack([q['open'] for q in quotes])
variation    = close_prices - open_prices
edge_model   = covariance.GraphicalLassoCV()

# standardize time series: using correlations rather than covariance
# is more efficient for structure recovery
X = variation.copy().T
X /= X.std(axis=0)
edge_model.fit(X)

Fetching quote history for 'AAPL'
Fetching quote history for 'AIG'
Fetching quote history for 'AMZN'
Fetching quote history for 'AXP'
Fetching quote history for 'BA'
Fetching quote history for 'BAC'
Fetching quote history for 'CAJ'
Fetching quote history for 'CAT'
Fetching quote history for 'CL'
Fetching quote history for 'CMCSA'
Fetching quote history for 'COP'
Fetching quote history for 'CSCO'
Fetching quote history for 'CVC'
Fetching quote history for 'CVS'
Fetching quote history for 'CVX'
Fetching quote history for 'DD'
Fetching quote history for 'DELL'
Fetching quote history for 'F'
Fetching quote history for 'GD'
Fetching quote history for 'GE'
Fetching quote history for 'GS'
Fetching quote history for 'GSK'
Fetching quote history for 'HD'
Fetching quote history for 'HMC'
Fetching quote history for 'HPQ'
Fetching quote history for 'IBM'
Fetching quote history for 'JPM'
Fetching quote history for 'K'
Fetching quote history for 'KMB'
Fetching quote history for 'KO'
Fetching quote history for 'MAR'
Fetching quote history for 'MCD'
Fetching quote history for 'MMM'
Fetching quote history for 'MSFT'
Fetching quote history for 'NAV'
Fetching quote history for 'NOC'
Fetching quote history for 'NVS'
Fetching quote history for 'PEP'
Fetching quote history for 'PFE'
Fetching quote history for 'PG'
Fetching quote history for 'R'
Fetching quote history for 'RTN'
Fetching quote history for 'SAP'
Fetching quote history for 'SNE'
Fetching quote history for 'SNY'
Fetching quote history for 'TM'
Fetching quote history for 'TOT'
Fetching quote history for 'TWX'
Fetching quote history for 'TXN'
Fetching quote history for 'UN'
Fetching quote history for 'VLO'
Fetching quote history for 'WFC'
Fetching quote history for 'WMT'
Fetching quote history for 'XOM'
Fetching quote history for 'XRX'
Fetching quote history for 'YHOO'
/home/bjpcjp/.local/lib/python3.8/site-packages/numpy/core/_methods.py:202: RuntimeWarning: invalid value encountered in subtract
  x = asanyarray(arr - arrmean)

GraphicalLassoCV()


_, labels = cluster.affinity_propagation(edge_model.covariance_, random_state=0)
n_labels  = labels.max()
for i in range(n_labels + 1):
    print('Cluster %i: %s' % ((i + 1), ', '.join(names[labels == i])))

# #############################################################################
# Find low-D embedding for visualization
# We use a dense eigen_solver to achieve reproducibility (arpack is
# initiated with random vectors that we don't control). In addition, we
# use a large number of neighbors to capture the large-scale structure.
node_position_model = manifold.LocallyLinearEmbedding(
    n_components=2, eigen_solver='dense', n_neighbors=6)
embedding = node_position_model.fit_transform(X.T).T

Cluster 1: Apple, Amazon, Yahoo
Cluster 2: Comcast, Cablevision, Time Warner
Cluster 3: ConocoPhillips, Chevron, Total, Valero Energy, Exxon
Cluster 4: Cisco, Dell, HP, IBM, Microsoft, SAP, Texas Instruments
Cluster 5: Boeing, General Dynamics, Northrop Grumman, Raytheon
Cluster 6: AIG, American express, Bank of America, Caterpillar, CVS, DuPont de Nemours, Ford, General Electrics, Goldman Sachs, Home Depot, JPMorgan Chase, Marriott, 3M, Ryder, Wells Fargo, Wal-Mart
Cluster 7: McDonald's
Cluster 8: GlaxoSmithKline, Novartis, Pfizer, Sanofi-Aventis, Unilever
Cluster 9: Kellogg, Coca Cola, Pepsi
Cluster 10: Colgate-Palmolive, Kimberly-Clark, Procter Gamble
Cluster 11: Canon, Honda, Navistar, Sony, Toyota, Xerox


plt.figure(1, facecolor='w', figsize=(10, 8))
plt.clf()
ax = plt.axes([0., 0., 1., 1.])
plt.axis('off')

# Display a graph of the partial correlations
partial_correlations = edge_model.precision_.copy()
d = 1 / np.sqrt(np.diag(partial_correlations))
partial_correlations *= d
partial_correlations *= d[:, np.newaxis]
non_zero = (np.abs(np.triu(partial_correlations, k=1)) > 0.02)

# Plot the nodes using the coordinates of our embedding
plt.scatter(embedding[0], embedding[1], s=100 * d ** 2, c=labels,
            cmap=plt.cm.nipy_spectral)

# Plot the edges
start_idx, end_idx = np.where(non_zero)
# a sequence of (*line0*, *line1*, *line2*), where::
#            linen = (x0, y0), (x1, y1), ... (xm, ym)
segments = [[embedding[:, start], embedding[:, stop]]
            for start, stop in zip(start_idx, end_idx)]
values = np.abs(partial_correlations[non_zero])
lc = LineCollection(segments,
                    zorder=0, cmap=plt.cm.hot_r,
                    norm=plt.Normalize(0, .7 * values.max()))
lc.set_array(values)
lc.set_linewidths(15 * values)
ax.add_collection(lc)

# Add a label to each node. The challenge here is that we want to
# position the labels to avoid overlap with other labels
for index, (name, label, (x, y)) in enumerate(
        zip(names, labels, embedding.T)):

    dx = x - embedding[0]
    dx[index] = 1
    dy = y - embedding[1]
    dy[index] = 1
    this_dx = dx[np.argmin(np.abs(dy))]
    this_dy = dy[np.argmin(np.abs(dx))]
    if this_dx > 0:
        horizontalalignment = 'left'
        x = x + .002
    else:
        horizontalalignment = 'right'
        x = x - .002
    if this_dy > 0:
        verticalalignment = 'bottom'
        y = y + .002
    else:
        verticalalignment = 'top'
        y = y - .002
    plt.text(x, y, name, size=10,
             horizontalalignment=horizontalalignment,
             verticalalignment=verticalalignment,
             bbox=dict(facecolor='w',
                       edgecolor=plt.cm.nipy_spectral(label / float(n_labels)),
                       alpha=.6))

plt.xlim(embedding[0].min() - .15 * embedding[0].ptp(),
         embedding[0].max() + .10 * embedding[0].ptp(),)
plt.ylim(embedding[1].min() - .03 * embedding[1].ptp(),
         embedding[1].max() + .03 * embedding[1].ptp())

plt.show()


import numpy as np
from sklearn.cluster import MeanShift, estimate_bandwidth
from sklearn.datasets import make_blobs


centers = [[1, 1], [-1, -1], [1, -1]]

X, _    = make_blobs(n_samples=10000, 
                     centers=centers, 
                     cluster_std=0.6)
bandwidth = estimate_bandwidth(X, 
                               quantile=0.2, 
                               n_samples=500)

ms = MeanShift(bandwidth=bandwidth, 
               bin_seeding=True).fit(X)

labels          = ms.labels_
cluster_centers = ms.cluster_centers_
labels_unique   = np.unique(labels)
n_clusters_     = len(labels_unique)

print("#estimated clusters : %d" % n_clusters_)

#estimated clusters : 3


import matplotlib.pyplot as plt
from itertools import cycle

plt.figure(1); plt.clf()

colors = cycle('bgrcmykbgrcmykbgrcmykbgrcmyk')
for k, col in zip(range(n_clusters_), colors):
    my_members = labels == k
    cluster_center = cluster_centers[k]
    plt.plot(X[my_members, 0], 
             X[my_members, 1], col + '.')
    plt.plot(cluster_center[0], 
             cluster_center[1], 
             'o', markerfacecolor=col,
             markeredgecolor='k', markersize=14)
plt.title('#estimated clusters: %d' % n_clusters_)

Text(0.5, 1.0, '#estimated clusters: 3')


import numpy as np
import matplotlib.pyplot as plt
from sklearn.feature_extraction import image
from sklearn.cluster import spectral_clustering as SC

l = 100; x, y = np.indices((l, l))

center1 = (28, 24); center2 = (40, 50)
center3 = (67, 58); center4 = (24, 70)

radius1, radius2, radius3, radius4 = 16, 14, 15, 14

circle1 = (x-center1[0]) ** 2+(y-center1[1]) ** 2 < radius1 ** 2
circle2 = (x-center2[0]) ** 2+(y-center2[1]) ** 2 < radius2 ** 2
circle3 = (x-center3[0]) ** 2+(y-center3[1]) ** 2 < radius3 ** 2
circle4 = (x-center4[0]) ** 2+(y-center4[1]) ** 2 < radius4 ** 2

img = circle1 + circle2 + circle3 + circle4

# Sse a mask that limits to the foreground.
# we are not interested in separating the objects from the background,
# but separating them one from the other.
mask = img.astype(bool)
img  = img.astype(float)
img += 1 + 0.2 * np.random.randn(*img.shape)

# Convert image to graph with value of gradient on the edges.
graph = image.img_to_graph(img, mask=mask)

# Take a decreasing function of the gradient: we take it weakly
# dependent from the gradient the segmentation is close to a voronoi
graph.data = np.exp(-graph.data / graph.data.std())

# Force the solver to be arpack, since amg is numerically
# unstable on this example
labels         = SC(graph, n_clusters=4, eigen_solver='arpack')
label_im       = np.full(mask.shape, -1.)
label_im[mask] = labels

plt.matshow(img)
plt.matshow(label_im)

<matplotlib.image.AxesImage at 0x7f6565876970>


img        = circle1 + circle2
mask       = img.astype(bool)
img        = img.astype(float)
img       += 1 + 0.2 * np.random.randn(*img.shape)
graph      = image.img_to_graph(img, mask=mask)
graph.data = np.exp(-graph.data / graph.data.std())
labels     = SC(graph, 
                n_clusters=2, eigen_solver='arpack')
label_im   = np.full(mask.shape, -1.)
label_im[mask] = labels

plt.matshow(img)
plt.matshow(label_im)
plt.show()


import time
import numpy as np
from scipy.ndimage.filters import gaussian_filter as GF
import matplotlib.pyplot as plt
import skimage
from skimage.data import coins
from skimage.transform import rescale
from sklearn.feature_extraction import image
from sklearn.cluster import spectral_clustering as SC
from sklearn.utils.fixes import parse_version as PV


# these were introduced in skimage-0.14
if PV(skimage.__version__) >= PV('0.14'):
    rescale_params = {'anti_aliasing': False, 
                      'multichannel': False}
else:
    rescale_params = {}

# Load coins as NumPy array
# Resize it to 20% of the original size to speed up the processing
# Apply Gaussian filter for smoothing prior to down-scaling
# reduces aliasing artifacts.
# Convert image to graph with gradient value on the edges.
orig_coins       = coins()
smoothened_coins = GF(orig_coins, sigma=2)
rescaled_coins   = rescale(smoothened_coins, 
                           0.2, 
                           mode="reflect",
                           **rescale_params)
graph            = image.img_to_graph(rescaled_coins)

# Take a decreasing function of the gradient: an exponential
# Smaller beta = more independent the segmentation is from actual image.
# For beta=1, the segmentation is close to a voronoi
beta,eps   = 10, 1e-6
graph.data = np.exp(-beta*graph.data/graph.data.std()) + eps


# Apply spectral clustering (much faster if pyamg installed)
N_REGIONS = 25
for strategy in ('kmeans', 'discretize'):
    t0 = time.time()
    labels = SC(graph, 
                n_clusters=N_REGIONS,
                assign_labels=strategy, random_state=42)
    t1 = time.time()
    labels = labels.reshape(rescaled_coins.shape)

    plt.figure(figsize=(5, 5))
    plt.imshow(rescaled_coins, cmap=plt.cm.gray)
    
    for l in range(N_REGIONS):
        plt.contour(labels == l,
                    colors=[plt.cm.nipy_spectral(l / float(N_REGIONS))])
    
    plt.xticks(()); plt.yticks(())
    title = 'Spectral clustering: %s, %.2fs' % (strategy, (t1 - t0))
    print(title)
    plt.title(title)
plt.show()

Spectral clustering: kmeans, 6.00s
Spectral clustering: discretize, 3.79s


import time
import warnings
import numpy as np
import matplotlib.pyplot as plt
from sklearn import cluster, datasets
from sklearn.preprocessing import StandardScaler as SS
from itertools import cycle, islice
np.random.seed(0)
#from sklearn.cluster import AgglomerativeClustering as AC


n = 1500
noisy_circles = datasets.make_circles(n_samples=n, factor=.5, noise=.05)
noisy_moons  = datasets.make_moons(   n_samples=n,            noise=.05)
blobs        = datasets.make_blobs(   n_samples=n, random_state=8)
no_structure = np.random.rand(n,2), None

# anisotropicly distributed data
X, y           = datasets.make_blobs(n_samples=n, random_state=170)
transformation = [[0.6, -0.6], [-0.4, 0.8]]
X_aniso        = np.dot(X, transformation)
aniso          = (X_aniso, y)

# blobs with varied variances
varied = datasets.make_blobs(n_samples=n,
                             cluster_std=[1.0, 2.5, 0.5],
                             random_state=170)


plt.figure(figsize=(9 * 1.3 + 2, 14.5))
plt.subplots_adjust(left=.02, right=.98, bottom=.001, top=.96, 
                    wspace=.05, hspace=.01)

plot_num = 1
default_base = {'n_neighbors': 10,
                'n_clusters': 3}

datasets = [
    (noisy_circles, {'n_clusters': 2}),
    (noisy_moons,   {'n_clusters': 2}),
    (varied,        {'n_neighbors': 2}),
    (aniso,         {'n_neighbors': 2}),
    (blobs,         {}),
    (no_structure,  {})]

for i_dataset, (dataset, algo_params) in enumerate(datasets):
    params = default_base.copy() # update params w/ dataset specifics
    params.update(algo_params)
    X, y = dataset
    X = StandardScaler().fit_transform(X) #normalize dataset

    ward     = cluster.AgglomerativeClustering(
        n_clusters=params['n_clusters'], linkage='ward')
    complete = cluster.AgglomerativeClustering(
        n_clusters=params['n_clusters'], linkage='complete')
    average  = cluster.AgglomerativeClustering(
        n_clusters=params['n_clusters'], linkage='average')
    single   = cluster.AgglomerativeClustering(
        n_clusters=params['n_clusters'], linkage='single')

    clustering_algorithms = (
        ('Single Linkage', single),
        ('Average Linkage', average),
        ('Complete Linkage', complete),
        ('Ward Linkage', ward),
    )

    for name, algorithm in clustering_algorithms:
        t0 = time.time()

        # catch warnings related to kneighbors_graph
        with warnings.catch_warnings():
            warnings.filterwarnings(
                "ignore",
                message="#connected components of the " +
                "connectivity matrix is [0-9]{1,2}" +
                " > 1. Completing it to avoid stopping the tree early.",
                category=UserWarning)
            algorithm.fit(X)

        t1 = time.time()
        if hasattr(algorithm, 'labels_'):
            y_pred = algorithm.labels_.astype(int)
        else:
            y_pred = algorithm.predict(X)

        plt.subplot(len(datasets), len(clustering_algorithms), plot_num)
        if i_dataset == 0:
            plt.title(name, size=18)

        colors = np.array(list(islice(cycle(['#377eb8', '#ff7f00', '#4daf4a',
                                             '#f781bf', '#a65628', '#984ea3',
                                             '#999999', '#e41a1c', '#dede00']),
                                      int(max(y_pred) + 1))))
        plt.scatter(X[:, 0], X[:, 1], s=10, color=colors[y_pred])

        plt.xlim(-2.5, 2.5); plt.ylim(-2.5, 2.5)
        plt.xticks(());      plt.yticks(())
        plt.text(.99, .01, ('%.2fs' % (t1 - t0)).lstrip('0'),
                 transform=plt.gca().transAxes, size=15,
                 horizontalalignment='right')
        plot_num += 1
plt.show()


import numpy as np
from matplotlib import pyplot as plt
from scipy.cluster.hierarchy import dendrogram
from sklearn.datasets import load_iris
from sklearn.cluster import AgglomerativeClustering as AC


def plot_dendrogram(model, **kwargs):
    # Create linkage matrix and then plot the dendrogram

    # create the counts of samples under each node
    counts    = np.zeros(model.children_.shape[0])
    n_samples = len(model.labels_)
    
    for i, merge in enumerate(model.children_):
        current_count = 0
        for child_idx in merge:
            if child_idx < n_samples:
                current_count += 1  # leaf node
            else:
                current_count += counts[child_idx - n_samples]
        counts[i] = current_count

    linkage_matrix = np.column_stack([model.children_, 
                                      model.distances_,
                                      counts]).astype(float)

    dendrogram(linkage_matrix, **kwargs)


iris = load_iris(); X = iris.data

# distance_threshold=0 ensures we compute the full tree.
model = AC(distance_threshold=0,
           n_clusters=None).fit(X)


plt.title('Hierarchical Clustering Dendrogram')
plot_dendrogram(model, truncate_mode='level', p=3)
plt.xlabel("#points in node (or index of point if no parenthesis).")
plt.show()


import time as time
import numpy as np
import matplotlib.pyplot as plt
import mpl_toolkits.mplot3d.axes3d as p3
from sklearn.cluster import AgglomerativeClustering as AC
from sklearn.datasets import make_swiss_roll as SR


# make a "thin" swiss roll
n, noise = 1500, 0.05
X, _     = SR(n, noise=noise); X[:, 1] *= .5

t0 = time.time()
ward = AC(n_clusters=6, linkage='ward').fit(X)
t1 = time.time() - t0
label = ward.labels_
print("Elapsed time: %.2fs" % t1)
print("#points: %i" % label.size)

fig = plt.figure()
ax = p3.Axes3D(fig)
ax.view_init(7, -80)
for l in np.unique(label):
    ax.scatter(X[label == l, 0], 
               X[label == l, 1], 
               X[label == l, 2],
               color=plt.cm.jet(float(l) / np.max(label + 1)), s=20, edgecolor='k')
plt.title('Without connectivity constraints (time %.2fs)' % t1)

Elapsed time: 0.04s
#points: 1500

Text(0.5, 0.92, 'Without connectivity constraints (time 0.04s)')


# define data structure - 10 NNs
from sklearn.neighbors import kneighbors_graph as KNG
connectivity = KNG(X, n_neighbors=10, include_self=False)
t0 = time.time()
ward = AC(n_clusters=6, connectivity=connectivity, linkage='ward').fit(X)
t1 = time.time() - t0
label = ward.labels_
print("Elapsed time: %.2fs" % t1)
print("#points: %i" % label.size)

fig = plt.figure()
ax = p3.Axes3D(fig)
ax.view_init(7, -80)
for l in np.unique(label):
    ax.scatter(X[label == l, 0], 
               X[label == l, 1], 
               X[label == l, 2],
               color=plt.cm.jet(float(l) / np.max(label + 1)), s=20, edgecolor='k')
plt.title('WITH connectivity constraints (time %.2fs)' % t1)
plt.show()

Elapsed time: 0.06s
#points: 1500


import time
import matplotlib.pyplot as plt
import numpy as np
from sklearn.cluster import AgglomerativeClustering as AC
from sklearn.neighbors import kneighbors_graph as KNG


np.random.seed(0)
n=1500; t=1.5*np.pi*(1+3*np.random.rand(1,n))
x,y = t*np.cos(t), t*np.sin(t)

X = np.concatenate((x,y));
X += 0.7*np.random.randn(2,n); X=X.T

# create local connectivity graph
# larger #neighbors = more homogeneous clusters, but more computation time
knn_graph = KNG(X,30,include_self=False)


for connectivity in (None, knn_graph):
    for n_clusters in (30, 3):
        plt.figure(figsize=(10, 4))
        for index, linkage in enumerate(('average',
                                         'complete',
                                         'ward',
                                         'single')):
            plt.subplot(1, 4, index+1)
            model = AC(linkage=linkage,
                       connectivity=connectivity,
                       n_clusters=n_clusters)
            
            t0 = time.time(); model.fit(X); t1 = time.time()-t0
            
            plt.scatter(X[:, 0], 
                        X[:, 1], c=model.labels_, cmap=plt.cm.nipy_spectral)
            
            plt.title('linkage=%s\n(time %.2fs)' % (linkage, t1),
                      fontdict=dict(verticalalignment='top'))
            plt.axis('equal')
            plt.axis('off')
            plt.subplots_adjust(bottom=0, top=.83, wspace=0, left=0, right=1)
            plt.suptitle('n_cluster=%i, connectivity=%r' %
                         (n_clusters, connectivity is not None), size=17)
plt.show()


import matplotlib.pyplot as plt
import numpy as np
from sklearn.cluster import AgglomerativeClustering as AC
from sklearn.metrics import pairwise_distances as PD


np.random.seed(0)
n=2000; t = np.pi*np.linspace(0,1,n)
X,y=[],[]

def sqr(x):
    return np.sign(np.cos(x))

for i, (phi,a) in enumerate([(0.5,0.15), (0.5,0.6), (0.3,0.2)]):
    for _ in range(30):
        phase_noise = 0.01*np.random.normal()
        amplitude_noise = 0.04*np.random.normal()
        more_noise      = 1-2*np.random.rand(n)
        # make the noise sparse
        more_noise[np.abs(more_noise)<0.997]=0
        
        X.append(12*((a+amplitude_noise)*
                     (sqr(6*(t+phi+phase_noise)))
                    +more_noise))
        y.append(i)

X,y = np.array(X), np.array(y)

n_clusters=3; labels=("waveform 1","waveform 2","waveform 3")


plt.figure()
plt.axes([0, 0, 1, 1])
for l, c, n in zip(range(n_clusters), 'rgb',
                   labels):
    lines = plt.plot(X[y == l].T, c=c, alpha=.5)
    lines[0].set_label(n)

plt.legend(loc='best')
plt.axis('tight'); plt.axis('off'); plt.suptitle("Ground truth", size=20)

# Plot the distances
for index, metric in enumerate(["cosine", "euclidean", "cityblock"]):
    avg_dist = np.zeros((n_clusters, n_clusters))
    plt.figure(figsize=(5, 4.5))
    for i in range(n_clusters):
        for j in range(n_clusters):
            avg_dist[i, j] = PD(X[y == i], 
                                X[y == j], metric=metric).mean()
    avg_dist /= avg_dist.max()
    
    for i in range(n_clusters):
        for j in range(n_clusters):
            plt.text(i, j, '%5.3f' % avg_dist[i, j],
                     verticalalignment='center',
                     horizontalalignment='center')

    plt.imshow(avg_dist, interpolation='nearest', cmap=plt.cm.gnuplot2,
               vmin=0)
    plt.xticks(range(n_clusters), labels, rotation=45)
    plt.yticks(range(n_clusters), labels)
    plt.colorbar()
    plt.suptitle("Interclass %s distances" % metric, size=18)
    plt.tight_layout()

# Plot clustering results
for index, metric in enumerate(["cosine", "euclidean", "cityblock"]):
    model = AC(n_clusters=n_clusters,
               linkage="average", 
               affinity=metric)
    model.fit(X)
    plt.figure()
    plt.axes([0, 0, 1, 1])
    for l, c in zip(np.arange(model.n_clusters), 'rgbk'):
        plt.plot(X[model.labels_ == l].T, c=c, alpha=.5)
    plt.axis('tight')
    plt.axis('off')
    plt.suptitle("AgglomerativeClustering(affinity=%s)" % metric, size=20)


plt.show()


import numpy as np

from sklearn.cluster import DBSCAN
from sklearn import metrics
from sklearn.datasets import make_blobs
from sklearn.preprocessing import StandardScaler as SS


centers = [[1, 1], [-1, -1], [1, -1]]
X, labels_true = make_blobs(n_samples=750, 
                            centers=centers, 
                            cluster_std=0.4,
                            random_state=0)

X  = SS().fit_transform(X)
db = DBSCAN(eps=0.3, min_samples=10).fit(X)

core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
core_samples_mask[db.core_sample_indices_] = True
labels = db.labels_

# Number of clusters in labels, ignoring noise if present.
n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
n_noise_    = list(labels).count(-1)


print('Est. #clusters: %d'     % n_clusters_)
print('Est. #noise points: %d' % n_noise_)
print("Homogeneity: %0.3f"     % metrics.homogeneity_score(labels_true, labels))
print("Completeness: %0.3f"    % metrics.completeness_score(labels_true, labels))
print("V-measure: %0.3f"       % metrics.v_measure_score(labels_true, labels))
print("Adj Rand Index: %0.3f"  % metrics.adjusted_rand_score(labels_true, labels))
print("Adj Mutual Info: %0.3f" % metrics.adjusted_mutual_info_score(labels_true, labels))
print("Silhouette Coef: %0.3f" % metrics.silhouette_score(X, labels))

Est. #clusters: 3
Est. #noise points: 18
Homogeneity: 0.953
Completeness: 0.883
V-measure: 0.917
Adj Rand Index: 0.952
Adj Mutual Info: 0.916
Silhouette Coef: 0.626


import matplotlib.pyplot as plt

# Black removed and is used for noise instead.
unique_labels = set(labels)
colors = [plt.cm.Spectral(each)
          for each in np.linspace(0, 1, len(unique_labels))]

for k, col in zip(unique_labels, colors):
    if k == -1:
        col = [0, 0, 0, 1]

    class_member_mask = (labels == k)

    xy = X[class_member_mask & 
           core_samples_mask]

    plt.plot(xy[:, 0], 
             xy[:, 1], 'o', markerfacecolor=tuple(col),
             markeredgecolor='k', markersize=14)

    xy = X[class_member_mask & 
           ~core_samples_mask]
    
    plt.plot(xy[:, 0], 
             xy[:, 1], 'o', markerfacecolor=tuple(col),
             markeredgecolor='k', markersize=6)

plt.title('Est. #clusters: %d' % n_clusters_)
plt.show()


from sklearn.cluster import OPTICS, cluster_optics_dbscan as CODB
import matplotlib.gridspec as gridspec
import matplotlib.pyplot as plt
import numpy as np


np.random.seed(0)
n_points_per_cluster = 250

C1    = [-5, -2] + 0.8 * np.random.randn(n_points_per_cluster, 2)
C2    = [ 4, -1] + 0.1 * np.random.randn(n_points_per_cluster, 2)
C3    = [ 1, -2] + 0.2 * np.random.randn(n_points_per_cluster, 2)
C4    = [-2,  3] + 0.3 * np.random.randn(n_points_per_cluster, 2)
C5    = [ 3, -2] + 1.6 * np.random.randn(n_points_per_cluster, 2)
C6    = [ 5,  6] + 2.0 * np.random.randn(n_points_per_cluster, 2)
X     = np.vstack((C1, C2, C3, C4, C5, C6))
clust = OPTICS(min_samples=50, xi=.05, min_cluster_size=.05).fit(X)

labels_050 = CODB(reachability=clust.reachability_,
                  core_distances=clust.core_distances_,
                  ordering=clust.ordering_, 
                  eps=0.5)
labels_200 = CODB(reachability=clust.reachability_,
                  core_distances=clust.core_distances_,
                  ordering=clust.ordering_, 
                  eps=2.0)

space        = np.arange(len(X))
reachability = clust.reachability_[clust.ordering_]
labels       = clust.labels_[clust.ordering_]


plt.figure(figsize=(10, 7))
G = gridspec.GridSpec(2, 3)
ax1 = plt.subplot(G[0, :])
ax2 = plt.subplot(G[1, 0])
ax3 = plt.subplot(G[1, 1])
ax4 = plt.subplot(G[1, 2])

# Reachability plot
colors = ['g.', 'r.', 'b.', 'y.', 'c.']
for klass, color in zip(range(0, 5), colors):
    Xk = space[labels == klass]
    Rk = reachability[labels == klass]
    ax1.plot(Xk, Rk, color, alpha=0.3)
    
ax1.plot(space[labels == -1], reachability[labels == -1], 
         'k.', alpha=0.3)
ax1.plot(space,               np.full_like(space, 2., dtype=float), 
         'k-', alpha=0.5)
ax1.plot(space,               np.full_like(space, 0.5, dtype=float), 
         'k-.', alpha=0.5)
ax1.set_ylabel('Reachability (epsilon distance)')
ax1.set_title('Reachability Plot')

# OPTICS
colors = ['g.', 'r.', 'b.', 'y.', 'c.']
for klass, color in zip(range(0, 5), colors):
    Xk = X[clust.labels_ == klass]

    ax2.plot(Xk[:, 0], 
             Xk[:, 1], color, alpha=0.3)
    ax2.plot(X[clust.labels_ == -1, 0], 
             X[clust.labels_ == -1, 1], 'k+', alpha=0.1)
ax2.set_title('Automatic Clustering\nOPTICS')

# DBSCAN at 0.5
colors = ['g', 'greenyellow', 'olive', 'r', 'b', 'c']
for klass, color in zip(range(0, 6), colors):
    Xk = X[labels_050 == klass]

    ax3.plot(Xk[:, 0], 
             Xk[:, 1], color, alpha=0.3, marker='.')
    ax3.plot(X[labels_050 == -1, 0], 
             X[labels_050 == -1, 1], 'k+', alpha=0.1)
ax3.set_title('Clustering at 0.5 epsilon cut\nDBSCAN')

# DBSCAN at 2.
colors = ['g.', 'm.', 'y.', 'c.']
for klass, color in zip(range(0, 4), colors):
    Xk = X[labels_200 == klass]

    ax4.plot(Xk[:, 0], 
             Xk[:, 1], color, alpha=0.3)

ax4.plot(X[labels_200 == -1, 0], 
         X[labels_200 == -1, 1], 'k+', alpha=0.1)
ax4.set_title('Clustering at 2.0 epsilon cut\nDBSCAN')

plt.tight_layout()
plt.show()


from itertools import cycle
from time import time
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.colors as colors
from sklearn.cluster import Birch, MiniBatchKMeans as MBKM
from sklearn.datasets import make_blobs


xx = np.linspace(-22, 22, 10)
yy = np.linspace(-22, 22, 10)
xx, yy = np.meshgrid(xx, yy)
n_centres = np.hstack((np.ravel(xx)[:, np.newaxis],
                       np.ravel(yy)[:, np.newaxis]))
X, y = make_blobs(n_samples=100000, 
                  centers=n_centres, 
                  random_state=0)
colors_ = cycle(colors.cnames.keys())


fig = plt.figure(figsize=(12, 4))
fig.subplots_adjust(left=0.04, right=0.98, bottom=0.1, top=0.9)

birch_models = [Birch(threshold=1.7, n_clusters=None),
                Birch(threshold=1.7, n_clusters=100)]
final_step = ['without global clustering', 'with global clustering']

for ind, (birch_model, info) in enumerate(zip(birch_models, final_step)):

    t0 = time(); birch_model.fit(X); t1 = time() - t0
    print("Birch %s as the final step took %0.2f seconds" % (info, t1))

    # Plot result
    labels     = birch_model.labels_
    centroids  = birch_model.subcluster_centers_
    n_clusters = np.unique(labels).size

    print("n_clusters : %d" % n_clusters)

    ax = fig.add_subplot(1, 3, ind + 1)
    for this_centroid, k, col in zip(centroids, range(n_clusters), colors_):
        mask = labels == k
        ax.scatter(X[mask, 0], X[mask, 1],
                   c='w', edgecolor=col, marker='.', alpha=0.5)
        if birch_model.n_clusters is None:
            ax.scatter(this_centroid[0], this_centroid[1], marker='+',
                       c='k', s=25)
    ax.set_ylim([-25, 25])
    ax.set_xlim([-25, 25])
    ax.set_autoscaley_on(False)
    ax.set_title('Birch %s' % info)
    
mbk = MBKM(init='k-means++', 
           n_clusters=100, 
           batch_size=100,
           n_init=10, 
           max_no_improvement=10, 
           verbose=0,
           random_state=0)

t0 = time(); mbk.fit(X); t1 = time() - t0
print("MiniBatchKMeans runtime: %0.2f seconds" % t1)
mbk_means_labels_unique = np.unique(mbk.labels_)

ax = fig.add_subplot(1, 3, 3)
for this_centroid, k, col in zip(mbk.cluster_centers_,
                                 range(n_clusters), colors_):
    mask = mbk.labels_ == k
    ax.scatter(X[mask, 0], X[mask, 1], marker='.',
               c='w', edgecolor=col, alpha=0.5)
    ax.scatter(this_centroid[0], this_centroid[1], marker='+',
               c='k', s=25)
ax.set_xlim([-25, 25])
ax.set_ylim([-25, 25])
ax.set_title("MiniBatchKMeans")
ax.set_autoscaley_on(False)
plt.show()

Birch without global clustering as the final step took 2.44 seconds
n_clusters : 158
Birch with global clustering as the final step took 2.36 seconds
n_clusters : 100
MiniBatchKMeans runtime: 2.50 seconds

clustering ¶

Kmeans, aka Lloyd's Algorithm¶

Example: Kmeans assumptions vs performance ¶

Example: Kmeans on digits data - Voronoi diagram ¶

Kmeans (minibatch)¶

Example: Kmeans minibatch vs std performance ¶

Affinity Propagation ¶

Example: Affinity Propagation - Stock Market Visualization ¶

Mean Shift ¶

Example Mean Shift ¶

Spectral Clustering ¶

Example: Spectral clustering - image segmentation ¶

Example: Spectral Clustering - label assignment strategies ¶

Agglomerative Clustering ¶

Example: Agglomerative Clustering, Linkage Performance ¶

Hierarchical Cluster Visualization with Dendrograms ¶

Agglomerative Clustering & Connectivity Constraints ¶

Agglomerative Clustering and Connectivity Graphs ¶

Agglomerative Clustering & Distance Metrics ¶

DBSCAN ¶

OPTICS ¶

Birch ¶

Birch vs MiniBatch Kmeans ¶