labels_
attribute after learning the data.(#samples, #features)
as inputs - they can be obtained from feature_extraction classes. (#samples, #samples)
as inputs. they can be obtained from pairwise functions.import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.datasets import make_blobs
# Incorrect number of clusters
n, rndm = 1500, 170
X, y = make_blobs(n_samples=n, random_state=rndm)
y_pred1 = KMeans(n_clusters=2, random_state=rndm).fit_predict(X)
# Anisotropicly distributed data
transformation = [[0.60834549, -0.63667341], [-0.40887718, 0.85253229]]
X_aniso = np.dot(X, transformation)
y_pred2 = KMeans(n_clusters=3, random_state=rndm).fit_predict(X_aniso)
# Different variance
X_varied, y_varied = make_blobs(n_samples=n,
cluster_std=[1.0, 2.5, 0.5],
random_state=rndm)
y_pred3 = KMeans(n_clusters=3, random_state=rndm).fit_predict(X_varied)
# Unevenly sized blobs
X_filtered = np.vstack((X[y == 0][:500],
X[y == 1][:100],
X[y == 2][:10]))
y_pred4 = KMeans(n_clusters=3, random_state=rndm).fit_predict(X_filtered)
plt.figure(figsize=(12, 12))
plt.subplot(221)
plt.scatter(X[:, 0], X[:, 1], c=y_pred1)
plt.title("Incorrect Number of Blobs")
plt.subplot(222)
plt.scatter(X_aniso[:, 0], X_aniso[:, 1], c=y_pred2)
plt.title("Anisotropicly Distributed Blobs")
plt.subplot(223)
plt.scatter(X_varied[:, 0], X_varied[:, 1], c=y_pred3)
plt.title("Unequal Variance")
plt.subplot(224)
plt.scatter(X_filtered[:, 0], X_filtered[:, 1], c=y_pred4)
plt.title("Unevenly Sized Blobs")
plt.show()
init="k-means++"
initializes the centroids to more distant from each other, leading to better results than by random initialization.sample_weight
enables sample weighting. A sample with weight=2 is equivalent to adding a duplicate of that sample to the dataset.import numpy as np
from sklearn.datasets import load_digits
from time import time
from sklearn import metrics
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans as KM
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
data, labels = load_digits(return_X_y=True)
(samples, features), digits = data.shape, np.unique(labels).size
import matplotlib.pyplot as plt
# PCA reduces #dims from 64 to 2
reduced_data = PCA(n_components=2).fit_transform(data)
kmeans = KMeans(init="k-means++", n_clusters=digits, n_init=4)
kmeans.fit(reduced_data)
# Mesh step size
h = .02
# Plot the decision boundary. For that, we will assign a color to each
x_min, x_max = reduced_data[:, 0].min() - 1, reduced_data[:, 0].max() + 1
y_min, y_max = reduced_data[:, 1].min() - 1, reduced_data[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
# Obtain labels for each point in mesh. Use last trained model.
Z = kmeans.predict(np.c_[xx.ravel(), yy.ravel()])
# Put the result into a color plot
Z = Z.reshape(xx.shape); plt.figure(1); plt.clf()
plt.imshow(Z, interpolation="nearest",
extent=(xx.min(), xx.max(), yy.min(), yy.max()),
cmap=plt.cm.Paired, aspect="auto", origin="lower")
plt.plot(reduced_data[:, 0],
reduced_data[:, 1], 'k.', markersize=2)
centroids = kmeans.cluster_centers_
plt.scatter(centroids[:, 0],
centroids[:, 1],
marker="x", s=169, linewidths=3, color="w", zorder=10)
plt.title("K-means on digits (PCA-reduced)\n" "Centroids = white cross")
plt.xlim(x_min, x_max)
plt.ylim(y_min, y_max)
plt.xticks(()); plt.yticks(()); plt.show()
import time
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import MiniBatchKMeans as KMMB, KMeans as KM
from sklearn.metrics.pairwise import pairwise_distances_argmin as PDA
from sklearn.datasets import make_blobs
np.random.seed(0)
batch_size = 45
centers = [[1, 1], [-1, -1], [1, -1]]
n_clusters = len(centers)
X, labels_true = make_blobs(
n_samples=3000,
centers=centers,
cluster_std=0.7)
k_means = KM( init='k-means++', n_clusters=3, n_init=10)
mbk = KMMB(init='k-means++', n_clusters=3, batch_size=batch_size,
n_init=10, max_no_improvement=10, verbose=0)
t0 = time.time(); k_means.fit(X); t_batch = time.time() - t0
t0 = time.time(); mbk.fit(X); t_mini_batch = time.time() - t0
fig = plt.figure(figsize=(8, 3))
fig.subplots_adjust(left=0.02, right=0.98, bottom=0.05, top=0.9)
colors = ['#4EACC5', '#FF9C34', '#4E9A06']
# We want to have the same colors for the same cluster from the
# MiniBatchKMeans and the KMeans algorithm. Let's pair the cluster centers per
# closest one.
k_means_cluster_centers = k_means.cluster_centers_
order = PDA(k_means.cluster_centers_,
mbk.cluster_centers_)
mbk_means_cluster_centers = mbk.cluster_centers_[order]
k_means_labels = PDA(X, k_means_cluster_centers)
mbk_means_labels = PDA(X, mbk_means_cluster_centers)
# KMeans
ax = fig.add_subplot(1, 3, 1)
for k, col in zip(range(n_clusters), colors):
my_members = k_means_labels == k
cluster_center = k_means_cluster_centers[k]
ax.plot(X[my_members, 0],
X[my_members, 1], 'w',
markerfacecolor=col,
marker='.')
ax.plot(cluster_center[0],
cluster_center[1], 'o',
markerfacecolor=col,
markeredgecolor='k', markersize=6)
ax.set_title('KMeans'); ax.set_xticks(()); ax.set_yticks(())
plt.text(-3.5, 1.8, 'train time: %.2fs\ninertia: %f' % (
t_batch, k_means.inertia_))
# MiniBatchKMeans
ax = fig.add_subplot(1, 3, 2)
for k, col in zip(range(n_clusters), colors):
my_members = mbk_means_labels == k
cluster_center = mbk_means_cluster_centers[k]
ax.plot(X[my_members, 0],
X[my_members, 1], 'w',
markerfacecolor=col, marker='.')
ax.plot(cluster_center[0],
cluster_center[1], 'o',
markerfacecolor=col,
markeredgecolor='k', markersize=6)
ax.set_title('MiniBatchKMeans'); ax.set_xticks(()); ax.set_yticks(())
plt.text(-3.5, 1.8, 'train time: %.2fs\ninertia: %f' %
(t_mini_batch, mbk.inertia_))
# Initialise the different array to all False
different = (mbk_means_labels == 4)
ax = fig.add_subplot(1, 3, 3)
for k in range(n_clusters):
different += ((k_means_labels == k) != (mbk_means_labels == k))
identic = np.logical_not(different)
ax.plot(X[identic, 0], X[identic, 1], 'w',
markerfacecolor='#bbbbbb', marker='.')
ax.plot(X[different, 0], X[different, 1], 'w',
markerfacecolor='m', marker='.')
ax.set_title('Difference'); ax.set_xticks(()); ax.set_yticks(())
plt.show()
preference
(how many exemplars) and damping factor
(helps avoid message oscillation).from sklearn.cluster import AffinityPropagation as AP
from sklearn import metrics
from sklearn.datasets import make_blobs
import matplotlib.pyplot as plt
from itertools import cycle
centers = [[1, 1], [-1, -1], [1, -1]]
X, labels_true = make_blobs(n_samples=300,
centers=centers,
cluster_std=0.5,
random_state=None)
af = AP(preference=-50).fit(X)
cluster_centers_indices = af.cluster_centers_indices_
labels = af.labels_
n_clusters_ = len(cluster_centers_indices)
/home/bjpcjp/.local/lib/python3.8/site-packages/sklearn/cluster/_affinity_propagation.py:148: FutureWarning: 'random_state' has been introduced in 0.23. It will be set to None starting from 1.0 (renaming of 0.25) which means that results will differ at every function call. Set 'random_state' to None to silence this warning, or to 0 to keep the behavior of versions <0.23. warnings.warn(
# metrics
print('Estimated #clusters: %d' % n_clusters_)
print("Homogeneity: %0.3f" %
metrics.homogeneity_score(labels_true, labels))
print("Completeness: %0.3f" %
metrics.completeness_score(labels_true, labels))
print("V-measure: %0.3f" %
metrics.v_measure_score(labels_true, labels))
print("Adjusted Rand Index: %0.3f"
% metrics.adjusted_rand_score(labels_true, labels))
print("Adjusted Mutual Information: %0.3f"
% metrics.adjusted_mutual_info_score(labels_true, labels))
print("Silhouette Coefficient: %0.3f"
% metrics.silhouette_score(X, labels, metric='sqeuclidean'))
Estimated #clusters: 3 Homogeneity: 0.896 Completeness: 0.897 V-measure: 0.897 Adjusted Rand Index: 0.922 Adjusted Mutual Information: 0.896 Silhouette Coefficient: 0.766
plt.close('all')
plt.figure(1)
plt.clf()
colors = cycle('bgrcmykbgrcmykbgrcmykbgrcmyk')
for k, col in zip(range(n_clusters_), colors):
class_members = labels == k
cluster_center = X[cluster_centers_indices[k]]
plt.plot(X[class_members, 0],
X[class_members, 1], col + '.')
plt.plot(cluster_center[0],
cluster_center[1],
'o', markerfacecolor=col,
markeredgecolor='k', markersize=14)
for x in X[class_members]:
plt.plot([cluster_center[0], x[0]],
[cluster_center[1], x[1]], col)
plt.title('Estimated #clusters: %d' % n_clusters_)
plt.show()
import sys
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.collections import LineCollection
import pandas as pd
from sklearn import cluster, covariance, manifold
symbol_dict = {
'TOT': 'Total', 'XOM': 'Exxon', 'CVX': 'Chevron', 'COP': 'ConocoPhillips',
'VLO': 'Valero Energy', 'MSFT': 'Microsoft', 'IBM': 'IBM',
'TWX': 'Time Warner', 'CMCSA': 'Comcast', 'CVC': 'Cablevision',
'YHOO': 'Yahoo', 'DELL': 'Dell', 'HPQ': 'HP', 'AMZN': 'Amazon',
'TM': 'Toyota', 'CAJ': 'Canon', 'SNE': 'Sony', 'F': 'Ford',
'HMC': 'Honda', 'NAV': 'Navistar', 'NOC': 'Northrop Grumman', 'BA': 'Boeing',
'KO': 'Coca Cola', 'MMM': '3M', 'MCD': 'McDonald\'s', 'PEP': 'Pepsi',
'K': 'Kellogg', 'UN': 'Unilever', 'MAR': 'Marriott', 'PG': 'Procter Gamble',
'CL': 'Colgate-Palmolive', 'GE': 'General Electrics', 'WFC': 'Wells Fargo', 'JPM': 'JPMorgan Chase',
'AIG': 'AIG', 'AXP': 'American express', 'BAC': 'Bank of America', 'GS': 'Goldman Sachs',
'AAPL': 'Apple', 'SAP': 'SAP', 'CSCO': 'Cisco', 'TXN': 'Texas Instruments',
'XRX': 'Xerox', 'WMT': 'Wal-Mart', 'HD': 'Home Depot', 'GSK': 'GlaxoSmithKline',
'PFE': 'Pfizer', 'SNY': 'Sanofi-Aventis', 'NVS': 'Novartis', 'KMB': 'Kimberly-Clark',
'R': 'Ryder', 'GD': 'General Dynamics', 'RTN': 'Raytheon', 'CVS': 'CVS',
'CAT': 'Caterpillar', 'DD': 'DuPont de Nemours'}
symbols, names = np.array(sorted(symbol_dict.items())).T
quotes = []
for symbol in symbols:
print('Fetching quote history for %r' % symbol, file=sys.stderr)
url = ('https://raw.githubusercontent.com/scikit-learn/examples-data/'
'master/financial-data/{}.csv')
quotes.append(pd.read_csv(url.format(symbol)))
close_prices = np.vstack([q['close'] for q in quotes])
open_prices = np.vstack([q['open'] for q in quotes])
variation = close_prices - open_prices
edge_model = covariance.GraphicalLassoCV()
# standardize time series: using correlations rather than covariance
# is more efficient for structure recovery
X = variation.copy().T
X /= X.std(axis=0)
edge_model.fit(X)
Fetching quote history for 'AAPL' Fetching quote history for 'AIG' Fetching quote history for 'AMZN' Fetching quote history for 'AXP' Fetching quote history for 'BA' Fetching quote history for 'BAC' Fetching quote history for 'CAJ' Fetching quote history for 'CAT' Fetching quote history for 'CL' Fetching quote history for 'CMCSA' Fetching quote history for 'COP' Fetching quote history for 'CSCO' Fetching quote history for 'CVC' Fetching quote history for 'CVS' Fetching quote history for 'CVX' Fetching quote history for 'DD' Fetching quote history for 'DELL' Fetching quote history for 'F' Fetching quote history for 'GD' Fetching quote history for 'GE' Fetching quote history for 'GS' Fetching quote history for 'GSK' Fetching quote history for 'HD' Fetching quote history for 'HMC' Fetching quote history for 'HPQ' Fetching quote history for 'IBM' Fetching quote history for 'JPM' Fetching quote history for 'K' Fetching quote history for 'KMB' Fetching quote history for 'KO' Fetching quote history for 'MAR' Fetching quote history for 'MCD' Fetching quote history for 'MMM' Fetching quote history for 'MSFT' Fetching quote history for 'NAV' Fetching quote history for 'NOC' Fetching quote history for 'NVS' Fetching quote history for 'PEP' Fetching quote history for 'PFE' Fetching quote history for 'PG' Fetching quote history for 'R' Fetching quote history for 'RTN' Fetching quote history for 'SAP' Fetching quote history for 'SNE' Fetching quote history for 'SNY' Fetching quote history for 'TM' Fetching quote history for 'TOT' Fetching quote history for 'TWX' Fetching quote history for 'TXN' Fetching quote history for 'UN' Fetching quote history for 'VLO' Fetching quote history for 'WFC' Fetching quote history for 'WMT' Fetching quote history for 'XOM' Fetching quote history for 'XRX' Fetching quote history for 'YHOO' /home/bjpcjp/.local/lib/python3.8/site-packages/numpy/core/_methods.py:202: RuntimeWarning: invalid value encountered in subtract x = asanyarray(arr - arrmean)
GraphicalLassoCV()
_, labels = cluster.affinity_propagation(edge_model.covariance_, random_state=0)
n_labels = labels.max()
for i in range(n_labels + 1):
print('Cluster %i: %s' % ((i + 1), ', '.join(names[labels == i])))
# #############################################################################
# Find low-D embedding for visualization
# We use a dense eigen_solver to achieve reproducibility (arpack is
# initiated with random vectors that we don't control). In addition, we
# use a large number of neighbors to capture the large-scale structure.
node_position_model = manifold.LocallyLinearEmbedding(
n_components=2, eigen_solver='dense', n_neighbors=6)
embedding = node_position_model.fit_transform(X.T).T
Cluster 1: Apple, Amazon, Yahoo Cluster 2: Comcast, Cablevision, Time Warner Cluster 3: ConocoPhillips, Chevron, Total, Valero Energy, Exxon Cluster 4: Cisco, Dell, HP, IBM, Microsoft, SAP, Texas Instruments Cluster 5: Boeing, General Dynamics, Northrop Grumman, Raytheon Cluster 6: AIG, American express, Bank of America, Caterpillar, CVS, DuPont de Nemours, Ford, General Electrics, Goldman Sachs, Home Depot, JPMorgan Chase, Marriott, 3M, Ryder, Wells Fargo, Wal-Mart Cluster 7: McDonald's Cluster 8: GlaxoSmithKline, Novartis, Pfizer, Sanofi-Aventis, Unilever Cluster 9: Kellogg, Coca Cola, Pepsi Cluster 10: Colgate-Palmolive, Kimberly-Clark, Procter Gamble Cluster 11: Canon, Honda, Navistar, Sony, Toyota, Xerox
plt.figure(1, facecolor='w', figsize=(10, 8))
plt.clf()
ax = plt.axes([0., 0., 1., 1.])
plt.axis('off')
# Display a graph of the partial correlations
partial_correlations = edge_model.precision_.copy()
d = 1 / np.sqrt(np.diag(partial_correlations))
partial_correlations *= d
partial_correlations *= d[:, np.newaxis]
non_zero = (np.abs(np.triu(partial_correlations, k=1)) > 0.02)
# Plot the nodes using the coordinates of our embedding
plt.scatter(embedding[0], embedding[1], s=100 * d ** 2, c=labels,
cmap=plt.cm.nipy_spectral)
# Plot the edges
start_idx, end_idx = np.where(non_zero)
# a sequence of (*line0*, *line1*, *line2*), where::
# linen = (x0, y0), (x1, y1), ... (xm, ym)
segments = [[embedding[:, start], embedding[:, stop]]
for start, stop in zip(start_idx, end_idx)]
values = np.abs(partial_correlations[non_zero])
lc = LineCollection(segments,
zorder=0, cmap=plt.cm.hot_r,
norm=plt.Normalize(0, .7 * values.max()))
lc.set_array(values)
lc.set_linewidths(15 * values)
ax.add_collection(lc)
# Add a label to each node. The challenge here is that we want to
# position the labels to avoid overlap with other labels
for index, (name, label, (x, y)) in enumerate(
zip(names, labels, embedding.T)):
dx = x - embedding[0]
dx[index] = 1
dy = y - embedding[1]
dy[index] = 1
this_dx = dx[np.argmin(np.abs(dy))]
this_dy = dy[np.argmin(np.abs(dx))]
if this_dx > 0:
horizontalalignment = 'left'
x = x + .002
else:
horizontalalignment = 'right'
x = x - .002
if this_dy > 0:
verticalalignment = 'bottom'
y = y + .002
else:
verticalalignment = 'top'
y = y - .002
plt.text(x, y, name, size=10,
horizontalalignment=horizontalalignment,
verticalalignment=verticalalignment,
bbox=dict(facecolor='w',
edgecolor=plt.cm.nipy_spectral(label / float(n_labels)),
alpha=.6))
plt.xlim(embedding[0].min() - .15 * embedding[0].ptp(),
embedding[0].max() + .10 * embedding[0].ptp(),)
plt.ylim(embedding[1].min() - .03 * embedding[1].ptp(),
embedding[1].max() + .03 * embedding[1].ptp())
plt.show()
bandwidth
sets the size of region to search. It can be set manually or estimated using the estimate_bandwidth
function.import numpy as np
from sklearn.cluster import MeanShift, estimate_bandwidth
from sklearn.datasets import make_blobs
centers = [[1, 1], [-1, -1], [1, -1]]
X, _ = make_blobs(n_samples=10000,
centers=centers,
cluster_std=0.6)
bandwidth = estimate_bandwidth(X,
quantile=0.2,
n_samples=500)
ms = MeanShift(bandwidth=bandwidth,
bin_seeding=True).fit(X)
labels = ms.labels_
cluster_centers = ms.cluster_centers_
labels_unique = np.unique(labels)
n_clusters_ = len(labels_unique)
print("#estimated clusters : %d" % n_clusters_)
#estimated clusters : 3
import matplotlib.pyplot as plt
from itertools import cycle
plt.figure(1); plt.clf()
colors = cycle('bgrcmykbgrcmykbgrcmykbgrcmyk')
for k, col in zip(range(n_clusters_), colors):
my_members = labels == k
cluster_center = cluster_centers[k]
plt.plot(X[my_members, 0],
X[my_members, 1], col + '.')
plt.plot(cluster_center[0],
cluster_center[1],
'o', markerfacecolor=col,
markeredgecolor='k', markersize=14)
plt.title('#estimated clusters: %d' % n_clusters_)
Text(0.5, 1.0, '#estimated clusters: 3')
amg
solver is used for the eigenvalue computation. (requires the pyamg module.)import numpy as np
import matplotlib.pyplot as plt
from sklearn.feature_extraction import image
from sklearn.cluster import spectral_clustering as SC
l = 100; x, y = np.indices((l, l))
center1 = (28, 24); center2 = (40, 50)
center3 = (67, 58); center4 = (24, 70)
radius1, radius2, radius3, radius4 = 16, 14, 15, 14
circle1 = (x-center1[0]) ** 2+(y-center1[1]) ** 2 < radius1 ** 2
circle2 = (x-center2[0]) ** 2+(y-center2[1]) ** 2 < radius2 ** 2
circle3 = (x-center3[0]) ** 2+(y-center3[1]) ** 2 < radius3 ** 2
circle4 = (x-center4[0]) ** 2+(y-center4[1]) ** 2 < radius4 ** 2
img = circle1 + circle2 + circle3 + circle4
# Sse a mask that limits to the foreground.
# we are not interested in separating the objects from the background,
# but separating them one from the other.
mask = img.astype(bool)
img = img.astype(float)
img += 1 + 0.2 * np.random.randn(*img.shape)
# Convert image to graph with value of gradient on the edges.
graph = image.img_to_graph(img, mask=mask)
# Take a decreasing function of the gradient: we take it weakly
# dependent from the gradient the segmentation is close to a voronoi
graph.data = np.exp(-graph.data / graph.data.std())
# Force the solver to be arpack, since amg is numerically
# unstable on this example
labels = SC(graph, n_clusters=4, eigen_solver='arpack')
label_im = np.full(mask.shape, -1.)
label_im[mask] = labels
plt.matshow(img)
plt.matshow(label_im)
<matplotlib.image.AxesImage at 0x7f6565876970>
img = circle1 + circle2
mask = img.astype(bool)
img = img.astype(float)
img += 1 + 0.2 * np.random.randn(*img.shape)
graph = image.img_to_graph(img, mask=mask)
graph.data = np.exp(-graph.data / graph.data.std())
labels = SC(graph,
n_clusters=2, eigen_solver='arpack')
label_im = np.full(mask.shape, -1.)
label_im[mask] = labels
plt.matshow(img)
plt.matshow(label_im)
plt.show()
assign_labels="kmeans"
= can match fine-level details, but can be unstable & unreproduceable - especially if you don't control random_state
assign_labels="discretize"
is 100% reproduceable, but tends to produce parcels of even & geometric shape.import time
import numpy as np
from scipy.ndimage.filters import gaussian_filter as GF
import matplotlib.pyplot as plt
import skimage
from skimage.data import coins
from skimage.transform import rescale
from sklearn.feature_extraction import image
from sklearn.cluster import spectral_clustering as SC
from sklearn.utils.fixes import parse_version as PV
# these were introduced in skimage-0.14
if PV(skimage.__version__) >= PV('0.14'):
rescale_params = {'anti_aliasing': False,
'multichannel': False}
else:
rescale_params = {}
# Load coins as NumPy array
# Resize it to 20% of the original size to speed up the processing
# Apply Gaussian filter for smoothing prior to down-scaling
# reduces aliasing artifacts.
# Convert image to graph with gradient value on the edges.
orig_coins = coins()
smoothened_coins = GF(orig_coins, sigma=2)
rescaled_coins = rescale(smoothened_coins,
0.2,
mode="reflect",
**rescale_params)
graph = image.img_to_graph(rescaled_coins)
# Take a decreasing function of the gradient: an exponential
# Smaller beta = more independent the segmentation is from actual image.
# For beta=1, the segmentation is close to a voronoi
beta,eps = 10, 1e-6
graph.data = np.exp(-beta*graph.data/graph.data.std()) + eps
# Apply spectral clustering (much faster if pyamg installed)
N_REGIONS = 25
for strategy in ('kmeans', 'discretize'):
t0 = time.time()
labels = SC(graph,
n_clusters=N_REGIONS,
assign_labels=strategy, random_state=42)
t1 = time.time()
labels = labels.reshape(rescaled_coins.shape)
plt.figure(figsize=(5, 5))
plt.imshow(rescaled_coins, cmap=plt.cm.gray)
for l in range(N_REGIONS):
plt.contour(labels == l,
colors=[plt.cm.nipy_spectral(l / float(N_REGIONS))])
plt.xticks(()); plt.yticks(())
title = 'Spectral clustering: %s, %.2fs' % (strategy, (t1 - t0))
print(title)
plt.title(title)
plt.show()
Spectral clustering: kmeans, 6.00s Spectral clustering: discretize, 3.79s
import time
import warnings
import numpy as np
import matplotlib.pyplot as plt
from sklearn import cluster, datasets
from sklearn.preprocessing import StandardScaler as SS
from itertools import cycle, islice
np.random.seed(0)
#from sklearn.cluster import AgglomerativeClustering as AC
n = 1500
noisy_circles = datasets.make_circles(n_samples=n, factor=.5, noise=.05)
noisy_moons = datasets.make_moons( n_samples=n, noise=.05)
blobs = datasets.make_blobs( n_samples=n, random_state=8)
no_structure = np.random.rand(n,2), None
# anisotropicly distributed data
X, y = datasets.make_blobs(n_samples=n, random_state=170)
transformation = [[0.6, -0.6], [-0.4, 0.8]]
X_aniso = np.dot(X, transformation)
aniso = (X_aniso, y)
# blobs with varied variances
varied = datasets.make_blobs(n_samples=n,
cluster_std=[1.0, 2.5, 0.5],
random_state=170)
plt.figure(figsize=(9 * 1.3 + 2, 14.5))
plt.subplots_adjust(left=.02, right=.98, bottom=.001, top=.96,
wspace=.05, hspace=.01)
plot_num = 1
default_base = {'n_neighbors': 10,
'n_clusters': 3}
datasets = [
(noisy_circles, {'n_clusters': 2}),
(noisy_moons, {'n_clusters': 2}),
(varied, {'n_neighbors': 2}),
(aniso, {'n_neighbors': 2}),
(blobs, {}),
(no_structure, {})]
for i_dataset, (dataset, algo_params) in enumerate(datasets):
params = default_base.copy() # update params w/ dataset specifics
params.update(algo_params)
X, y = dataset
X = StandardScaler().fit_transform(X) #normalize dataset
ward = cluster.AgglomerativeClustering(
n_clusters=params['n_clusters'], linkage='ward')
complete = cluster.AgglomerativeClustering(
n_clusters=params['n_clusters'], linkage='complete')
average = cluster.AgglomerativeClustering(
n_clusters=params['n_clusters'], linkage='average')
single = cluster.AgglomerativeClustering(
n_clusters=params['n_clusters'], linkage='single')
clustering_algorithms = (
('Single Linkage', single),
('Average Linkage', average),
('Complete Linkage', complete),
('Ward Linkage', ward),
)
for name, algorithm in clustering_algorithms:
t0 = time.time()
# catch warnings related to kneighbors_graph
with warnings.catch_warnings():
warnings.filterwarnings(
"ignore",
message="#connected components of the " +
"connectivity matrix is [0-9]{1,2}" +
" > 1. Completing it to avoid stopping the tree early.",
category=UserWarning)
algorithm.fit(X)
t1 = time.time()
if hasattr(algorithm, 'labels_'):
y_pred = algorithm.labels_.astype(int)
else:
y_pred = algorithm.predict(X)
plt.subplot(len(datasets), len(clustering_algorithms), plot_num)
if i_dataset == 0:
plt.title(name, size=18)
colors = np.array(list(islice(cycle(['#377eb8', '#ff7f00', '#4daf4a',
'#f781bf', '#a65628', '#984ea3',
'#999999', '#e41a1c', '#dede00']),
int(max(y_pred) + 1))))
plt.scatter(X[:, 0], X[:, 1], s=10, color=colors[y_pred])
plt.xlim(-2.5, 2.5); plt.ylim(-2.5, 2.5)
plt.xticks(()); plt.yticks(())
plt.text(.99, .01, ('%.2fs' % (t1 - t0)).lstrip('0'),
transform=plt.gca().transAxes, size=15,
horizontalalignment='right')
plot_num += 1
plt.show()
import numpy as np
from matplotlib import pyplot as plt
from scipy.cluster.hierarchy import dendrogram
from sklearn.datasets import load_iris
from sklearn.cluster import AgglomerativeClustering as AC
def plot_dendrogram(model, **kwargs):
# Create linkage matrix and then plot the dendrogram
# create the counts of samples under each node
counts = np.zeros(model.children_.shape[0])
n_samples = len(model.labels_)
for i, merge in enumerate(model.children_):
current_count = 0
for child_idx in merge:
if child_idx < n_samples:
current_count += 1 # leaf node
else:
current_count += counts[child_idx - n_samples]
counts[i] = current_count
linkage_matrix = np.column_stack([model.children_,
model.distances_,
counts]).astype(float)
dendrogram(linkage_matrix, **kwargs)
iris = load_iris(); X = iris.data
# distance_threshold=0 ensures we compute the full tree.
model = AC(distance_threshold=0,
n_clusters=None).fit(X)
plt.title('Hierarchical Clustering Dendrogram')
plot_dendrogram(model, truncate_mode='level', p=3)
plt.xlabel("#points in node (or index of point if no parenthesis).")
plt.show()
import time as time
import numpy as np
import matplotlib.pyplot as plt
import mpl_toolkits.mplot3d.axes3d as p3
from sklearn.cluster import AgglomerativeClustering as AC
from sklearn.datasets import make_swiss_roll as SR
# make a "thin" swiss roll
n, noise = 1500, 0.05
X, _ = SR(n, noise=noise); X[:, 1] *= .5
t0 = time.time()
ward = AC(n_clusters=6, linkage='ward').fit(X)
t1 = time.time() - t0
label = ward.labels_
print("Elapsed time: %.2fs" % t1)
print("#points: %i" % label.size)
fig = plt.figure()
ax = p3.Axes3D(fig)
ax.view_init(7, -80)
for l in np.unique(label):
ax.scatter(X[label == l, 0],
X[label == l, 1],
X[label == l, 2],
color=plt.cm.jet(float(l) / np.max(label + 1)), s=20, edgecolor='k')
plt.title('Without connectivity constraints (time %.2fs)' % t1)
Elapsed time: 0.04s #points: 1500
Text(0.5, 0.92, 'Without connectivity constraints (time 0.04s)')
# define data structure - 10 NNs
from sklearn.neighbors import kneighbors_graph as KNG
connectivity = KNG(X, n_neighbors=10, include_self=False)
t0 = time.time()
ward = AC(n_clusters=6, connectivity=connectivity, linkage='ward').fit(X)
t1 = time.time() - t0
label = ward.labels_
print("Elapsed time: %.2fs" % t1)
print("#points: %i" % label.size)
fig = plt.figure()
ax = p3.Axes3D(fig)
ax.view_init(7, -80)
for l in np.unique(label):
ax.scatter(X[label == l, 0],
X[label == l, 1],
X[label == l, 2],
color=plt.cm.jet(float(l) / np.max(label + 1)), s=20, edgecolor='k')
plt.title('WITH connectivity constraints (time %.2fs)' % t1)
plt.show()
Elapsed time: 0.06s #points: 1500
import time
import matplotlib.pyplot as plt
import numpy as np
from sklearn.cluster import AgglomerativeClustering as AC
from sklearn.neighbors import kneighbors_graph as KNG
np.random.seed(0)
n=1500; t=1.5*np.pi*(1+3*np.random.rand(1,n))
x,y = t*np.cos(t), t*np.sin(t)
X = np.concatenate((x,y));
X += 0.7*np.random.randn(2,n); X=X.T
# create local connectivity graph
# larger #neighbors = more homogeneous clusters, but more computation time
knn_graph = KNG(X,30,include_self=False)
for connectivity in (None, knn_graph):
for n_clusters in (30, 3):
plt.figure(figsize=(10, 4))
for index, linkage in enumerate(('average',
'complete',
'ward',
'single')):
plt.subplot(1, 4, index+1)
model = AC(linkage=linkage,
connectivity=connectivity,
n_clusters=n_clusters)
t0 = time.time(); model.fit(X); t1 = time.time()-t0
plt.scatter(X[:, 0],
X[:, 1], c=model.labels_, cmap=plt.cm.nipy_spectral)
plt.title('linkage=%s\n(time %.2fs)' % (linkage, t1),
fontdict=dict(verticalalignment='top'))
plt.axis('equal')
plt.axis('off')
plt.subplots_adjust(bottom=0, top=.83, wspace=0, left=0, right=1)
plt.suptitle('n_cluster=%i, connectivity=%r' %
(n_clusters, connectivity is not None), size=17)
plt.show()
import matplotlib.pyplot as plt
import numpy as np
from sklearn.cluster import AgglomerativeClustering as AC
from sklearn.metrics import pairwise_distances as PD
np.random.seed(0)
n=2000; t = np.pi*np.linspace(0,1,n)
X,y=[],[]
def sqr(x):
return np.sign(np.cos(x))
for i, (phi,a) in enumerate([(0.5,0.15), (0.5,0.6), (0.3,0.2)]):
for _ in range(30):
phase_noise = 0.01*np.random.normal()
amplitude_noise = 0.04*np.random.normal()
more_noise = 1-2*np.random.rand(n)
# make the noise sparse
more_noise[np.abs(more_noise)<0.997]=0
X.append(12*((a+amplitude_noise)*
(sqr(6*(t+phi+phase_noise)))
+more_noise))
y.append(i)
X,y = np.array(X), np.array(y)
n_clusters=3; labels=("waveform 1","waveform 2","waveform 3")
plt.figure()
plt.axes([0, 0, 1, 1])
for l, c, n in zip(range(n_clusters), 'rgb',
labels):
lines = plt.plot(X[y == l].T, c=c, alpha=.5)
lines[0].set_label(n)
plt.legend(loc='best')
plt.axis('tight'); plt.axis('off'); plt.suptitle("Ground truth", size=20)
# Plot the distances
for index, metric in enumerate(["cosine", "euclidean", "cityblock"]):
avg_dist = np.zeros((n_clusters, n_clusters))
plt.figure(figsize=(5, 4.5))
for i in range(n_clusters):
for j in range(n_clusters):
avg_dist[i, j] = PD(X[y == i],
X[y == j], metric=metric).mean()
avg_dist /= avg_dist.max()
for i in range(n_clusters):
for j in range(n_clusters):
plt.text(i, j, '%5.3f' % avg_dist[i, j],
verticalalignment='center',
horizontalalignment='center')
plt.imshow(avg_dist, interpolation='nearest', cmap=plt.cm.gnuplot2,
vmin=0)
plt.xticks(range(n_clusters), labels, rotation=45)
plt.yticks(range(n_clusters), labels)
plt.colorbar()
plt.suptitle("Interclass %s distances" % metric, size=18)
plt.tight_layout()
# Plot clustering results
for index, metric in enumerate(["cosine", "euclidean", "cityblock"]):
model = AC(n_clusters=n_clusters,
linkage="average",
affinity=metric)
model.fit(X)
plt.figure()
plt.axes([0, 0, 1, 1])
for l, c in zip(np.arange(model.n_clusters), 'rgbk'):
plt.plot(X[model.labels_ == l].T, c=c, alpha=.5)
plt.axis('tight')
plt.axis('off')
plt.suptitle("AgglomerativeClustering(affinity=%s)" % metric, size=20)
plt.show()
Clusters found by DBSCAN can be of any shape (compared to Kmeans, which assumes convex clusters).
min_samples
and eps
define density for this algorithm. min_samples
controls the algorithm's tolerance for noise. eps
controls the local neighborhood and usually cannot be left as a default value.
core samples are samples such that there are min_samples
within a distance of eps
. A cluster is set of core samples that can be built by recursively taking a core sample, finding all of its neighbors that are also core samples, and so on.
import numpy as np
from sklearn.cluster import DBSCAN
from sklearn import metrics
from sklearn.datasets import make_blobs
from sklearn.preprocessing import StandardScaler as SS
centers = [[1, 1], [-1, -1], [1, -1]]
X, labels_true = make_blobs(n_samples=750,
centers=centers,
cluster_std=0.4,
random_state=0)
X = SS().fit_transform(X)
db = DBSCAN(eps=0.3, min_samples=10).fit(X)
core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
core_samples_mask[db.core_sample_indices_] = True
labels = db.labels_
# Number of clusters in labels, ignoring noise if present.
n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
n_noise_ = list(labels).count(-1)
print('Est. #clusters: %d' % n_clusters_)
print('Est. #noise points: %d' % n_noise_)
print("Homogeneity: %0.3f" % metrics.homogeneity_score(labels_true, labels))
print("Completeness: %0.3f" % metrics.completeness_score(labels_true, labels))
print("V-measure: %0.3f" % metrics.v_measure_score(labels_true, labels))
print("Adj Rand Index: %0.3f" % metrics.adjusted_rand_score(labels_true, labels))
print("Adj Mutual Info: %0.3f" % metrics.adjusted_mutual_info_score(labels_true, labels))
print("Silhouette Coef: %0.3f" % metrics.silhouette_score(X, labels))
Est. #clusters: 3 Est. #noise points: 18 Homogeneity: 0.953 Completeness: 0.883 V-measure: 0.917 Adj Rand Index: 0.952 Adj Mutual Info: 0.916 Silhouette Coef: 0.626
import matplotlib.pyplot as plt
# Black removed and is used for noise instead.
unique_labels = set(labels)
colors = [plt.cm.Spectral(each)
for each in np.linspace(0, 1, len(unique_labels))]
for k, col in zip(unique_labels, colors):
if k == -1:
col = [0, 0, 0, 1]
class_member_mask = (labels == k)
xy = X[class_member_mask &
core_samples_mask]
plt.plot(xy[:, 0],
xy[:, 1], 'o', markerfacecolor=tuple(col),
markeredgecolor='k', markersize=14)
xy = X[class_member_mask &
~core_samples_mask]
plt.plot(xy[:, 0],
xy[:, 1], 'o', markerfacecolor=tuple(col),
markeredgecolor='k', markersize=6)
plt.title('Est. #clusters: %d' % n_clusters_)
plt.show()
eps
.reachability_
distance and an ordering_
value (spot within the cluster).max_eps="inf"
tells OPTICS to run repeatedly in linear time using the cluster_optics_dbscan
method. Shorter values result in shorter run times.reachability_
enables variable density extractions. Combining it with ordering_
can be used to build a reachability plot (Y-axis = point density, X-axis = point ordering).from sklearn.cluster import OPTICS, cluster_optics_dbscan as CODB
import matplotlib.gridspec as gridspec
import matplotlib.pyplot as plt
import numpy as np
np.random.seed(0)
n_points_per_cluster = 250
C1 = [-5, -2] + 0.8 * np.random.randn(n_points_per_cluster, 2)
C2 = [ 4, -1] + 0.1 * np.random.randn(n_points_per_cluster, 2)
C3 = [ 1, -2] + 0.2 * np.random.randn(n_points_per_cluster, 2)
C4 = [-2, 3] + 0.3 * np.random.randn(n_points_per_cluster, 2)
C5 = [ 3, -2] + 1.6 * np.random.randn(n_points_per_cluster, 2)
C6 = [ 5, 6] + 2.0 * np.random.randn(n_points_per_cluster, 2)
X = np.vstack((C1, C2, C3, C4, C5, C6))
clust = OPTICS(min_samples=50, xi=.05, min_cluster_size=.05).fit(X)
labels_050 = CODB(reachability=clust.reachability_,
core_distances=clust.core_distances_,
ordering=clust.ordering_,
eps=0.5)
labels_200 = CODB(reachability=clust.reachability_,
core_distances=clust.core_distances_,
ordering=clust.ordering_,
eps=2.0)
space = np.arange(len(X))
reachability = clust.reachability_[clust.ordering_]
labels = clust.labels_[clust.ordering_]
plt.figure(figsize=(10, 7))
G = gridspec.GridSpec(2, 3)
ax1 = plt.subplot(G[0, :])
ax2 = plt.subplot(G[1, 0])
ax3 = plt.subplot(G[1, 1])
ax4 = plt.subplot(G[1, 2])
# Reachability plot
colors = ['g.', 'r.', 'b.', 'y.', 'c.']
for klass, color in zip(range(0, 5), colors):
Xk = space[labels == klass]
Rk = reachability[labels == klass]
ax1.plot(Xk, Rk, color, alpha=0.3)
ax1.plot(space[labels == -1], reachability[labels == -1],
'k.', alpha=0.3)
ax1.plot(space, np.full_like(space, 2., dtype=float),
'k-', alpha=0.5)
ax1.plot(space, np.full_like(space, 0.5, dtype=float),
'k-.', alpha=0.5)
ax1.set_ylabel('Reachability (epsilon distance)')
ax1.set_title('Reachability Plot')
# OPTICS
colors = ['g.', 'r.', 'b.', 'y.', 'c.']
for klass, color in zip(range(0, 5), colors):
Xk = X[clust.labels_ == klass]
ax2.plot(Xk[:, 0],
Xk[:, 1], color, alpha=0.3)
ax2.plot(X[clust.labels_ == -1, 0],
X[clust.labels_ == -1, 1], 'k+', alpha=0.1)
ax2.set_title('Automatic Clustering\nOPTICS')
# DBSCAN at 0.5
colors = ['g', 'greenyellow', 'olive', 'r', 'b', 'c']
for klass, color in zip(range(0, 6), colors):
Xk = X[labels_050 == klass]
ax3.plot(Xk[:, 0],
Xk[:, 1], color, alpha=0.3, marker='.')
ax3.plot(X[labels_050 == -1, 0],
X[labels_050 == -1, 1], 'k+', alpha=0.1)
ax3.set_title('Clustering at 0.5 epsilon cut\nDBSCAN')
# DBSCAN at 2.
colors = ['g.', 'm.', 'y.', 'c.']
for klass, color in zip(range(0, 4), colors):
Xk = X[labels_200 == klass]
ax4.plot(Xk[:, 0],
Xk[:, 1], color, alpha=0.3)
ax4.plot(X[labels_200 == -1, 0],
X[labels_200 == -1, 1], 'k+', alpha=0.1)
ax4.set_title('Clustering at 2.0 epsilon cut\nDBSCAN')
plt.tight_layout()
plt.show()
threshold
limits the distance between the entering sample and existing subclusters.branching
limits the number of subclusters in a node.n_features
>20, consider using MiniBatchKmeans instead.n_clusters=None
reduces the dataset from 100K to 158 clusters. This is a preprocessing step before a global cluster step that further reduces them to 100 clusters.from itertools import cycle
from time import time
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.colors as colors
from sklearn.cluster import Birch, MiniBatchKMeans as MBKM
from sklearn.datasets import make_blobs
xx = np.linspace(-22, 22, 10)
yy = np.linspace(-22, 22, 10)
xx, yy = np.meshgrid(xx, yy)
n_centres = np.hstack((np.ravel(xx)[:, np.newaxis],
np.ravel(yy)[:, np.newaxis]))
X, y = make_blobs(n_samples=100000,
centers=n_centres,
random_state=0)
colors_ = cycle(colors.cnames.keys())
fig = plt.figure(figsize=(12, 4))
fig.subplots_adjust(left=0.04, right=0.98, bottom=0.1, top=0.9)
birch_models = [Birch(threshold=1.7, n_clusters=None),
Birch(threshold=1.7, n_clusters=100)]
final_step = ['without global clustering', 'with global clustering']
for ind, (birch_model, info) in enumerate(zip(birch_models, final_step)):
t0 = time(); birch_model.fit(X); t1 = time() - t0
print("Birch %s as the final step took %0.2f seconds" % (info, t1))
# Plot result
labels = birch_model.labels_
centroids = birch_model.subcluster_centers_
n_clusters = np.unique(labels).size
print("n_clusters : %d" % n_clusters)
ax = fig.add_subplot(1, 3, ind + 1)
for this_centroid, k, col in zip(centroids, range(n_clusters), colors_):
mask = labels == k
ax.scatter(X[mask, 0], X[mask, 1],
c='w', edgecolor=col, marker='.', alpha=0.5)
if birch_model.n_clusters is None:
ax.scatter(this_centroid[0], this_centroid[1], marker='+',
c='k', s=25)
ax.set_ylim([-25, 25])
ax.set_xlim([-25, 25])
ax.set_autoscaley_on(False)
ax.set_title('Birch %s' % info)
mbk = MBKM(init='k-means++',
n_clusters=100,
batch_size=100,
n_init=10,
max_no_improvement=10,
verbose=0,
random_state=0)
t0 = time(); mbk.fit(X); t1 = time() - t0
print("MiniBatchKMeans runtime: %0.2f seconds" % t1)
mbk_means_labels_unique = np.unique(mbk.labels_)
ax = fig.add_subplot(1, 3, 3)
for this_centroid, k, col in zip(mbk.cluster_centers_,
range(n_clusters), colors_):
mask = mbk.labels_ == k
ax.scatter(X[mask, 0], X[mask, 1], marker='.',
c='w', edgecolor=col, alpha=0.5)
ax.scatter(this_centroid[0], this_centroid[1], marker='+',
c='k', s=25)
ax.set_xlim([-25, 25])
ax.set_ylim([-25, 25])
ax.set_title("MiniBatchKMeans")
ax.set_autoscaley_on(False)
plt.show()
Birch without global clustering as the final step took 2.44 seconds n_clusters : 158 Birch with global clustering as the final step took 2.36 seconds n_clusters : 100 MiniBatchKMeans runtime: 2.50 seconds