labels_
attribute after learning the data.(#samples, #features)
as inputs - they can be obtained from feature_extraction classes. (#samples, #samples)
as inputs. they can be obtained from pairwise functions.import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.datasets import make_blobs
# Incorrect number of clusters
n, rndm = 1500, 170
X, y = make_blobs(n_samples=n, random_state=rndm)
y_pred1 = KMeans(n_clusters=2, random_state=rndm).fit_predict(X)
# Anisotropicly distributed data
transformation = [[0.60834549, -0.63667341], [-0.40887718, 0.85253229]]
X_aniso = np.dot(X, transformation)
y_pred2 = KMeans(n_clusters=3, random_state=rndm).fit_predict(X_aniso)
# Different variance
X_varied, y_varied = make_blobs(n_samples=n,
cluster_std=[1.0, 2.5, 0.5],
random_state=rndm)
y_pred3 = KMeans(n_clusters=3, random_state=rndm).fit_predict(X_varied)
# Unevenly sized blobs
X_filtered = np.vstack((X[y == 0][:500],
X[y == 1][:100],
X[y == 2][:10]))
y_pred4 = KMeans(n_clusters=3, random_state=rndm).fit_predict(X_filtered)
plt.figure(figsize=(12, 12))
plt.subplot(221)
plt.scatter(X[:, 0], X[:, 1], c=y_pred1)
plt.title("Incorrect Number of Blobs")
plt.subplot(222)
plt.scatter(X_aniso[:, 0], X_aniso[:, 1], c=y_pred2)
plt.title("Anisotropicly Distributed Blobs")
plt.subplot(223)
plt.scatter(X_varied[:, 0], X_varied[:, 1], c=y_pred3)
plt.title("Unequal Variance")
plt.subplot(224)
plt.scatter(X_filtered[:, 0], X_filtered[:, 1], c=y_pred4)
plt.title("Unevenly Sized Blobs")
plt.show()
init="k-means++"
initializes the centroids to more distant from each other, leading to better results than by random initialization.sample_weight
enables sample weighting. A sample with weight=2 is equivalent to adding a duplicate of that sample to the dataset.import numpy as np
from sklearn.datasets import load_digits
from time import time
from sklearn import metrics
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans as KM
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
data, labels = load_digits(return_X_y=True)
(samples, features), digits = data.shape, np.unique(labels).size
import matplotlib.pyplot as plt
# PCA reduces #dims from 64 to 2
reduced_data = PCA(n_components=2).fit_transform(data)
kmeans = KMeans(init="k-means++", n_clusters=digits, n_init=4)
kmeans.fit(reduced_data)
# Mesh step size
h = .02
# Plot the decision boundary. For that, we will assign a color to each
x_min, x_max = reduced_data[:, 0].min() - 1, reduced_data[:, 0].max() + 1
y_min, y_max = reduced_data[:, 1].min() - 1, reduced_data[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
# Obtain labels for each point in mesh. Use last trained model.
Z = kmeans.predict(np.c_[xx.ravel(), yy.ravel()])
# Put the result into a color plot
Z = Z.reshape(xx.shape); plt.figure(1); plt.clf()
plt.imshow(Z, interpolation="nearest",
extent=(xx.min(), xx.max(), yy.min(), yy.max()),
cmap=plt.cm.Paired, aspect="auto", origin="lower")
plt.plot(reduced_data[:, 0],
reduced_data[:, 1], 'k.', markersize=2)
centroids = kmeans.cluster_centers_
plt.scatter(centroids[:, 0],
centroids[:, 1],
marker="x", s=169, linewidths=3, color="w", zorder=10)
plt.title("K-means on digits (PCA-reduced)\n" "Centroids = white cross")
plt.xlim(x_min, x_max)
plt.ylim(y_min, y_max)
plt.xticks(()); plt.yticks(()); plt.show()
import time
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import MiniBatchKMeans as KMMB, KMeans as KM
from sklearn.metrics.pairwise import pairwise_distances_argmin as PDA
from sklearn.datasets import make_blobs
np.random.seed(0)
batch_size = 45
centers = [[1, 1], [-1, -1], [1, -1]]
n_clusters = len(centers)
X, labels_true = make_blobs(
n_samples=3000,
centers=centers,
cluster_std=0.7)
k_means = KM( init='k-means++', n_clusters=3, n_init=10)
mbk = KMMB(init='k-means++', n_clusters=3, batch_size=batch_size,
n_init=10, max_no_improvement=10, verbose=0)
t0 = time.time(); k_means.fit(X); t_batch = time.time() - t0
t0 = time.time(); mbk.fit(X); t_mini_batch = time.time() - t0
fig = plt.figure(figsize=(8, 3))
fig.subplots_adjust(left=0.02, right=0.98, bottom=0.05, top=0.9)
colors = ['#4EACC5', '#FF9C34', '#4E9A06']
# We want to have the same colors for the same cluster from the
# MiniBatchKMeans and the KMeans algorithm. Let's pair the cluster centers per
# closest one.
k_means_cluster_centers = k_means.cluster_centers_
order = PDA(k_means.cluster_centers_,
mbk.cluster_centers_)
mbk_means_cluster_centers = mbk.cluster_centers_[order]
k_means_labels = PDA(X, k_means_cluster_centers)
mbk_means_labels = PDA(X, mbk_means_cluster_centers)
# KMeans
ax = fig.add_subplot(1, 3, 1)
for k, col in zip(range(n_clusters), colors):
my_members = k_means_labels == k
cluster_center = k_means_cluster_centers[k]
ax.plot(X[my_members, 0],
X[my_members, 1], 'w',
markerfacecolor=col,
marker='.')
ax.plot(cluster_center[0],
cluster_center[1], 'o',
markerfacecolor=col,
markeredgecolor='k', markersize=6)
ax.set_title('KMeans'); ax.set_xticks(()); ax.set_yticks(())
plt.text(-3.5, 1.8, 'train time: %.2fs\ninertia: %f' % (
t_batch, k_means.inertia_))
# MiniBatchKMeans
ax = fig.add_subplot(1, 3, 2)
for k, col in zip(range(n_clusters), colors):
my_members = mbk_means_labels == k
cluster_center = mbk_means_cluster_centers[k]
ax.plot(X[my_members, 0],
X[my_members, 1], 'w',
markerfacecolor=col, marker='.')
ax.plot(cluster_center[0],
cluster_center[1], 'o',
markerfacecolor=col,
markeredgecolor='k', markersize=6)
ax.set_title('MiniBatchKMeans'); ax.set_xticks(()); ax.set_yticks(())
plt.text(-3.5, 1.8, 'train time: %.2fs\ninertia: %f' %
(t_mini_batch, mbk.inertia_))
# Initialise the different array to all False
different = (mbk_means_labels == 4)
ax = fig.add_subplot(1, 3, 3)
for k in range(n_clusters):
different += ((k_means_labels == k) != (mbk_means_labels == k))
identic = np.logical_not(different)
ax.plot(X[identic, 0], X[identic, 1], 'w',
markerfacecolor='#bbbbbb', marker='.')
ax.plot(X[different, 0], X[different, 1], 'w',
markerfacecolor='m', marker='.')
ax.set_title('Difference'); ax.set_xticks(()); ax.set_yticks(())
plt.show()
preference
(how many exemplars) and damping factor
(helps avoid message oscillation).from sklearn.cluster import AffinityPropagation as AP
from sklearn import metrics
from sklearn.datasets import make_blobs
import matplotlib.pyplot as plt
from itertools import cycle
centers = [[1, 1], [-1, -1], [1, -1]]
X, labels_true = make_blobs(n_samples=300,
centers=centers,
cluster_std=0.5,
random_state=None)
af = AP(preference=-50).fit(X)
cluster_centers_indices = af.cluster_centers_indices_
labels = af.labels_
n_clusters_ = len(cluster_centers_indices)
/home/bjpcjp/.local/lib/python3.8/site-packages/sklearn/cluster/_affinity_propagation.py:148: FutureWarning: 'random_state' has been introduced in 0.23. It will be set to None starting from 1.0 (renaming of 0.25) which means that results will differ at every function call. Set 'random_state' to None to silence this warning, or to 0 to keep the behavior of versions <0.23. warnings.warn(
# metrics
print('Estimated #clusters: %d' % n_clusters_)
print("Homogeneity: %0.3f" %
metrics.homogeneity_score(labels_true, labels))
print("Completeness: %0.3f" %
metrics.completeness_score(labels_true, labels))
print("V-measure: %0.3f" %
metrics.v_measure_score(labels_true, labels))
print("Adjusted Rand Index: %0.3f"
% metrics.adjusted_rand_score(labels_true, labels))
print("Adjusted Mutual Information: %0.3f"
% metrics.adjusted_mutual_info_score(labels_true, labels))
print("Silhouette Coefficient: %0.3f"
% metrics.silhouette_score(X, labels, metric='sqeuclidean'))
Estimated #clusters: 3 Homogeneity: 0.896 Completeness: 0.897 V-measure: 0.897 Adjusted Rand Index: 0.922 Adjusted Mutual Information: 0.896 Silhouette Coefficient: 0.766
plt.close('all')
plt.figure(1)
plt.clf()
colors = cycle('bgrcmykbgrcmykbgrcmykbgrcmyk')
for k, col in zip(range(n_clusters_), colors):
class_members = labels == k
cluster_center = X[cluster_centers_indices[k]]
plt.plot(X[class_members, 0],
X[class_members, 1], col + '.')
plt.plot(cluster_center[0],
cluster_center[1],
'o', markerfacecolor=col,
markeredgecolor='k', markersize=14)
for x in X[class_members]:
plt.plot([cluster_center[0], x[0]],
[cluster_center[1], x[1]], col)
plt.title('Estimated #clusters: %d' % n_clusters_)
plt.show()
import sys
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.collections import LineCollection
import pandas as pd
from sklearn import cluster, covariance, manifold
symbol_dict = {
'TOT': 'Total', 'XOM': 'Exxon', 'CVX': 'Chevron', 'COP': 'ConocoPhillips',
'VLO': 'Valero Energy', 'MSFT': 'Microsoft', 'IBM': 'IBM',
'TWX': 'Time Warner', 'CMCSA': 'Comcast', 'CVC': 'Cablevision',
'YHOO': 'Yahoo', 'DELL': 'Dell', 'HPQ': 'HP', 'AMZN': 'Amazon',
'TM': 'Toyota', 'CAJ': 'Canon', 'SNE': 'Sony', 'F': 'Ford',
'HMC': 'Honda', 'NAV': 'Navistar', 'NOC': 'Northrop Grumman', 'BA': 'Boeing',
'KO': 'Coca Cola', 'MMM': '3M', 'MCD': 'McDonald\'s', 'PEP': 'Pepsi',
'K': 'Kellogg', 'UN': 'Unilever', 'MAR': 'Marriott', 'PG': 'Procter Gamble',
'CL': 'Colgate-Palmolive', 'GE': 'General Electrics', 'WFC': 'Wells Fargo', 'JPM': 'JPMorgan Chase',
'AIG': 'AIG', 'AXP': 'American express', 'BAC': 'Bank of America', 'GS': 'Goldman Sachs',
'AAPL': 'Apple', 'SAP': 'SAP', 'CSCO': 'Cisco', 'TXN': 'Texas Instruments',
'XRX': 'Xerox', 'WMT': 'Wal-Mart', 'HD': 'Home Depot', 'GSK': 'GlaxoSmithKline',
'PFE': 'Pfizer', 'SNY': 'Sanofi-Aventis', 'NVS': 'Novartis', 'KMB': 'Kimberly-Clark',
'R': 'Ryder', 'GD': 'General Dynamics', 'RTN': 'Raytheon', 'CVS': 'CVS',
'CAT': 'Caterpillar', 'DD': 'DuPont de Nemours'}
symbols, names = np.array(sorted(symbol_dict.items())).T
quotes = []
for symbol in symbols:
print('Fetching quote history for %r' % symbol, file=sys.stderr)
url = ('https://raw.githubusercontent.com/scikit-learn/examples-data/'
'master/financial-data/{}.csv')
quotes.append(pd.read_csv(url.format(symbol)))
close_prices = np.vstack([q['close'] for q in quotes])
open_prices = np.vstack([q['open'] for q in quotes])
variation = close_prices - open_prices
edge_model = covariance.GraphicalLassoCV()
# standardize time series: using correlations rather than covariance
# is more efficient for structure recovery
X = variation.copy().T
X /= X.std(axis=0)
edge_model.fit(X)
Fetching quote history for 'AAPL' Fetching quote history for 'AIG' Fetching quote history for 'AMZN' Fetching quote history for 'AXP' Fetching quote history for 'BA' Fetching quote history for 'BAC' Fetching quote history for 'CAJ' Fetching quote history for 'CAT' Fetching quote history for 'CL' Fetching quote history for 'CMCSA' Fetching quote history for 'COP' Fetching quote history for 'CSCO' Fetching quote history for 'CVC' Fetching quote history for 'CVS' Fetching quote history for 'CVX' Fetching quote history for 'DD' Fetching quote history for 'DELL' Fetching quote history for 'F' Fetching quote history for 'GD' Fetching quote history for 'GE' Fetching quote history for 'GS' Fetching quote history for 'GSK' Fetching quote history for 'HD' Fetching quote history for 'HMC' Fetching quote history for 'HPQ' Fetching quote history for 'IBM' Fetching quote history for 'JPM' Fetching quote history for 'K' Fetching quote history for 'KMB' Fetching quote history for 'KO' Fetching quote history for 'MAR' Fetching quote history for 'MCD' Fetching quote history for 'MMM' Fetching quote history for 'MSFT' Fetching quote history for 'NAV' Fetching quote history for 'NOC' Fetching quote history for 'NVS' Fetching quote history for 'PEP' Fetching quote history for 'PFE' Fetching quote history for 'PG' Fetching quote history for 'R' Fetching quote history for 'RTN' Fetching quote history for 'SAP' Fetching quote history for 'SNE' Fetching quote history for 'SNY' Fetching quote history for 'TM' Fetching quote history for 'TOT' Fetching quote history for 'TWX' Fetching quote history for 'TXN' Fetching quote history for 'UN' Fetching quote history for 'VLO' Fetching quote history for 'WFC' Fetching quote history for 'WMT' Fetching quote history for 'XOM' Fetching quote history for 'XRX' Fetching quote history for 'YHOO' /home/bjpcjp/.local/lib/python3.8/site-packages/numpy/core/_methods.py:202: RuntimeWarning: invalid value encountered in subtract x = asanyarray(arr - arrmean)
GraphicalLassoCV()
_, labels = cluster.affinity_propagation(edge_model.covariance_, random_state=0)
n_labels = labels.max()
for i in range(n_labels + 1):
print('Cluster %i: %s' % ((i + 1), ', '.join(names[labels == i])))
# #############################################################################
# Find low-D embedding for visualization
# We use a dense eigen_solver to achieve reproducibility (arpack is
# initiated with random vectors that we don't control). In addition, we
# use a large number of neighbors to capture the large-scale structure.
node_position_model = manifold.LocallyLinearEmbedding(
n_components=2, eigen_solver='dense', n_neighbors=6)
embedding = node_position_model.fit_transform(X.T).T
Cluster 1: Apple, Amazon, Yahoo Cluster 2: Comcast, Cablevision, Time Warner Cluster 3: ConocoPhillips, Chevron, Total, Valero Energy, Exxon Cluster 4: Cisco, Dell, HP, IBM, Microsoft, SAP, Texas Instruments Cluster 5: Boeing, General Dynamics, Northrop Grumman, Raytheon Cluster 6: AIG, American express, Bank of America, Caterpillar, CVS, DuPont de Nemours, Ford, General Electrics, Goldman Sachs, Home Depot, JPMorgan Chase, Marriott, 3M, Ryder, Wells Fargo, Wal-Mart Cluster 7: McDonald's Cluster 8: GlaxoSmithKline, Novartis, Pfizer, Sanofi-Aventis, Unilever Cluster 9: Kellogg, Coca Cola, Pepsi Cluster 10: Colgate-Palmolive, Kimberly-Clark, Procter Gamble Cluster 11: Canon, Honda, Navistar, Sony, Toyota, Xerox
plt.figure(1, facecolor='w', figsize=(10, 8))
plt.clf()
ax = plt.axes([0., 0., 1., 1.])
plt.axis('off')
# Display a graph of the partial correlations
partial_correlations = edge_model.precision_.copy()
d = 1 / np.sqrt(np.diag(partial_correlations))
partial_correlations *= d
partial_correlations *= d[:, np.newaxis]
non_zero = (np.abs(np.triu(partial_correlations, k=1)) > 0.02)
# Plot the nodes using the coordinates of our embedding
plt.scatter(embedding[0], embedding[1], s=100 * d ** 2, c=labels,
cmap=plt.cm.nipy_spectral)
# Plot the edges
start_idx, end_idx = np.where(non_zero)
# a sequence of (*line0*, *line1*, *line2*), where::
# linen = (x0, y0), (x1, y1), ... (xm, ym)
segments = [[embedding[:, start], embedding[:, stop]]
for start, stop in zip(start_idx, end_idx)]
values = np.abs(partial_correlations[non_zero])
lc = LineCollection(segments,
zorder=0, cmap=plt.cm.hot_r,
norm=plt.Normalize(0, .7 * values.max()))
lc.set_array(values)
lc.set_linewidths(15 * values)
ax.add_collection(lc)
# Add a label to each node. The challenge here is that we want to
# position the labels to avoid overlap with other labels
for index, (name, label, (x, y)) in enumerate(
zip(names, labels, embedding.T)):
dx = x - embedding[0]
dx[index] = 1
dy = y - embedding[1]
dy[index] = 1
this_dx = dx[np.argmin(np.abs(dy))]
this_dy = dy[np.argmin(np.abs(dx))]
if this_dx > 0:
horizontalalignment = 'left'
x = x + .002
else:
horizontalalignment = 'right'
x = x - .002
if this_dy > 0:
verticalalignment = 'bottom'
y = y + .002
else:
verticalalignment = 'top'
y = y - .002
plt.text(x, y, name, size=10,
horizontalalignment=horizontalalignment,
verticalalignment=verticalalignment,
bbox=dict(facecolor='w',
edgecolor=plt.cm.nipy_spectral(label / float(n_labels)),
alpha=.6))
plt.xlim(embedding[0].min() - .15 * embedding[0].ptp(),
embedding[0].max() + .10 * embedding[0].ptp(),)
plt.ylim(embedding[1].min() - .03 * embedding[1].ptp(),
embedding[1].max() + .03 * embedding[1].ptp())
plt.show()