conda install python-graphviz
.from sklearn import tree
X,y = [[0,0],[1,1],[2,2]], [0,1,2]
# predict class of samples
clf = tree.DecisionTreeClassifier(); clf.fit(X,y)
print(clf.predict([[0.6,2.0]]))
# predict probability of prediction
print(clf.predict_proba([[0.6,2.0]]))
# plot the tree
tree.plot_tree(clf, rounded=True, filled=True)
[1] [[0. 1. 0.]]
[Text(133.92000000000002, 181.2, 'X[0] <= 0.5\ngini = 0.667\nsamples = 3\nvalue = [1, 1, 1]'), Text(66.96000000000001, 108.72, 'gini = 0.0\nsamples = 1\nvalue = [1, 0, 0]'), Text(200.88000000000002, 108.72, 'X[0] <= 1.5\ngini = 0.5\nsamples = 2\nvalue = [0, 1, 1]'), Text(133.92000000000002, 36.23999999999998, 'gini = 0.0\nsamples = 1\nvalue = [0, 1, 0]'), Text(267.84000000000003, 36.23999999999998, 'gini = 0.0\nsamples = 1\nvalue = [0, 0, 1]')]
import graphviz
data = tree.export_graphviz(clf,
out_file=None,
filled=True,
rounded=True,
special_characters=True)
file = graphviz.Source(data)
file
file.render("example-tree")
# Exporting to text format
from sklearn.datasets import load_iris
from sklearn.tree import DecisionTreeClassifier as DTC
from sklearn.tree import export_text
iris = load_iris()
dt = DTC(random_state=0, max_depth=3)
dt = dt.fit(iris.data, iris.target)
r = export_text(dt, feature_names=iris['feature_names'])
print(r)
|--- petal width (cm) <= 0.80 | |--- class: 0 |--- petal width (cm) > 0.80 | |--- petal width (cm) <= 1.75 | | |--- petal length (cm) <= 4.95 | | | |--- class: 1 | | |--- petal length (cm) > 4.95 | | | |--- class: 2 | |--- petal width (cm) > 1.75 | | |--- petal length (cm) <= 4.85 | | | |--- class: 2 | | |--- petal length (cm) > 4.85 | | | |--- class: 2
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import load_iris
from sklearn.tree import DecisionTreeClassifier as DTC, plot_tree
# Params
n_classes, plot_colors, plot_step = 3, "ryb", 0.02
iris = load_iris()
for pairidx, pair in enumerate([[0, 1], [0, 2], [0, 3],
[1, 2], [1, 3], [2, 3]]):
X = iris.data[:, pair]
y = iris.target
clf = DTC().fit(X, y)
plt.subplot(2, 3, pairidx + 1) # decision boundry
x_min, x_max = X[:, 0].min() - 1, X[:, 0].max()+1
y_min, y_max = X[:, 1].min() - 1, X[:, 1].max()+1
xx, yy = np.meshgrid(np.arange(x_min, x_max, plot_step),
np.arange(y_min, y_max, plot_step))
plt.tight_layout(h_pad=0.5, w_pad=0.5, pad=2.5)
Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)
cs = plt.contourf(xx, yy, Z, cmap=plt.cm.RdYlBu)
plt.xlabel(iris.feature_names[pair[0]])
plt.ylabel(iris.feature_names[pair[1]])
# Plot the training points
for i, color in zip(range(n_classes), plot_colors):
idx = np.where(y == i)
plt.scatter(X[idx, 0],
X[idx, 1],
c=color, label=iris.target_names[i],
cmap=plt.cm.RdYlBu, edgecolor='black', s=15)
plt.suptitle("Decision surface: paired features")
plt.legend(loc='lower right')
plt.axis("tight")
plt.figure()
clf = DTC().fit(iris.data, iris.target)
plot_tree(clf, filled=True)
plt.show()
import numpy as np
from sklearn.tree import DecisionTreeRegressor as DTR
import matplotlib.pyplot as plt
# Create a noisy sinusoidal dataset
rng = np.random.RandomState(1)
X = np.sort(5 * rng.rand(80, 1), axis=0)
y = np.sin(X).ravel()
print("shapes:\t X:\t",X.shape,"\ty:\t",y.shape)
y[::5] += 3 * (0.5 - rng.rand(16))
# Fit regression model
regr_1 = DTR(max_depth=2); regr_1.fit(X, y)
regr_2 = DTR(max_depth=5); regr_2.fit(X, y)
# Predict
X_test = np.arange(0.0, 5.0, 0.01)[:, np.newaxis]
y_1 = regr_1.predict(X_test)
y_2 = regr_2.predict(X_test)
# Plot the results
plt.figure()
plt.scatter(X, y, s=20, edgecolor="black", c="darkorange", label="data")
plt.plot(X_test, y_1, color="cornflowerblue", label="max_depth=2", linewidth=2)
plt.plot(X_test, y_2, color="yellowgreen", label="max_depth=5", linewidth=2)
plt.xlabel("data"); plt.ylabel("target"); plt.title("Decision Tree Regression")
plt.legend()
plt.show()
shapes: X: (80, 1) y: (80,)
predict
: a list of n output valuespredict_proba
: a list of n arrays of class probabilitiesmax_depth
is set too high.import numpy as np
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeRegressor as DTR
# Noisy circle dataset
rng = np.random.RandomState(1)
X = np.sort(200 * rng.rand(100, 1) - 100, axis=0)
y = np.array([np.pi * np.sin(X).ravel(),
np.pi * np.cos(X).ravel()]).T
y[::5, :] += (0.5 - rng.rand(20, 2))
# Fit regression model
rgr_1, rgr_2, rgr_3 = DTR(max_depth=2), DTR(max_depth=5), DTR(max_depth=8)
rgr_1.fit(X, y); rgr_2.fit(X, y); rgr_3.fit(X, y)
# Predict
X_test = np.arange(-100.0, 100.0, 0.01)[:, np.newaxis]
y_1 = rgr_1.predict(X_test)
y_2 = rgr_2.predict(X_test)
y_3 = rgr_3.predict(X_test)
plt.figure()
s = 25
plt.scatter(y[ :, 0], y[ :, 1], c="navy", s=s, label="data")
plt.scatter(y_1[:, 0], y_1[:, 1], c="cornflowerblue", s=s, label="max_depth=2")
plt.scatter(y_2[:, 0], y_2[:, 1], c="red", s=s, label="max_depth=5")
plt.scatter(y_3[:, 0], y_3[:, 1], c="orange", s=s, label="max_depth=8")
plt.xlim([-6, 6]); plt.ylim([-6, 6])
plt.xlabel("target 1"); plt.ylabel("target 2")
plt.title("Multi-output DTR"); plt.legend(loc="best"); plt.show()
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import fetch_olivetti_faces
from sklearn.utils.validation import check_random_state
from sklearn.ensemble import ExtraTreesRegressor as ETR
# Load the faces datasets
data, targets = fetch_olivetti_faces(return_X_y=True)
train = data[targets < 30]
test = data[targets >= 30] # Test on independent people
# Test on a subset of people
n_faces = 5
rng = check_random_state(4)
face_ids = rng.randint(test.shape[0], size=(n_faces, ))
test = test[face_ids, :]
n_pixels = data.shape[1]
X_train = train[:, :(n_pixels+1) // 2] #upper half of faces
y_train = train[:, n_pixels // 2:] # Lower half of the faces
X_test = test[ :, :(n_pixels+1) // 2]
y_test = test[ :, n_pixels // 2:]
estimator = ETR(n_estimators=10, max_features=32, random_state=0)
estimator.fit(X_train,y_train)
y_test_prediction = estimator.predict(X_test)
image_dims=(64,64)
plt.figure(figsize=(20,20))
for i in range(n_faces):
sub = plt.subplot(n_faces,1,i+1)
true_face = np.hstack((X_test[i], y_test[i]))
completed_face = np.hstack((X_test[i], y_test_prediction[i]))
sub.imshow(true_face.reshape(image_dims),
cmap=plt.cm.gray,
interpolation="nearest")
sub.imshow(completed_face.reshape(image_dims),
cmap=plt.cm.gray,
interpolation="nearest")
ID3 creates a multiway tree & finds the categorical feature that yields a maximum categorical information gain for each node. Trees are grown to a maximum size, then pruned to improve the tree's ability to generalize on unknown data.
C4.5 succeeded ID3 and removed the categorical variable restriction. It converts trained trees into sets of if-then rules, which are evaluated to determine an application order. Pruning is done by removing a rule's precondition if accuracy improves without it.
C5.0 succeeeds C4.5. It is available under a proprietary license.
CART is similar to C4.5 & supports numerical (regression) targets. It builds binary trees using the feature & threshold that yields the largest information gain at each node. Scikit-Learn uses an optimized version of CART.
The quality of a candidate split of a node is found with an impurity or loss function $H()$. How $H()$ is computed depends on the task being solved.
If a target is a classification outcome, $p_{mk}$ is the proportion of class $k$ observations in node $m$. $p_{mk} = 1/ N_m \sum_{y \in Q_m} I(y = k)$. Here are the most common criteria for classification-based splits.
If the target is a continuous (regression) value, here are the most common criteria for regression-based splits:
y + \bar{y}_m)$
- Using `criteria="poisson"` is recommended if your target is a count or frequency. $y>=0$ is required for this criterion. It fits much slower than MSE.
ccp_alpha
.ccp_alpha
on tree regularizationccp_alpha
with validation scoresimport matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split as TTS
from sklearn.datasets import load_breast_cancer
from sklearn.tree import DecisionTreeClassifier as DTC
X, y = load_breast_cancer(return_X_y=True)
X_train, X_test, y_train, y_test = TTS(X, y, random_state=0)
clf = DTC(random_state=0)
path = clf.cost_complexity_pruning_path(X_train, y_train)
ccp_alphas, impurities = path.ccp_alphas, path.impurities
# max effective alpha is removed - it represents a single node.
fig, ax = plt.subplots()
ax.plot(ccp_alphas[:-1], impurities[:-1], marker='o', drawstyle="steps-post")
ax.set_xlabel("effective alpha")
ax.set_ylabel("total impurity of leaves")
ax.set_title("Total Impurity vs effective alpha for training set")
Text(0.5, 1.0, 'Total Impurity vs effective alpha for training set')
ccp_alphas
will prune the entire tree - leaving it (clfs[-1]
) with a single node.clfs = []
for ccp_alpha in ccp_alphas:
clf = DTC(random_state=0, ccp_alpha=ccp_alpha)
clf.fit(X_train, y_train)
clfs.append(clf)
print("#nodes in the last tree: {} with ccp_alpha: {}".format(
clfs[-1].tree_.node_count,
ccp_alphas[-1]))
#nodes in the last tree: 1 with ccp_alpha: 0.3272984419327777
clfs = clfs[:-1]
ccp_alphas = ccp_alphas[:-1]
node_counts = [clf.tree_.node_count for clf in clfs]
depth = [clf.tree_.max_depth for clf in clfs]
fig, ax = plt.subplots(2, 1)
ax[0].plot(ccp_alphas, node_counts, marker='o', drawstyle="steps-post")
ax[0].set_xlabel("alpha")
ax[0].set_ylabel("#nodes")
ax[0].set_title("#nodes vs alpha")
ax[1].plot(ccp_alphas, depth, marker='o', drawstyle="steps-post")
ax[1].set_xlabel("alpha")
ax[1].set_ylabel("depth of tree")
ax[1].set_title("Depth vs alpha")
fig.tight_layout()
ccp_alpha=0
and all other parameters set to their defaults, the tree will overfit.ccp_alpha=0.015
should provide a max testing accuracy.train_scores = [clf.score(X_train, y_train) for clf in clfs]
test_scores = [clf.score(X_test, y_test) for clf in clfs]
fig, ax = plt.subplots()
ax.set_xlabel("alpha"); ax.set_ylabel("accuracy")
ax.set_title("Accuracy vs alpha for training and testing sets")
ax.plot(ccp_alphas, train_scores, label="train", drawstyle="steps-post")
ax.plot(ccp_alphas, test_scores, label="test", drawstyle="steps-post")
ax.legend(); plt.show()