from sklearn.model_selection import cross_val_score
from sklearn.datasets import load_iris
from sklearn.ensemble import AdaBoostClassifier
X, y = load_iris(return_X_y=True)
clf = AdaBoostClassifier(n_estimators=100) # AdaBoost on 100 weak classifiers
scores = cross_val_score(clf, X, y, cv=5)
scores.mean()
0.9466666666666665
n_estimators
controls the numbers of learners.learning_rate
controls the contribution of weak learners in the final combination.base_estimator
controls the estimator method.import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets
from sklearn.tree import DecisionTreeClassifier as DTC
from sklearn.metrics import zero_one_loss
from sklearn.ensemble import AdaBoostClassifier as ABC
n, lr = 400, 1.0
X, y = datasets.make_hastie_10_2(n_samples=12000, random_state=1)
X_test, y_test = X[2000:], y[2000:]
X_train, y_train = X[:2000], y[:2000]
dt_stump = DTC(max_depth=1, min_samples_leaf=1)
dt = DTC(max_depth=9, min_samples_leaf=1)
dt.fit( X_train, y_train)
dt_stump.fit(X_train, y_train)
dt_err = 1.0 - dt.score( X_test, y_test)
dt_stump_err = 1.0 - dt_stump.score(X_test, y_test)
ada_discrete = ABC(base_estimator=dt_stump, learning_rate=lr, n_estimators=n, algorithm="SAMME")
ada_real = ABC(base_estimator=dt_stump, learning_rate=lr, n_estimators=n, algorithm="SAMME.R")
ada_discrete.fit(X_train, y_train)
ada_real.fit(X_train, y_train)
AdaBoostClassifier(base_estimator=DecisionTreeClassifier(max_depth=1), n_estimators=400)
fig = plt.figure()
ax = fig.add_subplot(111)
ax.plot([1, n], [dt_stump_err] * 2, 'k-',
label='Decision Stump Error')
ax.plot([1, n], [dt_err] * 2, 'k--',
label='Decision Tree Error')
ada_discrete_err = np.zeros( (n,))
ada_discrete_err_train = np.zeros((n,))
ada_real_err = np.zeros( (n,))
ada_real_err_train = np.zeros( (n,))
for i, y_pred in enumerate(ada_discrete.staged_predict(X_test)):
ada_discrete_err[i] = zero_one_loss(y_pred, y_test)
for i, y_pred in enumerate(ada_discrete.staged_predict(X_train)):
ada_discrete_err_train[i] = zero_one_loss(y_pred, y_train)
for i, y_pred in enumerate(ada_real.staged_predict(X_test)):
ada_real_err[i] = zero_one_loss(y_pred, y_test)
for i, y_pred in enumerate(ada_real.staged_predict(X_train)):
ada_real_err_train[i] = zero_one_loss(y_pred, y_train)
ax.plot(np.arange(n) + 1, ada_discrete_err,
label='Discrete AdaBoost Test Error',
color='red')
ax.plot(np.arange(n) + 1, ada_discrete_err_train,
label='Discrete AdaBoost Train Error',
color='blue')
ax.plot(np.arange(n) + 1, ada_real_err,
label='Real AdaBoost Test Error',
color='orange')
ax.plot(np.arange(n) + 1, ada_real_err_train,
label='Real AdaBoost Train Error',
color='green')
ax.set_ylim((0.0, 0.5))
ax.set_xlabel('n_estimators')
ax.set_ylabel('error rate')
leg = ax.legend(loc='upper right', fancybox=True)
leg.get_frame().set_alpha(0.7)
plt.show()
import matplotlib.pyplot as plt
from sklearn.datasets import make_gaussian_quantiles
from sklearn.metrics import accuracy_score
from sklearn.ensemble import AdaBoostClassifier as ABC
from sklearn.tree import DecisionTreeClassifier as DTC
X, y = make_gaussian_quantiles(n_samples=13000, n_features=10, n_classes=3, random_state=1)
n_split = 3000
X_train, X_test = X[:n_split], X[n_split:]
y_train, y_test = y[:n_split], y[n_split:]
bdt_real = ABC(DTC(max_depth=2), n_estimators=600, learning_rate=1)
bdt_discrete = ABC(DTC(max_depth=2), n_estimators=600, learning_rate=1.5, algorithm="SAMME")
bdt_real.fit( X_train, y_train)
bdt_discrete.fit(X_train, y_train)
real_test_errors, discrete_test_errors = [],[]
for real_test_predict, discrete_train_predict in zip(
bdt_real.staged_predict(X_test),
bdt_discrete.staged_predict(X_test)):
real_test_errors.append( 1. - accuracy_score( real_test_predict, y_test))
discrete_test_errors.append(1. - accuracy_score(discrete_train_predict, y_test))
n_trees_discrete = len(bdt_discrete)
n_trees_real = len(bdt_real)
# Boosting might terminate early, but the arrays are always n_estimators long.
# Crop them to the actual number of trees here:
discrete_estimator_errors = bdt_discrete.estimator_errors_[:n_trees_discrete]
real_estimator_errors = bdt_real.estimator_errors_[:n_trees_real]
discrete_estimator_weights = bdt_discrete.estimator_weights_[:n_trees_discrete]
plt.figure(figsize=(15, 5))
plt.subplot(131)
plt.plot(range(1, n_trees_discrete + 1), discrete_test_errors, c='black', label='SAMME')
plt.plot(range(1, n_trees_real + 1), real_test_errors, c='black', linestyle='dashed', label='SAMME.R')
plt.legend()
plt.ylim(0.18, 0.62)
plt.ylabel('Test Error'); plt.xlabel('Number of Trees')
plt.subplot(132)
plt.plot(range(1, n_trees_discrete + 1), discrete_estimator_errors, "b", label='SAMME', alpha=.5)
plt.plot(range(1, n_trees_real + 1), real_estimator_errors, "r", label='SAMME.R', alpha=.5)
plt.legend()
plt.ylabel('Error'); plt.xlabel('Number of Trees')
plt.ylim((.2,
max(real_estimator_errors.max(),
discrete_estimator_errors.max()) * 1.2))
plt.xlim((-20, len(bdt_discrete) + 20))
plt.subplot(133)
plt.plot(range(1, n_trees_discrete + 1), discrete_estimator_weights, "b", label='SAMME')
plt.legend()
plt.ylabel('Weight')
plt.xlabel('Number of Trees')
plt.ylim((0, discrete_estimator_weights.max() * 1.2))
plt.xlim((-20, n_trees_discrete + 20))
# prevent overlapping y-axis labels
plt.subplots_adjust(wspace=0.25)
plt.show()
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import AdaBoostClassifier as ABC
from sklearn.tree import DecisionTreeClassifier as DTC
from sklearn.datasets import make_gaussian_quantiles
# Construct dataset
X1, y1 = make_gaussian_quantiles(cov=2.,
n_samples=200, n_features=2,
n_classes=2, random_state=1)
X2, y2 = make_gaussian_quantiles(mean=(3, 3), cov=1.5,
n_samples=300, n_features=2,
n_classes=2, random_state=1)
X = np.concatenate((X1, X2))
y = np.concatenate((y1, - y2 + 1))
# Create and fit an AdaBoosted decision tree
bdt = ABC(DTC(max_depth=1), algorithm="SAMME", n_estimators=200)
bdt.fit(X, y)
AdaBoostClassifier(algorithm='SAMME', base_estimator=DecisionTreeClassifier(max_depth=1), n_estimators=200)
plot_colors, plot_step, class_names = "br", 0.02, "AB"
plt.figure(figsize=(10, 5))
# decision boundaries
plt.subplot(121)
x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, plot_step),
np.arange(y_min, y_max, plot_step))
Z = bdt.predict(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)
cs = plt.contourf(xx, yy, Z, cmap=plt.cm.Paired)
plt.axis("tight")
# training points
for i, n, c in zip(range(2), class_names, plot_colors):
idx = np.where(y == i)
plt.scatter(X[idx, 0], X[idx, 1],
c=c, cmap=plt.cm.Paired,
s=20, edgecolor='k',
label="Class %s" % n)
plt.xlim(x_min, x_max)
plt.ylim(y_min, y_max)
plt.legend(loc='upper right')
plt.xlabel('x'); plt.ylabel('y'); plt.title('Decision Boundary')
# two-class decision scores
twoclass_output = bdt.decision_function(X)
plot_range = (twoclass_output.min(), twoclass_output.max())
plt.subplot(122)
for i, n, c in zip(range(2), class_names, plot_colors):
plt.hist(twoclass_output[y == i],
bins=10,
range=plot_range,
facecolor=c,
label='Class %s' % n,
alpha=.5,
edgecolor='k')
x1, x2, y1, y2 = plt.axis()
plt.axis((x1, x2, y1, y2 * 1.2))
plt.legend(loc='upper right')
plt.ylabel('Samples'); plt.xlabel('Score'); plt.title('Decision Scores')
plt.tight_layout()
plt.subplots_adjust(wspace=0.35)
plt.show()
import numpy as np
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeRegressor as DTR
from sklearn.ensemble import AdaBoostRegressor as ABR
rng = np.random.RandomState(1)
X = np.linspace(0, 6, 100)[:, np.newaxis]
y = np.sin(X).ravel() + np.sin(6 * X).ravel() + rng.normal(0, 0.1, X.shape[0])
# Fit regression model
regr_1 = DTR(max_depth=4)
regr_2 = ABR(DTR(max_depth=4), n_estimators=300, random_state=rng)
regr_1.fit(X,y); regr_2.fit(X,y)
# Predict
y_1 = regr_1.predict(X)
y_2 = regr_2.predict(X)
# Plot the results
plt.figure()
plt.scatter(X, y, c="k", label="training samples")
plt.plot(X, y_1, c="g", label="n_estimators=1", linewidth=2)
plt.plot(X, y_2, c="r", label="n_estimators=300", linewidth=2)
plt.xlabel("data")
plt.ylabel("target")
plt.title("Boosted Decision Tree Regression")
plt.legend()
plt.show()
n_estimators
. Tree size is controlled with max_depth
and max_leaf_nodes
. learning_rate
controls overfitting via shrinkage. It is valued as [0.0..1.0].n_classes
trees at each iteration. You should use HistGradientBoostingClassifier
for datasets with large numbers of classes.from sklearn.datasets import make_hastie_10_2
from sklearn.ensemble import GradientBoostingClassifier
X, y = make_hastie_10_2(random_state=0)
X_train, X_test = X[:2000], X[2000:]
y_train, y_test = y[:2000], y[2000:]
clf = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0,
max_depth=1, random_state=0).fit(X_train, y_train)
clf.score(X_test, y_test)
0.913
loss
. The default is least squares (ls
).import numpy as np
from sklearn.metrics import mean_squared_error
from sklearn.datasets import make_friedman1
from sklearn.ensemble import GradientBoostingRegressor as GBR
X, y = make_friedman1(n_samples=1200,
random_state=0,
noise=1.0)
X_train, X_test = X[:200], X[200:]
y_train, y_test = y[:200], y[200:]
est = GBR(n_estimators=100,
learning_rate=0.1,
max_depth=1,
random_state=0,
loss='ls').fit(X_train, y_train)
mean_squared_error(y_test, est.predict(X_test))
5.009154859960321
import matplotlib.pyplot as plt
import numpy as np
from sklearn import datasets, ensemble
from sklearn.inspection import permutation_importance
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
diabetes = datasets.load_diabetes()
X, y = diabetes.data, diabetes.target
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.1, random_state=13)
params = {'n_estimators': 500,
'max_depth': 4,
'min_samples_split': 5,
'learning_rate': 0.01,
'loss': 'ls'}
reg = ensemble.GradientBoostingRegressor(**params)
reg.fit(X_train, y_train)
mse = mean_squared_error(y_test, reg.predict(X_test))
print("The mean squared error (MSE) on test set: {:.4f}".format(mse))
The mean squared error (MSE) on test set: 3030.9181
test_score = np.zeros((params['n_estimators'],), dtype=np.float64)
for i, y_pred in enumerate(reg.staged_predict(X_test)):
test_score[i] = reg.loss_(y_test, y_pred)
fig = plt.figure(figsize=(6, 6))
plt.subplot(1, 1, 1)
plt.title('Deviance')
plt.plot(np.arange(params['n_estimators']) + 1, reg.train_score_, 'b-',
label='Training Set Deviance')
plt.plot(np.arange(params['n_estimators']) + 1, test_score, 'r-',
label='Test Set Deviance')
plt.legend(loc='upper right')
plt.xlabel('Boosting Iterations')
plt.ylabel('Deviance')
fig.tight_layout()
The size of the regression tree base learners defines the level of variable interactions that can be captured by gradient boosting. In general, a tree of depth h
captures interactions of order h
.
max_depth=h
allows binary trees of depth h
. They will have (at most) 2^h leaf nodes and 2^h-1 split nodes.
max_leaf_nodes
controls the number of leaf nodes. In this case, trees will grow using best-first search - nodes with the highest improvement in impurity will be expanded first.
max_leaf_nodes=k
gives comparable results to max_depth=k-1
but is significantly faster to train vs a slightly higher training error.
ls
): the default choice. Initial model given by the mean of the target values.lad
): Initial model given by the median of the target values.huber
): Combines LS & LAD; uses alpha
to control outlier sensitivity.quantile
): For quantile regression, aka prediction intervals. Uses 0<alpha<1
to specify the quantile.deviance
): Uses a negative binomial log-likelihood loss function for binary classification.deviance
): Uses a negative multinomial log-likelihood loss function for multiclass classification. Builds n_classes
regression trees per iteration, which makes GBRT inefficient for large numbers of classes.exponential
): Also used by AdaBoost. Loss robust to mislabeled samples than deviance
.Regularization strategy: scales the contribution of each weak learner by $v$ (learning rate): $F_m(x) = F_{m-1}(x) + \nu h_m(x)$.
The learning rate (learning_rate
) scales the step rate in a gradient descent algorithm. It strongly influences n_estimators
(the number of weak learners).
Smaller learning rates require larger numbers of weak learners to maintain a constant training error.
Evidence indicates smaller learning rates favor better test errors. Recommend starting with a learning rate <0.1.
Stochastic gradient boosting (SGB) combines GB with bootstrap averaging (bagging). At each iteration, a base classifier is trained on a subsample
(fraction, drawn without replacement, typical: 0.5) of the training data.
Below example:
Also: subsampling features will reduce variance. The number of subsampled features is controlled with max_features
. Small values will significantly reduce runtimes.
SGB allows out-of-bag (OOB) deviance estimates by finding the improvement in deviance on the examples not included in the bootstrap. The improvements are tracked in oob_improvement
. This can be used, for example, for feature selection.
OOB estimates are usually very pessimistic. Use cross-validation instead - resort to OOB only if CV is too time-consuming.
Individual decision trees can be evaluated by visualizing the tree structure. Gradient boosting models include hundreds of regression trees & cannot be easily interpreted by visual inspection. Fortunately, a number of techniques have been proposed to summarize and interpret gradient boosting models.
Features usually do not contribute equally to a target response; in many situations the majority features are irrelevant.
Individual decision trees perform feature selection by selecting appropriate split points. This can be used to measure the importance of each feature - the more often a feature is used in the split points, the more important that feature is.
This notion can be by averaging the impurity-based feature importance of each tree
The feature importance scores of a fitted GB model can be accessed via feature_importances_
.
Note: this evaluation is based on entropy. It is distinct from permutation_importance, which depends on feature permutations.
from sklearn.datasets import make_hastie_10_2
from sklearn.ensemble import GradientBoostingClassifier as GBC
X, y = make_hastie_10_2(random_state=0)
clf = GBC(n_estimators=100,
learning_rate=1.0,
max_depth=1,
random_state=0).fit(X, y)
clf.feature_importances_
array([0.10684213, 0.10461707, 0.11265447, 0.09863589, 0.09469133, 0.10729306, 0.09163753, 0.09718194, 0.09581415, 0.09063242])
import numpy as np
import matplotlib.pyplot as plt
from sklearn import ensemble
from sklearn import datasets
X, y = datasets.make_hastie_10_2(n_samples=12000,
random_state=1)
# map labels from {-1, 1} to {0, 1}
X = X.astype(np.float32)
labels, y = np.unique(y, return_inverse=True)
X_train, X_test = X[:2000], X[2000:]
y_train, y_test = y[:2000], y[2000:]
original_params = {'n_estimators': 1000,
'max_leaf_nodes': 4,
'max_depth': None,
'random_state': 2,
'min_samples_split': 5}
plt.figure()
for label, color, setting in [('No shrinkage', 'orange',
{'learning_rate': 1.0, 'subsample': 1.0}),
('learning_rate=0.1', 'turquoise',
{'learning_rate': 0.1, 'subsample': 1.0}),
('subsample=0.5', 'blue',
{'learning_rate': 1.0, 'subsample': 0.5}),
('learning_rate=0.1, subsample=0.5', 'gray',
{'learning_rate': 0.1, 'subsample': 0.5}),
('learning_rate=0.1, max_features=2', 'magenta',
{'learning_rate': 0.1, 'max_features': 2})]:
params = dict(original_params)
params.update(setting)
clf = ensemble.GradientBoostingClassifier(**params).fit(X_train, y_train)
test_deviance = np.zeros((params['n_estimators'],),
dtype=np.float64)
for i, y_pred in enumerate(clf.staged_decision_function(X_test)):
# clf.loss_ assumes that y_test[i] in {0, 1}
test_deviance[i] = clf.loss_(y_test, y_pred)
plt.plot((np.arange(test_deviance.shape[0]) + 1)[::5],
test_deviance[::5],
'-', color=color, label=label)
plt.legend(loc='upper left')
plt.xlabel('Boosting Iterations')
plt.ylabel('Test Set Deviance')
Text(0, 0.5, 'Test Set Deviance')
classification and regression supported.
Number of bins controlled with max_bins
. Less bins = more regularization. General rule: use as many bins as possible.
Loss function regularization is controlled with l2_regularization
.
Loss function options:
Early stopping enabled by default if #samples>10000.
early-stopping
, scoring
, validation_fraction
, n_iter_no_change
and tol
.Built-in missing value support - no need for standalone imputation.
from sklearn.experimental import enable_hist_gradient_boosting # noqa
from sklearn.ensemble import HistGradientBoostingClassifier as HGBC
import numpy as np
X = np.array([0, 1, 2, np.nan]).reshape(-1, 1)
y = [0, 0, 1, 1]
gbdt = HGBC(min_samples_leaf=1).fit(X, y); gbdt.predict(X)
array([0, 0, 1, 1])
# explicitly require the feature before using
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import HistGradientBoostingClassifier as HGBC
from sklearn.datasets import make_hastie_10_2
X, y = make_hastie_10_2(random_state=0)
X_train, X_test = X[:2000], X[2000:]
y_train, y_test = y[:2000], y[2000:]
clf = HGBC(max_iter=100).fit(X_train, y_train); clf.score(X_test, y_test)
0.8965
X = np.array([0, np.nan, 1, 2, np.nan]).reshape(-1, 1)
y = [0, 1, 0, 0, 1]
gbdt = HGBC(min_samples_leaf=1, max_depth=2, learning_rate=1, max_iter=1).fit(X, y)
gbdt.predict(X)
array([0, 1, 0, 0, 1])
X = [[1, 0], [1, 0], [1, 0], [0, 1]]
y = [ 0, 0, 1, 0]
w = [ 0, 0, 1, 1] # ignore 1st 2 samples by setting their weight to 0
gb = HGBC(min_samples_leaf=1)
gb.fit(X, y, sample_weight=w)
print(gb.predict([[1, 0]]))
print(gb.predict_proba([[1, 0]])[0, 1])
[1] 0.9990209190235209
categorical_features
to indicate which are categories:gbdt = HGBC(categorical_features=[True, False])
gbdt = HGBC(categorical_features=[0])
# load Ames housing dataset
from sklearn.datasets import fetch_openml
X, y = fetch_openml(data_id=41211, as_frame=True, return_X_y=True)
n_categorical_features = (X.dtypes == 'category').sum()
n_numerical_features = (X.dtypes == 'float').sum()
print(f"#samples: {X.shape[0]}")
print(f"#features: {X.shape[1]}")
print(f"#categorical features: {n_categorical_features}")
print(f"#numerical features: {n_numerical_features}")
/home/bjpcjp/.local/lib/python3.8/site-packages/sklearn/datasets/_openml.py:849: UserWarning: Version 1 of dataset ames-housing is inactive, meaning that issues have been found in the dataset. Try using a newer version from this URL: https://www.openml.org/data/v1/download/20649135/ames-housing.arff warn("Version {} of dataset {} is inactive, meaning that issues have "
#samples: 2930 #features: 80 #categorical features: 46 #numerical features: 34
# create estimator
from sklearn.experimental import enable_hist_gradient_boosting # noqa
from sklearn.ensemble import HistGradientBoostingRegressor as HGBR
from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_transformer
from sklearn.compose import make_column_selector
dropper = make_column_transformer(
('drop', make_column_selector(dtype_include='category')),
remainder='passthrough')
hist_dropped = make_pipeline(dropper, HGBR(random_state=42))
# one-hot encoding
from sklearn.preprocessing import OneHotEncoder as OHE
one_hot_encoder = make_column_transformer(
(OHE(sparse=False, handle_unknown='ignore'),
make_column_selector(dtype_include='category')),
remainder='passthrough')
hist_one_hot = make_pipeline(one_hot_encoder, HGBR(random_state=42))
# ordinal encoding
from sklearn.preprocessing import OrdinalEncoder as OE
import numpy as np
ordinal_encoder = make_column_transformer(
(OE(handle_unknown='use_encoded_value', unknown_value=np.nan),
make_column_selector(dtype_include='category')),
remainder='passthrough')
hist_ordinal = make_pipeline(ordinal_encoder, HGBR(random_state=42))
# native category support
categorical_mask = ([True]*n_categorical_features + [False]*n_numerical_features)
hist_native = make_pipeline(
ordinal_encoder, HGBR(random_state=42, categorical_features=categorical_mask))
# compare with cross validation
from sklearn.model_selection import cross_validate as CV
import matplotlib.pyplot as plt
scoring = "neg_mean_absolute_percentage_error"
dropped_result = CV(hist_dropped, X, y, cv=3, scoring=scoring)
one_hot_result = CV(hist_one_hot, X, y, cv=3, scoring=scoring)
ordinal_result = CV(hist_ordinal, X, y, cv=3, scoring=scoring)
native_result = CV(hist_native, X, y, cv=3, scoring=scoring)
def plot_results(figure_title):
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 8))
plot_info = [('fit_time', 'Fit times (s)', ax1, None),
('test_score', 'Mean Absolute Percentage Error', ax2,
(0, 0.20))]
x, width = np.arange(4), 0.9
for key, title, ax, y_limit in plot_info:
items = [dropped_result[key], one_hot_result[key], ordinal_result[key],
native_result[key]]
ax.bar(x, [np.mean(np.abs(item)) for item in items],
width, yerr=[np.std(item) for item in items],
color=['C0', 'C1', 'C2', 'C3'])
ax.set(xlabel='Model', title=title, xticks=x,
xticklabels=["Dropped", "One Hot", "Ordinal", "Native"],
ylim=y_limit)
fig.suptitle(figure_title)
plot_results("Gradient Boosting on Adult Census")
# limiting the #of splits
# rerun analysis with artificially low split count by limiting #trees & tree depth
for pipe in (hist_dropped, hist_one_hot, hist_ordinal, hist_native):
pipe.set_params(histgradientboostingregressor__max_depth=3,
histgradientboostingregressor__max_iter=15)
dropped_result = CV(hist_dropped, X, y, cv=3, scoring=scoring)
one_hot_result = CV(hist_one_hot, X, y, cv=3, scoring=scoring)
ordinal_result = CV(hist_ordinal, X, y, cv=3, scoring=scoring)
native_result = CV(hist_native, X, y, cv=3, scoring=scoring)
plot_results("Gradient Boosting on Adult Census (few and small trees)")
plt.show()
monotonic_cst
controls the constraint. 0 = no constraint; -1 = negative, +1 = positive.from sklearn.experimental import enable_hist_gradient_boosting # noqa
from sklearn.ensemble import HistGradientBoostingRegressor as HGBR
from sklearn.inspection import plot_partial_dependence as PPD
import numpy as np
import matplotlib.pyplot as plt
rng = np.random.RandomState(0)
n = 5000
f_0 = rng.rand(n) # positive correlation with y
f_1 = rng.rand(n) # negative correlation with y
X = np.c_[f_0, f_1]
noise = rng.normal(loc=0.0, scale=0.01, size=n)
y = (5*f_0 + np.sin(10*np.pi*f_0) -
5*f_1 - np.cos(10*np.pi*f_1) + noise)
fig, ax = plt.subplots()
# Without any constraint
gbdt = HGBR()
gbdt.fit(X, y)
disp = PPD(gbdt, X, features=[0, 1],
line_kw={"linewidth": 4, "label": "unconstrained", "color": "tab:blue"},
ax=ax)
# With positive and negative constraints
gbdt = HGBR(monotonic_cst=[1, -1])
gbdt.fit(X, y)
PPD(gbdt, X, features=[0, 1],
feature_names=(
"First feature\nPositive constraint",
"Second feature\nNegtive constraint",
),
line_kw={"linewidth": 4, "label": "constrained", "color": "tab:orange"},
ax=disp.axes_)
for f_idx in (0, 1):
disp.axes_[0, f_idx].plot(
X[:, f_idx], y, "o", alpha=0.3, zorder=-1, color="tab:green"
)
disp.axes_[0, f_idx].set_ylim(-6, 6)
plt.legend()
fig.suptitle("Monotonic constraints illustration")
plt.show()
estimators
contains the list of stacked (in parallel) estimators.final_estimator
uses the combined predictions for the final output.from sklearn.linear_model import RidgeCV, LassoCV
from sklearn.neighbors import KNeighborsRegressor as KNR
estimators = [('ridge', RidgeCV()),
('lasso', LassoCV(random_state=42)),
('knr', KNR(n_neighbors=20, metric='euclidean'))]
from sklearn.ensemble import GradientBoostingRegressor as GBR
from sklearn.ensemble import StackingRegressor as SR
final_estimator = GBR(n_estimators=25,
subsample=0.5,
min_samples_leaf=25,
max_features=1,
random_state=42)
reg = SR(estimators=estimators,
final_estimator=final_estimator)
from sklearn.datasets import load_diabetes
X, y = load_diabetes(return_X_y=True)
from sklearn.model_selection import train_test_split as TTS
X_train, X_test, y_train, y_test = TTS(X, y, random_state=42)
reg.fit(X_train, y_train)
StackingRegressor(estimators=[('ridge', RidgeCV(alphas=array([ 0.1, 1. , 10. ]))), ('lasso', LassoCV(random_state=42)), ('knr', KNeighborsRegressor(metric='euclidean', n_neighbors=20))], final_estimator=GradientBoostingRegressor(max_features=1, min_samples_leaf=25, n_estimators=25, random_state=42, subsample=0.5))
y_pred = reg.predict(X_test)
from sklearn.metrics import r2_score
print('R2 score: {:.2f}'.format(r2_score(y_test, y_pred)))
R2 score: 0.53
# getting output of stacked estimators:
reg.transform(X_test[:5])
array([[142.36214074, 138.30765507, 146.1 ], [179.70207217, 182.90046333, 151.75 ], [139.89924327, 132.47007083, 158.25 ], [286.94742491, 292.65164781, 225.4 ], [126.88190192, 124.11964797, 164.65 ]])
final_layer_rfr = RFR(n_estimators=10, max_features=1, max_leaf_nodes=5,random_state=42)
final_layer_gbr = GBR(n_estimators=10, max_features=1, max_leaf_nodes=5,random_state=42)
final_layer = SR(
estimators=[('rf', final_layer_rfr),
('gbrt', final_layer_gbr)],
final_estimator=RidgeCV()
)
multi_layer_regressor = SR(
estimators=[('ridge', RidgeCV()),
('lasso', LassoCV(random_state=42)),
('knr', KNR(n_neighbors=20, metric='euclidean'))],
final_estimator=final_layer
)
multi_layer_regressor.fit(X_train, y_train)
print('R2 score: {:.2f}'.format(multi_layer_regressor.score(X_test, y_test)))
R2 score: 0.53