SGDClassifier(loss='log')
is equivalent to Logistic Regression fitted via SGD.X (#samples,#features)
and targets (labels) y (#labels)
.from sklearn.linear_model import SGDClassifier
X,y = [[0.0, 0.0], [1.0, 1.0]], [0.0, 1.0]
clf = SGDClassifier(loss="hinge", penalty="l2", max_iter=50)
clf.fit(X, y)
print(clf.coef_) # model params
print(clf.intercept_) # offset/bias
print(clf.predict([[2.0,2.0]])) # predicting new values
[[9.85221675 9.85221675]] [-9.97004991] [1.]
fit_intercept
tells the model whether to use an intercept (a biased hyperplane).decision_function
(a method) returns the signed distance to the hyperplane (the dot product between the coefficients & the input sample - plus the intercept.)clf.decision_function([[2.0, 2.0]])
array([29.43881708])
loss
sets the loss function for the model. options:
loss="hinge"
- linear support vector machineloss="modified_huber"
- hinge loss (smoothed)loss="log"
- logistic regressionUsing log and modified_huber loss functions enables the predict_proba
method - which returns a vector of probability estimates per sample x: $P(y|x)$
clf = SGDClassifier(loss="log", max_iter=25).fit(X, y)
clf.predict_proba([[1., 1.]])
array([[6.54276816e-07, 9.99999346e-01]])
L1 & L2 norm penalties are set using penalty
:
penalty="l2"
: L2 norm penalty used on coef_.penalty="l1"
: L1 norm penalty used on coef_.penalty="elasticnet"
: convex combination of L1 & L2 norm penalties.The default is penalty=l2"
.
l1_ratio
controls the convex L1/L2 penalty combination.import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets
from sklearn.linear_model import SGDClassifier
iris = datasets.load_iris()
X,y,colors = iris.data[:,:2], iris.target, "bry"
# shuffle
idx = np.arange(X.shape[0])
np.random.seed(13); np.random.shuffle(idx)
X,y = X[idx],y[idx]
# standardize
mean = X.mean(axis=0)
std = X.std(axis=0)
X = (X-mean)/std
h = .02 # step size in the mesh
clf = SGDClassifier(alpha=0.001, max_iter=100).fit(X, y)
# create a mesh to plot in
x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
np.arange(y_min, y_max, h))
Z = clf.predict(np.c_[xx.ravel(),
yy.ravel()])
Z = Z.reshape(xx.shape)
cs = plt.contourf(xx, yy, Z, cmap=plt.cm.Paired)
for i, color in zip(clf.classes_, colors):
idx = np.where(y == i)
plt.scatter(X[idx, 0],
X[idx, 1],
c=color, label=iris.target_names[i],
cmap=plt.cm.Paired, edgecolor='black', s=20)
plt.title("Decision surface of multi-class SGD")
# Plot the three one-against-all classifiers
xmin, xmax = plt.xlim()
ymin, ymax = plt.ylim()
coef = clf.coef_
intercept = clf.intercept_
def plot_hyperplane(c, color):
def line(x0):
return (-(x0 * coef[c, 0]) - intercept[c]) / coef[c, 1]
plt.plot([xmin, xmax], [line(xmin), line(xmax)],
ls="--", color=color)
for i, color in zip(clf.classes_, colors):
plot_hyperplane(i, color)
plt.legend(); plt.show()
class_weight
) and instances (via sample_weight
).from sklearn import linear_model
np.random.seed(0)
X = np.r_[np.random.randn(10, 2) + [1, 1],
np.random.randn(10, 2)]
y = [1] * 10 + [-1] * 10
sample_weight = 100 * np.abs(np.random.randn(20))
sample_weight[:10] *= 10 # assign more weight to last 10 samples
xx, yy = np.meshgrid(np.linspace(-4, 5, 500),
np.linspace(-4, 5, 500))
plt.figure()
plt.scatter(X[:, 0],
X[:, 1],
c=y, s=sample_weight, alpha=0.9,
cmap=plt.cm.bone, edgecolor='black')
# fit the unweighted model
clf = linear_model.SGDClassifier(alpha=0.01, max_iter=100)
clf.fit(X, y)
Z = clf.decision_function(np.c_[xx.ravel(),
yy.ravel()])
Z = Z.reshape(xx.shape)
no_weights = plt.contour(xx, yy, Z,
levels=[0], linestyles=['solid'])
# fit the weighted model
clf = linear_model.SGDClassifier(alpha=0.01, max_iter=100)
clf.fit(X, y, sample_weight=sample_weight)
Z = clf.decision_function(np.c_[xx.ravel(),
yy.ravel()])
Z = Z.reshape(xx.shape)
samples_weights = plt.contour(xx, yy, Z,
levels=[0], linestyles=['dashed'])
plt.legend([no_weights.collections[0],
samples_weights.collections[0]],
["no weights", "with weights"], loc="lower left")
plt.xticks(()); plt.yticks(()); plt.show()
average=True
.coef_
is set to the average coefficient values across all updates. (The same distinction happens with intercept_
.)from sklearn.model_selection import train_test_split as TTS
from sklearn.linear_model import SGDClassifier as SGC
from sklearn.linear_model import Perceptron as PERCEPT
from sklearn.linear_model import PassiveAggressiveClassifier as PAC
from sklearn.linear_model import LogisticRegression as LR
heldout = [0.95, 0.90, 0.75, 0.50, 0.01]
rounds = 20
X, y = datasets.load_digits(return_X_y=True)
classifiers = [
("SGD", SGC(max_iter=100)),
("ASGD", SGC(average=True)),
("Perceptron", PERCEPT()),
("Pasv-Agrsv I", PAC(loss='hinge', C=1.0, tol=1e-4)),
("Pasv-Agrsv II", PAC(loss='squared_hinge', C=1.0, tol=1e-4)),
("SAG", LR(solver='sag', tol=1e-1, C=1.e4 / X.shape[0]))
]
xx = 1. - np.array(heldout)
for name, clf in classifiers:
print("training %s" % name)
rng = np.random.RandomState(42)
yy = []
for i in heldout:
yy_ = []
for r in range(rounds):
X_train, X_test, y_train, y_test = \
TTS(X, y, test_size=i, random_state=rng)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
yy_.append(1 - np.mean(y_pred == y_test))
yy.append(np.mean(yy_))
plt.plot(xx, yy, label=name)
plt.legend(loc="upper right")
plt.xlabel("Proportion train")
plt.ylabel("Test Error Rate")
plt.show()
training SGD training ASGD training Perceptron training Pasv-Agrsv I training Pasv-Agrsv II training SAG
loss="squared_loss"
: ordinary least squares regressionloss="huber"
: huber loss for robust regressionloss="epsilon_insensitive
: linear support vector regressionpenalty
controls regularization (same options as in classification)from sklearn.linear_model import SGDRegressor
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
n_samples, n_features = 10, 5
rng = np.random.RandomState(0)
y = rng.randn(n_samples)
X = rng.randn(n_samples, n_features)
# Always scale inputs. The most convenient way is to use a pipeline.
reg = make_pipeline(StandardScaler(),
SGDRegressor(max_iter=1000, tol=1e-3))
reg.fit(X, y)
Pipeline(steps=[('standardscaler', StandardScaler()), ('sgdregressor', SGDRegressor())])
early_stopping=True
: stopping criteria is based on the prediction score (score
) found on the validation set.early_stopping=False
: model is fitted on entire input dataset. Stopping is based on the objective function found on the training dataset.n_iter_no_change
consecutive times.max_iter
.StandardScaler
.c
such that the average L2 norm of the training data equals one.