import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model  import LogisticRegression as LR
from sklearn               import datasets
from sklearn.preprocessing import StandardScaler as SS


X, y     = datasets.load_digits(return_X_y=True)
X        = SS().fit_transform(X)
y        = (y > 4).astype(int) # classify smaller vs larger digits
l1_ratio = 0.5  # L1 weight in Elastic-Net regularization


fig, axes = plt.subplots(3, 3)

# Set regularization
for i, (C, axes_row) in enumerate(zip((1, 0.1, 0.01), axes)):
    # turn down tolerance for short training time
    clf_l1_LR = LR(C=C, penalty='l1',         tol=0.01, solver='saga')
    clf_l2_LR = LR(C=C, penalty='l2',         tol=0.01, solver='saga')
    clf_en_LR = LR(C=C, penalty='elasticnet', tol=0.01, solver='saga', l1_ratio=l1_ratio)
    clf_l1_LR.fit(X, y)
    clf_l2_LR.fit(X, y)
    clf_en_LR.fit(X, y)

    coef_l1_LR = clf_l1_LR.coef_.ravel()
    coef_l2_LR = clf_l2_LR.coef_.ravel()
    coef_en_LR = clf_en_LR.coef_.ravel()

    # coef_l1_LR contains zeros due to L1 sparsity inducing norm

    sparsity_l1_LR = np.mean(coef_l1_LR == 0) * 100
    sparsity_l2_LR = np.mean(coef_l2_LR == 0) * 100
    sparsity_en_LR = np.mean(coef_en_LR == 0) * 100

    print("C=%.2f" % C)
    print("{:<40} {:.2f}%".format("Sparsity with L1 penalty:", sparsity_l1_LR))
    print("{:<40} {:.2f}%".format("Sparsity with Elastic-Net penalty:",
                                  sparsity_en_LR))
    print("{:<40} {:.2f}%".format("Sparsity with L2 penalty:", sparsity_l2_LR))
    print("{:<40} {:.2f}".format("Score with L1 penalty:",
                                 clf_l1_LR.score(X, y)))
    print("{:<40} {:.2f}".format("Score with Elastic-Net penalty:",
                                 clf_en_LR.score(X, y)))
    print("{:<40} {:.2f}".format("Score with L2 penalty:",
                                 clf_l2_LR.score(X, y)))

    if i == 0:
        axes_row[0].set_title("L1 penalty")
        axes_row[1].set_title("Elastic-Net\nl1_ratio = %s" % l1_ratio)
        axes_row[2].set_title("L2 penalty")

    for ax, coefs in zip(axes_row, [coef_l1_LR, coef_en_LR, coef_l2_LR]):
        ax.imshow(np.abs(coefs.reshape(8, 8)), interpolation='nearest',
                  cmap='binary', vmax=1, vmin=0)
        ax.set_xticks(())
        ax.set_yticks(())

    axes_row[0].set_ylabel('C = %s' % C)

plt.show()

C=1.00
Sparsity with L1 penalty:                4.69%
Sparsity with Elastic-Net penalty:       4.69%
Sparsity with L2 penalty:                4.69%
Score with L1 penalty:                   0.90
Score with Elastic-Net penalty:          0.90
Score with L2 penalty:                   0.90
C=0.10
Sparsity with L1 penalty:                29.69%
Sparsity with Elastic-Net penalty:       10.94%
Sparsity with L2 penalty:                4.69%
Score with L1 penalty:                   0.90
Score with Elastic-Net penalty:          0.90
Score with L2 penalty:                   0.90
C=0.01
Sparsity with L1 penalty:                82.81%
Sparsity with Elastic-Net penalty:       67.19%
Sparsity with L2 penalty:                4.69%
Score with L1 penalty:                   0.86
Score with Elastic-Net penalty:          0.88
Score with L2 penalty:                   0.89


iris = datasets.load_iris()
X,y  = iris.data, iris.target
X    = X[y != 2]
y    = y[y != 2]
X   /= X.max()  # Normalize X to speed-up convergence


from sklearn.svm import l1_min_c
cs = l1_min_c(X, y, loss='log') * np.logspace(0, 7, 16)

from time import time
start = time()

clf = LR(penalty='l1', 
         solver='liblinear',
         tol=1e-6, 
         max_iter=int(1e6),
         warm_start=True,
         intercept_scaling=10000.)

coefs_ = []
for c in cs:
    clf.set_params(C=c)
    clf.fit(X, y)
    coefs_.append(clf.coef_.ravel().copy())

print("This took %0.3fs" % (time() - start))

This took 0.041s


coefs_ = np.array(coefs_)
plt.plot(np.log10(cs), coefs_, marker='o')
ymin, ymax = plt.ylim()

plt.xlabel('log(C)'); plt.ylabel('Coefficients'); plt.title('Logistic Regression Path')
plt.axis('tight')
plt.show()


from sklearn.datasets import make_blobs

centers        = [[-5, 0], [0, 1.5], [5, -1]]
X, y           = make_blobs(n_samples=1000, centers=centers, random_state=40)
transformation = [[0.4, 0.2], [-0.4, 1.2]]
X              = np.dot(X, transformation)


for multi_class in ('multinomial', 'ovr'):
    clf = LR(solver='sag', max_iter=100, random_state=42,
                             multi_class=multi_class).fit(X, y)

    print("training score : %.3f (%s)" % (clf.score(X, y), multi_class))

    # create a mesh to plot in
    h = .02  # step size in the mesh
    x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
    y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
    xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
                         np.arange(y_min, y_max, h))

    # Plot decision boundary. Assign a color to each point in the mesh 
    # [x_min, x_max]x[y_min, y_max].
    Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])
    Z = Z.reshape(xx.shape)

    plt.figure()
    plt.contourf(xx, yy, Z, cmap=plt.cm.Paired)
    plt.title("Decision surface (%s)" % multi_class)
    plt.axis('tight')

    # Plot also the training points
    colors = "bry"
    for i, color in zip(clf.classes_, colors):
        idx = np.where(y == i)
        plt.scatter(X[idx, 0], X[idx, 1], c=color, cmap=plt.cm.Paired,
                    edgecolor='black', s=20)

    # Plot the three one-against-all classifiers
    xmin, xmax = plt.xlim()
    ymin, ymax = plt.ylim()
    coef = clf.coef_
    intercept = clf.intercept_

    def plot_hyperplane(c, color):
        def line(x0):
            return (-(x0 * coef[c, 0]) - intercept[c]) / coef[c, 1]
        plt.plot([xmin, xmax], [line(xmin), line(xmax)],
                 ls="--", color=color)

    for i, color in zip(clf.classes_, colors):
        plot_hyperplane(i, color)

plt.show()

training score : 0.995 (multinomial)
training score : 0.976 (ovr)


import warnings, timeit
from sklearn.datasets        import fetch_20newsgroups_vectorized as Fetch20
from sklearn.linear_model    import LogisticRegression as LR
from sklearn.model_selection import train_test_split   as TTS
from sklearn.exceptions      import ConvergenceWarning as CW


warnings.filterwarnings("ignore", category=CW,module="sklearn")
t0        = timeit.default_timer()
solver    = 'saga'
n_samples = 10000


X, y = Fetch20(subset='all', return_X_y=True)
X    = X[:n_samples]
y    = y[:n_samples]

X_train, X_test, y_train, y_test = TTS(X, y, random_state=42, stratify=y, test_size=0.1)
train_samples, n_features        = X_train.shape
n_classes                        = np.unique(y).shape[0]


models = {'ovr':         {'name': 'One versus Rest', 'iters': [1, 2, 4]},
          'multinomial': {'name': 'Multinomial',     'iters': [1, 3, 7]}}


for model in models:
    accuracies   = [1 / n_classes]
    times        = [0]
    densities    = [1]
    model_params = models[model]

    # Small #epochs = faster runtime
    for this_max_iter in model_params['iters']:
        print('[model=%s, solver=%s] Number of epochs: %s' %
              (model_params['name'], solver, this_max_iter))
        lr = LR(solver=solver,
                multi_class=model,
                penalty='l1',
                max_iter=this_max_iter,
                random_state=42)
        
        t1 = timeit.default_timer()
        lr.fit(X_train, y_train)
        train_time = timeit.default_timer() - t1

        y_pred     = lr.predict(X_test)
        accuracy   = np.sum(y_pred == y_test) / y_test.shape[0]
        density    = np.mean(lr.coef_ != 0, axis=1) * 100
        
        accuracies.append(accuracy)
        densities.append(density)
        times.append(train_time)

    models[model]['times'] = times
    models[model]['densities'] = densities
    models[model]['accuracies'] = accuracies

    print('Test accuracy for %s: %.4f' % (model, accuracies[-1]))
    print('%% non-zero coeffs for %s, '
    'per class:\n %s' % (model, densities[-1]))
    print('Run time (%i epochs) for %s:'
          '%.2f' % (model_params['iters'][-1], model, times[-1]))

fig = plt.figure()
ax = fig.add_subplot(111)

for model in models:
    name = models[model]['name']
    times = models[model]['times']
    accuracies = models[model]['accuracies']
    ax.plot(times, accuracies, marker='o',
            label='Model: %s' % name)
    ax.set_xlabel('Train time (s)')
    ax.set_ylabel('Test accuracy')
ax.legend()
fig.suptitle('Multinomial vs One-vs-Rest Logistic L1\n'
             'Dataset %s' % '20newsgroups')
fig.tight_layout()
fig.subplots_adjust(top=0.85)
run_time = timeit.default_timer() - t0
print('Example run in %.3f s' % run_time)
plt.show()

[model=One versus Rest, solver=saga] Number of epochs: 1
[model=One versus Rest, solver=saga] Number of epochs: 2
[model=One versus Rest, solver=saga] Number of epochs: 4
Test accuracy for ovr: 0.7490
% non-zero coeffs for ovr, per class:
 [0.31743104 0.36815852 0.4181174  0.46115889 0.24595141 0.41350581
 0.31281945 0.27054655 0.58720899 0.32972861 0.4158116  0.3312658
 0.41888599 0.41120001 0.59643217 0.31666244 0.34279478 0.28130692
 0.35278655 0.24748861]
Run time (4 epochs) for ovr:2.37
[model=Multinomial, solver=saga] Number of epochs: 1
[model=Multinomial, solver=saga] Number of epochs: 3
[model=Multinomial, solver=saga] Number of epochs: 7
Test accuracy for multinomial: 0.7450
% non-zero coeffs for multinomial, per class:
 [0.13219888 0.11452112 0.13066169 0.13681047 0.12066991 0.15909982
 0.13450468 0.09146318 0.07916561 0.12143851 0.13911627 0.10760374
 0.18984374 0.12143851 0.17524038 0.22289346 0.11605832 0.07916561
 0.07301682 0.15141384]
Run time (7 epochs) for multinomial:2.15
Example run in 4132.328 s


from sklearn.datasets import fetch_openml
from sklearn.linear_model import LogisticRegression as LR
from sklearn.model_selection import train_test_split as TTS
from sklearn.preprocessing import StandardScaler as SS
from sklearn.utils import check_random_state as CRS


train_samples = 5000

# Load data from https://www.openml.org/d/554
X, y = fetch_openml('mnist_784', version=1, return_X_y=True, as_frame=False)

# Turn seed into a np.random.RandomState instance
random_state = CRS(0); permutation  = random_state.permutation(X.shape[0])
print('permutation: ', permutation)

X = X[permutation]
y = y[permutation]
X = X.reshape((X.shape[0], -1))

X_train, X_test, y_train, y_test = TTS(
    X, y, train_size=train_samples, test_size=10000)

scaler = SS()
X_train = scaler.fit_transform(X_train)
X_test  = scaler.transform(X_test)

permutation:  [10840 56267 14849 ... 42613 43567 68268]


# Turn up tolerance for faster convergence
clf = LR(C      =50.0/train_samples, 
         penalty='l1', 
         solver ='saga', 
         tol=0.1)
clf.fit(X_train, y_train)

sparsity = np.mean(clf.coef_ == 0) * 100
score    = clf.score(X_test, y_test)

print("Sparsity with L1 penalty:   %.2f%%" % sparsity)
print("Test score with L1 penalty: %.4f"   % score)

coef = clf.coef_.copy()
plt.figure(figsize=(10, 5))
scale = np.abs(coef).max()

for i in range(10):
    l1_plot = plt.subplot(2, 5, i + 1)
    l1_plot.imshow(coef[i].reshape(28, 28), interpolation='nearest',
                   cmap=plt.cm.RdBu, vmin=-scale, vmax=scale)
    l1_plot.set_xticks(())
    l1_plot.set_yticks(())
    l1_plot.set_xlabel('Class %i' % i)
plt.suptitle('Classification vector for...')
plt.show()

Sparsity with L1 penalty:   79.49%
Test score with L1 penalty: 0.8362

Logistic Regression ¶

Logistic Regression (Cross Validated)¶

Example: comparison of sparsity (% of zero coefficients) vs L1, L2 & ElasticNet penalties in Logistic Regression¶

Example: Regularization path (coefficients vs log(C), Logistic Regression)¶

Example: Multinomial vs One-vs-Rest (OVR) Logistic Regression¶

Example: Multiclass (multinomial vs one-vs-rest) document classification using logistic regression¶

Example: Multinomial LR with l1 penalty on MNIST¶