(nsamples, nfeatures)
, OLS has a cost of $O(n_{\text{samples}} n_{\text{features}}^2)$.from sklearn import linear_model as LM
reg = LM.LinearRegression()
reg.fit([[0,0],[1,1],[2,2]],
[0,1,2])
LinearRegression()
reg.coef_
array([0.5, 0.5])
import matplotlib.pyplot as plt
import numpy as np
from sklearn import datasets, linear_model as LM
from sklearn.metrics import mean_squared_error, r2_score
# diabetes dataset
X,y = datasets.load_diabetes(return_X_y=True)
# use only one feature
X = X[:, np.newaxis, 2]
# split into training & test sets
Xtrain,Xtest = X[:-20], X[-20:]
# split target data
ytrain,ytest = y[:-20], y[-20:]
# create & train regression model
regr = LM.LinearRegression()
regr.fit(Xtrain,ytrain)
LinearRegression()
# predict from test data & return results
ypred = regr.predict(Xtest)
print('Coefficients:\n', regr.coef_)
print('Mean sqd error:\n', mean_squared_error(ytest,ypred))
print('r2 score:\n', r2_score( ytest,ypred))
Coefficients: [938.23786125] Mean sqd error: 2548.0723987259694 r2 score: 0.47257544798227147
# plot
plt.scatter(Xtest,ytest,color='black')
plt.plot( Xtest,ypred,color='blue')
plt.xticks(()), plt.yticks(()), plt.show()
(([], []), ([], []), None)
# generate random data
np.random.seed(42)
nsamps,nfeats = 200,50
X = np.random.randn(nsamps,nfeats)
# coefficients
true_coefs = 3*np.random.randn(nfeats)
true_coefs[true_coefs<0] = 0
# set y to dot-product; add normally distributed noise
y = np.dot(X,true_coefs)
y += np.random.normal(size=(nsamps,))
# split data. split_size = % of data to include in test split.
from sklearn.model_selection import train_test_split as TTS
Xtrain,Xtest,ytrain,ytest = TTS(X,y,test_size=0.5)
# fit model and return results
from sklearn.linear_model import LinearRegression as LR
regr_nn = LR(positive=True)
ypred_nn = regr_nn.fit(Xtrain,ytrain).predict(Xtest)
print('r2 score:\n', r2_score(ytest,ypred_nn))
r2 score: 0.9898521625129549
# fit using OLS for comparison
regr_ols = LR()
ypred_ols = regr_ols.fit(Xtrain,ytrain).predict(Xtest)
print('r2 score:\n', r2_score(ytest,ypred_ols))
r2 score: 0.9876061239769228
# plot regression coeffs between OLS & NNLS
# note how NNLS constraint shrinks some coeffs to zero
# NNLS inherently yield sparse results.
fix,ax = plt.subplots()
ax.plot(regr_ols.coef_, regr_nn.coef_, marker='.', linewidth=0)
minx,maxx = ax.get_xlim()
miny,maxy = ax.get_ylim()
low, high = max(minx,miny), min(maxx,maxy)
ax.plot([low,high],[low,high], ls='--', c='0.3', alpha=0.5)
ax.set_xlabel('OLS coeffs'); ax.set_ylabel('NNLS coeffs')
Text(0, 0.5, 'NNLS coeffs')
from sklearn import linear_model as LM
regr = LM.Ridge(alpha=0.5)
regr.fit([[0,0],[0,0],[1,1]], [0.0, 0.1, 1.0])
print("Coefficients:\n", regr.coef_)
print('Intercept:\n', regr.intercept_)
Coefficients: [0.34545455 0.34545455] Intercept: 0.13636363636363638
# example: plot coefficients vs regularization
# when alpha is very large, regularization dominates - coeffs trend to zero.
# when alpha ~0, squared loss function dominates - coeffs start oscillating.
# build 10x10 Hilbert matrix (sq.matrix with entries as unit fractions)
X = 1.0/(np.arange(1,11) + np.arange(0,10)[:, np.newaxis])
y = np.ones(10)
# paths
alphas = np.logspace(-10,2,200)
coefs = []
for a in alphas:
ridge = LM.Ridge(alpha=a, fit_intercept=False)
ridge.fit(X,y)
coefs.append(ridge.coef_)
# plot
ax = plt.gca()
ax.plot(alphas,coefs)
ax.set_xscale('log')
ax.set_xlim(ax.get_xlim()[::-1]) # reverse axis
plt.xlabel('alpha')
plt.ylabel('weights')
plt.title('coefficients vs regularization')
plt.axis('tight')
plt.show()
{-1,+1}
, then treats problem as a regression task. import numpy as np
from sklearn import linear_model as LM
reg = LM.RidgeCV(alphas=np.logspace(-6, 6, 13))
reg.fit([[0, 0], [0, 0], [1, 1]], [0, .1, 1])
reg.alpha_
0.01
regr = LM.Lasso(alpha=0.1)
regr.fit([[0,0], [1,1]],
[0,1])
regr.predict([[1,1]])
array([0.8])
# Lasso vs Elastic Net - sparse signal fit comparison
np.random.seed(42)
nsamps, nfeats = 50,100
X = np.random.randn(nsamps,nfeats)
# create decreasing coeffs, with alternating signs, for viz
idx = np.arange(nfeats)
coef = (-1)**idx*np.exp(-idx/10)
# sparsify it
coef[10:]=0
y = np.dot(X,coef)
# add normally-distributed noise
y += 0.01*np.random.normal(size=nsamps)
# split into training, test data
nsamps = X.shape[0]
Xtrain,ytrain = X[:nsamps//2 ], y[:nsamps//2 ]
Xtest, ytest = X[:nsamps//2:], y[ nsamps//2:]
# fit & predict
from sklearn.linear_model import Lasso as LA
from sklearn.linear_model import ElasticNet as EN
alpha = 0.1
lasso = LA(alpha=alpha)
enet = EN(alpha=alpha, l1_ratio=0.7)
ypred_lasso = lasso.fit(Xtrain,ytrain).predict(Xtest)
ypred_enet = enet.fit(Xtrain,ytrain).predict(Xtest)
r2_lasso = r2_score(ytest,ypred_lasso)
r2_enet = r2_score(ytest,ypred_enet)
print('r2 score (lasso):\t', r2_lasso)
print('r2 score (enet): \t', r2_enet)
r2 score (lasso): -0.2977169214889326 r2 score (enet): -0.321518995138536
# display using a stem plot
m,s,_ = plt.stem(
np.where(enet.coef_)[0],
enet.coef_[enet.coef_ != 0],
markerfmt='x',
label='enet coefficients',
use_line_collection=True)
plt.setp([m,s], color='#2ca02c')
m, s, _ = plt.stem(
np.where(lasso.coef_)[0],
lasso.coef_[lasso.coef_ != 0],
markerfmt='x',
label='Lasso coefficients',
use_line_collection=True)
plt.setp([m,s], color='#ff7f0e')
plt.stem(
np.where(coef)[0],
coef[coef != 0],
label='true coefficients',
markerfmt='bx',
use_line_collection=True)
plt.legend(loc='best')
plt.title("Lasso $R^2$: %.3f, Elastic Net $R^2$: %.3f" %
(r2_lasso, r2_enet))
plt.show()
import time
from sklearn.linear_model import LassoCV, LassoLarsCV, LassoLarsIC
from sklearn import datasets
# example: numpy c_ (translate slice objs to concatenate on 2nd axis)
np.c_[np.array([1,2,3]), np.array([4,5,6])]
array([[1, 4], [2, 5], [3, 6]])
# to avoid division by zero while doing np.log10
EPSILON=1e-4
X,y = datasets.load_diabetes(return_X_y=True)
print(X.shape)
rng = np.random.RandomState(42)
X = np.c_[X,rng.randn(X.shape[0],14)]
(442, 10)
# normalize data, as done by Lars, to allow comparisons
X /= np.sqrt(np.sum(X**2,axis=0))
print(X.shape)
(442, 24)
# LassoLarsIC: least angle regression with BIC/AIC criterion
bic = LassoLarsIC(criterion='bic')
aic = LassoLarsIC(criterion='aic')
bic.fit(X,y); bic_alpha = bic.alpha_
aic.fit(X,y); aic_alpha = aic.alpha_
def plot_criterion(model,name,color):
criterion_ = model.criterion_
plt.semilogx(
model.alphas_ + EPSILON,
model.criterion_,
'--', color=color, linewidth=3, label='%s criterion' % name
)
# plt.axvline(
# model.alphas_ + EPSILON,
# color=color, linewidth=3, label='alpha: %s estimate' % name
# )
plt.xlabel(r'$\alpha$')
plt.ylabel('criterion')
plt.figure()
plot_criterion(aic,'AIC','b')
plot_criterion(bic,'BIC','r')
plt.legend()
plt.title('info criteria for model selection')
Text(0.5, 1.0, 'info criteria for model selection')
# LassoCV: coordinate descent
t1 = time.time()
model = LassoCV(cv=20).fit(X, y)
t_lasso_cv = time.time() - t1
plt.figure()
ymin, ymax = 2300, 3800
plt.semilogx(model.alphas_ + EPSILON, model.mse_path_, ':')
plt.plot( model.alphas_ + EPSILON, model.mse_path_.mean(axis=-1), 'k', label='avg across folds', linewidth=2)
plt.axvline( model.alpha_ + EPSILON, linestyle='--', color='k', label='alpha: CV estimate')
plt.legend()
plt.xlabel(r'$\alpha$')
plt.ylabel('Mean square error')
plt.title('MSE on each fold: coordinate descent '
'(train time: %.2fs)' % t_lasso_cv)
plt.axis('tight')
plt.ylim(ymin, ymax)
(2300.0, 3800.0)
# LassoLarsCV: least angle regression
t1 = time.time()
model = LassoLarsCV(cv=20).fit(X, y)
t_lasso_lars_cv = time.time() - t1
plt.figure()
plt.semilogx(model.cv_alphas_ + EPSILON, model.mse_path_, ':')
plt.semilogx(model.cv_alphas_ + EPSILON, model.mse_path_.mean(axis=-1), 'k', label='avg across folds', linewidth=2)
plt.axvline(model.alpha_, linestyle='--', color='k', label='alpha CV')
plt.legend()
plt.xlabel(r'$\alpha$')
plt.ylabel('Mean square error')
plt.title('MSE on each fold: Lars (train time: %.2fs)'
% t_lasso_lars_cv)
plt.axis('tight')
plt.ylim(ymin, ymax)
plt.show()
from sklearn.linear_model import MultiTaskLasso as MTL
from sklearn.linear_model import Lasso as L
# generate sine-based coefficients - random frequencies & phases
nsamps, nfeats, ntasks = 100,30,40
nrelevant_feats = 5
coefs = np.zeros((ntasks,nfeats))
times = np.linspace(0,2*np.pi,ntasks)
rng = np.random.RandomState(42)
for k in range(nrelevant_feats):
coefs[:,k] = np.sin((1.0+rng.randn(1))*times+3*rng.randn(1))
X = rng.randn(nsamps,nfeats)
Y = np.dot(X,coefs.T) + rng.randn(nsamps,ntasks)
Lcoefs = np.array([L(alpha=0.5).fit(X,y).coef_ for y in Y.T])
MTLcoefs = MTL(alpha=1.0).fit(X,Y).coef_
fig = plt.figure()
plt.subplot(1,2,1); plt.spy(Lcoefs)
plt.xlabel('feature'); plt.ylabel('time or task')
plt.text(10,5,'lasso')
plt.subplot(1,2,2); plt.spy(MTLcoefs)
plt.xlabel('feature'); plt.ylabel('time or task')
plt.text(10,5,'MT lasso')
Text(10, 5, 'MT lasso')
featurenum=0
plt.figure()
plt.plot(coefs[:, featurenum],
linewidth=2, color='green', label='ground truth')
plt.plot(Lcoefs[:, featurenum],
linewidth=2, color='blue', label='lasso')
plt.plot(MTLcoefs[:, featurenum],
linewidth=2, color='gold', label='MT lasso')
plt.show()
l1_ratio
parameter.from sklearn.linear_model import lasso_path as LP
from sklearn.linear_model import enet_path as EP
from sklearn import datasets
X,y = datasets.load_diabetes(return_X_y=True)
X /= X.std(axis=0) #standardize data for easier l1_ratio
# compute Lasso & Enet paths via coordinate descent
alphas_l, coefs_l, _ = LP(X,y,eps=5e-3,fit_intercept=False)
alphas_posl, coefs_posl, _ = LP(X,y,eps=5e-3,fit_intercept=False,positive=True)
alphas_e, coefs_e, _ = EP(X,y,eps=5e-3,fit_intercept=False,l1_ratio=0.8)
alphas_pose, coefs_pose, _ = EP(X,y,eps=5e-3,fit_intercept=False,l1_ratio=0.8, positive=True)
# plot
import matplotlib.pyplot as plt
import numpy as np
from itertools import cycle
plt.figure()
colors = cycle(['b', 'r', 'g', 'c', 'k'])
neg_log_alphas_lasso = -np.log10(alphas_l)
neg_log_alphas_enet = -np.log10(alphas_e)
for coef_l, coef_e, c in zip(coefs_l, coefs_e, colors):
l1 = plt.plot(neg_log_alphas_lasso, coef_l, c=c)
l2 = plt.plot(neg_log_alphas_enet, coef_e, linestyle='--', c=c)
plt.xlabel('-Log(alpha)')
plt.ylabel('coefficients')
plt.title('Lasso and Elastic-Net Paths')
plt.legend((l1[-1], l2[-1]), ('Lasso', 'Elastic-Net'), loc='lower left')
<matplotlib.legend.Legend at 0x7f6b54925f10>
plt.figure()
neg_log_alphas_positive_lasso = -np.log10(alphas_positive_lasso)
for coef_l, coef_pl, c in zip(coefs_lasso, coefs_positive_lasso, colors):
l1 = plt.plot(neg_log_alphas_lasso, coef_l, c=c)
l2 = plt.plot(neg_log_alphas_positive_lasso, coef_pl, linestyle='--', c=c)
plt.xlabel('-Log(alpha)')
plt.ylabel('coefficients')
plt.title('Lasso and positive Lasso')
plt.legend((l1[-1], l2[-1]), ('Lasso', 'positive Lasso'), loc='lower left')
alpha
and l1_ratio
parameters.# best practice: pass X & y as Fortran-contiguous numpy arrays
# to avoid unnecessary memory duplication
from sklearn import linear_model as LM
clf = LM.MultiTaskElasticNet(alpha=0.1)
clfcv = LM.MultiTaskElasticNetCV(cv=3)
X,y = [[0, 0], [1, 1], [2, 2]], [[0, 0], [1, 1], [2, 2]]
clf.fit(X,y); clfcv.fit(X,y)
print(clf.coef_, clf.intercept_)
print(clfcv.coef_, clfcv.intercept_)
[[0.45663524 0.45612256] [0.45663524 0.45612256]] [0.0872422 0.0872422] [[0.52875032 0.46958558] [0.52875032 0.46958558]] [0.00166409 0.00166409]
reg = LM.Lars(n_nonzero_coefs=1)
reg.fit([[-1,1], [0, 0], [1, 1]],
[-1.1111, 0, -1.1111])
print(reg.coef_)
[ 0. -1.1111]
coef_path_
(n_features, max_features+1) contains the coefficients path. The first column is always zero.import numpy as np; import matplotlib.pyplot as plt
from sklearn import linear_model as LM
from sklearn import datasets as DATA
X,y = DATA.load_diabetes(return_X_y=True)
_,_,coefs = LM.lars_path(X,y,method='lasso',verbose=True)
xx = np.sum(np.abs(coefs.T),axis=1); xx /= xx[-1]
print(coefs.shape); print(xx.shape)
.(10, 13) (13,)
plt.plot(xx,coefs.T)
ymin,ymax = plt.ylim()
plt.vlines(xx,ymin,ymax,linestyle="dotted")
plt.xlabel('|coef\/max|coef|')
plt.ylabel('coeffs')
plt.title('LASSO path')
plt.show()