from sklearn.feature_selection import VarianceThreshold
X = [[0, 0, 1], [0, 1, 0], [1, 0, 0], [0, 1, 1], [0, 1, 0], [0, 1, 1]]
sel = VarianceThreshold(threshold=(.8 * (1 - .8)))
sel.fit_transform(X)
array([[0, 1], [1, 0], [0, 0], [1, 1], [1, 0], [1, 1]])
Selects the best features based on a statistical test:
Regression tests:
F
(array of feature values), pval
(array of F-scores)mi
(array of estimated mutual information between each feature and the target)chi2
(array of chi2 stats for each feature) and pval
(array of p-values for each feature)import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split as TTS
from sklearn.preprocessing import MinMaxScaler
from sklearn.svm import LinearSVC
from sklearn.pipeline import make_pipeline
from sklearn.feature_selection import SelectKBest, f_classif
# iris dataset with noise added to informative features
X, y = load_iris(return_X_y=True)
E = np.random.RandomState(42).uniform(0, 0.1, size=(X.shape[0], 20))
X = np.hstack((X, E))
X_train, X_test, y_train, y_test = TTS(X, y, stratify=y, random_state=0)
X_indices = np.arange(X.shape[-1])
# Univariate feature selection with F-test for feature scoring
# Use default to select 4 most significant features
selector = SelectKBest(f_classif, k=4)
selector.fit(X_train, y_train)
scores = -np.log10(selector.pvalues_)
scores /= scores.max()
clf = make_pipeline(MinMaxScaler(), LinearSVC())
clf_selected = make_pipeline(SelectKBest(f_classif, k=4), MinMaxScaler(), LinearSVC())
clf.fit( X_train, y_train)
clf_selected.fit(X_train, y_train)
svm_weights = np.abs(clf[-1].coef_).sum(axis=0)
svm_weights /= svm_weights.sum()
svm_weights_selected = np.abs(clf_selected[-1].coef_).sum(axis=0)
svm_weights_selected /= svm_weights_selected.sum()
print('Classification accuracy:')
print('Without selecting features: {:.3f}'.format(clf.score(X_test, y_test)))
print('After univariate feature selection: {:.3f}'.format(clf_selected.score(X_test, y_test)))
plt.figure(figsize=(8,4))
plt.bar(X_indices - .45,
scores, width=.2, label=r'Univariate score ($-Log(p_{value})$)')
plt.bar(X_indices - .25,
svm_weights, width=.2, label='SVM weight')
plt.bar(X_indices[selector.get_support()] - .05,
svm_weights_selected, width=.2, label='SVM weights after selection')
plt.title("Comparing feature selection")
plt.xlabel('Feature number')
plt.yticks(())
plt.axis('tight')
plt.legend(loc='upper right')
plt.show()
Classification accuracy: Without selecting features: 0.789 After univariate feature selection: 0.868
import numpy as np
import matplotlib.pyplot as plt
from sklearn.feature_selection import f_regression, mutual_info_regression
np.random.seed(0)
X = np.random.rand(1000, 3)
y = X[:, 0] + np.sin(6 * np.pi * X[:, 1]) + 0.1 * np.random.randn(1000)
f_test, _ = f_regression( X, y); f_test /= np.max(f_test)
mi = mutual_info_regression(X, y); mi /= np.max(mi)
plt.figure(figsize=(15, 5))
for i in range(3):
plt.subplot(1, 3, i + 1)
plt.scatter(X[:, i], y, edgecolor='black', s=20)
plt.xlabel("$x_{}$".format(i + 1), fontsize=14)
if i == 0:
plt.ylabel("$y$", fontsize=14)
plt.title("F-test={:.2f}, MI={:.2f}".format(f_test[i], mi[i]),
fontsize=16)
plt.show()
# example - show pixel relevance in digit classification
from sklearn.svm import SVC
from sklearn.datasets import load_digits
from sklearn.feature_selection import RFE
import matplotlib.pyplot as plt
digits = load_digits()
X = digits.images.reshape((len(digits.images), -1))
y = digits.target
# Create the RFE object and rank each pixel
svc = SVC(kernel="linear", C=1)
rfe = RFE(estimator=svc, n_features_to_select=1, step=1)
rfe.fit(X, y)
ranking = rfe.ranking_.reshape(digits.images[0].shape)
# Plot pixel ranking
plt.matshow(ranking, cmap=plt.cm.Blues)
plt.colorbar()
plt.title("Ranking of pixels with RFE")
plt.show()
threshold
value are removed.max_features
to set a limit on the number of selected features.alpha
(higher alpha = fewer features.)C
(smaller C = fewer features).from sklearn.svm import LinearSVC
from sklearn.datasets import load_iris
from sklearn.feature_selection import SelectFromModel
X, y = load_iris(return_X_y=True)
print(X.shape)
lsvc = LinearSVC(C=0.01, penalty="l1", dual=False).fit(X, y)
model = SelectFromModel(lsvc, prefit=True)
print(model.transform(X).shape)
(150, 4) (150, 3)
/home/bjpcjp/.local/lib/python3.8/site-packages/sklearn/svm/_base.py:985: ConvergenceWarning: Liblinear failed to converge, increase the number of iterations. warnings.warn("Liblinear failed to converge, increase "
from sklearn.ensemble import ExtraTreesClassifier as ETC
from sklearn.datasets import load_iris
from sklearn.feature_selection import SelectFromModel
X, y = load_iris(return_X_y=True)
print(X.shape)
clf = ETC(n_estimators=50); clf = clf.fit(X, y); print(clf.feature_importances_)
model = SelectFromModel(clf, prefit=True)
print(model.transform(X).shape)
(150, 4) [0.08051372 0.05626768 0.41688445 0.44633415] (150, 2)
n_features_to_select
)direction
controls the algorithm flow.from sklearn.datasets import load_diabetes
diabetes = load_diabetes(); X, y = diabetes.data, diabetes.target
threshold
just above the coefficient of the 3rd most important feature.import matplotlib.pyplot as plt
import numpy as np
from sklearn.linear_model import LassoCV
lasso = LassoCV().fit(X, y)
importance = np.abs(lasso.coef_)
feature_names = np.array(diabetes.feature_names)
plt.bar(height=importance, x=feature_names)
plt.title("Feature importances via coefficients")
plt.show()
from sklearn.feature_selection import SelectFromModel
from time import time
threshold = np.sort(importance)[-3] + 0.01
tic = time()
sfm = SelectFromModel(lasso, threshold=threshold).fit(X, y)
toc = time()
print("Features selected by SelectFromModel: "
f"{feature_names[sfm.get_support()]}")
print(f"Done in {toc - tic:.3f}s")
Features selected by SelectFromModel: ['s1' 's5'] Done in 0.075s
from sklearn.feature_selection import SequentialFeatureSelector as SFS
tic_fwd = time()
sfs_forward = SFS(lasso, n_features_to_select=2, direction='forward').fit(X, y)
toc_fwd = time()
tic_bwd = time()
sfs_backward = SFS(lasso, n_features_to_select=2, direction='backward').fit(X, y)
toc_bwd = time()
print("Features (forward SS): " f"{feature_names[sfs_forward.get_support()]}")
print(f"Done in {toc_fwd - tic_fwd:.3f}s")
print("Features (backward SS): " f"{feature_names[sfs_backward.get_support()]}")
print(f"Done in {toc_bwd - tic_bwd:.3f}s")
Features (forward SS): ['bmi' 's5'] Done in 3.092s Features (backward SS): ['bmi' 's5'] Done in 8.810s
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
X, y = make_classification(random_state=0)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
clf = Pipeline([
('feature_selection', SelectFromModel(LinearSVC())),
('classification', RandomForestClassifier())
])
clf.fit(X, y)
Pipeline(steps=[('feature_selection', SelectFromModel(estimator=LinearSVC())), ('classification', RandomForestClassifier())])