from sklearn.feature_selection import VarianceThreshold
X = [[0, 0, 1], [0, 1, 0], [1, 0, 0], [0, 1, 1], [0, 1, 0], [0, 1, 1]]
sel = VarianceThreshold(threshold=(.8 * (1 - .8)))
sel.fit_transform(X)

array([[0, 1],
       [1, 0],
       [0, 0],
       [1, 1],
       [1, 0],
       [1, 1]])


import numpy as np
import matplotlib.pyplot as plt

from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split as TTS
from sklearn.preprocessing import MinMaxScaler
from sklearn.svm import LinearSVC
from sklearn.pipeline import make_pipeline
from sklearn.feature_selection import SelectKBest, f_classif


# iris dataset with noise added to informative features
X, y = load_iris(return_X_y=True)
E    = np.random.RandomState(42).uniform(0, 0.1, size=(X.shape[0], 20))
X    = np.hstack((X, E))

X_train, X_test, y_train, y_test = TTS(X, y, stratify=y, random_state=0)
X_indices = np.arange(X.shape[-1])


# Univariate feature selection with F-test for feature scoring
# Use default to select 4 most significant features

selector = SelectKBest(f_classif, k=4)
selector.fit(X_train, y_train)
scores = -np.log10(selector.pvalues_)
scores /= scores.max()


clf          = make_pipeline(MinMaxScaler(),                              LinearSVC())
clf_selected = make_pipeline(SelectKBest(f_classif, k=4), MinMaxScaler(), LinearSVC())

clf.fit(         X_train, y_train)
clf_selected.fit(X_train, y_train)

svm_weights           = np.abs(clf[-1].coef_).sum(axis=0)
svm_weights          /= svm_weights.sum()
svm_weights_selected  = np.abs(clf_selected[-1].coef_).sum(axis=0)
svm_weights_selected /= svm_weights_selected.sum()

print('Classification accuracy:')
print('Without selecting features: {:.3f}'.format(clf.score(X_test, y_test)))
print('After univariate feature selection: {:.3f}'.format(clf_selected.score(X_test, y_test)))

plt.figure(figsize=(8,4))
plt.bar(X_indices - .45,      
        scores, width=.2, label=r'Univariate score ($-Log(p_{value})$)')
plt.bar(X_indices - .25, 
        svm_weights, width=.2, label='SVM weight')
plt.bar(X_indices[selector.get_support()] - .05, 
        svm_weights_selected, width=.2, label='SVM weights after selection')

plt.title("Comparing feature selection")
plt.xlabel('Feature number')
plt.yticks(())
plt.axis('tight')
plt.legend(loc='upper right')
plt.show()

Classification accuracy:
Without selecting features: 0.789
After univariate feature selection: 0.868


import numpy as np
import matplotlib.pyplot as plt
from sklearn.feature_selection import f_regression, mutual_info_regression

np.random.seed(0)
X = np.random.rand(1000, 3)
y = X[:, 0] + np.sin(6 * np.pi * X[:, 1]) + 0.1 * np.random.randn(1000)

f_test, _ = f_regression(          X, y); f_test /= np.max(f_test)
mi        = mutual_info_regression(X, y); mi     /= np.max(mi)


plt.figure(figsize=(15, 5))
for i in range(3):
    plt.subplot(1, 3, i + 1)
    plt.scatter(X[:, i], y, edgecolor='black', s=20)
    plt.xlabel("$x_{}$".format(i + 1), fontsize=14)
    if i == 0:
        plt.ylabel("$y$", fontsize=14)
    plt.title("F-test={:.2f}, MI={:.2f}".format(f_test[i], mi[i]),
              fontsize=16)
plt.show()


# example - show pixel relevance in digit classification
from sklearn.svm import SVC
from sklearn.datasets import load_digits
from sklearn.feature_selection import RFE
import matplotlib.pyplot as plt

digits = load_digits()
X = digits.images.reshape((len(digits.images), -1))
y = digits.target

# Create the RFE object and rank each pixel
svc = SVC(kernel="linear", C=1)
rfe = RFE(estimator=svc, n_features_to_select=1, step=1)
rfe.fit(X, y)
ranking = rfe.ranking_.reshape(digits.images[0].shape)

# Plot pixel ranking
plt.matshow(ranking, cmap=plt.cm.Blues)
plt.colorbar()
plt.title("Ranking of pixels with RFE")
plt.show()


from sklearn.svm import LinearSVC
from sklearn.datasets import load_iris
from sklearn.feature_selection import SelectFromModel

X, y = load_iris(return_X_y=True)
print(X.shape)

lsvc = LinearSVC(C=0.01, penalty="l1", dual=False).fit(X, y)
model = SelectFromModel(lsvc, prefit=True)
print(model.transform(X).shape)

(150, 4)
(150, 3)

/home/bjpcjp/.local/lib/python3.8/site-packages/sklearn/svm/_base.py:985: ConvergenceWarning: Liblinear failed to converge, increase the number of iterations.
  warnings.warn("Liblinear failed to converge, increase "


from sklearn.ensemble import ExtraTreesClassifier as ETC
from sklearn.datasets import load_iris
from sklearn.feature_selection import SelectFromModel
X, y = load_iris(return_X_y=True)

print(X.shape)

clf = ETC(n_estimators=50); clf = clf.fit(X, y); print(clf.feature_importances_)  

model = SelectFromModel(clf, prefit=True)
print(model.transform(X).shape)

(150, 4)
[0.08051372 0.05626768 0.41688445 0.44633415]
(150, 2)


from sklearn.datasets import load_diabetes
diabetes = load_diabetes(); X, y = diabetes.data, diabetes.target


import matplotlib.pyplot as plt
import numpy as np
from sklearn.linear_model import LassoCV

lasso         = LassoCV().fit(X, y)
importance    = np.abs(lasso.coef_)
feature_names = np.array(diabetes.feature_names)

plt.bar(height=importance, x=feature_names)
plt.title("Feature importances via coefficients")
plt.show()


from sklearn.feature_selection import SelectFromModel
from time import time

threshold = np.sort(importance)[-3] + 0.01

tic = time()
sfm = SelectFromModel(lasso, threshold=threshold).fit(X, y)
toc = time()
print("Features selected by SelectFromModel: "
      f"{feature_names[sfm.get_support()]}")
print(f"Done in {toc - tic:.3f}s")

Features selected by SelectFromModel: ['s1' 's5']
Done in 0.075s


from sklearn.feature_selection import SequentialFeatureSelector as SFS

tic_fwd = time()
sfs_forward = SFS(lasso, n_features_to_select=2, direction='forward').fit(X, y)
toc_fwd = time()

tic_bwd = time()
sfs_backward = SFS(lasso, n_features_to_select=2, direction='backward').fit(X, y)
toc_bwd = time()

print("Features (forward SS): " f"{feature_names[sfs_forward.get_support()]}")
print(f"Done in {toc_fwd - tic_fwd:.3f}s")
print("Features (backward SS): " f"{feature_names[sfs_backward.get_support()]}")
print(f"Done in {toc_bwd - tic_bwd:.3f}s")

Features (forward SS): ['bmi' 's5']
Done in 3.092s
Features (backward SS): ['bmi' 's5']
Done in 8.810s


from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split

X, y = make_classification(random_state=0)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

clf = Pipeline([
  ('feature_selection', SelectFromModel(LinearSVC())),
  ('classification', RandomForestClassifier())
])
clf.fit(X, y)

Pipeline(steps=[('feature_selection', SelectFromModel(estimator=LinearSVC())),
                ('classification', RandomForestClassifier())])

Feature Selection ¶

Removing Low-Variance Features ¶

Univariate Selection¶

Example: Univariate Selection¶

Example: Using F-test vs Mutual Information statistics¶

Recursive Selection ¶

Selecting from a Model ¶

Selecting from a Linear Model - L1 norm based selection¶

Selecting from a Tree - Impurity based selection¶

Sequential Selection ¶

Example: Selecting from a Model vs Sequential Selection¶

Feature Selections & Pipelines¶

Feature Selection¶

Removing Low-Variance Features¶

Univariate Selection¶

Example: Univariate Selection¶

Example: Using F-test vs Mutual Information statistics¶

Recursive Selection¶

Selecting from a Model¶

Selecting from a Linear Model - L1 norm based selection¶

Selecting from a Tree - Impurity based selection¶

Sequential Selection¶

Example: Selecting from a Model vs Sequential Selection¶

Feature Selections & Pipelines¶

Feature Selection ¶

Removing Low-Variance Features ¶

Recursive Selection ¶

Selecting from a Model ¶

Sequential Selection ¶