Many real world datasets contain missing values, often encoded as blanks, NaNs or other placeholders. They are incompatible with scikit-learn estimators (which assume all values in an array are numerical & have a meaning.)
You could discard rows or columns with missing values. This comes at the price of losing possibly valuable data. A better strategy is to infer missing data from known data.
import numpy as np from sklearn.impute import SimpleImputer as SI imp = SI(missing_values=np.nan, strategy='mean') imp.fit([[1, 2], [np.nan, 3], [7, 6]]) X = [[np.nan, 2], [6, np.nan], [7, 6]] print(imp.transform(X))
[[4. 2. ] [6. 3.66666667] [7. 6. ]]
import scipy.sparse as sp X = sp.csc_matrix([[1, 2], [0, -1], [8, 4]]) imp = SI(missing_values=-1, strategy='mean').fit(X) X_test = sp.csc_matrix([[-1, 2], [6, -1], [7, 6]]) print(imp.transform(X_test).toarray())
[[3. 2.] [6. 3.] [7. 6.]]
import pandas as pd df = pd.DataFrame([["a", "x"], [np.nan, "y"], ["a", np.nan], ["b", "y"]], dtype="category") imp = SI(strategy="most_frequent") print(imp.fit_transform(df))
[['a' 'x'] ['a' 'y'] ['a' 'y'] ['b' 'y']]
import numpy as np from sklearn.experimental import enable_iterative_imputer from sklearn.impute import IterativeImputer as II imp = II(max_iter=10, random_state=0).fit([[1, 2], [3, 6], [4, 8], [np.nan, 3], [7, np.nan]]) X_test = [[np.nan, 2], [6, np.nan], [np.nan, 6]] # the model learns that the second feature is double the first print(np.round(imp.transform(X_test)))
[[ 1. 2.] [ 6. 12.] [ 3. 6.]]
Goal: compare estimators to see which is best when evaluating Cal Housing dataset - with a single value randomly removed.
import numpy as np import matplotlib.pyplot as plt import pandas as pd # To use this experimental feature, we need to explicitly ask for it: from sklearn.experimental import enable_iterative_imputer # noqa from sklearn.datasets import fetch_california_housing from sklearn.impute import SimpleImputer from sklearn.impute import IterativeImputer from sklearn.linear_model import BayesianRidge from sklearn.tree import DecisionTreeRegressor from sklearn.ensemble import ExtraTreesRegressor from sklearn.neighbors import KNeighborsRegressor from sklearn.pipeline import make_pipeline from sklearn.model_selection import cross_val_score
N_SPLITS = 5 rng = np.random.RandomState(0) X_full, y_full = fetch_california_housing(return_X_y=True) # ~2k samples is enough for the purpose of the example. # Remove the following two lines for a slower run with different error bars. X_full = X_full[::10] y_full = y_full[::10] n_samples, n_features = X_full.shape # Estimate score on entire dataset (no missing values) br_estimator = BayesianRidge() score_full_data = pd.DataFrame( cross_val_score( br_estimator, X_full, y_full, scoring='neg_mean_squared_error', cv=N_SPLITS ), columns=['Full Data'] ) # Add a single missing value to each row X_missing = X_full.copy() y_missing = y_full missing_samples = np.arange(n_samples) missing_features = rng.choice(n_features, n_samples, replace=True) X_missing[missing_samples, missing_features] = np.nan # Estimate score after imputation (mean and median strategies) score_simple_imputer = pd.DataFrame() for strategy in ('mean', 'median'): estimator = make_pipeline( SimpleImputer(missing_values=np.nan, strategy=strategy), br_estimator ) score_simple_imputer[strategy] = cross_val_score( estimator, X_missing, y_missing, scoring='neg_mean_squared_error', cv=N_SPLITS ) # Estimate the score after iterative imputation of the missing values # with different estimators estimators = [ BayesianRidge(), DecisionTreeRegressor(max_features='sqrt', random_state=0), ExtraTreesRegressor(n_estimators=10, random_state=0), KNeighborsRegressor(n_neighbors=15)] score_iterative_imputer = pd.DataFrame() for impute_estimator in estimators: estimator = make_pipeline( IterativeImputer(random_state=0, estimator=impute_estimator, max_iter=1000,), br_estimator) score_iterative_imputer[impute_estimator.__class__.__name__] = \ cross_val_score( estimator, X_missing, y_missing, scoring='neg_mean_squared_error', cv=N_SPLITS) scores = pd.concat( [score_full_data, score_simple_imputer, score_iterative_imputer], keys=['Original', 'SimpleImputer', 'IterativeImputer'], axis=1)
/home/bjpcjp/.local/lib/python3.8/site-packages/sklearn/impute/_iterative.py:685: ConvergenceWarning: [IterativeImputer] Early stopping criterion not reached. warnings.warn("[IterativeImputer] Early stopping criterion not" /home/bjpcjp/.local/lib/python3.8/site-packages/sklearn/impute/_iterative.py:685: ConvergenceWarning: [IterativeImputer] Early stopping criterion not reached. warnings.warn("[IterativeImputer] Early stopping criterion not" /home/bjpcjp/.local/lib/python3.8/site-packages/sklearn/impute/_iterative.py:685: ConvergenceWarning: [IterativeImputer] Early stopping criterion not reached. warnings.warn("[IterativeImputer] Early stopping criterion not" /home/bjpcjp/.local/lib/python3.8/site-packages/sklearn/impute/_iterative.py:685: ConvergenceWarning: [IterativeImputer] Early stopping criterion not reached. warnings.warn("[IterativeImputer] Early stopping criterion not" /home/bjpcjp/.local/lib/python3.8/site-packages/sklearn/impute/_iterative.py:685: ConvergenceWarning: [IterativeImputer] Early stopping criterion not reached. warnings.warn("[IterativeImputer] Early stopping criterion not"
fig, ax = plt.subplots(figsize=(13, 6)) means = -scores.mean() errors = scores.std() means.plot.barh(xerr=errors, ax=ax) ax.set_title('California Housing Regression with Different Imputation Methods') ax.set_xlabel('MSE (smaller is better)') ax.set_yticks(np.arange(means.shape)) ax.set_yticklabels([" w/ ".join(label) for label in means.index.tolist()]) plt.tight_layout(pad=1)
Using multiple imputations to generate $m$ imputations for a single feature matrix is a best practice. Each imputation is put through the analysis pipeline. The $m$ analysis results help you understand how results can vary due to the uncertainty caused by the missing values. This is called multiple imputation.
IterativeImputer is based on the R MICE package (Multivariate Imputation by Chained Equations), but returns a single imputation instead of multiple imputations.
IterativeImputer can also be used for multiple imputations by applying it repeatedly to the same dataset with different random seeds when
transform method is not allowed to change the number of samples. Therefore multiple imputations cannot be achieved by a single call to transform.
nan_euclidean_distances (a euclidean distance metric that supports missing values) is used to find the NNs. Each missing feature is imputed using values from
n_neighbors NN that have a value for the feature.
The feature of the neighbors are averaged uniformly or weighted by distance to each neighbor.
If a sample has more than one missing feature, the neighbors for that sample can be different depending on the feature being imputed. When the number of available neighbors is less than n_neighbors and there are no defined distances to the training set, the training set average for that feature is used during imputation.
If there is at least one neighbor with a defined distance, the weighted or unweighted average of the remaining neighbors will be used during imputation. If a feature is always missing in training, it is removed during transform.
import numpy as np from sklearn.impute import KNNImputer nan = np.nan X = [[1, 2, nan], [3, 4, 3], [nan, 6, 5], [8, 8, 7]] imputer = KNNImputer(n_neighbors=2, weights="uniform") imputer.fit_transform(X)
array([[1. , 2. , 4. ], [3. , 4. , 3. ], [5.5, 6. , 5. ], [8. , 8. , 7. ]])
MissingIndicator transforms a dataset into a binary matrix which indicates the presence of missing values.
IterativeImputer have an
add_indicator option, False by default, which can stack the missing data matrix with imputer's output.
Nan is the usual placeholder value.
missing_values accepts other values, such as integer.
from sklearn.impute import MissingIndicator as MI X = np.array([[-1, -1, 1, 3], [4, -1, 0, -1], [8, -1, 1, 0]]) indicator = MI(missing_values=-1) mask_missing_values_only = indicator.fit_transform(X) mask_missing_values_only
array([[ True, True, False], [False, True, True], [False, True, False]])
featureschooses the features for building the mask.
missing-onlyis the default setting.
array([0, 1, 3])
indicator = MI(missing_values=-1, features="all") mask_all = indicator.fit_transform(X) print(mask_all) indicator.features_
[[ True True False False] [False True False True] [False True False False]]
array([0, 1, 2, 3])
MissingIndicator in a pipeline, be sure to use
ColumnTransformer to add the indicators to the regular features.
Below: first load "iris" dataset & add missing values.
from sklearn.datasets import load_iris from sklearn.impute import SimpleImputer, MissingIndicator from sklearn.model_selection import train_test_split from sklearn.pipeline import FeatureUnion, make_pipeline from sklearn.tree import DecisionTreeClassifier X, y = load_iris(return_X_y=True) mask = np.random.randint(0, 2, size=X.shape).astype(bool) X[mask] = np.nan X_train, X_test, y_train, _ = train_test_split(X, y, test_size=100, random_state=0)
FeatureUnion& add indicators from
transformer = FeatureUnion( transformer_list=[ ('features', SimpleImputer(strategy='mean')), ('indicators', MissingIndicator())]) transformer = transformer.fit(X_train, y_train) results = transformer.transform(X_test) results.shape
clf = make_pipeline(transformer, DecisionTreeClassifier()).fit(X_train, y_train) results = clf.predict(X_test) results.shape