Many real world datasets contain missing values, often encoded as blanks, NaNs or other placeholders. They are incompatible with scikit-learn estimators (which assume all values in an array are numerical & have a meaning.)
You could discard rows or columns with missing values. This comes at the price of losing possibly valuable data. A better strategy is to infer missing data from known data.
SimpleImputer
does univariate imputation (imputes values in a given dimension using only non-missing data in that dimension.)
Below: replace np.nan with the mean of the columns (axis 0).
import numpy as np
from sklearn.impute import SimpleImputer as SI
imp = SI(missing_values=np.nan, strategy='mean')
imp.fit([[1, 2], [np.nan, 3], [7, 6]])
X = [[np.nan, 2], [6, np.nan], [7, 6]]
print(imp.transform(X))
[[4. 2. ] [6. 3.66666667] [7. 6. ]]
import scipy.sparse as sp
X = sp.csc_matrix([[1, 2], [0, -1], [8, 4]])
imp = SI(missing_values=-1,
strategy='mean').fit(X)
X_test = sp.csc_matrix([[-1, 2],
[6, -1],
[7, 6]])
print(imp.transform(X_test).toarray())
[[3. 2.] [6. 3.] [7. 6.]]
most_frequent
or constant
options.import pandas as pd
df = pd.DataFrame([["a", "x"],
[np.nan, "y"],
["a", np.nan],
["b", "y"]], dtype="category")
imp = SI(strategy="most_frequent")
print(imp.fit_transform(df))
[['a' 'x'] ['a' 'y'] ['a' 'y'] ['b' 'y']]
IterativeImputer
models each feature's missing values as a function of other features. It uses a round-robin algorithm for max_iter
iterations.
This estimator is considered experimental.
import numpy as np
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer as II
imp = II(max_iter=10,
random_state=0).fit([[1, 2], [3, 6], [4, 8],
[np.nan, 3], [7, np.nan]])
X_test = [[np.nan, 2],
[6, np.nan],
[np.nan, 6]]
# the model learns that the second feature is double the first
print(np.round(imp.transform(X_test)))
[[ 1. 2.] [ 6. 12.] [ 3. 6.]]
Goal: compare estimators to see which is best when evaluating Cal Housing dataset - with a single value randomly removed.
Estimator options:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
# To use this experimental feature, we need to explicitly ask for it:
from sklearn.experimental import enable_iterative_imputer # noqa
from sklearn.datasets import fetch_california_housing
from sklearn.impute import SimpleImputer
from sklearn.impute import IterativeImputer
from sklearn.linear_model import BayesianRidge
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import cross_val_score
N_SPLITS = 5
rng = np.random.RandomState(0)
X_full, y_full = fetch_california_housing(return_X_y=True)
# ~2k samples is enough for the purpose of the example.
# Remove the following two lines for a slower run with different error bars.
X_full = X_full[::10]
y_full = y_full[::10]
n_samples, n_features = X_full.shape
# Estimate score on entire dataset (no missing values)
br_estimator = BayesianRidge()
score_full_data = pd.DataFrame(
cross_val_score(
br_estimator,
X_full, y_full,
scoring='neg_mean_squared_error',
cv=N_SPLITS
),
columns=['Full Data']
)
# Add a single missing value to each row
X_missing = X_full.copy()
y_missing = y_full
missing_samples = np.arange(n_samples)
missing_features = rng.choice(n_features,
n_samples,
replace=True)
X_missing[missing_samples,
missing_features] = np.nan
# Estimate score after imputation (mean and median strategies)
score_simple_imputer = pd.DataFrame()
for strategy in ('mean', 'median'):
estimator = make_pipeline(
SimpleImputer(missing_values=np.nan,
strategy=strategy),
br_estimator
)
score_simple_imputer[strategy] = cross_val_score(
estimator,
X_missing,
y_missing,
scoring='neg_mean_squared_error',
cv=N_SPLITS
)
# Estimate the score after iterative imputation of the missing values
# with different estimators
estimators = [
BayesianRidge(),
DecisionTreeRegressor(max_features='sqrt',
random_state=0),
ExtraTreesRegressor(n_estimators=10,
random_state=0),
KNeighborsRegressor(n_neighbors=15)]
score_iterative_imputer = pd.DataFrame()
for impute_estimator in estimators:
estimator = make_pipeline(
IterativeImputer(random_state=0,
estimator=impute_estimator,
max_iter=1000,),
br_estimator)
score_iterative_imputer[impute_estimator.__class__.__name__] = \
cross_val_score(
estimator, X_missing, y_missing, scoring='neg_mean_squared_error',
cv=N_SPLITS)
scores = pd.concat(
[score_full_data,
score_simple_imputer,
score_iterative_imputer],
keys=['Original',
'SimpleImputer',
'IterativeImputer'], axis=1)
/home/bjpcjp/.local/lib/python3.8/site-packages/sklearn/impute/_iterative.py:685: ConvergenceWarning: [IterativeImputer] Early stopping criterion not reached. warnings.warn("[IterativeImputer] Early stopping criterion not" /home/bjpcjp/.local/lib/python3.8/site-packages/sklearn/impute/_iterative.py:685: ConvergenceWarning: [IterativeImputer] Early stopping criterion not reached. warnings.warn("[IterativeImputer] Early stopping criterion not" /home/bjpcjp/.local/lib/python3.8/site-packages/sklearn/impute/_iterative.py:685: ConvergenceWarning: [IterativeImputer] Early stopping criterion not reached. warnings.warn("[IterativeImputer] Early stopping criterion not" /home/bjpcjp/.local/lib/python3.8/site-packages/sklearn/impute/_iterative.py:685: ConvergenceWarning: [IterativeImputer] Early stopping criterion not reached. warnings.warn("[IterativeImputer] Early stopping criterion not" /home/bjpcjp/.local/lib/python3.8/site-packages/sklearn/impute/_iterative.py:685: ConvergenceWarning: [IterativeImputer] Early stopping criterion not reached. warnings.warn("[IterativeImputer] Early stopping criterion not"
fig, ax = plt.subplots(figsize=(13, 6))
means = -scores.mean()
errors = scores.std()
means.plot.barh(xerr=errors, ax=ax)
ax.set_title('California Housing Regression with Different Imputation Methods')
ax.set_xlabel('MSE (smaller is better)')
ax.set_yticks(np.arange(means.shape[0]))
ax.set_yticklabels([" w/ ".join(label) for label in means.index.tolist()])
plt.tight_layout(pad=1)
Using multiple imputations to generate $m$ imputations for a single feature matrix is a best practice. Each imputation is put through the analysis pipeline. The $m$ analysis results help you understand how results can vary due to the uncertainty caused by the missing values. This is called multiple imputation.
IterativeImputer
is based on the R MICE package (Multivariate Imputation by Chained Equations), but returns a single imputation instead of multiple imputations. IterativeImputer
can also be used for multiple imputations by applying it repeatedly to the same dataset with different random seeds when sample_posterior=True
.
Note: calling IterativeImputer
s transform
method is not allowed to change the number of samples. Therefore multiple imputations cannot be achieved by a single call to transform.
nan_euclidean_distances
(a euclidean distance metric that supports missing values) is used to find the NNs. Each missing feature is imputed using values from n_neighbors
NN that have a value for the feature.
The feature of the neighbors are averaged uniformly or weighted by distance to each neighbor.
If a sample has more than one missing feature, the neighbors for that sample can be different depending on the feature being imputed. When the number of available neighbors is less than n_neighbors and there are no defined distances to the training set, the training set average for that feature is used during imputation.
If there is at least one neighbor with a defined distance, the weighted or unweighted average of the remaining neighbors will be used during imputation. If a feature is always missing in training, it is removed during transform.
import numpy as np
from sklearn.impute import KNNImputer
nan = np.nan
X = [[1, 2, nan], [3, 4, 3], [nan, 6, 5], [8, 8, 7]]
imputer = KNNImputer(n_neighbors=2, weights="uniform")
imputer.fit_transform(X)
array([[1. , 2. , 4. ], [3. , 4. , 3. ], [5.5, 6. , 5. ], [8. , 8. , 7. ]])
MissingIndicator
transforms a dataset into a binary matrix which indicates the presence of missing values.
SimpleImputer
and IterativeImputer
have an add_indicator
option, False by default, which can stack the missing data matrix with imputer's output.
Nan is the usual placeholder value. missing_values
accepts other values, such as integer.
from sklearn.impute import MissingIndicator as MI
X = np.array([[-1, -1, 1, 3],
[4, -1, 0, -1],
[8, -1, 1, 0]])
indicator = MI(missing_values=-1)
mask_missing_values_only = indicator.fit_transform(X)
mask_missing_values_only
array([[ True, True, False], [False, True, True], [False, True, False]])
features
chooses the features for building the mask. missing-only
is the default setting.indicator.features_
array([0, 1, 3])
indicator = MI(missing_values=-1, features="all")
mask_all = indicator.fit_transform(X)
print(mask_all)
indicator.features_
[[ True True False False] [False True False True] [False True False False]]
array([0, 1, 2, 3])
When using MissingIndicator
in a pipeline, be sure to use FeatureUnion
or ColumnTransformer
to add the indicators to the regular features.
Below: first load "iris" dataset & add missing values.
from sklearn.datasets import load_iris
from sklearn.impute import SimpleImputer, MissingIndicator
from sklearn.model_selection import train_test_split
from sklearn.pipeline import FeatureUnion, make_pipeline
from sklearn.tree import DecisionTreeClassifier
X, y = load_iris(return_X_y=True)
mask = np.random.randint(0, 2, size=X.shape).astype(bool)
X[mask] = np.nan
X_train, X_test, y_train, _ = train_test_split(X, y, test_size=100,
random_state=0)
FeatureUnion
& add indicators from MissingIndicator
.transformer = FeatureUnion(
transformer_list=[
('features', SimpleImputer(strategy='mean')),
('indicators', MissingIndicator())])
transformer = transformer.fit(X_train, y_train)
results = transformer.transform(X_test)
results.shape
(100, 8)
clf = make_pipeline(transformer,
DecisionTreeClassifier()).fit(X_train,
y_train)
results = clf.predict(X_test)
results.shape
(100,)