import numpy as np
from sklearn.impute import SimpleImputer as SI

imp = SI(missing_values=np.nan, strategy='mean')

imp.fit([[1, 2], [np.nan, 3], [7, 6]])

X = [[np.nan, 2], [6, np.nan], [7, 6]]

print(imp.transform(X))

[[4.         2.        ]
 [6.         3.66666667]
 [7.         6.        ]]


import scipy.sparse as sp
X   = sp.csc_matrix([[1, 2], [0, -1], [8, 4]])
imp = SI(missing_values=-1, 
         strategy='mean').fit(X)

X_test = sp.csc_matrix([[-1,  2], 
                         [6, -1], 
                         [7, 6]])
print(imp.transform(X_test).toarray())

[[3. 2.]
 [6. 3.]
 [7. 6.]]


import pandas as pd
df = pd.DataFrame([["a",    "x"],
                   [np.nan, "y"],
                   ["a",    np.nan],
                   ["b",    "y"]], dtype="category")

imp = SI(strategy="most_frequent")
print(imp.fit_transform(df))

[['a' 'x']
 ['a' 'y']
 ['a' 'y']
 ['b' 'y']]


import numpy as np
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer as II

imp = II(max_iter=10, 
         random_state=0).fit([[1,      2], [3, 6], [4, 8], 
                              [np.nan, 3], [7, np.nan]])

X_test = [[np.nan, 2], 
          [6, np.nan], 
          [np.nan, 6]]

# the model learns that the second feature is double the first
print(np.round(imp.transform(X_test)))

[[ 1.  2.]
 [ 6. 12.]
 [ 3.  6.]]


import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

# To use this experimental feature, we need to explicitly ask for it:
from sklearn.experimental import enable_iterative_imputer  # noqa
from sklearn.datasets import fetch_california_housing
from sklearn.impute import SimpleImputer
from sklearn.impute import IterativeImputer
from sklearn.linear_model import BayesianRidge
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import cross_val_score


N_SPLITS              = 5
rng                   = np.random.RandomState(0)
X_full, y_full        = fetch_california_housing(return_X_y=True)
# ~2k samples is enough for the purpose of the example.
# Remove the following two lines for a slower run with different error bars.
X_full                = X_full[::10]
y_full                = y_full[::10]
n_samples, n_features = X_full.shape

# Estimate score on entire dataset (no missing values)
br_estimator          = BayesianRidge()
score_full_data       = pd.DataFrame(
    cross_val_score(
        br_estimator, 
        X_full, y_full, 
        scoring='neg_mean_squared_error',
        cv=N_SPLITS
    ),
    columns=['Full Data']
)

# Add a single missing value to each row
X_missing        = X_full.copy()
y_missing        = y_full
missing_samples  = np.arange(n_samples)
missing_features = rng.choice(n_features, 
                              n_samples, 
                              replace=True)
X_missing[missing_samples, 
          missing_features] = np.nan

# Estimate score after imputation (mean and median strategies)
score_simple_imputer = pd.DataFrame()
for strategy in ('mean', 'median'):
    estimator = make_pipeline(
        SimpleImputer(missing_values=np.nan, 
                      strategy=strategy),
        br_estimator
    )
    score_simple_imputer[strategy] = cross_val_score(
        estimator, 
        X_missing, 
        y_missing, 
        scoring='neg_mean_squared_error',
        cv=N_SPLITS
    )

# Estimate the score after iterative imputation of the missing values
# with different estimators
estimators = [
    BayesianRidge(),
    DecisionTreeRegressor(max_features='sqrt', 
                          random_state=0),
    ExtraTreesRegressor(n_estimators=10, 
                        random_state=0),
    KNeighborsRegressor(n_neighbors=15)]

score_iterative_imputer = pd.DataFrame()
for impute_estimator in estimators:
    estimator = make_pipeline(
        IterativeImputer(random_state=0, 
                         estimator=impute_estimator,
                         max_iter=1000,),
        br_estimator)
    
    score_iterative_imputer[impute_estimator.__class__.__name__] = \
        cross_val_score(
            estimator, X_missing, y_missing, scoring='neg_mean_squared_error',
            cv=N_SPLITS)

scores = pd.concat(
    [score_full_data, 
     score_simple_imputer, 
     score_iterative_imputer],
    keys=['Original', 
          'SimpleImputer', 
          'IterativeImputer'], axis=1)

/home/bjpcjp/.local/lib/python3.8/site-packages/sklearn/impute/_iterative.py:685: ConvergenceWarning: [IterativeImputer] Early stopping criterion not reached.
  warnings.warn("[IterativeImputer] Early stopping criterion not"
/home/bjpcjp/.local/lib/python3.8/site-packages/sklearn/impute/_iterative.py:685: ConvergenceWarning: [IterativeImputer] Early stopping criterion not reached.
  warnings.warn("[IterativeImputer] Early stopping criterion not"
/home/bjpcjp/.local/lib/python3.8/site-packages/sklearn/impute/_iterative.py:685: ConvergenceWarning: [IterativeImputer] Early stopping criterion not reached.
  warnings.warn("[IterativeImputer] Early stopping criterion not"
/home/bjpcjp/.local/lib/python3.8/site-packages/sklearn/impute/_iterative.py:685: ConvergenceWarning: [IterativeImputer] Early stopping criterion not reached.
  warnings.warn("[IterativeImputer] Early stopping criterion not"
/home/bjpcjp/.local/lib/python3.8/site-packages/sklearn/impute/_iterative.py:685: ConvergenceWarning: [IterativeImputer] Early stopping criterion not reached.
  warnings.warn("[IterativeImputer] Early stopping criterion not"


fig, ax = plt.subplots(figsize=(13, 6))
means = -scores.mean()
errors = scores.std()
means.plot.barh(xerr=errors, ax=ax)
ax.set_title('California Housing Regression with Different Imputation Methods')
ax.set_xlabel('MSE (smaller is better)')
ax.set_yticks(np.arange(means.shape[0]))
ax.set_yticklabels([" w/ ".join(label) for label in means.index.tolist()])
plt.tight_layout(pad=1)


import numpy as np
from sklearn.impute import KNNImputer
nan = np.nan
X = [[1, 2, nan], [3, 4, 3], [nan, 6, 5], [8, 8, 7]]
imputer = KNNImputer(n_neighbors=2, weights="uniform")
imputer.fit_transform(X)

array([[1. , 2. , 4. ],
       [3. , 4. , 3. ],
       [5.5, 6. , 5. ],
       [8. , 8. , 7. ]])


from sklearn.impute import MissingIndicator as MI
X = np.array([[-1, -1, 1, 3],
              [4, -1, 0, -1],
              [8, -1, 1, 0]])

indicator = MI(missing_values=-1)
mask_missing_values_only = indicator.fit_transform(X)
mask_missing_values_only

array([[ True,  True, False],
       [False,  True,  True],
       [False,  True, False]])


indicator.features_

array([0, 1, 3])


indicator = MI(missing_values=-1, features="all")
mask_all = indicator.fit_transform(X)
print(mask_all)
indicator.features_

[[ True  True False False]
 [False  True False  True]
 [False  True False False]]

array([0, 1, 2, 3])


from sklearn.datasets import load_iris
from sklearn.impute import SimpleImputer, MissingIndicator
from sklearn.model_selection import train_test_split
from sklearn.pipeline import FeatureUnion, make_pipeline
from sklearn.tree import DecisionTreeClassifier

X, y    = load_iris(return_X_y=True)
mask    = np.random.randint(0, 2, size=X.shape).astype(bool)
X[mask] = np.nan

X_train, X_test, y_train, _ = train_test_split(X, y, test_size=100,
                                               random_state=0)


transformer = FeatureUnion(
    transformer_list=[
        ('features',   SimpleImputer(strategy='mean')),
        ('indicators', MissingIndicator())])

transformer = transformer.fit(X_train, y_train)
results     = transformer.transform(X_test)
results.shape

(100, 8)


clf = make_pipeline(transformer, 
                    DecisionTreeClassifier()).fit(X_train, 
                                                  y_train)
results = clf.predict(X_test)
results.shape

(100,)

Imputation of Missing Values ¶

Univariate feature imputation ¶

Multivariate Feature Imputation ¶

Example: Iterative Imputing Variations ¶

Multiple vs Single Imputation¶

Nearest Neighbors Imputation ¶

Marking Imputed Values ¶

Imputation of Missing Values¶

Univariate feature imputation¶

Multivariate Feature Imputation¶

Example: Iterative Imputing Variations¶

Multiple vs Single Imputation¶

Nearest Neighbors Imputation¶

Marking Imputed Values¶

Imputation of Missing Values ¶

Univariate feature imputation ¶

Multivariate Feature Imputation ¶

Example: Iterative Imputing Variations ¶

Nearest Neighbors Imputation ¶

Marking Imputed Values ¶