import numpy as np
from sklearn.model_selection import train_test_split
from sklearn import datasets
from sklearn import svm

X, y = datasets.load_iris(return_X_y=True)
print(X.shape, y.shape)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.4, random_state=0)

print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

clf = svm.SVC(kernel='linear', C=1).fit(X_train, y_train)
print(clf.score(X_test, y_test))

(150, 4) (150,)
(90, 4) (90,)
(60, 4) (60,)
0.9666666666666667


# split, fit, cv with 5-fold scoring
from sklearn.model_selection import cross_val_score
clf    = svm.SVC(kernel='linear', C=1, random_state=42)
scores = cross_val_score(clf, X, y, cv=5)
print("scores:\t", scores)
print("accuracy:\t %0.2f" % (scores.mean()))
print("st dev: \t  %0.2f" % (scores.std()))

scores:	 [0.96666667 1.         0.96666667 0.96666667 1.        ]
accuracy:	 0.98
st dev: 	  0.02


from sklearn import metrics
print(cross_val_score(clf, X, y, cv=5, scoring='f1_macro'))

[0.96658312 1.         0.96658312 0.96658312 1.        ]


from sklearn.model_selection import ShuffleSplit
n_samples = X.shape[0]
cv = ShuffleSplit(n_splits=5, test_size=0.3, random_state=0)
print(cross_val_score(clf, X, y, cv=cv))

[0.97777778 0.97777778 1.         0.95555556 1.        ]


def custom_cv_2folds(X):
    n = X.shape[0]
    i = 1
    while i <= 2:
        idx = np.arange(n * (i - 1) / 2, n * i / 2, dtype=int)
        yield idx, idx
        i += 1
print(cross_val_score(clf, X, y, cv=custom_cv_2folds(X)))

[1.         0.97333333]


from sklearn.model_selection import cross_validate
from sklearn.metrics import recall_score

scoring = ['precision_macro', 'recall_macro']
clf     = svm.SVC(kernel='linear', C=1, random_state=0)
scores  = cross_validate(clf, X, y, scoring=scoring)

print(sorted(scores.keys()))
print(scores['test_recall_macro'])

['fit_time', 'score_time', 'test_precision_macro', 'test_recall_macro']
[0.96666667 1.         0.96666667 0.96666667 1.        ]


from sklearn.metrics import make_scorer
scoring = {'prec_macro': 'precision_macro',
           'rec_macro': make_scorer(recall_score, average='macro')}
scores = cross_validate(clf, X, y, scoring=scoring, cv=5, 
                        return_train_score=True)

print(sorted(scores.keys()))
print(scores['train_rec_macro'])

['fit_time', 'score_time', 'test_prec_macro', 'test_rec_macro', 'train_prec_macro', 'train_rec_macro']
[0.975      0.975      0.99166667 0.98333333 0.98333333]


import numpy as np
import matplotlib.pyplot as plt
from sklearn import svm, datasets
from sklearn.metrics import auc
from sklearn.metrics import plot_roc_curve
from sklearn.model_selection import StratifiedKFold


iris = datasets.load_iris()
X,y = iris.data, iris.target
X,y = X[y != 2], y[y != 2]
n_samples, n_features = X.shape

# Add noisy features
random_state = np.random.RandomState(0)
X = np.c_[X, random_state.randn(n_samples, 
                                200*n_features)]

cv = StratifiedKFold(n_splits=6)
classifier = svm.SVC(kernel='linear', 
                     probability=True,
                     random_state=random_state)

tprs,aucs,mean_fpr = [],[],np.linspace(0, 1, 100)


fig, ax = plt.subplots()
for i, (train, test) in enumerate(cv.split(X, y)):
    classifier.fit(X[train], y[train])
    viz = plot_roc_curve(classifier, X[test], y[test],
                         name='ROC fold {}'.format(i),
                         alpha=0.3, lw=1, ax=ax)
    interp_tpr    = np.interp(mean_fpr, viz.fpr, viz.tpr)
    interp_tpr[0] = 0.0
     
    tprs.append(interp_tpr); aucs.append(viz.roc_auc)

ax.plot([0, 1], [0, 1], linestyle='--', lw=2, color='r',
        label='Chance', alpha=.8)

mean_tpr     = np.mean(tprs, axis=0)
mean_tpr[-1] = 1.0
mean_auc     = auc(mean_fpr, mean_tpr)
std_auc      = np.std(aucs)

ax.plot(mean_fpr, mean_tpr, color='b',
        label=r'Mean ROC (AUC = %0.2f $\pm$ %0.2f)' % (mean_auc, std_auc),
        lw=2, alpha=.8)

std_tpr    = np.std(tprs, axis=0)
tprs_upper = np.minimum(mean_tpr+std_tpr, 1)
tprs_lower = np.maximum(mean_tpr-std_tpr, 0)

ax.fill_between(mean_fpr, tprs_lower, tprs_upper, color='grey', alpha=.2,
                label=r'$\pm$ 1 std. dev.')

ax.set(xlim=[-0.05, 1.05], ylim=[-0.05, 1.05],
       title="Receiver operating characteristic example")
ax.legend(loc="lower right")
plt.show()


import matplotlib.pyplot as plt
from sklearn.svm import SVC
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_selection import RFECV
from sklearn.datasets import make_classification


X, y = make_classification(n_samples=1000, 
                           n_features=25, 
                           n_informative=3,
                           n_redundant=2, 
                           n_repeated=0, 
                           n_classes=8,
                           n_clusters_per_class=1, 
                           random_state=0)

min_features = 1

rfecv = RFECV(estimator=SVC(kernel="linear"), 
              step=1, 
              cv=StratifiedKFold(2),
              scoring='accuracy',
              min_features_to_select=min_features).fit(X, y)


plt.figure()
plt.xlabel("# selected features")
plt.ylabel("CV score (#correct classifications)")
plt.plot(range(min_features,
               len(rfecv.grid_scores_)+min_features),
         rfecv.grid_scores_)

[<matplotlib.lines.Line2D at 0x7fb889d527c0>]


from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.svm import SVC


digits = datasets.load_digits()

n_samples = len(digits.images)
X,y = digits.images.reshape((n_samples, -1)), digits.target

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.5, random_state=0)

tuned_parameters = [{'kernel': ['rbf'], 
                     'gamma':  [1e-3, 1e-4],
                     'C':      [1, 10, 100, 1000]},
                    {'kernel': ['linear'], 
                     'C':      [1, 10, 100, 1000]}]

scores = ['precision', 'recall']


for score in scores:
    clf = GridSearchCV(
        SVC(), tuned_parameters, scoring='%s_macro' % score).fit(X_train, y_train)

    print("Best params in devt set:", clf.best_params_)
    print("Grid scores in devt set:")
    means = clf.cv_results_['mean_test_score']
    stds = clf.cv_results_['std_test_score']
    for mean, std, params in zip(means, stds, clf.cv_results_['params']):
        print("%0.3f (+/-%0.03f) for %r" % (mean, std * 2, params))
    print()

    print("Classification report:")
    print("The model is trained on the full development set.")
    print("The scores are computed on the full evaluation set.")
    y_true, y_pred = y_test, clf.predict(X_test)
    print(classification_report(y_true, y_pred))

Best params in devt set: {'C': 10, 'gamma': 0.001, 'kernel': 'rbf'}
Grid scores in devt set:
0.986 (+/-0.016) for {'C': 1, 'gamma': 0.001, 'kernel': 'rbf'}
0.959 (+/-0.028) for {'C': 1, 'gamma': 0.0001, 'kernel': 'rbf'}
0.988 (+/-0.017) for {'C': 10, 'gamma': 0.001, 'kernel': 'rbf'}
0.982 (+/-0.026) for {'C': 10, 'gamma': 0.0001, 'kernel': 'rbf'}
0.988 (+/-0.017) for {'C': 100, 'gamma': 0.001, 'kernel': 'rbf'}
0.983 (+/-0.026) for {'C': 100, 'gamma': 0.0001, 'kernel': 'rbf'}
0.988 (+/-0.017) for {'C': 1000, 'gamma': 0.001, 'kernel': 'rbf'}
0.983 (+/-0.026) for {'C': 1000, 'gamma': 0.0001, 'kernel': 'rbf'}
0.974 (+/-0.012) for {'C': 1, 'kernel': 'linear'}
0.974 (+/-0.012) for {'C': 10, 'kernel': 'linear'}
0.974 (+/-0.012) for {'C': 100, 'kernel': 'linear'}
0.974 (+/-0.012) for {'C': 1000, 'kernel': 'linear'}

Classification report:
The model is trained on the full development set.
The scores are computed on the full evaluation set.
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        89
           1       0.97      1.00      0.98        90
           2       0.99      0.98      0.98        92
           3       1.00      0.99      0.99        93
           4       1.00      1.00      1.00        76
           5       0.99      0.98      0.99       108
           6       0.99      1.00      0.99        89
           7       0.99      1.00      0.99        78
           8       1.00      0.98      0.99        92
           9       0.99      0.99      0.99        92

    accuracy                           0.99       899
   macro avg       0.99      0.99      0.99       899
weighted avg       0.99      0.99      0.99       899

Best params in devt set: {'C': 10, 'gamma': 0.001, 'kernel': 'rbf'}
Grid scores in devt set:
0.986 (+/-0.019) for {'C': 1, 'gamma': 0.001, 'kernel': 'rbf'}
0.957 (+/-0.028) for {'C': 1, 'gamma': 0.0001, 'kernel': 'rbf'}
0.987 (+/-0.019) for {'C': 10, 'gamma': 0.001, 'kernel': 'rbf'}
0.981 (+/-0.028) for {'C': 10, 'gamma': 0.0001, 'kernel': 'rbf'}
0.987 (+/-0.019) for {'C': 100, 'gamma': 0.001, 'kernel': 'rbf'}
0.982 (+/-0.026) for {'C': 100, 'gamma': 0.0001, 'kernel': 'rbf'}
0.987 (+/-0.019) for {'C': 1000, 'gamma': 0.001, 'kernel': 'rbf'}
0.982 (+/-0.026) for {'C': 1000, 'gamma': 0.0001, 'kernel': 'rbf'}
0.971 (+/-0.010) for {'C': 1, 'kernel': 'linear'}
0.971 (+/-0.010) for {'C': 10, 'kernel': 'linear'}
0.971 (+/-0.010) for {'C': 100, 'kernel': 'linear'}
0.971 (+/-0.010) for {'C': 1000, 'kernel': 'linear'}

Classification report:
The model is trained on the full development set.
The scores are computed on the full evaluation set.
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        89
           1       0.97      1.00      0.98        90
           2       0.99      0.98      0.98        92
           3       1.00      0.99      0.99        93
           4       1.00      1.00      1.00        76
           5       0.99      0.98      0.99       108
           6       0.99      1.00      0.99        89
           7       0.99      1.00      0.99        78
           8       1.00      0.98      0.99        92
           9       0.99      0.99      0.99        92

    accuracy                           0.99       899
   macro avg       0.99      0.99      0.99       899
weighted avg       0.99      0.99      0.99       899


from pprint import pprint
from time import time
import logging

from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline


logging.basicConfig(level=logging.INFO,
                    format='%(asctime)s %(levelname)s %(message)s')


# #############################################################################
categories = ['alt.atheism','talk.religion.misc']
#categories = None
print(categories)

data = fetch_20newsgroups(subset='train', categories=categories)
print("%d documents" % len(data.filenames))
print("%d categories" % len(data.target_names))

pipeline = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', SGDClassifier()),
])

# more parameters = better exploring power but more CPU run time
parameters = {
    'vect__max_df': (0.5, 0.75, 1.0),
    # 'vect__max_features': (None, 5000, 10000, 50000),
    'vect__ngram_range': ((1, 1), (1, 2)),  # unigrams or bigrams
    # 'tfidf__use_idf': (True, False),
    # 'tfidf__norm': ('l1', 'l2'),
    'clf__max_iter': (20,),
    'clf__alpha': (0.00001, 0.000001),
    'clf__penalty': ('l2', 'elasticnet'),
    # 'clf__max_iter': (10, 50, 80),
}

grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=1)

['alt.atheism', 'talk.religion.misc']
857 documents
2 categories


print("pipeline:", [name for name, _ in pipeline.steps])
print("parameters:"); pprint(parameters)
t0 = time()
grid_search.fit(data.data, data.target)
print("done in %0.3fs" % (time() - t0))
print("Best score: %0.3f" % grid_search.best_score_)
print("Best parameters set:")
best_parameters = grid_search.best_estimator_.get_params()
for param_name in sorted(parameters.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))

pipeline: ['vect', 'tfidf', 'clf']
parameters:
{'clf__alpha': (1e-05, 1e-06),
 'clf__max_iter': (20,),
 'clf__penalty': ('l2', 'elasticnet'),
 'vect__max_df': (0.5, 0.75, 1.0),
 'vect__ngram_range': ((1, 1), (1, 2))}
Fitting 5 folds for each of 24 candidates, totalling 120 fits
done in 12.090s
Best score: 0.950
Best parameters set:
	clf__alpha: 1e-05
	clf__max_iter: 20
	clf__penalty: 'l2'
	vect__max_df: 0.5
	vect__ngram_range: (1, 2)


from sklearn import datasets
from sklearn.model_selection import cross_val_predict
from sklearn import linear_model
import matplotlib.pyplot as plt


lr        = linear_model.LinearRegression()
X, y      = datasets.load_diabetes(return_X_y=True)
predicted = cross_val_predict(lr, X, y, cv=10)


fig, ax = plt.subplots()
ax.scatter(y, predicted, edgecolors=(0, 0, 0))
ax.plot([y.min(), y.max()], [y.min(), y.max()], 'k--', lw=4)
ax.set_xlabel('Measured')
ax.set_ylabel('Predicted')

Text(0, 0.5, 'Predicted')


from sklearn.datasets import load_iris
from matplotlib import pyplot as plt
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV, cross_val_score, KFold
import numpy as np


NUM_TRIALS = 30

iris   = load_iris()
X,y    = iris.data, iris.target
p_grid = {"C": [1, 10, 100],
          "gamma": [.01, .1]}

svm           = SVC(kernel="rbf")
scores        = np.zeros(NUM_TRIALS)
nested_scores = np.zeros(NUM_TRIALS)

for i in range(NUM_TRIALS):

    inner_cv = KFold(n_splits=4, shuffle=True, random_state=i)
    outer_cv = KFold(n_splits=4, shuffle=True, random_state=i)

    clf = GridSearchCV(estimator=svm, param_grid=p_grid, cv=inner_cv).fit(X,y)
    scores[i] = clf.best_score_

    nested_score = cross_val_score(clf, X=X, y=y, cv=outer_cv)
    nested_scores[i] = nested_score.mean()

score_difference = scores - nested_scores
print("Average difference of {:6f} with std. dev. of {:6f}."
      .format(score_difference.mean(), score_difference.std()))

Average difference of 0.007581 with std. dev. of 0.007833.


plt.subplot(211)
scores_line = plt.plot(scores,        color='r')
nested_line = plt.plot(nested_scores, color='b')

plt.ylabel("score", fontsize="14")
#plt.legend([scores_line, nested_line], ["CV", "Nested CV"], bbox_to_anchor=(0, .4, .5, 0))
plt.title("Non-Nested and Nested CV on Iris", x=.5, y=1.1, fontsize="15")

plt.subplot(212)
difference_plot = plt.bar(range(NUM_TRIALS), score_difference)
plt.xlabel("Individual Trial #")
#plt.legend([difference_plot],["Non-Nested CV - Nested CV Score"],bbox_to_anchor=(0, 1, .8, 0))
plt.ylabel("score difference", fontsize="14")

Text(0, 0.5, 'score difference')


import numpy as np
from sklearn.model_selection import KFold

X = ["a", "b", "c", "d", "e", "f", "g", "h"]
kf = KFold(n_splits=2)
for train, test in kf.split(X):
    print("%s %s" % (train, test))

[4 5 6 7] [0 1 2 3]
[0 1 2 3] [4 5 6 7]


import numpy as np
from sklearn.model_selection import RepeatedKFold
X = np.array([[1, 2], [3, 4], [1, 2], [3, 4]])
random_state = 12883823
rkf = RepeatedKFold(n_splits=2, n_repeats=2, random_state=random_state)
for train, test in rkf.split(X):
    print("%s %s" % (train, test))

[2 3] [0 1]
[0 1] [2 3]
[0 2] [1 3]
[1 3] [0 2]


from sklearn.model_selection import LeaveOneOut
X = [1, 2, 3, 4]
for train, test in LeaveOneOut().split(X):
    print("%s %s" % (train, test))

[1 2 3] [0]
[0 2 3] [1]
[0 1 3] [2]
[0 1 2] [3]


from sklearn.model_selection import LeavePOut

X = np.ones(6)
for train, test in LeavePOut(p=2).split(X):
    print("%s %s" % (train, test))

[2 3 4 5] [0 1]
[1 3 4 5] [0 2]
[1 2 4 5] [0 3]
[1 2 3 5] [0 4]
[1 2 3 4] [0 5]
[0 3 4 5] [1 2]
[0 2 4 5] [1 3]
[0 2 3 5] [1 4]
[0 2 3 4] [1 5]
[0 1 4 5] [2 3]
[0 1 3 5] [2 4]
[0 1 3 4] [2 5]
[0 1 2 5] [3 4]
[0 1 2 4] [3 5]
[0 1 2 3] [4 5]


from sklearn.model_selection import ShuffleSplit
X = np.arange(10)
for train_index, test_index in ShuffleSplit(n_splits=5, 
                                            test_size=0.25, 
                                            random_state=0).split(X):
    print("%s %s" % (train_index, test_index))

[9 1 6 7 3 0 5] [2 8 4]
[2 9 8 0 6 7 4] [3 5 1]
[4 5 1 0 6 9 7] [2 3 8]
[2 7 5 8 0 3 4] [6 1 9]
[4 1 0 6 8 9 3] [5 2 7]


from sklearn.model_selection import StratifiedKFold, KFold
import numpy as np
X, y = np.ones((50, 1)), np.hstack(([0] * 45, [1] * 5))

skf = StratifiedKFold(n_splits=3)
kf  =           KFold(n_splits=3)

print("bincounts:\nStratified Kfold")
for train, test in skf.split(X, y):
    print('train: {} | test: {}'.format(
        np.bincount(y[train]), 
        np.bincount(y[test])))

print("Kfold")
for train, test in kf.split(X, y):
    print('train: {} | test: {}'.format(
        np.bincount(y[train]), 
        np.bincount(y[test])))

bincounts:
Stratified Kfold
train: [30  3] | test: [15  2]
train: [30  3] | test: [15  2]
train: [30  4] | test: [15  1]
Kfold
train: [28  5] | test: [17]
train: [28  5] | test: [17]
train: [34] | test: [11  5]


# Each subject is in a different testing fold
# the same subject is never in both testing and training
# folds do not have exactly the same size due to the imbalance in the data.

from sklearn.model_selection import GroupKFold

X      = [0.1, 0.2, 2.2, 2.4, 2.3, 4.55, 5.8, 8.8, 9, 10]
y      = ["a", "b", "b", "b", "c", "c", "c", "d", "d", "d"]
groups = [ 1,   1,   1,   2,   2,   2,   3,   3,   3,   3]

for train, test in GroupKFold(n_splits=3).split(X, y, groups=groups):
    print("%s %s" % (train, test))

[0 1 2 3 4 5] [6 7 8 9]
[0 1 2 6 7 8 9] [3 4 5]
[3 4 5 6 7 8 9] [0 1 2]


from sklearn.model_selection import LeaveOneGroupOut

X      = [1, 5, 10, 50, 60, 70, 80]
y      = [0, 1,  1,  2,  2,  2,  2]
groups = [1, 1,  2,  2,  3,  3,  3]

for train, test in LeaveOneGroupOut().split(X, y, groups=groups):
    print("%s %s" % (train, test))

[2 3 4 5 6] [0 1]
[0 1 4 5 6] [2 3]
[0 1 2 3] [4 5 6]


from sklearn.model_selection import LeavePGroupsOut

X = np.arange(6)
y = [1, 1, 1, 2, 2, 2]
groups = [1, 1, 2, 2, 3, 3]

for train, test in LeavePGroupsOut(n_groups=2).split(X, y, groups=groups):
    print("%s %s" % (train, test))

[4 5] [0 1 2 3]
[2 3] [0 1 4 5]
[0 1] [2 3 4 5]


from sklearn.model_selection import GroupShuffleSplit

X = [0.1, 0.2, 2.2, 2.4, 2.3, 4.55, 5.8, 0.001]
y = ["a", "b", "b", "b", "c", "c", "c", "a"]
groups = [1, 1, 2, 2, 3, 3, 4, 4]

gss = GroupShuffleSplit(n_splits=4, test_size=0.5, random_state=0)
for train, test in gss.split(X, y, groups=groups):
    print("%s %s" % (train, test))

[0 1 2 3] [4 5 6 7]
[2 3 6 7] [0 1 4 5]
[2 3 4 5] [0 1 6 7]
[4 5 6 7] [0 1 2 3]


import numpy as np
from sklearn.model_selection import GroupShuffleSplit as GSS

X      = np.array([0.1, 0.2, 2.2, 2.4, 2.3, 4.55, 5.8, 0.001])
y      = np.array(["a", "b", "b", "b", "c", "c", "c", "a"])
groups = np.array([ 1,   1,   2,   2,   3,   3,   4,   4])

train_indx, test_indx = next(
    GSS(random_state=7).split(X, y, groups)
)
X_train, X_test, y_train, y_test = \
    X[train_indx], X[test_indx], y[train_indx], y[test_indx]

print(X_train.shape, 
      X_test.shape)
print(np.unique(groups[train_indx]), 
      np.unique(groups[test_indx]))

(6,) (2,)
[1 2 4] [3]


from sklearn.model_selection import TimeSeriesSplit as TSS

X = np.array([[1, 2], [3, 4], [1, 2], [3, 4], [1, 2], [3, 4]])
y = np.array([1, 2, 3, 4, 5, 6])

for train, test in TSS(n_splits=3).split(X):
    print("%s %s" % (train, test))

[0 1 2] [3]
[0 1 2 3] [4]
[0 1 2 3 4] [5]


from sklearn.datasets import load_iris

iris = load_iris()
X,y = iris.data, iris.target

# random feature data, 2200 features, uncorrelated with iris class labels:
import numpy as np
n_uncorrelated_features = 2200
rng = np.random.RandomState(seed=0)
# Use same number of samples as in iris and 2200 features
X_rand = rng.normal(size=(X.shape[0], n_uncorrelated_features))


from sklearn.svm import SVC
from sklearn.model_selection import StratifiedKFold as SKF
from sklearn.model_selection import permutation_test_score as PTS

clf = SVC(kernel='linear', random_state=7)
cv  = SKF(2, shuffle=True, random_state=0)

score_iris, perm_scores_iris, pvalue_iris = PTS(
    clf, X, y, scoring="accuracy", cv=cv, n_permutations=1000)

score_rand, perm_scores_rand, pvalue_rand = PTS(
    clf, X_rand, y, scoring="accuracy", cv=cv, n_permutations=1000)


import matplotlib.pyplot as plt

fig, ax = plt.subplots()

ax.hist(perm_scores_iris, bins=20, density=True)
ax.axvline(score_iris, ls='--', color='r')
score_label = (f"Score on original\ndata: {score_iris:.2f}\n"
               f"(p-value: {pvalue_iris:.3f})")
#ax.text(0.7, 260, score_label, fontsize=12)
ax.set_xlabel("Accuracy score")
_ = ax.set_ylabel("Probability")


fig, ax = plt.subplots()

ax.hist(perm_scores_rand, bins=20, density=True)
ax.set_xlim(0.13)
ax.axvline(score_rand, ls='--', color='r')
score_label = (f"Score on original\ndata: {score_rand:.2f}\n"
               f"(p-value: {pvalue_rand:.3f})")
#ax.text(0.14, 125, score_label, fontsize=12)
ax.set_xlabel("Accuracy score")
ax.set_ylabel("Probability")

Text(0, 0.5, 'Probability')


from sklearn.model_selection import (TimeSeriesSplit, KFold, ShuffleSplit,
                                     StratifiedKFold, GroupShuffleSplit,
                                     GroupKFold, StratifiedShuffleSplit)
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.patches import Patch

np.random.seed(1338)
cmap_data = plt.cm.Paired
cmap_cv = plt.cm.coolwarm
n_splits = 4


n_points = 100
X = np.random.randn(100, 10)
percentiles_classes = [.1, .3, .6]
y = np.hstack([[ii] * int(100 * perc)
               for ii, perc in enumerate(percentiles_classes)])

# Evenly spaced groups repeated once
groups = np.hstack([[ii] * 10 for ii in range(10)])

def visualize_groups(classes, groups, name):
    fig, ax = plt.subplots()
    ax.scatter(range(len(groups)),  
               [.5] * len(groups), 
               c=groups, marker='_',
               lw=50, cmap=cmap_data)
    ax.scatter(range(len(groups)),  
               [3.5] * len(groups), 
               c=classes, marker='_',
               lw=50, cmap=cmap_data)
    ax.set(ylim=[-1, 5], 
           yticks=[.5, 3.5],
           yticklabels=['Data\ngroup', 'Data\nclass'], 
           xlabel="Sample index")

visualize_groups(y, groups, 'no groups')


def plot_cv_indices(cv, X, y, group, ax, n_splits, lw=10):

    for ii, (tr, tt) in enumerate(cv.split(X=X, y=y, groups=group)):
        indices     = np.array([np.nan] * len(X))
        indices[tt] = 1
        indices[tr] = 0

        ax.scatter(range(len(indices)), [ii + .5] * len(indices),
                   c=indices, marker='_', lw=lw, cmap=cmap_cv,
                   vmin=-.2, vmax=1.2)

    ax.scatter(range(len(X)), [ii + 1.5] * len(X),
               c=y, marker='_', lw=lw, cmap=cmap_data)

    ax.scatter(range(len(X)), [ii + 2.5] * len(X),
               c=group, marker='_', lw=lw, cmap=cmap_data)

    yticklabels = list(range(n_splits)) + ['class', 'group']
    
    ax.set(yticks=np.arange(n_splits+2) + .5, yticklabels=yticklabels,
           xlabel='Sample index', ylabel="CV iteration",
           ylim=[n_splits+2.2, -.2], xlim=[0, 100])
    
    ax.set_title('{}'.format(type(cv).__name__), fontsize=15)
    return ax


# k-fold
fig, ax = plt.subplots()
cv = KFold(n_splits)
plot_cv_indices(cv, X, y, groups, ax, n_splits)

---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-69-148852789ba4> in <module>
      2 fig, ax = plt.subplots()
      3 cv = KFold(n_splits)
----> 4 plot_cv_indices(cv, X, y, groups, ax, n_splits)

<ipython-input-67-151a68765702> in plot_cv_indices(cv, X, y, group, ax, n_splits, lw)
      1 def plot_cv_indices(cv, X, y, group, ax, n_splits, lw=10):
      2 
----> 3     for ii, (tr, tt) in enumerate(cv.split(X=X, y=y, groups=group)):
      4         indices     = np.array([np.nan] * len(X))
      5         indices[tt] = 1

~/.local/lib/python3.8/site-packages/sklearn/model_selection/_split.py in split(self, X, y, groups)
    322             The testing set indices for that split.
    323         """
--> 324         X, y, groups = indexable(X, y, groups)
    325         n_samples = _num_samples(X)
    326         if self.n_splits > n_samples:

~/.local/lib/python3.8/site-packages/sklearn/utils/validation.py in indexable(*iterables)
    297     """
    298     result = [_make_indexable(X) for X in iterables]
--> 299     check_consistent_length(*result)
    300     return result
    301 

~/.local/lib/python3.8/site-packages/sklearn/utils/validation.py in check_consistent_length(*arrays)
    260     uniques = np.unique(lengths)
    261     if len(uniques) > 1:
--> 262         raise ValueError("Found input variables with inconsistent numbers of"
    263                          " samples: %r" % [int(l) for l in lengths])
    264 

ValueError: Found input variables with inconsistent numbers of samples: [8, 100, 100]


#stratified k-fold
fig, ax = plt.subplots()
cv = StratifiedKFold(n_splits)
plot_cv_indices(cv, X, y, groups, ax, n_splits)

---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-61-e3784200053b> in <module>
      2 fig, ax = plt.subplots()
      3 cv = StratifiedKFold(n_splits)
----> 4 plot_cv_indices(cv, X, y, groups, ax, n_splits)

<ipython-input-47-151a68765702> in plot_cv_indices(cv, X, y, group, ax, n_splits, lw)
      1 def plot_cv_indices(cv, X, y, group, ax, n_splits, lw=10):
      2 
----> 3     for ii, (tr, tt) in enumerate(cv.split(X=X, y=y, groups=group)):
      4         indices     = np.array([np.nan] * len(X))
      5         indices[tt] = 1

~/.local/lib/python3.8/site-packages/sklearn/model_selection/_split.py in split(self, X, y, groups)
    322             The testing set indices for that split.
    323         """
--> 324         X, y, groups = indexable(X, y, groups)
    325         n_samples = _num_samples(X)
    326         if self.n_splits > n_samples:

~/.local/lib/python3.8/site-packages/sklearn/utils/validation.py in indexable(*iterables)
    297     """
    298     result = [_make_indexable(X) for X in iterables]
--> 299     check_consistent_length(*result)
    300     return result
    301 

~/.local/lib/python3.8/site-packages/sklearn/utils/validation.py in check_consistent_length(*arrays)
    260     uniques = np.unique(lengths)
    261     if len(uniques) > 1:
--> 262         raise ValueError("Found input variables with inconsistent numbers of"
    263                          " samples: %r" % [int(l) for l in lengths])
    264 

ValueError: Found input variables with inconsistent numbers of samples: [50, 50, 100]


cvs = [KFold, GroupKFold, ShuffleSplit, StratifiedKFold,
       GroupShuffleSplit, StratifiedShuffleSplit, TimeSeriesSplit]

for cv in cvs:
    this_cv = cv(n_splits=n_splits)
    fig, ax = plt.subplots(figsize=(6, 3))
    plot_cv_indices(this_cv, X, y, groups, ax, n_splits)

    ax.legend([Patch(color=cmap_cv(.8)), Patch(color=cmap_cv(.02))],
              ['Testing set', 'Training set'], loc=(1.02, .8))
    # Make the legend fit
    plt.tight_layout()
    fig.subplots_adjust(right=.7)

Cross Validation ¶

Metrics¶

cross_validate (vs cross_val_score)¶

cross_val_predict (CVP)¶

Example: ROC classifier metrics with CV ¶

Example: recursive Feature Elimination with CV ¶

Example: Parameter Estimation with CV ¶

Example: Text Feature Evaluation Pipeline with CV ¶

Example: Prediction Plots with CV ¶

Example: Nested vs Non-Nested CV ¶

KFold ¶

Stratified KFold ¶

Leave One Out (LOO)¶

Leave P Out (LPO)¶

Shuffle & Split ¶

Class label-based stratfication¶

Stratified KFold ¶

Stratified Shuffle Split ¶

Cross Validation on "Grouped" Data¶

Group Kfold ¶

Leave One Group Out (LOGO)¶

Leave P Groups Out (LPGO)¶

Group Shuffle Split (GSS)¶

Predefined Split Methods ¶

Using CV Iterators as Dataset Splitters ¶

Time Series Split (TSS)¶

Permutation Testing ¶

Example: Permutation testing of Classification Performance ¶

Helper Function: Cross-validation split visualizations ¶

#¶

#¶

#¶

#¶

#¶