from sklearn.ensemble import RandomForestClassifier as RFC


clf = RFC(random_state=0)


X = [[1,2,3], [11,12,13]] # 2 samples, 3 features each
y = [0,1]                 # the classes of each sample

clf.fit(X,y)

RandomForestClassifier(random_state=0)


# fit() accepts 2 inputs: X (samples), y (target values)
# y not needed in unsupervised learning tasks.
# X & y are expected to be numpy arrays or similar data types
# once the estimator is fitted, it can be used for predictions.


clf.predict(X)

array([0, 1])


clf.predict([[4,5,6],[14,15,16]]) # new data

array([0, 1])


from sklearn.preprocessing import StandardScaler as SS


X = [[0,15],[1,-10]]

SS().fit(X).transform(X)

array([[-1.,  1.],
       [ 1., -1.]])


from sklearn.preprocessing import StandardScaler as SS
from sklearn.linear_model  import LogisticRegression as LR
from sklearn.pipeline      import make_pipeline as MP
from sklearn.datasets      import load_iris     as LI
from sklearn.model_selection import train_test_split as TTS
from sklearn.metrics         import accuracy_score   as AS


# make a pipeline
pipe = MP(SS(), LR())


# load iris data - split into training & test sets
X,y = LI(return_X_y=True)


X_train, X_test, y_train, y_test = TTS(X,y,random_state=0)


# fit entire pipeline
pipe.fit(X_train,y_train)

Pipeline(steps=[('standardscaler', StandardScaler()),
                ('logisticregression', LogisticRegression())])


# ready to use - return accuracy score of estimator on test data
AS(pipe.predict(X_test),y_test)

0.9736842105263158


from sklearn.datasets import make_regression as MR
from sklearn.linear_model import LinearRegression as LR
from sklearn.model_selection import cross_validate as CV


X,y    = MR(n_samples = 1000, random_state = 0)
lr     = LR()
result = CV(lr,X,y) # default: 5-fold


# return r_squared score. (easy to eval, so scores will be high.)
result['test_score']

array([1., 1., 1., 1., 1.])


from sklearn.datasets import fetch_california_housing as FCH
from sklearn.ensemble import RandomForestRegressor    as RFR
from sklearn.model_selection import RandomizedSearchCV as RSCV
from sklearn.model_selection import train_test_split as TTS
from scipy.stats             import randint


X,y = FCH(return_X_y=True)
X_train, X_test, y_train, y_test = TTS(X,y,random_state=0)


# define param space for searching
param_distributions = {'n_estimators': randint(1,5),
                       'max_depth':    randint(5,10)}


# create a search object
searcher = RSCV(estimator = RFR(random_state = 0),
                n_iter = 5,
                param_distributions = param_distributions,
                random_state = 0)


# fit searcher to training data
searcher.fit(X_train, y_train)

RandomizedSearchCV(estimator=RandomForestRegressor(random_state=0), n_iter=5,
                   param_distributions={'max_depth': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7fc106f0af10>,
                                        'n_estimators': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7fc107ae6ca0>},
                   random_state=0)


# what's the optimal values to use?
searcher.best_params_

{'max_depth': 9, 'n_estimators': 4}


# searcher now acts like a normal random forest estimator:
searcher.score(X_test,y_test)

0.735363411343253


# note:
# - usually want to search over a pipeline - not a single estimator.
# - applying preprocessing to an entire dataset before cross-validation
#   breaks assumption of independence between training & testing data.

Estimator basics¶

Transformers & pre-processors¶

Pipelines: chaining pre-processors & estimators¶

Model Evaluation¶

Automatic parameter searches¶