In the previous notebook, we saw that hyperparameters can affect the statistical performance of a model. In this notebook, we will show how to optimize hyperparameters using a grid-search approach.

Preparation

import pandas as pd
import matplotlib.pyplot as plt
import time
import random
from sklearn.compose import make_column_selector as selector
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
# for the moment this line is required to import HistGradientBoostingClassifier
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.model_selection import cross_validate
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
import seaborn as sns
myDataFrame = pd.read_csv("../../scikit-learn-mooc/datasets/adult-census.csv")
myDataFrame = myDataFrame.drop(columns="education-num")
target_column = 'class'
data = myDataFrame.drop(columns=target_column)
target = myDataFrame[target_column]

numerical_columns = selector(dtype_exclude=object)(data)
data_numerical = myDataFrame[numerical_columns]

categorical_columns = selector(dtype_include=object)(data)
data_categorical = myDataFrame[categorical_columns]

all_columns = numerical_columns + categorical_columns
data = data[all_columns]

Construction of the model with default hyperparameters

categorical_preprocessor = OrdinalEncoder(handle_unknown="use_encoded_value",
                                          unknown_value=-1)
preprocessor = ColumnTransformer([
    ('cat-preprocessor', categorical_preprocessor, categorical_columns)],
    remainder='passthrough', sparse_threshold=0)
model = Pipeline([
    ("preprocessor", preprocessor),
    ("classifier",
     HistGradientBoostingClassifier(random_state=42, max_leaf_nodes=4))])
cv_results = cross_validate(model, data, target)

scores = cv_results["test_score"]
fit_time = cv_results["fit_time"]
print("The accuracy is "
      f"{scores.mean():.3f} +/- {scores.std():.3f}, for {fit_time.mean():.3f} seconds")
The accuracy is 0.863 +/- 0.002, for 0.829 seconds
for parameter in model.get_params():
    print(parameter)
memory
steps
verbose
preprocessor
classifier
preprocessor__n_jobs
preprocessor__remainder
preprocessor__sparse_threshold
preprocessor__transformer_weights
preprocessor__transformers
preprocessor__verbose
preprocessor__cat-preprocessor
preprocessor__cat-preprocessor__categories
preprocessor__cat-preprocessor__dtype
preprocessor__cat-preprocessor__handle_unknown
preprocessor__cat-preprocessor__unknown_value
classifier__categorical_features
classifier__early_stopping
classifier__l2_regularization
classifier__learning_rate
classifier__loss
classifier__max_bins
classifier__max_depth
classifier__max_iter
classifier__max_leaf_nodes
classifier__min_samples_leaf
classifier__monotonic_cst
classifier__n_iter_no_change
classifier__random_state
classifier__scoring
classifier__tol
classifier__validation_fraction
classifier__verbose
classifier__warm_start

Search for hyperparameters with a random and cross validation

from scipy.stats import uniform
from scipy.stats import loguniform

param_distributions = {
    'classifier__learning_rate': loguniform(0.0001, 5),
    'classifier__max_leaf_nodes': uniform(1, 100)}

model_random_search = RandomizedSearchCV(model, param_distributions, n_jobs=4, cv=2)
cv_results = cross_validate(model_random_search, data, target, return_estimator=True)

scores = cv_results["test_score"]
fit_time = cv_results["fit_time"]
print(f"The accuracy via cross-validation is "
  f"{scores.mean():.3f} +/- {scores.std():.3f}, for {fit_time.mean():.3f} seconds")
The accuracy via cross-validation is 0.864 +/- 0.004, for 103.672 seconds
print(f"Best parameter found")
for fold_idx, estimator in enumerate(cv_results["estimator"]):
    print(f" on fold #{fold_idx + 1} : {estimator.best_params_}")
Best parameter found
 on fold #1 : {'classifier__learning_rate': 0.16309862397316285, 'classifier__max_leaf_nodes': 45.12999922572979}
 on fold #2 : {'classifier__learning_rate': 0.045232362575395146, 'classifier__max_leaf_nodes': 69.02541921038254}
 on fold #3 : {'classifier__learning_rate': 0.023585278516485758, 'classifier__max_leaf_nodes': 30.014816646691422}
 on fold #4 : {'classifier__learning_rate': 0.030455497715999508, 'classifier__max_leaf_nodes': 83.92313635555773}
 on fold #5 : {'classifier__learning_rate': 0.035781951571963115, 'classifier__max_leaf_nodes': 85.52313915125612}

Search for hyperparameters with a grid and cross validation

param_grid = {
    'classifier__learning_rate': (0.05, 0.1, 0.5, 1, 5),
    'classifier__max_leaf_nodes': (3, 10, 30, 100)}

model_grid_search = GridSearchCV(model, param_grid, n_jobs=4, cv=2)
cv_results = cross_validate(model_grid_search, data, target, return_estimator=True)

scores = cv_results["test_score"]
fit_time = cv_results["fit_time"]
print(f"The accuracy via cross-validation is "
  f"{scores.mean():.3f} +/- {scores.std():.3f}, for {fit_time.mean():.3f} seconds")
The accuracy via cross-validation is 0.874 +/- 0.002, for 14.921 seconds
print(f"Best parameter found")
for fold_idx, estimator in enumerate(cv_results["estimator"]):
    print(f" on fold #{fold_idx + 1} : {estimator.best_params_}")
Best parameter found
 on fold #1 : {'classifier__learning_rate': 0.1, 'classifier__max_leaf_nodes': 30}
 on fold #2 : {'classifier__learning_rate': 0.1, 'classifier__max_leaf_nodes': 30}
 on fold #3 : {'classifier__learning_rate': 0.1, 'classifier__max_leaf_nodes': 30}
 on fold #4 : {'classifier__learning_rate': 0.1, 'classifier__max_leaf_nodes': 30}
 on fold #5 : {'classifier__learning_rate': 0.1, 'classifier__max_leaf_nodes': 30}

Search for hyperparameters with a grid and without cross validation (bad)

Be aware that the evaluation should normally be performed in a cross-validation framework by providing model_grid_search as a model to the cross_validate function as above.

Here, we are using a single train-test split to highlight the specificities of the model_grid_search instance.

data_train, data_test, target_train, target_test = train_test_split(
    data, target, random_state=42)
model_grid_search.fit(data_train, target_train)

print(f"The best set of parameters is: "
      f"{model_grid_search.best_params_}")
The best set of parameters is: {'classifier__learning_rate': 0.1, 'classifier__max_leaf_nodes': 30}
cv_results_grid_train = pd.DataFrame(model_grid_search.cv_results_)
column_results = [f"param_{name}" for name in param_grid.keys()]
column_results += [
    "mean_test_score", "std_test_score", "rank_test_score"]
cv_results_grid_train = cv_results_grid_train[column_results]

def shorten_param(param_name):
    if "__" in param_name:
        return param_name.rsplit("__", 1)[1]
    return param_name


cv_results_grid_train = cv_results_grid_train.rename(shorten_param, axis=1)
cv_results_grid_train
learning_rate max_leaf_nodes mean_test_score std_test_score rank_test_score
0 0.05 3 0.827196 0.000214 16
1 0.05 10 0.862029 0.000222 9
2 0.05 30 0.867598 0.000932 2
3 0.05 100 0.865797 0.001259 7
4 0.1 3 0.853266 0.000515 13
5 0.1 10 0.866425 0.000359 4
6 0.1 30 0.868063 0.000850 1
7 0.1 100 0.864732 0.000795 8
8 0.5 3 0.865824 0.000952 5
9 0.5 10 0.865824 0.000031 6
10 0.5 30 0.866479 0.000577 3
11 0.5 100 0.859491 0.001069 10
12 1 3 0.857389 0.003545 12
13 1 10 0.858863 0.004036 11
14 1 30 0.851028 0.002707 14
15 1 100 0.835194 0.004609 15
16 5 3 0.283476 0.003775 20
17 5 10 0.527527 0.175411 19
18 5 30 0.638062 0.144696 18
19 5 100 0.727579 0.055780 17
pivoted_cv_results_grid_train = cv_results_grid_train.pivot_table(
    values="mean_test_score", index=["learning_rate"],
    columns=["max_leaf_nodes"])

pivoted_cv_results_grid_train
max_leaf_nodes 3 10 30 100
learning_rate
0.05 0.827196 0.862029 0.867598 0.865797
0.10 0.853266 0.866425 0.868063 0.864732
0.50 0.865824 0.865824 0.866479 0.859491
1.00 0.857389 0.858863 0.851028 0.835194
5.00 0.283476 0.527527 0.638062 0.727579
ax = sns.heatmap(pivoted_cv_results_grid_train, annot=True, cmap="YlGnBu", vmin=0.7,
                 vmax=0.9)
ax.invert_yaxis()

The above tables highlights the following things:

  • for too high values of learning_rate, the statistical performance of the model is degraded and adjusting the value of max_leaf_nodes cannot fix that problem;
  • outside of this pathological region, we observe that the optimal choice of max_leaf_nodes depends on the value of learning_rate;
  • in particular, we observe a "diagonal" of good models with an accuracy close to the maximal of 0.87: when the value of max_leaf_nodes is increased, one should increase the value of learning_rate accordingly to preserve a good accuracy.

For now we will note that, in general, there is no unique optimal parameter setting: 6 models out of the 16 parameter configuration reach the maximal accuracy (up to small random fluctuations caused by the sampling of the training set).