In the previous notebook, we saw that hyperparameters can affect the statistical performance of a model. In this notebook, we will show how to optimize hyperparameters using a grid-search approach.

Preparation

import pandas as pd
import matplotlib.pyplot as plt
import time
import random
from sklearn.compose import make_column_selector as selector
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
# for the moment this line is required to import HistGradientBoostingClassifier
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.model_selection import cross_validate
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
import seaborn as sns

myDataFrame = pd.read_csv("../../scikit-learn-mooc/datasets/adult-census.csv")

myDataFrame = myDataFrame.drop(columns="education-num")

target_column = 'class'
data = myDataFrame.drop(columns=target_column)
target = myDataFrame[target_column]

numerical_columns = selector(dtype_exclude=object)(data)
data_numerical = myDataFrame[numerical_columns]

categorical_columns = selector(dtype_include=object)(data)
data_categorical = myDataFrame[categorical_columns]

all_columns = numerical_columns + categorical_columns
data = data[all_columns]

Construction of the model with default hyperparameters

categorical_preprocessor = OrdinalEncoder(handle_unknown="use_encoded_value",
                                          unknown_value=-1)
preprocessor = ColumnTransformer([
    ('cat-preprocessor', categorical_preprocessor, categorical_columns)],
    remainder='passthrough', sparse_threshold=0)

model = Pipeline([
    ("preprocessor", preprocessor),
    ("classifier",
     HistGradientBoostingClassifier(random_state=42, max_leaf_nodes=4))])

cv_results = cross_validate(model, data, target)

scores = cv_results["test_score"]
fit_time = cv_results["fit_time"]
print("The accuracy is "
      f"{scores.mean():.3f} +/- {scores.std():.3f}, for {fit_time.mean():.3f} seconds")

The accuracy is 0.863 +/- 0.002, for 0.829 seconds

for parameter in model.get_params():
    print(parameter)

memory
steps
verbose
preprocessor
classifier
preprocessor__n_jobs
preprocessor__remainder
preprocessor__sparse_threshold
preprocessor__transformer_weights
preprocessor__transformers
preprocessor__verbose
preprocessor__cat-preprocessor
preprocessor__cat-preprocessor__categories
preprocessor__cat-preprocessor__dtype
preprocessor__cat-preprocessor__handle_unknown
preprocessor__cat-preprocessor__unknown_value
classifier__categorical_features
classifier__early_stopping
classifier__l2_regularization
classifier__learning_rate
classifier__loss
classifier__max_bins
classifier__max_depth
classifier__max_iter
classifier__max_leaf_nodes
classifier__min_samples_leaf
classifier__monotonic_cst
classifier__n_iter_no_change
classifier__random_state
classifier__scoring
classifier__tol
classifier__validation_fraction
classifier__verbose
classifier__warm_start

Search for hyperparameters with a random and cross validation

from scipy.stats import uniform
from scipy.stats import loguniform

param_distributions = {
    'classifier__learning_rate': loguniform(0.0001, 5),
    'classifier__max_leaf_nodes': uniform(1, 100)}

model_random_search = RandomizedSearchCV(model, param_distributions, n_jobs=4, cv=2)

cv_results = cross_validate(model_random_search, data, target, return_estimator=True)

scores = cv_results["test_score"]
fit_time = cv_results["fit_time"]
print(f"The accuracy via cross-validation is "
  f"{scores.mean():.3f} +/- {scores.std():.3f}, for {fit_time.mean():.3f} seconds")

The accuracy via cross-validation is 0.864 +/- 0.004, for 103.672 seconds

print(f"Best parameter found")
for fold_idx, estimator in enumerate(cv_results["estimator"]):
    print(f" on fold #{fold_idx + 1} : {estimator.best_params_}")

Best parameter found
 on fold #1 : {'classifier__learning_rate': 0.16309862397316285, 'classifier__max_leaf_nodes': 45.12999922572979}
 on fold #2 : {'classifier__learning_rate': 0.045232362575395146, 'classifier__max_leaf_nodes': 69.02541921038254}
 on fold #3 : {'classifier__learning_rate': 0.023585278516485758, 'classifier__max_leaf_nodes': 30.014816646691422}
 on fold #4 : {'classifier__learning_rate': 0.030455497715999508, 'classifier__max_leaf_nodes': 83.92313635555773}
 on fold #5 : {'classifier__learning_rate': 0.035781951571963115, 'classifier__max_leaf_nodes': 85.52313915125612}

Search for hyperparameters with a grid and cross validation

param_grid = {
    'classifier__learning_rate': (0.05, 0.1, 0.5, 1, 5),
    'classifier__max_leaf_nodes': (3, 10, 30, 100)}

model_grid_search = GridSearchCV(model, param_grid, n_jobs=4, cv=2)

cv_results = cross_validate(model_grid_search, data, target, return_estimator=True)

scores = cv_results["test_score"]
fit_time = cv_results["fit_time"]
print(f"The accuracy via cross-validation is "
  f"{scores.mean():.3f} +/- {scores.std():.3f}, for {fit_time.mean():.3f} seconds")

The accuracy via cross-validation is 0.874 +/- 0.002, for 14.921 seconds

print(f"Best parameter found")
for fold_idx, estimator in enumerate(cv_results["estimator"]):
    print(f" on fold #{fold_idx + 1} : {estimator.best_params_}")

Best parameter found
 on fold #1 : {'classifier__learning_rate': 0.1, 'classifier__max_leaf_nodes': 30}
 on fold #2 : {'classifier__learning_rate': 0.1, 'classifier__max_leaf_nodes': 30}
 on fold #3 : {'classifier__learning_rate': 0.1, 'classifier__max_leaf_nodes': 30}
 on fold #4 : {'classifier__learning_rate': 0.1, 'classifier__max_leaf_nodes': 30}
 on fold #5 : {'classifier__learning_rate': 0.1, 'classifier__max_leaf_nodes': 30}

Search for hyperparameters with a grid and without cross validation (bad)

Be aware that the evaluation should normally be performed in a cross-validation framework by providing model_grid_search as a model to the cross_validate function as above.

Here, we are using a single train-test split to highlight the specificities of the model_grid_search instance.

data_train, data_test, target_train, target_test = train_test_split(
    data, target, random_state=42)

model_grid_search.fit(data_train, target_train)

print(f"The best set of parameters is: "
      f"{model_grid_search.best_params_}")

The best set of parameters is: {'classifier__learning_rate': 0.1, 'classifier__max_leaf_nodes': 30}

cv_results_grid_train = pd.DataFrame(model_grid_search.cv_results_)

column_results = [f"param_{name}" for name in param_grid.keys()]
column_results += [
    "mean_test_score", "std_test_score", "rank_test_score"]
cv_results_grid_train = cv_results_grid_train[column_results]

def shorten_param(param_name):
    if "__" in param_name:
        return param_name.rsplit("__", 1)[1]
    return param_name


cv_results_grid_train = cv_results_grid_train.rename(shorten_param, axis=1)
cv_results_grid_train

pivoted_cv_results_grid_train = cv_results_grid_train.pivot_table(
    values="mean_test_score", index=["learning_rate"],
    columns=["max_leaf_nodes"])

pivoted_cv_results_grid_train

ax = sns.heatmap(pivoted_cv_results_grid_train, annot=True, cmap="YlGnBu", vmin=0.7,
                 vmax=0.9)
ax.invert_yaxis()

The above tables highlights the following things:

for too high values of learning_rate, the statistical performance of the model is degraded and adjusting the value of max_leaf_nodes cannot fix that problem;
outside of this pathological region, we observe that the optimal choice of max_leaf_nodes depends on the value of learning_rate;
in particular, we observe a "diagonal" of good models with an accuracy close to the maximal of 0.87: when the value of max_leaf_nodes is increased, one should increase the value of learning_rate accordingly to preserve a good accuracy.

For now we will note that, in general, there is no unique optimal parameter setting: 6 models out of the 16 parameter configuration reach the maximal accuracy (up to small random fluctuations caused by the sampling of the training set).

max_leaf_nodes	3	10	30	100
learning_rate
0.05	0.827196	0.862029	0.867598	0.865797
0.10	0.853266	0.866425	0.868063	0.864732
0.50	0.865824	0.865824	0.866479	0.859491
1.00	0.857389	0.858863	0.851028	0.835194
5.00	0.283476	0.527527	0.638062	0.727579

	learning_rate	max_leaf_nodes	mean_test_score	std_test_score	rank_test_score
0	0.05	3	0.827196	0.000214	16
1	0.05	10	0.862029	0.000222	9
2	0.05	30	0.867598	0.000932	2
3	0.05	100	0.865797	0.001259	7
4	0.1	3	0.853266	0.000515	13
5	0.1	10	0.866425	0.000359	4
6	0.1	30	0.868063	0.000850	1
7	0.1	100	0.864732	0.000795	8
8	0.5	3	0.865824	0.000952	5
9	0.5	10	0.865824	0.000031	6
10	0.5	30	0.866479	0.000577	3
11	0.5	100	0.859491	0.001069	10
12	1	3	0.857389	0.003545	12
13	1	10	0.858863	0.004036	11
14	1	30	0.851028	0.002707	14
15	1	100	0.835194	0.004609	15
16	5	3	0.283476	0.003775	20
17	5	10	0.527527	0.175411	19
18	5	30	0.638062	0.144696	18
19	5	100	0.727579	0.055780	17