Loading

import pandas as pd
import numpy as np
# import matplotlib.pyplot as plt
# import seaborn as sns
import time
myData = pd.read_csv("../../scikit-learn-mooc/datasets/adult-census.csv")
myData = myData.drop(columns="education-num")
print(f"The dataset data contains {myData.shape[0]} samples and {myData.shape[1]} features")
The dataset data contains 48842 samples and 13 features
target_column = 'class'
target = myData[target_column]
data = myData.drop(columns=target_column)
from sklearn.compose import make_column_selector as selector
# 
numerical_columns = selector(dtype_exclude=object)(data)
categorical_columns = selector(dtype_include=object)(data)
all_columns = numerical_columns + categorical_columns
data = data[all_columns]
data_numerical = data[numerical_columns]
data_categorical = data[categorical_columns]

Here, we know that object data type is used to represent strings and thus categorical features. Be aware that this is not always the case. Sometimes object data type could contain other types of information, such as dates that were not properly formatted (strings) and yet relate to a quantity of elapsed time.

Categoricals ands numericals in the same treatment

LogisticRegression + StandardScaler + OrdinalEncoder : not so good

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OrdinalEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import cross_validate
from sklearn.model_selection import ShuffleSplit
# 
categorical_preprocessor = OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1)
numerical_preprocessor = StandardScaler()

preprocessor = ColumnTransformer([
    ('categorical', categorical_preprocessor, categorical_columns),
    ('numerical', numerical_preprocessor, numerical_columns)])

model = make_pipeline(preprocessor, 
                      LogisticRegression(max_iter=500))

cv = ShuffleSplit(n_splits=10, test_size=0.3, random_state=0)

cv_results = cross_validate(model, data, target, cv=cv, return_train_score=True)

scores = cv_results["test_score"]
train_scores = cv_results["train_score"]
fit_time = cv_results["fit_time"]

print("The accuracy in TRAIN is "
      f"{train_scores.mean():.3f} +/- {train_scores.std():.3f}")
print("The accuracy in TEST  is "
      f"{scores.mean():.3f} +/- {scores.std():.3f}, for {fit_time.mean():.3f} seconds")
The accuracy in TRAIN is 0.803 +/- 0.001
The accuracy in TEST  is 0.803 +/- 0.002, for 0.446 seconds

LogisticRegression + StandardScaler + OneHotEncoder : good

Linear models are nice because they are usually cheap to train, small to deploy, fast to predict and give a good baseline.

However, it is often useful to check whether more complex models such as an ensemble of decision trees can lead to higher predictive performance.

from sklearn.preprocessing import OneHotEncoder
# 
categorical_preprocessor = OneHotEncoder(handle_unknown="ignore")
numerical_preprocessor = StandardScaler()

preprocessor = ColumnTransformer([
    ('categorical', categorical_preprocessor, categorical_columns),
    ('numerical', numerical_preprocessor, numerical_columns)])

model = make_pipeline(preprocessor, LogisticRegression(max_iter=500))

cv_results = cross_validate(model, data, target, cv=cv, return_train_score=True)

scores = cv_results["test_score"]
train_scores = cv_results["train_score"]
fit_time = cv_results["fit_time"]

print("The accuracy in TRAIN is "
      f"{train_scores.mean():.3f} +/- {train_scores.std():.3f}")
print("The accuracy in TEST  is "
      f"{scores.mean():.3f} +/- {scores.std():.3f}, for {fit_time.mean():.3f} seconds")
The accuracy in TRAIN is 0.852 +/- 0.001
The accuracy in TEST  is 0.853 +/- 0.001, for 0.836 seconds

Gradient Boosting model + StandardScaler + OneHotEncoder : long

from sklearn.ensemble import HistGradientBoostingClassifier
# 
categorical_preprocessor = OneHotEncoder(handle_unknown="ignore", sparse=False)
numerical_preprocessor = StandardScaler()

preprocessor = ColumnTransformer([
    ('categorical', categorical_preprocessor, categorical_columns),
    ('numerical', numerical_preprocessor, numerical_columns)])

model = make_pipeline(preprocessor, 
                      HistGradientBoostingClassifier())

cv_results = cross_validate(model, data, target, cv=cv, return_train_score=True)

scores = cv_results["test_score"]
train_scores = cv_results["train_score"]
fit_time = cv_results["fit_time"]

print("The accuracy in TRAIN is "
      f"{train_scores.mean():.3f} +/- {train_scores.std():.3f}")
print("The accuracy in TEST  is "
      f"{scores.mean():.3f} +/- {scores.std():.3f}, for {fit_time.mean():.3f} seconds")
The accuracy in TRAIN is 0.882 +/- 0.001
The accuracy in TEST  is 0.874 +/- 0.002, for 3.753 seconds

Gradient Boosting model + None + OneHotEncoder : still long

from sklearn.ensemble import HistGradientBoostingClassifier
# 
categorical_preprocessor = OneHotEncoder(handle_unknown="ignore", sparse=False)

preprocessor = ColumnTransformer([
    ('categorical', categorical_preprocessor, categorical_columns)],
    remainder="passthrough")

model = make_pipeline(preprocessor, 
                      HistGradientBoostingClassifier())

cv_results = cross_validate(model, data, target, cv=cv, return_train_score=True)

scores = cv_results["test_score"]
train_scores = cv_results["train_score"]
fit_time = cv_results["fit_time"]

print("The accuracy in TRAIN is "
      f"{train_scores.mean():.3f} +/- {train_scores.std():.3f}")
print("The accuracy in TEST  is "
      f"{scores.mean():.3f} +/- {scores.std():.3f}, for {fit_time.mean():.3f} seconds")
The accuracy in TRAIN is 0.881 +/- 0.001
The accuracy in TEST  is 0.874 +/- 0.002, for 3.409 seconds

Gradient Boosting model + None + OrdinalEncoder : good (the best here)

For tree-based models, the handling of numerical and categorical variables is simpler than for linear models:

  • we do not need to scale the numerical features
  • using an ordinal encoding for the categorical variables is fine even if the encoding results in an arbitrary ordering

We can observe that we get significantly higher accuracies with the Gradient Boosting model. This is often what we observe whenever the dataset has a large number of samples and limited number of informative features (e.g. less than 1000) with a mix of numerical and categorical variables.

This explains why Gradient Boosted Machines are very popular among datascience practitioners who work with tabular data.

categorical_preprocessor = OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1)

preprocessor = ColumnTransformer([
    ('categorical', categorical_preprocessor, categorical_columns)],
    remainder="passthrough")

model = make_pipeline(preprocessor, 
                      HistGradientBoostingClassifier())

cv_results = cross_validate(model, data, target, cv=cv, return_train_score=True)

scores = cv_results["test_score"]
train_scores = cv_results["train_score"]
fit_time = cv_results["fit_time"]

print("The accuracy in TRAIN is "
      f"{train_scores.mean():.3f} +/- {train_scores.std():.3f}")
print("The accuracy in TEST  is "
      f"{scores.mean():.3f} +/- {scores.std():.3f}, for {fit_time.mean():.3f} seconds")
The accuracy in TRAIN is 0.882 +/- 0.001
The accuracy in TEST  is 0.874 +/- 0.002, for 1.504 seconds