Using numerical and categorical variables together
We will show how to combine preprocessing steps on numerical and categorical
import pandas as pd
import matplotlib.pyplot as plt
import time
from sklearn.compose import make_column_selector as selector
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import cross_validate
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import HistGradientBoostingClassifier
myDataFrame = pd.read_csv("../../scikit-learn-mooc/datasets/adult-census.csv")
myDataFrame = myDataFrame.drop(columns="education-num")
target_column = 'class'
target = myDataFrame[target_column]
pie = target.value_counts(normalize=True)
pie
pie.plot(kind="pie", label="target");
data = myDataFrame.drop(columns=target_column)
print(f"The dataset data contains {data.shape[0]} samples and {data.shape[1]} features")
data.dtypes
numerical_columns = selector(dtype_exclude=object)(data)
categorical_columns = selector(dtype_include=object)(data)
all_columns = numerical_columns + categorical_columns
data = data[all_columns]
print(f"The dataset data contains {data.shape[0]} samples and {data.shape[1]} features")
data_numerical = data[numerical_columns]
data_categorical = data[categorical_columns]
model = make_pipeline(
StandardScaler(),
LogisticRegression())
cv_results = cross_validate(model, data_numerical, target, cv=10)
scores = cv_results["test_score"]
fit_time = cv_results["fit_time"]
print("The accuracy is "
f"{scores.mean():.3f} +/- {scores.std():.3f}, for {fit_time.mean():.3f} seconds")
model = make_pipeline(
OneHotEncoder(handle_unknown="ignore"),
LogisticRegression(max_iter=500)
)
cv_results = cross_validate(model, data_categorical, target, cv=10)
scores = cv_results["test_score"]
fit_time = cv_results["fit_time"]
print("The accuracy is "
f"{scores.mean():.3f} +/- {scores.std():.3f}, for {fit_time.mean():.3f} seconds")
model = make_pipeline(
OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1),
LogisticRegression(max_iter=500)
)
cv_results = cross_validate(model, data_categorical, target, cv=10)
scores = cv_results["test_score"]
fit_time = cv_results["fit_time"]
print("The accuracy is "
f"{scores.mean():.3f} +/- {scores.std():.3f}, for {fit_time.mean():.3f} seconds")
categorical_preprocessor = OneHotEncoder(handle_unknown="ignore")
numerical_preprocessor = StandardScaler()
preprocessor = ColumnTransformer([
('one-hot-encoder', categorical_preprocessor, categorical_columns),
('standard-scaler', numerical_preprocessor, numerical_columns)])
model = make_pipeline(
preprocessor,
LogisticRegression(max_iter=1500))
cv_results = cross_validate(model, data, target, cv=10)
scores = cv_results["test_score"]
fit_time = cv_results["fit_time"]
print("The accuracy is "
f"{scores.mean():.3f} +/- {scores.std():.3f}, for {fit_time.mean():.3f} seconds")
Ordinal encoding + Gradient-boosting trees = best
For tree-based models, the handling of numerical and categorical variables is simpler than for linear models:
- we do not need to scale the numerical features
- using an ordinal encoding for the categorical variables is fine even if the encoding results in an arbitrary ordering
categorical_preprocessor_tree = OrdinalEncoder(
handle_unknown="use_encoded_value",
unknown_value=-1)
preprocessor_tree = ColumnTransformer([
('categorical', categorical_preprocessor_tree, categorical_columns)], remainder="passthrough")
model = make_pipeline(
preprocessor_tree,
HistGradientBoostingClassifier())
cv_results = cross_validate(model, data, target, cv=10, return_estimator=True)
scores = cv_results["test_score"]
fit_time = cv_results["fit_time"]
print("The accuracy is "
f"{scores.mean():.3f} +/- {scores.std():.3f}, for {fit_time.mean():.3f} seconds")