Loading

import pandas as pd
import numpy as np
# import matplotlib.pyplot as plt
# import seaborn as sns
import time
myData = pd.read_csv("../../scikit-learn-mooc/datasets/adult-census.csv")
myData = myData.drop(columns="education-num")
print(f"The dataset data contains {myData.shape[0]} samples and {myData.shape[1]} features")
The dataset data contains 48842 samples and 13 features
target_column = 'class'
target = myData[target_column]
data = myData.drop(columns=target_column)
from sklearn.compose import make_column_selector as selector
# 
numerical_columns = selector(dtype_exclude=object)(data)
categorical_columns = selector(dtype_include=object)(data)
all_columns = numerical_columns + categorical_columns
data = data[all_columns]
data_numerical = data[numerical_columns]
data_categorical = data[categorical_columns]

Identify categorical variables

data_categorical
workclass education marital-status occupation relationship race sex native-country
0 Private 11th Never-married Machine-op-inspct Own-child Black Male United-States
1 Private HS-grad Married-civ-spouse Farming-fishing Husband White Male United-States
2 Local-gov Assoc-acdm Married-civ-spouse Protective-serv Husband White Male United-States
3 Private Some-college Married-civ-spouse Machine-op-inspct Husband Black Male United-States
4 ? Some-college Never-married ? Own-child White Female United-States
... ... ... ... ... ... ... ... ...
48837 Private Assoc-acdm Married-civ-spouse Tech-support Wife White Female United-States
48838 Private HS-grad Married-civ-spouse Machine-op-inspct Husband White Male United-States
48839 Private HS-grad Widowed Adm-clerical Unmarried White Female United-States
48840 Private HS-grad Never-married Adm-clerical Own-child White Male United-States
48841 Self-emp-inc HS-grad Married-civ-spouse Exec-managerial Wife White Female United-States

48842 rows × 8 columns

data_categorical["native-country"].value_counts()
 United-States                 43832
 Mexico                          951
 ?                               857
 Philippines                     295
 Germany                         206
 Puerto-Rico                     184
 Canada                          182
 El-Salvador                     155
 India                           151
 Cuba                            138
 England                         127
 China                           122
 South                           115
 Jamaica                         106
 Italy                           105
 Dominican-Republic              103
 Japan                            92
 Guatemala                        88
 Poland                           87
 Vietnam                          86
 Columbia                         85
 Haiti                            75
 Portugal                         67
 Taiwan                           65
 Iran                             59
 Greece                           49
 Nicaragua                        49
 Peru                             46
 Ecuador                          45
 France                           38
 Ireland                          37
 Hong                             30
 Thailand                         30
 Cambodia                         28
 Trinadad&Tobago                  27
 Laos                             23
 Yugoslavia                       23
 Outlying-US(Guam-USVI-etc)       23
 Scotland                         21
 Honduras                         20
 Hungary                          19
 Holand-Netherlands                1
Name: native-country, dtype: int64
data_categorical["native-country"].value_counts().sort_index()
 ?                               857
 Cambodia                         28
 Canada                          182
 China                           122
 Columbia                         85
 Cuba                            138
 Dominican-Republic              103
 Ecuador                          45
 El-Salvador                     155
 England                         127
 France                           38
 Germany                         206
 Greece                           49
 Guatemala                        88
 Haiti                            75
 Holand-Netherlands                1
 Honduras                         20
 Hong                             30
 Hungary                          19
 India                           151
 Iran                             59
 Ireland                          37
 Italy                           105
 Jamaica                         106
 Japan                            92
 Laos                             23
 Mexico                          951
 Nicaragua                        49
 Outlying-US(Guam-USVI-etc)       23
 Peru                             46
 Philippines                     295
 Poland                           87
 Portugal                         67
 Puerto-Rico                     184
 Scotland                         21
 South                           115
 Taiwan                           65
 Thailand                         30
 Trinadad&Tobago                  27
 United-States                 43832
 Vietnam                          86
 Yugoslavia                       23
Name: native-country, dtype: int64

Encoding ordinal categories

Using an OrdinalEncoder will output ordinal categories.

This means that there is an order in the resulting categories (e.g. 0 < 1 < 2). The impact of violating this ordering assumption is really dependent on the downstream models. Linear models will be impacted by misordered categories while tree-based models will not.

OrdinalEncoder is often a good strategy with tree-based models

You can still use an OrdinalEncoder with linear models but you need to be sure that:- the original categories (before encoding) have an ordering;- the encoded categories follow the same ordering than the original categories.

categorical_column = data_categorical[["workclass"]]
categorical_column.value_counts()
workclass        
 Private             33906
 Self-emp-not-inc     3862
 Local-gov            3136
 ?                    2799
 State-gov            1981
 Self-emp-inc         1695
 Federal-gov          1432
 Without-pay            21
 Never-worked           10
dtype: int64
from sklearn.preprocessing import OrdinalEncoder
# 
encoder = OrdinalEncoder()
categorical_encoded = encoder.fit_transform(categorical_column)
categorical_encoded[:5]
array([[4.],
       [4.],
       [2.],
       [4.],
       [0.]])
categorical_column[:5]
workclass
0 Private
1 Private
2 Local-gov
3 Private
4 ?
encoder.categories_
[array([' ?', ' Federal-gov', ' Local-gov', ' Never-worked', ' Private',
        ' Self-emp-inc', ' Self-emp-not-inc', ' State-gov', ' Without-pay'],
       dtype=object)]

Encoding nominal categories (without assuming any order)

Each category (unique value) became a column; the encoding returned, for each sample, a 1 to specify which category it belongs to.

OneHotEncoder is the encoding strategy used when the downstream models are linear models

One-hot encoding categorical variables with high cardinality can cause computational inefficiency in tree-based models. Because of this, it is not recommended to use OneHotEncoder in such cases even if the original categories do not have a given order.

One category

from sklearn.preprocessing import OneHotEncoder
# 
encoder = OneHotEncoder(sparse=False)
categorical_column = data_categorical[["workclass"]]
categorical_encoded = encoder.fit_transform(categorical_column)
categorical_encoded[:5]
array([[0., 0., 0., 0., 1., 0., 0., 0., 0.],
       [0., 0., 0., 0., 1., 0., 0., 0., 0.],
       [0., 0., 1., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 1., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0., 0., 0., 0., 0.]])

sparse=False is used in the OneHotEncoder for didactic purposes, namely easier visualization of the data. Sparse matrices are efficient data structures when most of your matrix elements are zero.

feature_names = encoder.get_feature_names_out(input_features=["workclass"])
categorical_encoded = pd.DataFrame(categorical_encoded, columns=feature_names)
categorical_encoded[:5]
workclass_ ? workclass_ Federal-gov workclass_ Local-gov workclass_ Never-worked workclass_ Private workclass_ Self-emp-inc workclass_ Self-emp-not-inc workclass_ State-gov workclass_ Without-pay
0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0
1 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0
2 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0
3 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0
4 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0

Let's apply this encoding on the full dataset

data_categorical.head()
workclass education marital-status occupation relationship race sex native-country
0 Private 11th Never-married Machine-op-inspct Own-child Black Male United-States
1 Private HS-grad Married-civ-spouse Farming-fishing Husband White Male United-States
2 Local-gov Assoc-acdm Married-civ-spouse Protective-serv Husband White Male United-States
3 Private Some-college Married-civ-spouse Machine-op-inspct Husband Black Male United-States
4 ? Some-college Never-married ? Own-child White Female United-States
data_encoded = encoder.fit_transform(data_categorical)
data_encoded[:2]
array([[0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 1., 0., 0., 0., 0., 1., 0., 0., 0., 1., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 1., 0., 0.],
       [0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 1., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 1., 0., 0.]])
columns_encoded = encoder.get_feature_names_out(data_categorical.columns)
pd.DataFrame(data_encoded, columns=columns_encoded)[:2]
workclass_ ? workclass_ Federal-gov workclass_ Local-gov workclass_ Never-worked workclass_ Private workclass_ Self-emp-inc workclass_ Self-emp-not-inc workclass_ State-gov workclass_ Without-pay education_ 10th ... native-country_ Portugal native-country_ Puerto-Rico native-country_ Scotland native-country_ South native-country_ Taiwan native-country_ Thailand native-country_ Trinadad&Tobago native-country_ United-States native-country_ Vietnam native-country_ Yugoslavia
0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0
1 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0

2 rows × 102 columns

LogisticRegression on categorical variables

OneHotEncoder + LogisticRegression = good

from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import cross_validate
# 
model = make_pipeline(OneHotEncoder(handle_unknown="ignore"), LogisticRegression(max_iter=500))

cv_results = cross_validate(model, data_categorical, target, cv=10)
scores = cv_results["test_score"]
fit_time = cv_results["fit_time"]
print("The accuracy is "
      f"{scores.mean():.3f} +/- {scores.std():.3f}, for {fit_time.mean():.3f} seconds")
The accuracy is 0.833 +/- 0.003, for 0.669 seconds

OrdinalEncoder + LogisticRegression = not so good

model = make_pipeline(OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=100), 
                      LogisticRegression(max_iter=500))

cv_results = cross_validate(model, data_categorical, target, cv=10)
scores = cv_results["test_score"]
fit_time = cv_results["fit_time"]
print("The accuracy is "
      f"{scores.mean():.3f} +/- {scores.std():.3f}, for {fit_time.mean():.3f} seconds")
The accuracy is 0.755 +/- 0.002, for 0.330 seconds