Preparation

import pandas as pd
import time
from sklearn.compose import make_column_selector as selector
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import cross_validate
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import OneHotEncoder
myDataFrame = pd.read_csv("../../scikit-learn-mooc/datasets/adult-census.csv")
myDataFrame = myDataFrame.drop(columns="education-num")

The set

target_column = 'class'
target = myDataFrame[target_column]
target.value_counts()
 <=50K    37155
 >50K     11687
Name: class, dtype: int64
target.value_counts(normalize=True)
 <=50K    0.760718
 >50K     0.239282
Name: class, dtype: float64

Continuation of preparation

data = myDataFrame.drop(columns=target_column)
print(f"The dataset data contains {data.shape[0]} samples and {data.shape[1]} features")
The dataset data contains 48842 samples and 12 features
data.dtypes
age                int64
workclass         object
education         object
marital-status    object
occupation        object
relationship      object
race              object
sex               object
capital-gain       int64
capital-loss       int64
hours-per-week     int64
native-country    object
dtype: object
numerical_columns = selector(dtype_include="int64")(data)
categorical_columns = selector(dtype_include="object")(data)
all_columns = numerical_columns + categorical_columns
data = data[all_columns]
print(f"The dataset data contains {data.shape[0]} samples and {data.shape[1]} features")
The dataset data contains 48842 samples and 12 features
data_numerical = data[numerical_columns]
data_categorical = data[categorical_columns]

Numerical

data_n_train, data_n_test, target_n_train, target_n_test = train_test_split(
    data_numerical, 
    target, 
    #random_state=42, 
    test_size=0.25)

Cross validation + normalization

model = make_pipeline(
    StandardScaler(), 
    LogisticRegression())
cv_results = cross_validate(model, data_numerical, target, cv=10)
scores = cv_results["test_score"]
fit_time = cv_results["fit_time"]
print("The accuracy is "
      f"{scores.mean():.3f} +/- {scores.std():.3f}, for {fit_time.mean():.3f} seconds")
The accuracy is 0.800 +/- 0.004, for 0.074 seconds

Categorical

Encoding ordinal categories

data_categorical[:5]
workclass education marital-status occupation relationship race sex native-country
0 Private 11th Never-married Machine-op-inspct Own-child Black Male United-States
1 Private HS-grad Married-civ-spouse Farming-fishing Husband White Male United-States
2 Local-gov Assoc-acdm Married-civ-spouse Protective-serv Husband White Male United-States
3 Private Some-college Married-civ-spouse Machine-op-inspct Husband Black Male United-States
4 ? Some-college Never-married ? Own-child White Female United-States
encoder_ordinal = OrdinalEncoder()
data_encoded_ordinal = encoder_ordinal.fit_transform(data_categorical)
data_encoded_ordinal[:5]
array([[ 4.,  1.,  4.,  7.,  3.,  2.,  1., 39.],
       [ 4., 11.,  2.,  5.,  0.,  4.,  1., 39.],
       [ 2.,  7.,  2., 11.,  0.,  4.,  1., 39.],
       [ 4., 15.,  2.,  7.,  0.,  2.,  1., 39.],
       [ 0., 15.,  4.,  0.,  3.,  4.,  0., 39.]])
encoder_ordinal.categories_
[array([' ?', ' Federal-gov', ' Local-gov', ' Never-worked', ' Private',
        ' Self-emp-inc', ' Self-emp-not-inc', ' State-gov', ' Without-pay'],
       dtype=object),
 array([' 10th', ' 11th', ' 12th', ' 1st-4th', ' 5th-6th', ' 7th-8th',
        ' 9th', ' Assoc-acdm', ' Assoc-voc', ' Bachelors', ' Doctorate',
        ' HS-grad', ' Masters', ' Preschool', ' Prof-school',
        ' Some-college'], dtype=object),
 array([' Divorced', ' Married-AF-spouse', ' Married-civ-spouse',
        ' Married-spouse-absent', ' Never-married', ' Separated',
        ' Widowed'], dtype=object),
 array([' ?', ' Adm-clerical', ' Armed-Forces', ' Craft-repair',
        ' Exec-managerial', ' Farming-fishing', ' Handlers-cleaners',
        ' Machine-op-inspct', ' Other-service', ' Priv-house-serv',
        ' Prof-specialty', ' Protective-serv', ' Sales', ' Tech-support',
        ' Transport-moving'], dtype=object),
 array([' Husband', ' Not-in-family', ' Other-relative', ' Own-child',
        ' Unmarried', ' Wife'], dtype=object),
 array([' Amer-Indian-Eskimo', ' Asian-Pac-Islander', ' Black', ' Other',
        ' White'], dtype=object),
 array([' Female', ' Male'], dtype=object),
 array([' ?', ' Cambodia', ' Canada', ' China', ' Columbia', ' Cuba',
        ' Dominican-Republic', ' Ecuador', ' El-Salvador', ' England',
        ' France', ' Germany', ' Greece', ' Guatemala', ' Haiti',
        ' Holand-Netherlands', ' Honduras', ' Hong', ' Hungary', ' India',
        ' Iran', ' Ireland', ' Italy', ' Jamaica', ' Japan', ' Laos',
        ' Mexico', ' Nicaragua', ' Outlying-US(Guam-USVI-etc)', ' Peru',
        ' Philippines', ' Poland', ' Portugal', ' Puerto-Rico',
        ' Scotland', ' South', ' Taiwan', ' Thailand', ' Trinadad&Tobago',
        ' United-States', ' Vietnam', ' Yugoslavia'], dtype=object)]
print(f"The dataset original contains {data_categorical.shape[1]} features")
The dataset original contains 8 features
print(f"The dataset encoded contains {data_encoded_ordinal.shape[1]} features")
The dataset encoded contains 8 features

Hot One encoding nominal categories (without assuming any order)

data_categorical[:2]
workclass education marital-status occupation relationship race sex native-country
0 Private 11th Never-married Machine-op-inspct Own-child Black Male United-States
1 Private HS-grad Married-civ-spouse Farming-fishing Husband White Male United-States
encoder_onehot = OneHotEncoder(sparse=False)
data_encoded_onehot = encoder_onehot.fit_transform(data_categorical)
data_encoded_onehot[:2]
array([[0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 1., 0., 0., 0., 0., 1., 0., 0., 0., 1., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 1., 0., 0.],
       [0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 1., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 1., 0., 0.]])
print(f"The dataset original contains {data_categorical.shape[1]} features")
The dataset original contains 8 features
print(f"The dataset encoded contains {data_encoded_onehot.shape[1]} features")
The dataset encoded contains 102 features

Choosing an encoding strategy

Hot encoding + Regression = good

model_oneHotLin = make_pipeline(
    OneHotEncoder(handle_unknown="ignore"), 
    LogisticRegression(max_iter=1500)
)
cv_results = cross_validate(model_oneHotLin, data_categorical, target, cv=10)
scores = cv_results["test_score"]
fit_time = cv_results["fit_time"]
print("The accuracy is "
      f"{scores.mean():.3f} +/- {scores.std():.3f}, for {fit_time.mean():.3f} seconds")
The accuracy is 0.833 +/- 0.003, for 0.750 seconds

Ordinal encoding + Regression = not good

model_ordLin = make_pipeline(
    OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=100), 
    LogisticRegression(max_iter=500)
)
cv_results = cross_validate(model_ordLin, data_categorical, target, cv=10)
scores = cv_results["test_score"]
fit_time = cv_results["fit_time"]
print("The accuracy is "
      f"{scores.mean():.3f} +/- {scores.std():.3f}, for {fit_time.mean():.3f} seconds")
The accuracy is 0.755 +/- 0.002, for 0.363 seconds