Loading

import pandas as pd
import numpy as np
# import matplotlib.pyplot as plt
# import seaborn as sns
import time
myData = pd.read_csv("../../scikit-learn-mooc/datasets/adult-census.csv")
myData = myData.drop(columns="education-num")
myData.head()
age workclass education marital-status occupation relationship race sex capital-gain capital-loss hours-per-week native-country class
0 25 Private 11th Never-married Machine-op-inspct Own-child Black Male 0 0 40 United-States <=50K
1 38 Private HS-grad Married-civ-spouse Farming-fishing Husband White Male 0 0 50 United-States <=50K
2 28 Local-gov Assoc-acdm Married-civ-spouse Protective-serv Husband White Male 0 0 40 United-States >50K
3 44 Private Some-college Married-civ-spouse Machine-op-inspct Husband Black Male 7688 0 40 United-States >50K
4 18 ? Some-college Never-married ? Own-child White Female 0 0 30 United-States <=50K
print(f"The dataset data contains {myData.shape[0]} samples and {myData.shape[1]} features")
The dataset data contains 48842 samples and 13 features
target_column = 'class'
target = myData[target_column]
data = myData.drop(columns=target_column)
myData.dtypes
age                int64
workclass         object
education         object
marital-status    object
occupation        object
relationship      object
race              object
sex               object
capital-gain       int64
capital-loss       int64
hours-per-week     int64
native-country    object
class             object
dtype: object
from sklearn.compose import make_column_selector as selector
# 
numerical_columns = selector(dtype_exclude=object)(data)
categorical_columns = selector(dtype_include=object)(data)
all_columns = numerical_columns + categorical_columns
data = data[all_columns]
data_numerical = data[numerical_columns]
data_categorical = data[categorical_columns]
print(f"The dataset data contains {data.shape[0]} samples and {data.shape[1]} features")
The dataset data contains 48842 samples and 12 features

Split data in train and test

from sklearn.model_selection import train_test_split
# 
data_train, data_test, target_train, target_test = train_test_split(
    data_numerical, target, random_state=42, test_size=0.25)
print(f"Number of samples in testing: {data_train.shape[0]} => "
      f"{data_train.shape[0] / data_numerical.shape[0] * 100:.1f}% of the"
      f" original set")
Number of samples in testing: 36631 => 75.0% of the original set
data_train.describe()
age capital-gain capital-loss hours-per-week
count 36631.000000 36631.000000 36631.000000 36631.000000
mean 38.642352 1087.077721 89.665311 40.431247
std 13.725748 7522.692939 407.110175 12.423952
min 17.000000 0.000000 0.000000 1.000000
25% 28.000000 0.000000 0.000000 40.000000
50% 37.000000 0.000000 0.000000 40.000000
75% 48.000000 0.000000 0.000000 45.000000
max 90.000000 99999.000000 4356.000000 99.000000

LogisticRegression without preprocessing

from sklearn.linear_model import LogisticRegression
# 
model = LogisticRegression()
start = time.time()
model.fit(data_train, target_train);
elapsed_time = time.time() - start
accuracy = model.score(data_test, target_test)
model_name = model.__class__.__name__
score = model.score(data_test, target_test)
print(f"The accuracy using a {model_name} is {score:.3f} "
      f"with a fitting time of {elapsed_time:.3f} seconds "
      f"in {model.n_iter_[0]} iterations")
The accuracy using a LogisticRegression is 0.807 with a fitting time of 0.168 seconds in 59 iterations

Preprocessing on the data train

Just to see what happens to the data

from sklearn.preprocessing import StandardScaler
# 
scaler = StandardScaler()
data_train_scaled = scaler.fit_transform(data_train)
data_train_scaled = pd.DataFrame(data_train_scaled,
                                 columns=data_train.columns)
data_train_scaled.describe()
age capital-gain capital-loss hours-per-week
count 3.663100e+04 3.663100e+04 3.663100e+04 3.663100e+04
mean -2.273364e-16 3.530310e-17 3.840667e-17 1.844684e-16
std 1.000014e+00 1.000014e+00 1.000014e+00 1.000014e+00
min -1.576792e+00 -1.445084e-01 -2.202513e-01 -3.173852e+00
25% -7.753674e-01 -1.445084e-01 -2.202513e-01 -3.471139e-02
50% -1.196565e-01 -1.445084e-01 -2.202513e-01 -3.471139e-02
75% 6.817680e-01 -1.445084e-01 -2.202513e-01 3.677425e-01
max 3.741752e+00 1.314865e+01 1.047970e+01 4.714245e+00

LogisticRegression with preprocessing via pipeline

Same accuracy but better fitting time

from sklearn.pipeline import make_pipeline
# 
model = make_pipeline(StandardScaler(), LogisticRegression())
start = time.time()
model.fit(data_train, target_train);
elapsed_time = time.time() - start# LogisticRegression without preprocessing
accuracy = model.score(data_test, target_test)
model_name = model.__class__.__name__
score = model.score(data_test, target_test)
print(f"The accuracy using a {model_name} is {score:.3f} "
      f"with a fitting time of {elapsed_time:.3f} seconds "
      f"in {model[-1].n_iter_[0]} iterations")
The accuracy using a Pipeline is 0.807 with a fitting time of 0.066 seconds in 12 iterations
model.named_steps
{'standardscaler': StandardScaler(),
 'logisticregression': LogisticRegression()}