Loading

import pandas as pd
import numpy as np
# import matplotlib.pyplot as plt
# import seaborn as sns
import time

myData = pd.read_csv("../../scikit-learn-mooc/datasets/adult-census.csv")

myData = myData.drop(columns="education-num")
myData.head()

print(f"The dataset data contains {myData.shape[0]} samples and {myData.shape[1]} features")

The dataset data contains 48842 samples and 13 features

target_column = 'class'
target = myData[target_column]
data = myData.drop(columns=target_column)

myData.dtypes

age                int64
workclass         object
education         object
marital-status    object
occupation        object
relationship      object
race              object
sex               object
capital-gain       int64
capital-loss       int64
hours-per-week     int64
native-country    object
class             object
dtype: object

from sklearn.compose import make_column_selector as selector
# 
numerical_columns = selector(dtype_exclude=object)(data)
categorical_columns = selector(dtype_include=object)(data)
all_columns = numerical_columns + categorical_columns
data = data[all_columns]

data_numerical = data[numerical_columns]
data_categorical = data[categorical_columns]

print(f"The dataset data contains {data.shape[0]} samples and {data.shape[1]} features")

The dataset data contains 48842 samples and 12 features

Split data in train and test

from sklearn.model_selection import train_test_split
# 
data_train, data_test, target_train, target_test = train_test_split(
    data_numerical, target, random_state=42, test_size=0.25)

print(f"Number of samples in testing: {data_train.shape[0]} => "
      f"{data_train.shape[0] / data_numerical.shape[0] * 100:.1f}% of the"
      f" original set")

Number of samples in testing: 36631 => 75.0% of the original set

data_train.describe()

LogisticRegression without preprocessing

from sklearn.linear_model import LogisticRegression
# 
model = LogisticRegression()
start = time.time()
model.fit(data_train, target_train);
elapsed_time = time.time() - start

accuracy = model.score(data_test, target_test)
model_name = model.__class__.__name__
score = model.score(data_test, target_test)

print(f"The accuracy using a {model_name} is {score:.3f} "
      f"with a fitting time of {elapsed_time:.3f} seconds "
      f"in {model.n_iter_[0]} iterations")

The accuracy using a LogisticRegression is 0.807 with a fitting time of 0.168 seconds in 59 iterations

Preprocessing on the data train

Just to see what happens to the data

from sklearn.preprocessing import StandardScaler
# 
scaler = StandardScaler()
data_train_scaled = scaler.fit_transform(data_train)

data_train_scaled = pd.DataFrame(data_train_scaled,
                                 columns=data_train.columns)
data_train_scaled.describe()

LogisticRegression with preprocessing via pipeline

Same accuracy but better fitting time

from sklearn.pipeline import make_pipeline
# 
model = make_pipeline(StandardScaler(), LogisticRegression())
start = time.time()
model.fit(data_train, target_train);
elapsed_time = time.time() - start# LogisticRegression without preprocessing

accuracy = model.score(data_test, target_test)
model_name = model.__class__.__name__
score = model.score(data_test, target_test)

print(f"The accuracy using a {model_name} is {score:.3f} "
      f"with a fitting time of {elapsed_time:.3f} seconds "
      f"in {model[-1].n_iter_[0]} iterations")

The accuracy using a Pipeline is 0.807 with a fitting time of 0.066 seconds in 12 iterations

model.named_steps

{'standardscaler': StandardScaler(),
 'logisticregression': LogisticRegression()}

	age	workclass	education	marital-status	occupation	relationship	race	sex	capital-gain	hours-per-week	native-country	class
0	25	Private	11th	Never-married	Machine-op-inspct	Own-child	Black	Male	0	40	United-States	<=50K
1	38	Private	HS-grad	Married-civ-spouse	Farming-fishing	Husband	White	Male	0	50	United-States	<=50K
2	28	Local-gov	Assoc-acdm	Married-civ-spouse	Protective-serv	Husband	White	Male	0	40	United-States	>50K
3	44	Private	Some-college	Married-civ-spouse	Machine-op-inspct	Husband	Black	Male	7688	40	United-States	>50K
4	18	?	Some-college	Never-married	?	Own-child	White	Female	0	30	United-States	<=50K

	age	capital-gain	capital-loss	hours-per-week
count	36631.000000	36631.000000	36631.000000	36631.000000
mean	38.642352	1087.077721	89.665311	40.431247
std	13.725748	7522.692939	407.110175	12.423952
min	17.000000	0.000000	0.000000	1.000000
25%	28.000000	0.000000	0.000000	40.000000
50%	37.000000	0.000000	0.000000	40.000000
75%	48.000000	0.000000	0.000000	45.000000
max	90.000000	99999.000000	4356.000000	99.000000

	age	capital-gain	capital-loss	hours-per-week
count	3.663100e+04	3.663100e+04	3.663100e+04	3.663100e+04
mean	-2.273364e-16	3.530310e-17	3.840667e-17	1.844684e-16
std	1.000014e+00	1.000014e+00	1.000014e+00	1.000014e+00
min	-1.576792e+00	-1.445084e-01	-2.202513e-01	-3.173852e+00
25%	-7.753674e-01	-1.445084e-01	-2.202513e-01	-3.471139e-02
50%	-1.196565e-01	-1.445084e-01	-2.202513e-01	-3.471139e-02
75%	6.817680e-01	-1.445084e-01	-2.202513e-01	3.677425e-01
max	3.741752e+00	1.314865e+01	1.047970e+01	4.714245e+00