Preprocessing for numerical features (v2)
how to build predictive models on tabulardatasets, with only numerical features
- Loading
- Split data in train and test
- LogisticRegression without preprocessing
- Preprocessing on the data train
- LogisticRegression with preprocessing via pipeline
import pandas as pd
import numpy as np
# import matplotlib.pyplot as plt
# import seaborn as sns
import time
myData = pd.read_csv("../../scikit-learn-mooc/datasets/adult-census.csv")
myData = myData.drop(columns="education-num")
myData.head()
print(f"The dataset data contains {myData.shape[0]} samples and {myData.shape[1]} features")
target_column = 'class'
target = myData[target_column]
data = myData.drop(columns=target_column)
myData.dtypes
from sklearn.compose import make_column_selector as selector
#
numerical_columns = selector(dtype_exclude=object)(data)
categorical_columns = selector(dtype_include=object)(data)
all_columns = numerical_columns + categorical_columns
data = data[all_columns]
data_numerical = data[numerical_columns]
data_categorical = data[categorical_columns]
print(f"The dataset data contains {data.shape[0]} samples and {data.shape[1]} features")
from sklearn.model_selection import train_test_split
#
data_train, data_test, target_train, target_test = train_test_split(
data_numerical, target, random_state=42, test_size=0.25)
print(f"Number of samples in testing: {data_train.shape[0]} => "
f"{data_train.shape[0] / data_numerical.shape[0] * 100:.1f}% of the"
f" original set")
data_train.describe()
from sklearn.linear_model import LogisticRegression
#
model = LogisticRegression()
start = time.time()
model.fit(data_train, target_train);
elapsed_time = time.time() - start
accuracy = model.score(data_test, target_test)
model_name = model.__class__.__name__
score = model.score(data_test, target_test)
print(f"The accuracy using a {model_name} is {score:.3f} "
f"with a fitting time of {elapsed_time:.3f} seconds "
f"in {model.n_iter_[0]} iterations")
from sklearn.preprocessing import StandardScaler
#
scaler = StandardScaler()
data_train_scaled = scaler.fit_transform(data_train)
data_train_scaled = pd.DataFrame(data_train_scaled,
columns=data_train.columns)
data_train_scaled.describe()
from sklearn.pipeline import make_pipeline
#
model = make_pipeline(StandardScaler(), LogisticRegression())
start = time.time()
model.fit(data_train, target_train);
elapsed_time = time.time() - start# LogisticRegression without preprocessing
accuracy = model.score(data_test, target_test)
model_name = model.__class__.__name__
score = model.score(data_test, target_test)
print(f"The accuracy using a {model_name} is {score:.3f} "
f"with a fitting time of {elapsed_time:.3f} seconds "
f"in {model[-1].n_iter_[0]} iterations")
model.named_steps