Preparation

import pandas as pd
import time
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import cross_validate

from sklearn import set_config
set_config(display='diagram')

myDataFrame = pd.read_csv("../../scikit-learn-mooc/datasets/penguins_classification.csv")

The set

target_column = 'Species'
target = myDataFrame[target_column]
target.value_counts()

Adelie       151
Gentoo       123
Chinstrap     68
Name: Species, dtype: int64

target.value_counts(normalize=True)

Adelie       0.441520
Gentoo       0.359649
Chinstrap    0.198830
Name: Species, dtype: float64

Continuation of preparation

data = myDataFrame.drop(columns=target_column)
data.columns

Index(['Culmen Length (mm)', 'Culmen Depth (mm)'], dtype='object')

numerical_columns = ['Culmen Length (mm)', 'Culmen Depth (mm)']
data_numeric = data[numerical_columns]

data_train, data_test, target_train, target_test = train_test_split(
    data_numeric, 
    target, 
    #random_state=42, 
    test_size=0.25)

data_train.describe()

Model without normalization

model = LogisticRegression()
start = time.time()
model.fit(data_train, target_train)
elapsed_time = time.time() - start

model_name = model.__class__.__name__
score = model.score(data_test, target_test)

print(f"The accuracy using a {model_name} is {score:.3f} "
      f"with a fitting time of {elapsed_time:.3f} seconds "
      f"in {model.n_iter_[0]} iterations")

The accuracy using a LogisticRegression is 0.953 with a fitting time of 0.029 seconds in 62 iterations

Model with normalization : Pipeline

Fewer iterations

model = make_pipeline(StandardScaler(), LogisticRegression())

start = time.time()
model.fit(data_train, target_train)
elapsed_time = time.time() - start

predicted_target = model.predict(data_test)
model_name = model.__class__.__name__
score = model.score(data_test, target_test)

print(f"The accuracy using a {model_name} is {score:.3f} "
      f"with a fitting time of {elapsed_time:.3f} seconds "
      f"in {model[-1].n_iter_[0]} iterations")

The accuracy using a Pipeline is 0.953 with a fitting time of 0.012 seconds in 14 iterations

Cross validation

model = make_pipeline(StandardScaler(), LogisticRegression())

cv_result = cross_validate(model, data_numeric, target, cv=10)
cv_result

{'fit_time': array([0.00897551, 0.00997257, 0.00801015, 0.0069809 , 0.00698352,
        0.00698447, 0.00598359, 0.00598407, 0.00599074, 0.00598288]),
 'score_time': array([0.00299311, 0.00099707, 0.00099778, 0.00199199, 0.00099421,
        0.00099444, 0.00099492, 0.00099397, 0.00098848, 0.00199533]),
 'test_score': array([1.        , 1.        , 0.94117647, 0.97058824, 0.91176471,
        0.94117647, 0.97058824, 0.97058824, 0.94117647, 0.94117647])}

scores = cv_result["test_score"]
fit_time = cv_result["fit_time"]

print("The mean cross-validation accuracy is "
      f"{scores.mean():.3f} +/- {scores.std():.3f}, for {fit_time.mean():.3f} seconds in average")

The mean cross-validation accuracy is 0.959 +/- 0.027, for 0.008 seconds in average

	Culmen Length (mm)	Culmen Depth (mm)
count	256.000000	256.000000
mean	43.847656	17.101953
std	5.381675	1.913470
min	32.100000	13.200000
25%	39.500000	15.675000
50%	43.900000	17.300000
75%	48.500000	18.600000
max	55.900000	21.500000