Combine sequential operations
we chain the operations and use them with any other classifier or regressor
import pandas as pd
import time
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import cross_validate
from sklearn import set_config
set_config(display='diagram')
myDataFrame = pd.read_csv("../../scikit-learn-mooc/datasets/penguins_classification.csv")
target_column = 'Species'
target = myDataFrame[target_column]
target.value_counts()
target.value_counts(normalize=True)
data = myDataFrame.drop(columns=target_column)
data.columns
numerical_columns = ['Culmen Length (mm)', 'Culmen Depth (mm)']
data_numeric = data[numerical_columns]
data_train, data_test, target_train, target_test = train_test_split(
data_numeric,
target,
#random_state=42,
test_size=0.25)
data_train.describe()
model = LogisticRegression()
start = time.time()
model.fit(data_train, target_train)
elapsed_time = time.time() - start
model_name = model.__class__.__name__
score = model.score(data_test, target_test)
print(f"The accuracy using a {model_name} is {score:.3f} "
f"with a fitting time of {elapsed_time:.3f} seconds "
f"in {model.n_iter_[0]} iterations")
model = make_pipeline(StandardScaler(), LogisticRegression())
start = time.time()
model.fit(data_train, target_train)
elapsed_time = time.time() - start
predicted_target = model.predict(data_test)
model_name = model.__class__.__name__
score = model.score(data_test, target_test)
print(f"The accuracy using a {model_name} is {score:.3f} "
f"with a fitting time of {elapsed_time:.3f} seconds "
f"in {model[-1].n_iter_[0]} iterations")
model = make_pipeline(StandardScaler(), LogisticRegression())
cv_result = cross_validate(model, data_numeric, target, cv=10)
cv_result
scores = cv_result["test_score"]
fit_time = cv_result["fit_time"]
print("The mean cross-validation accuracy is "
f"{scores.mean():.3f} +/- {scores.std():.3f}, for {fit_time.mean():.3f} seconds in average")