Encoding of categorical variables
Dealing with categorical variables by encoding them, namely ordinal encoding and one-hot encoding
import pandas as pd
import time
from sklearn.compose import make_column_selector as selector
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import cross_validate
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import OneHotEncoder
myDataFrame = pd.read_csv("../../scikit-learn-mooc/datasets/adult-census.csv")
myDataFrame = myDataFrame.drop(columns="education-num")
target_column = 'class'
target = myDataFrame[target_column]
target.value_counts()
target.value_counts(normalize=True)
data = myDataFrame.drop(columns=target_column)
print(f"The dataset data contains {data.shape[0]} samples and {data.shape[1]} features")
data.dtypes
numerical_columns = selector(dtype_include="int64")(data)
categorical_columns = selector(dtype_include="object")(data)
all_columns = numerical_columns + categorical_columns
data = data[all_columns]
print(f"The dataset data contains {data.shape[0]} samples and {data.shape[1]} features")
data_numerical = data[numerical_columns]
data_categorical = data[categorical_columns]
data_n_train, data_n_test, target_n_train, target_n_test = train_test_split(
data_numerical,
target,
#random_state=42,
test_size=0.25)
model = make_pipeline(
StandardScaler(),
LogisticRegression())
cv_results = cross_validate(model, data_numerical, target, cv=10)
scores = cv_results["test_score"]
fit_time = cv_results["fit_time"]
print("The accuracy is "
f"{scores.mean():.3f} +/- {scores.std():.3f}, for {fit_time.mean():.3f} seconds")
data_categorical[:5]
encoder_ordinal = OrdinalEncoder()
data_encoded_ordinal = encoder_ordinal.fit_transform(data_categorical)
data_encoded_ordinal[:5]
encoder_ordinal.categories_
print(f"The dataset original contains {data_categorical.shape[1]} features")
print(f"The dataset encoded contains {data_encoded_ordinal.shape[1]} features")
data_categorical[:2]
encoder_onehot = OneHotEncoder(sparse=False)
data_encoded_onehot = encoder_onehot.fit_transform(data_categorical)
data_encoded_onehot[:2]
print(f"The dataset original contains {data_categorical.shape[1]} features")
print(f"The dataset encoded contains {data_encoded_onehot.shape[1]} features")
model_oneHotLin = make_pipeline(
OneHotEncoder(handle_unknown="ignore"),
LogisticRegression(max_iter=1500)
)
cv_results = cross_validate(model_oneHotLin, data_categorical, target, cv=10)
scores = cv_results["test_score"]
fit_time = cv_results["fit_time"]
print("The accuracy is "
f"{scores.mean():.3f} +/- {scores.std():.3f}, for {fit_time.mean():.3f} seconds")
model_ordLin = make_pipeline(
OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=100),
LogisticRegression(max_iter=500)
)
cv_results = cross_validate(model_ordLin, data_categorical, target, cv=10)
scores = cv_results["test_score"]
fit_time = cv_results["fit_time"]
print("The accuracy is "
f"{scores.mean():.3f} +/- {scores.std():.3f}, for {fit_time.mean():.3f} seconds")