Normalization for numerical features
An example of preprocessing, namely scaling numerical variables
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
myDataFrame = pd.read_csv("../../scikit-learn-mooc/datasets/penguins_classification.csv")
target_column = 'Species'
target = myDataFrame[target_column]
target.value_counts()
target.value_counts(normalize=True)
data = myDataFrame.drop(columns=target_column)
data.columns
numerical_columns = ['Culmen Length (mm)', 'Culmen Depth (mm)']
data_numeric = data[numerical_columns]
data_train, data_test, target_train, target_test = train_test_split(
data_numeric,
target,
#random_state=42,
test_size=0.25)
data_train.describe()
_ = data_train.hist(figsize=(10, 5))
scaler = StandardScaler()
data_train_scaled = scaler.fit_transform(data_train)
data_train_scaled = pd.DataFrame(data_train_scaled,
columns=data_train.columns)
data_train_scaled.describe()
_ = data_train_scaled.hist(figsize=(10, 5))