Basic model with scikit-learn
First model with scikit-learn
- Imports
- First analysis
import pandas as pd
myDataFrame = pd.read_csv("../../scikit-learn-mooc/datasets/adult-census.csv")
print(f"The dataset contains {myDataFrame.shape[0]} samples and "
f"{myDataFrame.shape[1]} columns")
myDataFrame.head()
target_column = 'class'
target_y = myDataFrame["class"]
data_X = myDataFrame.drop(columns="class")
target_y.value_counts()
data_X.head()
pd.crosstab(index=data_X['education'],
columns=data_X['education-num'])
data_X = data_X.drop(columns="education-num")
print(f"The dataset data_X contains {data_X.shape[0]} samples and "
f"{data_X.shape[1]} columns")
data_X.dtypes
by hand
numerical_columns = ["age", "capital-gain", "capital-loss", "hours-per-week"]
categorical_columns = ['workclass', 'education', 'marital-status', 'occupation', 'relationship',
'race', 'sex', 'native-country']
select columns based on their data type
from sklearn.compose import make_column_selector as selector
categorical_columns = selector(dtype_include="object")(data_X)
categorical_columns
numerical_columns = selector(dtype_include="int64")(data_X)
numerical_columns
all_columns = numerical_columns + categorical_columns
data_X = data_X[all_columns]
print(f"The dataset data_X contains {data_X.shape[0]} samples and "
f"{data_X.shape[1]} columns")
data_X[numerical_columns].describe()
data_X_numerical = data_X[numerical_columns]
from sklearn.model_selection import train_test_split
data_train, data_test, target_train, target_test = train_test_split(
data_X_numerical,
target_y,
random_state=42,
test_size=0.25)
print(f"Number of samples in testing: {data_test.shape[0]} => "
f"{data_test.shape[0] / data_X_numerical.shape[0] * 100:.1f}% of the"
f" original set")
print(f"Number of samples in training: {data_train.shape[0]} => "
f"{data_train.shape[0] / data_X_numerical.shape[0] * 100:.1f}% of the"
f" original set")
from sklearn import set_config
set_config(display='diagram')
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()
model.fit(data_train, target_train)
accuracy = model.score(data_test, target_test)
print(f"Accuracy of logistic regression: {accuracy:.3f}")
</div>