Data set analyse (v2)
Look at the data set
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
myData = pd.read_csv("../../scikit-learn-mooc/datasets/adult-census.csv")
myData.head()
print(f"The dataset data contains {myData.shape[0]} samples and {myData.shape[1]} features")
myData["class"].value_counts()
myData.dtypes
from sklearn.compose import make_column_selector as selector
#
numerical_columns = selector(dtype_include="int64")(myData)
categorical_columns = selector(dtype_include="object")(myData)
all_columns = numerical_columns + categorical_columns
myData = myData[all_columns]
print(f"The dataset data contains {myData.shape[0]} samples and {myData.shape[1]} features")
data_numerical = myData[numerical_columns]
data_categorical = myData[categorical_columns]
_ = myData.hist(figsize=(20, 14))
_ = sns.pairplot(myData)
_ = sns.pairplot(myData, hue="class")
_ = sns.pairplot(myData, vars=myData)
Education and education-num seem to be linked and they are :
pd.crosstab(index=myData["education"], columns=myData["education-num"])