First model with scikit-learn (v2)
how to build predictive models on tabulardatasets, with only numerical features
import pandas as pd
import numpy as np
# import matplotlib.pyplot as plt
# import seaborn as sns
# import time
myData = pd.read_csv("../../scikit-learn-mooc/datasets/adult-census-numeric.csv")
myData.head()
print(f"The dataset data contains {myData.shape[0]} samples and {myData.shape[1]} features")
myData.dtypes
target_column = 'class'
target = myData[target_column]
target.value_counts()
pie = target.value_counts(normalize=True)
pie.plot(kind="pie", label="target");
data = myData.drop(columns=target_column)
data.columns
from sklearn.neighbors import KNeighborsClassifier
#
model = KNeighborsClassifier()
model.fit(data, target);
target_predicted = model.predict(data)
target_predicted[:5]
target[:5] == target_predicted[:5]
print(f"Number of correct prediction: "
f"{(target[:5] == target_predicted[:5]).sum()} / 5")
(target == target_predicted).mean()
adult_census_test = pd.read_csv('../../scikit-learn-mooc/datasets/adult-census-numeric-test.csv')
target_test = adult_census_test[target_column]
data_test = adult_census_test.drop(columns=target_column)
print(f"The testing dataset contains {data_test.shape[0]} samples and "
f"{data_test.shape[1]} features")
accuracy = model.score(data_test, target_test)
model_name = model.__class__.__name__
print(f"The test accuracy using a {model_name} is "
f"{accuracy:.3f}")