156 lines
6.4 KiB
Python
156 lines
6.4 KiB
Python
import pandas as pd
|
|
from sklearn.preprocessing import OneHotEncoder
|
|
from typing import Tuple
|
|
import os
|
|
from sklearn.tree import DecisionTreeClassifier
|
|
from sklearn.naive_bayes import BernoulliNB
|
|
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
|
|
|
|
def process_data(train_path: str, test_path: str) -> Tuple[pd.DataFrame, pd.Series, pd.DataFrame, pd.Series]:
|
|
"""
|
|
Processes the adult dataset by cleaning, removing continuous attributes, and one-hot encoding.
|
|
|
|
Args:
|
|
train_path (str): The path to the training data file.
|
|
test_path (str): The path to the test data file.
|
|
|
|
Returns:
|
|
Tuple[pd.DataFrame, pd.Series, pd.DataFrame, pd.Series]: A tuple containing:
|
|
- X_train_encoded: Processed and one-hot encoded training features.
|
|
- y_train: Training labels.
|
|
- X_test_encoded: Processed and one-hot encoded test features.
|
|
- y_test: Test labels.
|
|
"""
|
|
columns: list[str] = [
|
|
'age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status',
|
|
'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss',
|
|
'hours-per-week', 'native-country', 'income'
|
|
]
|
|
|
|
# Load datasets
|
|
df_train: pd.DataFrame = pd.read_csv(train_path, header=None, names=columns, sep=r',\s*', engine='python', na_values='?')
|
|
df_test: pd.DataFrame = pd.read_csv(test_path, header=None, names=columns, sep=r',\s*', engine='python', na_values='?', skiprows=1)
|
|
|
|
# Remove rows with any missing values
|
|
df_train.dropna(inplace=True)
|
|
df_test.dropna(inplace=True)
|
|
|
|
# Define continuous attributes to remove
|
|
continuous_attributes: list[str] = [
|
|
'age', 'fnlwgt', 'education-num', 'capital-gain', 'capital-loss', 'hours-per-week'
|
|
]
|
|
|
|
# Separate features and target
|
|
X_train: pd.DataFrame = df_train.drop(columns=['income'])
|
|
y_train: pd.Series = df_train['income'].str.replace('.', '', regex=False)
|
|
X_test: pd.DataFrame = df_test.drop(columns=['income'])
|
|
y_test: pd.Series = df_test['income'].str.replace('.', '', regex=False)
|
|
|
|
# Remove continuous attributes
|
|
X_train = X_train.drop(columns=continuous_attributes)
|
|
X_test = X_test.drop(columns=continuous_attributes)
|
|
|
|
# Identify categorical attributes for one-hot encoding
|
|
categorical_attributes: list[str] = X_train.columns.tolist()
|
|
|
|
# One-hot encode categorical attributes
|
|
encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
|
|
X_train_encoded: pd.DataFrame = pd.DataFrame(encoder.fit_transform(X_train[categorical_attributes]), columns=encoder.get_feature_names_out(categorical_attributes))
|
|
X_test_encoded: pd.DataFrame = pd.DataFrame(encoder.transform(X_test[categorical_attributes]), columns=encoder.get_feature_names_out(categorical_attributes))
|
|
|
|
return X_train_encoded, y_train, X_test_encoded, y_test
|
|
|
|
def evaluate_model(X_train: pd.DataFrame, y_train: pd.Series, X_test: pd.DataFrame, y_test: pd.Series, model, model_name: str):
|
|
"""
|
|
Trains and evaluates a given model, printing a detailed report.
|
|
|
|
Args:
|
|
X_train (pd.DataFrame): Training features.
|
|
y_train (pd.Series): Training labels.
|
|
X_test (pd.DataFrame): Test features.
|
|
y_test (pd.Series): Test labels.
|
|
model: The classifier model to evaluate.
|
|
model_name (str): The name of the model for reporting.
|
|
"""
|
|
# Train the model
|
|
model.fit(X_train, y_train)
|
|
|
|
# Make predictions
|
|
y_pred = model.predict(X_test)
|
|
|
|
# Generate the classification report
|
|
report = classification_report(y_test, y_pred, output_dict=True)
|
|
|
|
# Calculate confusion matrix to get TP and FP rates
|
|
# For binary classification: [[TN, FP], [FN, TP]]
|
|
# For multi-class, we calculate per class
|
|
cm = confusion_matrix(y_test, y_pred, labels=model.classes_)
|
|
|
|
print(f"--- {model_name} Evaluation ---")
|
|
print(f"Overall Accuracy: {accuracy_score(y_test, y_pred):.4f}\n")
|
|
|
|
for label in model.classes_:
|
|
# Get the index for the current class
|
|
class_idx = list(model.classes_).index(label)
|
|
|
|
# TP is the diagonal element
|
|
tp = cm[class_idx, class_idx]
|
|
|
|
# FP is the sum of the column for this class, excluding the TP
|
|
fp = cm[:, class_idx].sum() - tp
|
|
|
|
# FN is the sum of the row for this class, excluding the TP
|
|
fn = cm[class_idx, :].sum() - tp
|
|
|
|
# TN is the sum of all cells minus the TP, FP, and FN for this class
|
|
tn = cm.sum() - (tp + fp + fn)
|
|
|
|
# Rates
|
|
tp_rate = tp / (tp + fn) if (tp + fn) > 0 else 0 # Same as recall
|
|
fp_rate = fp / (fp + tn) if (fp + tn) > 0 else 0
|
|
|
|
print(f"Class: {label}")
|
|
print(f" TP Rate (Recall): {tp_rate:.4f}")
|
|
print(f" FP Rate : {fp_rate:.4f}")
|
|
print(f" Precision : {report[label]['precision']:.4f}")
|
|
print(f" F1-Score : {report[label]['f1-score']:.4f}")
|
|
print("-" * 20)
|
|
|
|
if __name__ == '__main__':
|
|
train_file = 'adult/adult.data'
|
|
test_file = 'adult/adult.test'
|
|
output_dir = 'adult-clean'
|
|
|
|
# Create the output directory if it doesn't exist
|
|
os.makedirs(output_dir, exist_ok=True)
|
|
|
|
X_train, y_train, X_test, y_test = process_data(train_file, test_file)
|
|
|
|
# Reset index to align features and labels for concatenation
|
|
y_train = y_train.reset_index(drop=True)
|
|
X_train = X_train.reset_index(drop=True)
|
|
y_test = y_test.reset_index(drop=True)
|
|
X_test = X_test.reset_index(drop=True)
|
|
|
|
# Concatenate features and labels
|
|
train_cleaned = pd.concat([X_train, y_train], axis=1)
|
|
test_cleaned = pd.concat([X_test, y_test], axis=1)
|
|
|
|
# Save the cleaned data to new CSV files
|
|
train_cleaned.to_csv(os.path.join(output_dir, 'train_clean.csv'), index=False)
|
|
test_cleaned.to_csv(os.path.join(output_dir, 'test_clean.csv'), index=False)
|
|
|
|
print(f"Preprocessed data saved to '{output_dir}' directory.")
|
|
print(f"Training data shape: {train_cleaned.shape}")
|
|
print(f"Test data shape: {test_cleaned.shape}\n")
|
|
|
|
# --- Model Training and Evaluation ---
|
|
|
|
# 1. Decision Tree Classifier
|
|
dt_classifier = DecisionTreeClassifier(random_state=42)
|
|
evaluate_model(X_train, y_train, X_test, y_test, dt_classifier, "Decision Tree Classifier")
|
|
|
|
# 2. Naïve Bayesian Classifier
|
|
nb_classifier = BernoulliNB()
|
|
evaluate_model(X_train, y_train, X_test, y_test, nb_classifier, "Naïve Bayesian Classifier")
|