cleaned up the code some

This commit is contained in:
JISAUAY 2025-10-06 10:32:18 -05:00
parent 67e1639548
commit 93492bc462
4 changed files with 30 additions and 142 deletions

View File

@ -7,53 +7,32 @@ from sklearn.naive_bayes import BernoulliNB
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
def process_data(train_path: str, test_path: str) -> Tuple[pd.DataFrame, pd.Series, pd.DataFrame, pd.Series]: def process_data(train_path: str, test_path: str) -> Tuple[pd.DataFrame, pd.Series, pd.DataFrame, pd.Series]:
"""
Processes the adult dataset by cleaning, removing continuous attributes, and one-hot encoding.
Args:
train_path (str): The path to the training data file.
test_path (str): The path to the test data file.
Returns:
Tuple[pd.DataFrame, pd.Series, pd.DataFrame, pd.Series]: A tuple containing:
- X_train_encoded: Processed and one-hot encoded training features.
- y_train: Training labels.
- X_test_encoded: Processed and one-hot encoded test features.
- y_test: Test labels.
"""
columns: list[str] = [ columns: list[str] = [
'age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status', 'age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status',
'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss',
'hours-per-week', 'native-country', 'income' 'hours-per-week', 'native-country', 'income'
] ]
# Load datasets
df_train: pd.DataFrame = pd.read_csv(train_path, header=None, names=columns, sep=r',\s*', engine='python', na_values='?') df_train: pd.DataFrame = pd.read_csv(train_path, header=None, names=columns, sep=r',\s*', engine='python', na_values='?')
df_test: pd.DataFrame = pd.read_csv(test_path, header=None, names=columns, sep=r',\s*', engine='python', na_values='?', skiprows=1) df_test: pd.DataFrame = pd.read_csv(test_path, header=None, names=columns, sep=r',\s*', engine='python', na_values='?', skiprows=1)
# Remove rows with any missing values
df_train.dropna(inplace=True) df_train.dropna(inplace=True)
df_test.dropna(inplace=True) df_test.dropna(inplace=True)
# Define continuous attributes to remove
continuous_attributes: list[str] = [ continuous_attributes: list[str] = [
'age', 'fnlwgt', 'education-num', 'capital-gain', 'capital-loss', 'hours-per-week' 'age', 'fnlwgt', 'education-num', 'capital-gain', 'capital-loss', 'hours-per-week'
] ]
# Separate features and target
X_train: pd.DataFrame = df_train.drop(columns=['income']) X_train: pd.DataFrame = df_train.drop(columns=['income'])
y_train: pd.Series = df_train['income'].str.replace('.', '', regex=False) y_train: pd.Series = df_train['income'].str.replace('.', '', regex=False)
X_test: pd.DataFrame = df_test.drop(columns=['income']) X_test: pd.DataFrame = df_test.drop(columns=['income'])
y_test: pd.Series = df_test['income'].str.replace('.', '', regex=False) y_test: pd.Series = df_test['income'].str.replace('.', '', regex=False)
# Remove continuous attributes
X_train = X_train.drop(columns=continuous_attributes) X_train = X_train.drop(columns=continuous_attributes)
X_test = X_test.drop(columns=continuous_attributes) X_test = X_test.drop(columns=continuous_attributes)
# Identify categorical attributes for one-hot encoding
categorical_attributes: list[str] = X_train.columns.tolist() categorical_attributes: list[str] = X_train.columns.tolist()
# One-hot encode categorical attributes
encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False) encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
X_train_encoded: pd.DataFrame = pd.DataFrame(encoder.fit_transform(X_train[categorical_attributes]), columns=encoder.get_feature_names_out(categorical_attributes)) X_train_encoded: pd.DataFrame = pd.DataFrame(encoder.fit_transform(X_train[categorical_attributes]), columns=encoder.get_feature_names_out(categorical_attributes))
X_test_encoded: pd.DataFrame = pd.DataFrame(encoder.transform(X_test[categorical_attributes]), columns=encoder.get_feature_names_out(categorical_attributes)) X_test_encoded: pd.DataFrame = pd.DataFrame(encoder.transform(X_test[categorical_attributes]), columns=encoder.get_feature_names_out(categorical_attributes))
@ -61,51 +40,28 @@ def process_data(train_path: str, test_path: str) -> Tuple[pd.DataFrame, pd.Seri
return X_train_encoded, y_train, X_test_encoded, y_test return X_train_encoded, y_train, X_test_encoded, y_test
def evaluate_model(X_train: pd.DataFrame, y_train: pd.Series, X_test: pd.DataFrame, y_test: pd.Series, model, model_name: str): def evaluate_model(X_train: pd.DataFrame, y_train: pd.Series, X_test: pd.DataFrame, y_test: pd.Series, model, model_name: str):
"""
Trains and evaluates a given model, printing a detailed report.
Args:
X_train (pd.DataFrame): Training features.
y_train (pd.Series): Training labels.
X_test (pd.DataFrame): Test features.
y_test (pd.Series): Test labels.
model: The classifier model to evaluate.
model_name (str): The name of the model for reporting.
"""
# Train the model
model.fit(X_train, y_train) model.fit(X_train, y_train)
# Make predictions
y_pred = model.predict(X_test) y_pred = model.predict(X_test)
# Generate the classification report
report = classification_report(y_test, y_pred, output_dict=True) report = classification_report(y_test, y_pred, output_dict=True)
# Calculate confusion matrix to get TP and FP rates
# For binary classification: [[TN, FP], [FN, TP]]
# For multi-class, we calculate per class
cm = confusion_matrix(y_test, y_pred, labels=model.classes_) cm = confusion_matrix(y_test, y_pred, labels=model.classes_)
print(f"--- {model_name} Evaluation ---") print(f"--- {model_name} Evaluation ---")
print(f"Overall Accuracy: {accuracy_score(y_test, y_pred):.4f}\n") print(f"Overall Accuracy: {accuracy_score(y_test, y_pred):.4f}\n")
for label in model.classes_: for label in model.classes_:
# Get the index for the current class
class_idx = list(model.classes_).index(label) class_idx = list(model.classes_).index(label)
# TP is the diagonal element
tp = cm[class_idx, class_idx] tp = cm[class_idx, class_idx]
# FP is the sum of the column for this class, excluding the TP
fp = cm[:, class_idx].sum() - tp fp = cm[:, class_idx].sum() - tp
# FN is the sum of the row for this class, excluding the TP
fn = cm[class_idx, :].sum() - tp fn = cm[class_idx, :].sum() - tp
# TN is the sum of all cells minus the TP, FP, and FN for this class
tn = cm.sum() - (tp + fp + fn) tn = cm.sum() - (tp + fp + fn)
# Rates
tp_rate = tp / (tp + fn) if (tp + fn) > 0 else 0 # Same as recall tp_rate = tp / (tp + fn) if (tp + fn) > 0 else 0 # Same as recall
fp_rate = fp / (fp + tn) if (fp + tn) > 0 else 0 fp_rate = fp / (fp + tn) if (fp + tn) > 0 else 0
@ -121,22 +77,18 @@ if __name__ == '__main__':
test_file = 'adult/adult.test' test_file = 'adult/adult.test'
output_dir = 'adult-clean' output_dir = 'adult-clean'
# Create the output directory if it doesn't exist
os.makedirs(output_dir, exist_ok=True) os.makedirs(output_dir, exist_ok=True)
X_train, y_train, X_test, y_test = process_data(train_file, test_file) X_train, y_train, X_test, y_test = process_data(train_file, test_file)
# Reset index to align features and labels for concatenation
y_train = y_train.reset_index(drop=True) y_train = y_train.reset_index(drop=True)
X_train = X_train.reset_index(drop=True) X_train = X_train.reset_index(drop=True)
y_test = y_test.reset_index(drop=True) y_test = y_test.reset_index(drop=True)
X_test = X_test.reset_index(drop=True) X_test = X_test.reset_index(drop=True)
# Concatenate features and labels
train_cleaned = pd.concat([X_train, y_train], axis=1) train_cleaned = pd.concat([X_train, y_train], axis=1)
test_cleaned = pd.concat([X_test, y_test], axis=1) test_cleaned = pd.concat([X_test, y_test], axis=1)
# Save the cleaned data to new CSV files
train_cleaned.to_csv(os.path.join(output_dir, 'train_clean.csv'), index=False) train_cleaned.to_csv(os.path.join(output_dir, 'train_clean.csv'), index=False)
test_cleaned.to_csv(os.path.join(output_dir, 'test_clean.csv'), index=False) test_cleaned.to_csv(os.path.join(output_dir, 'test_clean.csv'), index=False)
@ -144,12 +96,10 @@ if __name__ == '__main__':
print(f"Training data shape: {train_cleaned.shape}") print(f"Training data shape: {train_cleaned.shape}")
print(f"Test data shape: {test_cleaned.shape}\n") print(f"Test data shape: {test_cleaned.shape}\n")
# --- Model Training and Evaluation --- # Decision Tree Classifier
# 1. Decision Tree Classifier
dt_classifier = DecisionTreeClassifier(random_state=42) dt_classifier = DecisionTreeClassifier(random_state=42)
evaluate_model(X_train, y_train, X_test, y_test, dt_classifier, "Decision Tree Classifier") evaluate_model(X_train, y_train, X_test, y_test, dt_classifier, "Decision Tree Classifier")
# 2. Naïve Bayesian Classifier # Naive Bayesian Classifier
nb_classifier = BernoulliNB() nb_classifier = BernoulliNB()
evaluate_model(X_train, y_train, X_test, y_test, nb_classifier, "Naïve Bayesian Classifier") evaluate_model(X_train, y_train, X_test, y_test, nb_classifier, "Naïve Bayesian Classifier")

View File

@ -8,7 +8,6 @@ import numpy as np
def process_data_part2(train_path: str, test_path: str) -> Tuple[pd.DataFrame, pd.Series, pd.DataFrame, pd.Series]: def process_data_part2(train_path: str, test_path: str) -> Tuple[pd.DataFrame, pd.Series, pd.DataFrame, pd.Series]:
""" """
Processes the adult dataset for part 2 requirements.
- Removes unknown values. - Removes unknown values.
- Binarizes numerical attributes based on the mean. - Binarizes numerical attributes based on the mean.
- One-hot encodes categorical attributes. - One-hot encodes categorical attributes.
@ -19,27 +18,20 @@ def process_data_part2(train_path: str, test_path: str) -> Tuple[pd.DataFrame, p
'hours-per-week', 'native-country', 'income' 'hours-per-week', 'native-country', 'income'
] ]
# Load datasets
df_train: pd.DataFrame = pd.read_csv(train_path, header=None, names=columns, sep=r',\s*', engine='python', na_values='?') df_train: pd.DataFrame = pd.read_csv(train_path, header=None, names=columns, sep=r',\s*', engine='python', na_values='?')
df_test: pd.DataFrame = pd.read_csv(test_path, header=None, names=columns, sep=r',\s*', engine='python', na_values='?', skiprows=1) df_test: pd.DataFrame = pd.read_csv(test_path, header=None, names=columns, sep=r',\s*', engine='python', na_values='?', skiprows=1)
# Remove rows with any missing values
df_train.dropna(inplace=True) df_train.dropna(inplace=True)
df_test.dropna(inplace=True) df_test.dropna(inplace=True)
# Separate features and target, and clean target labels
X_train_raw = df_train.drop('income', axis=1) X_train_raw = df_train.drop('income', axis=1)
y_train = df_train['income'].str.replace('.', '', regex=False) y_train = df_train['income'].str.replace('.', '', regex=False)
X_test_raw = df_test.drop('income', axis=1) X_test_raw = df_test.drop('income', axis=1)
y_test = df_test['income'].str.replace('.', '', regex=False) y_test = df_test['income'].str.replace('.', '', regex=False)
# Identify numerical and categorical attributes
numerical_cols = X_train_raw.select_dtypes(include=np.number).columns.tolist() numerical_cols = X_train_raw.select_dtypes(include=np.number).columns.tolist()
categorical_cols = X_train_raw.select_dtypes(exclude=np.number).columns.tolist() categorical_cols = X_train_raw.select_dtypes(exclude=np.number).columns.tolist()
# --- Preprocessing ---
# 1. Binarize numerical attributes
X_train_numerical_processed = pd.DataFrame() X_train_numerical_processed = pd.DataFrame()
X_test_numerical_processed = pd.DataFrame() X_test_numerical_processed = pd.DataFrame()
@ -48,7 +40,6 @@ def process_data_part2(train_path: str, test_path: str) -> Tuple[pd.DataFrame, p
X_train_numerical_processed[col] = (X_train_raw[col] > mean_val).astype(int) X_train_numerical_processed[col] = (X_train_raw[col] > mean_val).astype(int)
X_test_numerical_processed[col] = (X_test_raw[col] > mean_val).astype(int) X_test_numerical_processed[col] = (X_test_raw[col] > mean_val).astype(int)
# 2. One-hot encode categorical attributes
encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False) encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
X_train_categorical_processed = pd.DataFrame( X_train_categorical_processed = pd.DataFrame(
@ -60,22 +51,17 @@ def process_data_part2(train_path: str, test_path: str) -> Tuple[pd.DataFrame, p
columns=encoder.get_feature_names_out(categorical_cols) columns=encoder.get_feature_names_out(categorical_cols)
) )
# Reset index to ensure concatenation works correctly
X_train_numerical_processed.index = X_train_categorical_processed.index X_train_numerical_processed.index = X_train_categorical_processed.index
X_test_numerical_processed.index = X_test_categorical_processed.index X_test_numerical_processed.index = X_test_categorical_processed.index
y_train.index = X_train_categorical_processed.index y_train.index = X_train_categorical_processed.index
y_test.index = X_test_categorical_processed.index y_test.index = X_test_categorical_processed.index
# 3. Combine processed features
X_train_processed = pd.concat([X_train_numerical_processed, X_train_categorical_processed], axis=1) X_train_processed = pd.concat([X_train_numerical_processed, X_train_categorical_processed], axis=1)
X_test_processed = pd.concat([X_test_numerical_processed, X_test_categorical_processed], axis=1) X_test_processed = pd.concat([X_test_numerical_processed, X_test_categorical_processed], axis=1)
return X_train_processed, y_train, X_test_processed, y_test return X_train_processed, y_train, X_test_processed, y_test
def run_kmeans_clustering(X_train: pd.DataFrame, k_values: List[int]): def run_kmeans_clustering(X_train: pd.DataFrame, k_values: List[int]):
"""
Runs K-Means clustering for different k values and reports centroids.
"""
print("--- K-Means Clustering ---") print("--- K-Means Clustering ---")
for k in k_values: for k in k_values:
print(f"\nRunning K-Means with k={k}...") print(f"\nRunning K-Means with k={k}...")
@ -83,17 +69,12 @@ def run_kmeans_clustering(X_train: pd.DataFrame, k_values: List[int]):
kmeans.fit(X_train) kmeans.fit(X_train)
print(f"Centroids for k={k}:") print(f"Centroids for k={k}:")
# Printing only the first 5 dimensions for brevity
print(pd.DataFrame(kmeans.cluster_centers_[:, :5], columns=X_train.columns[:5])) print(pd.DataFrame(kmeans.cluster_centers_[:, :5], columns=X_train.columns[:5]))
print("-" * 20) print("-" * 20)
def run_knn_classification(X_train: pd.DataFrame, y_train: pd.Series, X_test: pd.DataFrame, y_test: pd.Series, k_values: List[int]): def run_knn_classification(X_train: pd.DataFrame, y_train: pd.Series, X_test: pd.DataFrame, y_test: pd.Series, k_values: List[int]):
"""
Runs kNN classification on the last 10 test samples and reports accuracy.
"""
print("\n--- k-Nearest Neighbors (kNN) Classification ---") print("\n--- k-Nearest Neighbors (kNN) Classification ---")
# Use the last 10 records from the test set
X_test_sample = X_test.tail(10) X_test_sample = X_test.tail(10)
y_test_sample = y_test.tail(10) y_test_sample = y_test.tail(10)
@ -116,17 +97,14 @@ if __name__ == '__main__':
train_file = 'adult/adult.data' train_file = 'adult/adult.data'
test_file = 'adult/adult.test' test_file = 'adult/adult.test'
# Process data according to Part 2 requirements
X_train, y_train, X_test, y_test = process_data_part2(train_file, test_file) X_train, y_train, X_test, y_test = process_data_part2(train_file, test_file)
print("Data processing complete.") print("Data processing complete.")
print(f"Training data shape: {X_train.shape}") print(f"Training data shape: {X_train.shape}")
print(f"Test data shape: {X_test.shape}\n") print(f"Test data shape: {X_test.shape}\n")
# Run K-Means Clustering
kmeans_k_values = [3, 5, 10] kmeans_k_values = [3, 5, 10]
run_kmeans_clustering(X_train, kmeans_k_values) run_kmeans_clustering(X_train, kmeans_k_values)
# Run kNN Classification
knn_k_values = [3, 5, 10] knn_k_values = [3, 5, 10]
run_knn_classification(X_train, y_train, X_test, y_test, knn_k_values) run_knn_classification(X_train, y_train, X_test, y_test, knn_k_values)

View File

@ -5,10 +5,8 @@ from sklearn.metrics import accuracy_score
from typing import Tuple, List from typing import Tuple, List
import numpy as np import numpy as np
# This is the same data processing function from Part 2
def process_data_part2(train_path: str, test_path: str) -> Tuple[pd.DataFrame, pd.Series, pd.DataFrame, pd.Series]: def process_data_part2(train_path: str, test_path: str) -> Tuple[pd.DataFrame, pd.Series, pd.DataFrame, pd.Series]:
""" """
Processes the adult dataset for part 2 requirements.
- Removes unknown values. - Removes unknown values.
- Binarizes numerical attributes based on the mean. - Binarizes numerical attributes based on the mean.
- One-hot encodes categorical attributes. - One-hot encodes categorical attributes.
@ -18,39 +16,31 @@ def process_data_part2(train_path: str, test_path: str) -> Tuple[pd.DataFrame, p
'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss',
'hours-per-week', 'native-country', 'income' 'hours-per-week', 'native-country', 'income'
] ]
# Load datasets
df_train: pd.DataFrame = pd.read_csv(train_path, header=None, names=columns, sep=r',\s*', engine='python', na_values='?') df_train: pd.DataFrame = pd.read_csv(train_path, header=None, names=columns, sep=r',\s*', engine='python', na_values='?')
df_test: pd.DataFrame = pd.read_csv(test_path, header=None, names=columns, sep=r',\s*', engine='python', na_values='?', skiprows=1) df_test: pd.DataFrame = pd.read_csv(test_path, header=None, names=columns, sep=r',\s*', engine='python', na_values='?', skiprows=1)
# Remove rows with any missing values
df_train.dropna(inplace=True) df_train.dropna(inplace=True)
df_test.dropna(inplace=True) df_test.dropna(inplace=True)
# Separate features and target, and clean target labels
X_train_raw = df_train.drop('income', axis=1) X_train_raw = df_train.drop('income', axis=1)
y_train = df_train['income'].str.replace('.', '', regex=False) y_train = df_train['income'].str.replace('.', '', regex=False)
X_test_raw = df_test.drop('income', axis=1) X_test_raw = df_test.drop('income', axis=1)
y_test = df_test['income'].str.replace('.', '', regex=False) y_test = df_test['income'].str.replace('.', '', regex=False)
# Identify numerical and categorical attributes
numerical_cols = X_train_raw.select_dtypes(include=np.number).columns.tolist() numerical_cols = X_train_raw.select_dtypes(include=np.number).columns.tolist()
categorical_cols = X_train_raw.select_dtypes(exclude=np.number).columns.tolist() categorical_cols = X_train_raw.select_dtypes(exclude=np.number).columns.tolist()
# --- Preprocessing ---
# 1. Binarize numerical attributes
X_train_numerical_processed = pd.DataFrame() X_train_numerical_processed = pd.DataFrame()
X_test_numerical_processed = pd.DataFrame() X_test_numerical_processed = pd.DataFrame()
for col in numerical_cols: for col in numerical_cols:
mean_val = X_train_raw[col].mean() mean_val = X_train_raw[col].mean()
X_train_numerical_processed[col] = (X_train_raw[col] > mean_val).astype(int) X_train_numerical_processed[col] = (X_train_raw[col] > mean_val).astype(int)
X_test_numerical_processed[col] = (X_test_raw[col] > mean_val).astype(int) X_test_numerical_processed[col] = (X_test_raw[col] > mean_val).astype(int)
# 2. One-hot encode categorical attributes
encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False) encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
X_train_categorical_processed = pd.DataFrame( X_train_categorical_processed = pd.DataFrame(
encoder.fit_transform(X_train_raw[categorical_cols]), encoder.fit_transform(X_train_raw[categorical_cols]),
columns=encoder.get_feature_names_out(categorical_cols), columns=encoder.get_feature_names_out(categorical_cols),
@ -61,12 +51,10 @@ def process_data_part2(train_path: str, test_path: str) -> Tuple[pd.DataFrame, p
columns=encoder.get_feature_names_out(categorical_cols), columns=encoder.get_feature_names_out(categorical_cols),
index=X_test_raw.index index=X_test_raw.index
) )
# 3. Combine processed features
X_train_processed = pd.concat([X_train_numerical_processed, X_train_categorical_processed], axis=1) X_train_processed = pd.concat([X_train_numerical_processed, X_train_categorical_processed], axis=1)
X_test_processed = pd.concat([X_test_numerical_processed, X_test_categorical_processed], axis=1) X_test_processed = pd.concat([X_test_numerical_processed, X_test_categorical_processed], axis=1)
# Align y labels with the processed X dataframes
y_train = y_train.loc[X_train_processed.index] y_train = y_train.loc[X_train_processed.index]
y_test = y_test.loc[X_test_processed.index] y_test = y_test.loc[X_test_processed.index]
@ -76,32 +64,22 @@ if __name__ == '__main__':
train_file = 'adult/adult.data' train_file = 'adult/adult.data'
test_file = 'adult/adult.test' test_file = 'adult/adult.test'
# Process data using the function from Part 2
X_train, y_train, X_test, y_test = process_data_part2(train_file, test_file) X_train, y_train, X_test, y_test = process_data_part2(train_file, test_file)
print("Data processing complete.") print("Data processing complete.")
print(f"Training data shape: {X_train.shape}") print(f"Training data shape: {X_train.shape}")
print(f"Test data shape: {X_test.shape}\n") print(f"Test data shape: {X_test.shape}\n")
# --- SVM Classifier ---
print("--- Support Vector Machine (SVM) Classifier ---") print("--- Support Vector Machine (SVM) Classifier ---")
# Initialize SVM classifier. A linear kernel is often a good starting point. print("Training the SVM classifier...")
# Using a smaller subset for training due to SVM's computational complexity
# For a full run, you would use the entire X_train, y_train
print("Training the SVM classifier... (This may take a few minutes)")
# Note: SVM can be slow on large datasets. For demonstration, you might
# sample your data, e.g., X_train.sample(n=5000, random_state=42)
svm_classifier = SVC(kernel='linear', random_state=42) svm_classifier = SVC(kernel='linear', random_state=42)
# Train the model on the full training data
svm_classifier.fit(X_train, y_train) svm_classifier.fit(X_train, y_train)
# Make predictions on the test data
print("Making predictions on the test data...") print("Making predictions on the test data...")
y_pred = svm_classifier.predict(X_test) y_pred = svm_classifier.predict(X_test)
# Calculate and report the accuracy
accuracy = accuracy_score(y_test, y_pred) accuracy = accuracy_score(y_test, y_pred)
print(f"\nSVM Classifier Accuracy on Test Data: {accuracy:.4f}") print(f"\nSVM Classifier Accuracy on Test Data: {accuracy:.4f}")

View File

@ -5,7 +5,6 @@ from sklearn.metrics import accuracy_score
from typing import Tuple, List from typing import Tuple, List
import numpy as np import numpy as np
# This is the same data processing function from Part 2
def process_data_part2(train_path: str, test_path: str) -> Tuple[pd.DataFrame, pd.Series, pd.DataFrame, pd.Series]: def process_data_part2(train_path: str, test_path: str) -> Tuple[pd.DataFrame, pd.Series, pd.DataFrame, pd.Series]:
""" """
Processes the adult dataset for part 2 requirements. Processes the adult dataset for part 2 requirements.
@ -18,39 +17,31 @@ def process_data_part2(train_path: str, test_path: str) -> Tuple[pd.DataFrame, p
'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss',
'hours-per-week', 'native-country', 'income' 'hours-per-week', 'native-country', 'income'
] ]
# Load datasets
df_train: pd.DataFrame = pd.read_csv(train_path, header=None, names=columns, sep=r',\s*', engine='python', na_values='?') df_train: pd.DataFrame = pd.read_csv(train_path, header=None, names=columns, sep=r',\s*', engine='python', na_values='?')
df_test: pd.DataFrame = pd.read_csv(test_path, header=None, names=columns, sep=r',\s*', engine='python', na_values='?', skiprows=1) df_test: pd.DataFrame = pd.read_csv(test_path, header=None, names=columns, sep=r',\s*', engine='python', na_values='?', skiprows=1)
# Remove rows with any missing values
df_train.dropna(inplace=True) df_train.dropna(inplace=True)
df_test.dropna(inplace=True) df_test.dropna(inplace=True)
# Separate features and target, and clean target labels
X_train_raw = df_train.drop('income', axis=1) X_train_raw = df_train.drop('income', axis=1)
y_train = df_train['income'].str.replace('.', '', regex=False) y_train = df_train['income'].str.replace('.', '', regex=False)
X_test_raw = df_test.drop('income', axis=1) X_test_raw = df_test.drop('income', axis=1)
y_test = df_test['income'].str.replace('.', '', regex=False) y_test = df_test['income'].str.replace('.', '', regex=False)
# Identify numerical and categorical attributes
numerical_cols = X_train_raw.select_dtypes(include=np.number).columns.tolist() numerical_cols = X_train_raw.select_dtypes(include=np.number).columns.tolist()
categorical_cols = X_train_raw.select_dtypes(exclude=np.number).columns.tolist() categorical_cols = X_train_raw.select_dtypes(exclude=np.number).columns.tolist()
# --- Preprocessing ---
# 1. Binarize numerical attributes
X_train_numerical_processed = pd.DataFrame() X_train_numerical_processed = pd.DataFrame()
X_test_numerical_processed = pd.DataFrame() X_test_numerical_processed = pd.DataFrame()
for col in numerical_cols: for col in numerical_cols:
mean_val = X_train_raw[col].mean() mean_val = X_train_raw[col].mean()
X_train_numerical_processed[col] = (X_train_raw[col] > mean_val).astype(int) X_train_numerical_processed[col] = (X_train_raw[col] > mean_val).astype(int)
X_test_numerical_processed[col] = (X_test_raw[col] > mean_val).astype(int) X_test_numerical_processed[col] = (X_test_raw[col] > mean_val).astype(int)
# 2. One-hot encode categorical attributes
encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False) encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
X_train_categorical_processed = pd.DataFrame( X_train_categorical_processed = pd.DataFrame(
encoder.fit_transform(X_train_raw[categorical_cols]), encoder.fit_transform(X_train_raw[categorical_cols]),
columns=encoder.get_feature_names_out(categorical_cols), columns=encoder.get_feature_names_out(categorical_cols),
@ -61,12 +52,10 @@ def process_data_part2(train_path: str, test_path: str) -> Tuple[pd.DataFrame, p
columns=encoder.get_feature_names_out(categorical_cols), columns=encoder.get_feature_names_out(categorical_cols),
index=X_test_raw.index index=X_test_raw.index
) )
# 3. Combine processed features
X_train_processed = pd.concat([X_train_numerical_processed, X_train_categorical_processed], axis=1) X_train_processed = pd.concat([X_train_numerical_processed, X_train_categorical_processed], axis=1)
X_test_processed = pd.concat([X_test_numerical_processed, X_test_categorical_processed], axis=1) X_test_processed = pd.concat([X_test_numerical_processed, X_test_categorical_processed], axis=1)
# Align y labels with the processed X dataframes
y_train = y_train.loc[X_train_processed.index] y_train = y_train.loc[X_train_processed.index]
y_test = y_test.loc[X_test_processed.index] y_test = y_test.loc[X_test_processed.index]
@ -76,30 +65,23 @@ if __name__ == '__main__':
train_file = 'adult/adult.data' train_file = 'adult/adult.data'
test_file = 'adult/adult.test' test_file = 'adult/adult.test'
# Process data using the function from Part 2
X_train, y_train, X_test, y_test = process_data_part2(train_file, test_file) X_train, y_train, X_test, y_test = process_data_part2(train_file, test_file)
print("Data processing complete.") print("Data processing complete.")
print(f"Training data shape: {X_train.shape}") print(f"Training data shape: {X_train.shape}")
print(f"Test data shape: {X_test.shape}\n") print(f"Test data shape: {X_test.shape}\n")
# --- Neural Network Classifier ---
print("--- Neural Network (MLP) Classifier ---") print("--- Neural Network (MLP) Classifier ---")
# Initialize the Multi-layer Perceptron classifier
# hidden_layer_sizes=(100,) means one hidden layer with 100 neurons.
# max_iter=500 to ensure the model has enough iterations to converge.
# random_state=42 for reproducibility.
nn_classifier = MLPClassifier(hidden_layer_sizes=(100,), max_iter=500, random_state=42) nn_classifier = MLPClassifier(hidden_layer_sizes=(100,), max_iter=500, random_state=42)
print("Training the Neural Network classifier...") print("Training the Neural Network classifier...")
nn_classifier.fit(X_train, y_train) nn_classifier.fit(X_train, y_train)
# Make predictions on the test data
print("Making predictions on the test data...") print("Making predictions on the test data...")
y_pred = nn_classifier.predict(X_test) y_pred = nn_classifier.predict(X_test)
# Calculate and report the accuracy # Calculate and report the accuracy
accuracy = accuracy_score(y_test, y_pred) accuracy = accuracy_score(y_test, y_pred)
print(f"\nNeural Network Classifier Accuracy on Test Data: {accuracy:.4f}") print(f"\nNeural Network Classifier Accuracy on Test Data: {accuracy:.4f}")