import pandas as pd from sklearn.preprocessing import OneHotEncoder from sklearn.neural_network import MLPClassifier from sklearn.metrics import accuracy_score from typing import Tuple, List import numpy as np # This is the same data processing function from Part 2 def process_data_part2(train_path: str, test_path: str) -> Tuple[pd.DataFrame, pd.Series, pd.DataFrame, pd.Series]: """ Processes the adult dataset for part 2 requirements. - Removes unknown values. - Binarizes numerical attributes based on the mean. - One-hot encodes categorical attributes. """ columns: List[str] = [ 'age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country', 'income' ] # Load datasets df_train: pd.DataFrame = pd.read_csv(train_path, header=None, names=columns, sep=r',\s*', engine='python', na_values='?') df_test: pd.DataFrame = pd.read_csv(test_path, header=None, names=columns, sep=r',\s*', engine='python', na_values='?', skiprows=1) # Remove rows with any missing values df_train.dropna(inplace=True) df_test.dropna(inplace=True) # Separate features and target, and clean target labels X_train_raw = df_train.drop('income', axis=1) y_train = df_train['income'].str.replace('.', '', regex=False) X_test_raw = df_test.drop('income', axis=1) y_test = df_test['income'].str.replace('.', '', regex=False) # Identify numerical and categorical attributes numerical_cols = X_train_raw.select_dtypes(include=np.number).columns.tolist() categorical_cols = X_train_raw.select_dtypes(exclude=np.number).columns.tolist() # --- Preprocessing --- # 1. Binarize numerical attributes X_train_numerical_processed = pd.DataFrame() X_test_numerical_processed = pd.DataFrame() for col in numerical_cols: mean_val = X_train_raw[col].mean() X_train_numerical_processed[col] = (X_train_raw[col] > mean_val).astype(int) X_test_numerical_processed[col] = (X_test_raw[col] > mean_val).astype(int) # 2. One-hot encode categorical attributes encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False) X_train_categorical_processed = pd.DataFrame( encoder.fit_transform(X_train_raw[categorical_cols]), columns=encoder.get_feature_names_out(categorical_cols), index=X_train_raw.index ) X_test_categorical_processed = pd.DataFrame( encoder.transform(X_test_raw[categorical_cols]), columns=encoder.get_feature_names_out(categorical_cols), index=X_test_raw.index ) # 3. Combine processed features X_train_processed = pd.concat([X_train_numerical_processed, X_train_categorical_processed], axis=1) X_test_processed = pd.concat([X_test_numerical_processed, X_test_categorical_processed], axis=1) # Align y labels with the processed X dataframes y_train = y_train.loc[X_train_processed.index] y_test = y_test.loc[X_test_processed.index] return X_train_processed, y_train, X_test_processed, y_test if __name__ == '__main__': train_file = 'adult/adult.data' test_file = 'adult/adult.test' # Process data using the function from Part 2 X_train, y_train, X_test, y_test = process_data_part2(train_file, test_file) print("Data processing complete.") print(f"Training data shape: {X_train.shape}") print(f"Test data shape: {X_test.shape}\n") # --- Neural Network Classifier --- print("--- Neural Network (MLP) Classifier ---") # Initialize the Multi-layer Perceptron classifier # hidden_layer_sizes=(100,) means one hidden layer with 100 neurons. # max_iter=500 to ensure the model has enough iterations to converge. # random_state=42 for reproducibility. nn_classifier = MLPClassifier(hidden_layer_sizes=(100,), max_iter=500, random_state=42) print("Training the Neural Network classifier...") nn_classifier.fit(X_train, y_train) # Make predictions on the test data print("Making predictions on the test data...") y_pred = nn_classifier.predict(X_test) # Calculate and report the accuracy accuracy = accuracy_score(y_test, y_pred) print(f"\nNeural Network Classifier Accuracy on Test Data: {accuracy:.4f}")