From 93492bc46272048b1e533cb9a3dd177f04d9eb01 Mon Sep 17 00:00:00 2001
From: JISAUAY <jackson.d.hodge@jbhunt.com>
Date: Mon, 6 Oct 2025 10:32:18 -0500
Subject: [PATCH] cleaned up the code some

---
 part1.py | 68 ++++++++------------------------------------------------
 part2.py | 22 ------------------
 part3.py | 44 +++++++++---------------------------
 part4.py | 38 +++++++++----------------------
 4 files changed, 30 insertions(+), 142 deletions(-)

diff --git a/part1.py b/part1.py
index e5e4c18..d24f02b 100644
--- a/part1.py
+++ b/part1.py
@@ -7,53 +7,32 @@ from sklearn.naive_bayes import BernoulliNB
 from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
 
 def process_data(train_path: str, test_path: str) -> Tuple[pd.DataFrame, pd.Series, pd.DataFrame, pd.Series]:
-    """
-    Processes the adult dataset by cleaning, removing continuous attributes, and one-hot encoding.
-
-    Args:
-        train_path (str): The path to the training data file.
-        test_path (str): The path to the test data file.
-
-    Returns:
-        Tuple[pd.DataFrame, pd.Series, pd.DataFrame, pd.Series]: A tuple containing:
-            - X_train_encoded: Processed and one-hot encoded training features.
-            - y_train: Training labels.
-            - X_test_encoded: Processed and one-hot encoded test features.
-            - y_test: Test labels.
-    """
     columns: list[str] = [
         'age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status',
         'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss',
         'hours-per-week', 'native-country', 'income'
     ]
 
-    # Load datasets
     df_train: pd.DataFrame = pd.read_csv(train_path, header=None, names=columns, sep=r',\s*', engine='python', na_values='?')
     df_test: pd.DataFrame = pd.read_csv(test_path, header=None, names=columns, sep=r',\s*', engine='python', na_values='?', skiprows=1)
 
-    # Remove rows with any missing values
     df_train.dropna(inplace=True)
     df_test.dropna(inplace=True)
 
-    # Define continuous attributes to remove
     continuous_attributes: list[str] = [
         'age', 'fnlwgt', 'education-num', 'capital-gain', 'capital-loss', 'hours-per-week'
     ]
 
-    # Separate features and target
     X_train: pd.DataFrame = df_train.drop(columns=['income'])
     y_train: pd.Series = df_train['income'].str.replace('.', '', regex=False)
     X_test: pd.DataFrame = df_test.drop(columns=['income'])
     y_test: pd.Series = df_test['income'].str.replace('.', '', regex=False)
 
-    # Remove continuous attributes
     X_train = X_train.drop(columns=continuous_attributes)
     X_test = X_test.drop(columns=continuous_attributes)
 
-    # Identify categorical attributes for one-hot encoding
     categorical_attributes: list[str] = X_train.columns.tolist()
 
-    # One-hot encode categorical attributes
     encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
     X_train_encoded: pd.DataFrame = pd.DataFrame(encoder.fit_transform(X_train[categorical_attributes]), columns=encoder.get_feature_names_out(categorical_attributes))
     X_test_encoded: pd.DataFrame = pd.DataFrame(encoder.transform(X_test[categorical_attributes]), columns=encoder.get_feature_names_out(categorical_attributes))
@@ -61,51 +40,28 @@ def process_data(train_path: str, test_path: str) -> Tuple[pd.DataFrame, pd.Seri
     return X_train_encoded, y_train, X_test_encoded, y_test
 
 def evaluate_model(X_train: pd.DataFrame, y_train: pd.Series, X_test: pd.DataFrame, y_test: pd.Series, model, model_name: str):
-    """
-    Trains and evaluates a given model, printing a detailed report.
-
-    Args:
-        X_train (pd.DataFrame): Training features.
-        y_train (pd.Series): Training labels.
-        X_test (pd.DataFrame): Test features.
-        y_test (pd.Series): Test labels.
-        model: The classifier model to evaluate.
-        model_name (str): The name of the model for reporting.
-    """
-    # Train the model
     model.fit(X_train, y_train)
 
-    # Make predictions
     y_pred = model.predict(X_test)
 
-    # Generate the classification report
     report = classification_report(y_test, y_pred, output_dict=True)
-    
-    # Calculate confusion matrix to get TP and FP rates
-    # For binary classification: [[TN, FP], [FN, TP]]
-    # For multi-class, we calculate per class
+
     cm = confusion_matrix(y_test, y_pred, labels=model.classes_)
-    
+
     print(f"--- {model_name} Evaluation ---")
     print(f"Overall Accuracy: {accuracy_score(y_test, y_pred):.4f}\n")
-    
+
     for label in model.classes_:
-        # Get the index for the current class
         class_idx = list(model.classes_).index(label)
-        
-        # TP is the diagonal element
+
         tp = cm[class_idx, class_idx]
-        
-        # FP is the sum of the column for this class, excluding the TP
+
         fp = cm[:, class_idx].sum() - tp
-        
-        # FN is the sum of the row for this class, excluding the TP
+
         fn = cm[class_idx, :].sum() - tp
-        
-        # TN is the sum of all cells minus the TP, FP, and FN for this class
+
         tn = cm.sum() - (tp + fp + fn)
 
-        # Rates
         tp_rate = tp / (tp + fn) if (tp + fn) > 0 else 0  # Same as recall
         fp_rate = fp / (fp + tn) if (fp + tn) > 0 else 0
 
@@ -121,22 +77,18 @@ if __name__ == '__main__':
     test_file = 'adult/adult.test'
     output_dir = 'adult-clean'
 
-    # Create the output directory if it doesn't exist
     os.makedirs(output_dir, exist_ok=True)
 
     X_train, y_train, X_test, y_test = process_data(train_file, test_file)
 
-    # Reset index to align features and labels for concatenation
     y_train = y_train.reset_index(drop=True)
     X_train = X_train.reset_index(drop=True)
     y_test = y_test.reset_index(drop=True)
     X_test = X_test.reset_index(drop=True)
 
-    # Concatenate features and labels
     train_cleaned = pd.concat([X_train, y_train], axis=1)
     test_cleaned = pd.concat([X_test, y_test], axis=1)
 
-    # Save the cleaned data to new CSV files
     train_cleaned.to_csv(os.path.join(output_dir, 'train_clean.csv'), index=False)
     test_cleaned.to_csv(os.path.join(output_dir, 'test_clean.csv'), index=False)
 
@@ -144,12 +96,10 @@ if __name__ == '__main__':
     print(f"Training data shape: {train_cleaned.shape}")
     print(f"Test data shape: {test_cleaned.shape}\n")
 
-    # --- Model Training and Evaluation ---
-
-    # 1. Decision Tree Classifier
+    # Decision Tree Classifier
     dt_classifier = DecisionTreeClassifier(random_state=42)
     evaluate_model(X_train, y_train, X_test, y_test, dt_classifier, "Decision Tree Classifier")
 
-    # 2. Naïve Bayesian Classifier
+    # Naive Bayesian Classifier
     nb_classifier = BernoulliNB()
     evaluate_model(X_train, y_train, X_test, y_test, nb_classifier, "Naïve Bayesian Classifier")
diff --git a/part2.py b/part2.py
index 505b51d..1c5d505 100644
--- a/part2.py
+++ b/part2.py
@@ -8,7 +8,6 @@ import numpy as np
 
 def process_data_part2(train_path: str, test_path: str) -> Tuple[pd.DataFrame, pd.Series, pd.DataFrame, pd.Series]:
     """
-    Processes the adult dataset for part 2 requirements.
     - Removes unknown values.
     - Binarizes numerical attributes based on the mean.
     - One-hot encodes categorical attributes.
@@ -19,27 +18,20 @@ def process_data_part2(train_path: str, test_path: str) -> Tuple[pd.DataFrame, p
         'hours-per-week', 'native-country', 'income'
     ]
 
-    # Load datasets
     df_train: pd.DataFrame = pd.read_csv(train_path, header=None, names=columns, sep=r',\s*', engine='python', na_values='?')
     df_test: pd.DataFrame = pd.read_csv(test_path, header=None, names=columns, sep=r',\s*', engine='python', na_values='?', skiprows=1)
 
-    # Remove rows with any missing values
     df_train.dropna(inplace=True)
     df_test.dropna(inplace=True)
 
-    # Separate features and target, and clean target labels
     X_train_raw = df_train.drop('income', axis=1)
     y_train = df_train['income'].str.replace('.', '', regex=False)
     X_test_raw = df_test.drop('income', axis=1)
     y_test = df_test['income'].str.replace('.', '', regex=False)
 
-    # Identify numerical and categorical attributes
     numerical_cols = X_train_raw.select_dtypes(include=np.number).columns.tolist()
     categorical_cols = X_train_raw.select_dtypes(exclude=np.number).columns.tolist()
 
-    # --- Preprocessing ---
-
-    # 1. Binarize numerical attributes
     X_train_numerical_processed = pd.DataFrame()
     X_test_numerical_processed = pd.DataFrame()
 
@@ -48,7 +40,6 @@ def process_data_part2(train_path: str, test_path: str) -> Tuple[pd.DataFrame, p
         X_train_numerical_processed[col] = (X_train_raw[col] > mean_val).astype(int)
         X_test_numerical_processed[col] = (X_test_raw[col] > mean_val).astype(int)
 
-    # 2. One-hot encode categorical attributes
     encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
 
     X_train_categorical_processed = pd.DataFrame(
@@ -60,22 +51,17 @@ def process_data_part2(train_path: str, test_path: str) -> Tuple[pd.DataFrame, p
         columns=encoder.get_feature_names_out(categorical_cols)
     )
 
-    # Reset index to ensure concatenation works correctly
     X_train_numerical_processed.index = X_train_categorical_processed.index
     X_test_numerical_processed.index = X_test_categorical_processed.index
     y_train.index = X_train_categorical_processed.index
     y_test.index = X_test_categorical_processed.index
 
-    # 3. Combine processed features
     X_train_processed = pd.concat([X_train_numerical_processed, X_train_categorical_processed], axis=1)
     X_test_processed = pd.concat([X_test_numerical_processed, X_test_categorical_processed], axis=1)
 
     return X_train_processed, y_train, X_test_processed, y_test
 
 def run_kmeans_clustering(X_train: pd.DataFrame, k_values: List[int]):
-    """
-    Runs K-Means clustering for different k values and reports centroids.
-    """
     print("--- K-Means Clustering ---")
     for k in k_values:
         print(f"\nRunning K-Means with k={k}...")
@@ -83,17 +69,12 @@ def run_kmeans_clustering(X_train: pd.DataFrame, k_values: List[int]):
         kmeans.fit(X_train)
 
         print(f"Centroids for k={k}:")
-        # Printing only the first 5 dimensions for brevity
         print(pd.DataFrame(kmeans.cluster_centers_[:, :5], columns=X_train.columns[:5]))
         print("-" * 20)
 
 def run_knn_classification(X_train: pd.DataFrame, y_train: pd.Series, X_test: pd.DataFrame, y_test: pd.Series, k_values: List[int]):
-    """
-    Runs kNN classification on the last 10 test samples and reports accuracy.
-    """
     print("\n--- k-Nearest Neighbors (kNN) Classification ---")
 
-    # Use the last 10 records from the test set
     X_test_sample = X_test.tail(10)
     y_test_sample = y_test.tail(10)
 
@@ -116,17 +97,14 @@ if __name__ == '__main__':
     train_file = 'adult/adult.data'
     test_file = 'adult/adult.test'
 
-    # Process data according to Part 2 requirements
     X_train, y_train, X_test, y_test = process_data_part2(train_file, test_file)
 
     print("Data processing complete.")
     print(f"Training data shape: {X_train.shape}")
     print(f"Test data shape: {X_test.shape}\n")
 
-    # Run K-Means Clustering
     kmeans_k_values = [3, 5, 10]
     run_kmeans_clustering(X_train, kmeans_k_values)
 
-    # Run kNN Classification
     knn_k_values = [3, 5, 10]
     run_knn_classification(X_train, y_train, X_test, y_test, knn_k_values)
diff --git a/part3.py b/part3.py
index 7ad4504..9950ca9 100644
--- a/part3.py
+++ b/part3.py
@@ -5,10 +5,8 @@ from sklearn.metrics import accuracy_score
 from typing import Tuple, List
 import numpy as np
 
-# This is the same data processing function from Part 2
 def process_data_part2(train_path: str, test_path: str) -> Tuple[pd.DataFrame, pd.Series, pd.DataFrame, pd.Series]:
     """
-    Processes the adult dataset for part 2 requirements.
     - Removes unknown values.
     - Binarizes numerical attributes based on the mean.
     - One-hot encodes categorical attributes.
@@ -18,39 +16,31 @@ def process_data_part2(train_path: str, test_path: str) -> Tuple[pd.DataFrame, p
         'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss',
         'hours-per-week', 'native-country', 'income'
     ]
-    
-    # Load datasets
+
     df_train: pd.DataFrame = pd.read_csv(train_path, header=None, names=columns, sep=r',\s*', engine='python', na_values='?')
     df_test: pd.DataFrame = pd.read_csv(test_path, header=None, names=columns, sep=r',\s*', engine='python', na_values='?', skiprows=1)
 
-    # Remove rows with any missing values
     df_train.dropna(inplace=True)
     df_test.dropna(inplace=True)
 
-    # Separate features and target, and clean target labels
     X_train_raw = df_train.drop('income', axis=1)
     y_train = df_train['income'].str.replace('.', '', regex=False)
     X_test_raw = df_test.drop('income', axis=1)
     y_test = df_test['income'].str.replace('.', '', regex=False)
 
-    # Identify numerical and categorical attributes
     numerical_cols = X_train_raw.select_dtypes(include=np.number).columns.tolist()
     categorical_cols = X_train_raw.select_dtypes(exclude=np.number).columns.tolist()
 
-    # --- Preprocessing ---
-    
-    # 1. Binarize numerical attributes
     X_train_numerical_processed = pd.DataFrame()
     X_test_numerical_processed = pd.DataFrame()
-    
+
     for col in numerical_cols:
         mean_val = X_train_raw[col].mean()
         X_train_numerical_processed[col] = (X_train_raw[col] > mean_val).astype(int)
         X_test_numerical_processed[col] = (X_test_raw[col] > mean_val).astype(int)
 
-    # 2. One-hot encode categorical attributes
     encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
-    
+
     X_train_categorical_processed = pd.DataFrame(
         encoder.fit_transform(X_train_raw[categorical_cols]),
         columns=encoder.get_feature_names_out(categorical_cols),
@@ -61,12 +51,10 @@ def process_data_part2(train_path: str, test_path: str) -> Tuple[pd.DataFrame, p
         columns=encoder.get_feature_names_out(categorical_cols),
         index=X_test_raw.index
     )
-    
-    # 3. Combine processed features
+
     X_train_processed = pd.concat([X_train_numerical_processed, X_train_categorical_processed], axis=1)
     X_test_processed = pd.concat([X_test_numerical_processed, X_test_categorical_processed], axis=1)
 
-    # Align y labels with the processed X dataframes
     y_train = y_train.loc[X_train_processed.index]
     y_test = y_test.loc[X_test_processed.index]
 
@@ -76,32 +64,22 @@ if __name__ == '__main__':
     train_file = 'adult/adult.data'
     test_file = 'adult/adult.test'
 
-    # Process data using the function from Part 2
     X_train, y_train, X_test, y_test = process_data_part2(train_file, test_file)
-    
+
     print("Data processing complete.")
     print(f"Training data shape: {X_train.shape}")
     print(f"Test data shape: {X_test.shape}\n")
 
-    # --- SVM Classifier ---
     print("--- Support Vector Machine (SVM) Classifier ---")
-    
-    # Initialize SVM classifier. A linear kernel is often a good starting point.
-    # Using a smaller subset for training due to SVM's computational complexity
-    # For a full run, you would use the entire X_train, y_train
-    print("Training the SVM classifier... (This may take a few minutes)")
-    # Note: SVM can be slow on large datasets. For demonstration, you might
-    # sample your data, e.g., X_train.sample(n=5000, random_state=42)
+
+    print("Training the SVM classifier...")
     svm_classifier = SVC(kernel='linear', random_state=42)
-    
-    # Train the model on the full training data
+
     svm_classifier.fit(X_train, y_train)
-    
-    # Make predictions on the test data
+
     print("Making predictions on the test data...")
     y_pred = svm_classifier.predict(X_test)
-    
-    # Calculate and report the accuracy
+
     accuracy = accuracy_score(y_test, y_pred)
-    
+
     print(f"\nSVM Classifier Accuracy on Test Data: {accuracy:.4f}")
diff --git a/part4.py b/part4.py
index 972fa33..7ffed5d 100644
--- a/part4.py
+++ b/part4.py
@@ -5,7 +5,6 @@ from sklearn.metrics import accuracy_score
 from typing import Tuple, List
 import numpy as np
 
-# This is the same data processing function from Part 2
 def process_data_part2(train_path: str, test_path: str) -> Tuple[pd.DataFrame, pd.Series, pd.DataFrame, pd.Series]:
     """
     Processes the adult dataset for part 2 requirements.
@@ -18,39 +17,31 @@ def process_data_part2(train_path: str, test_path: str) -> Tuple[pd.DataFrame, p
         'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss',
         'hours-per-week', 'native-country', 'income'
     ]
-    
-    # Load datasets
+
     df_train: pd.DataFrame = pd.read_csv(train_path, header=None, names=columns, sep=r',\s*', engine='python', na_values='?')
     df_test: pd.DataFrame = pd.read_csv(test_path, header=None, names=columns, sep=r',\s*', engine='python', na_values='?', skiprows=1)
 
-    # Remove rows with any missing values
     df_train.dropna(inplace=True)
     df_test.dropna(inplace=True)
 
-    # Separate features and target, and clean target labels
     X_train_raw = df_train.drop('income', axis=1)
     y_train = df_train['income'].str.replace('.', '', regex=False)
     X_test_raw = df_test.drop('income', axis=1)
     y_test = df_test['income'].str.replace('.', '', regex=False)
 
-    # Identify numerical and categorical attributes
     numerical_cols = X_train_raw.select_dtypes(include=np.number).columns.tolist()
     categorical_cols = X_train_raw.select_dtypes(exclude=np.number).columns.tolist()
 
-    # --- Preprocessing ---
-    
-    # 1. Binarize numerical attributes
     X_train_numerical_processed = pd.DataFrame()
     X_test_numerical_processed = pd.DataFrame()
-    
+
     for col in numerical_cols:
         mean_val = X_train_raw[col].mean()
         X_train_numerical_processed[col] = (X_train_raw[col] > mean_val).astype(int)
         X_test_numerical_processed[col] = (X_test_raw[col] > mean_val).astype(int)
 
-    # 2. One-hot encode categorical attributes
     encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
-    
+
     X_train_categorical_processed = pd.DataFrame(
         encoder.fit_transform(X_train_raw[categorical_cols]),
         columns=encoder.get_feature_names_out(categorical_cols),
@@ -61,12 +52,10 @@ def process_data_part2(train_path: str, test_path: str) -> Tuple[pd.DataFrame, p
         columns=encoder.get_feature_names_out(categorical_cols),
         index=X_test_raw.index
     )
-    
-    # 3. Combine processed features
+
     X_train_processed = pd.concat([X_train_numerical_processed, X_train_categorical_processed], axis=1)
     X_test_processed = pd.concat([X_test_numerical_processed, X_test_categorical_processed], axis=1)
 
-    # Align y labels with the processed X dataframes
     y_train = y_train.loc[X_train_processed.index]
     y_test = y_test.loc[X_test_processed.index]
 
@@ -76,30 +65,23 @@ if __name__ == '__main__':
     train_file = 'adult/adult.data'
     test_file = 'adult/adult.test'
 
-    # Process data using the function from Part 2
     X_train, y_train, X_test, y_test = process_data_part2(train_file, test_file)
-    
+
     print("Data processing complete.")
     print(f"Training data shape: {X_train.shape}")
     print(f"Test data shape: {X_test.shape}\n")
 
-    # --- Neural Network Classifier ---
     print("--- Neural Network (MLP) Classifier ---")
-    
-    # Initialize the Multi-layer Perceptron classifier
-    # hidden_layer_sizes=(100,) means one hidden layer with 100 neurons.
-    # max_iter=500 to ensure the model has enough iterations to converge.
-    # random_state=42 for reproducibility.
+
     nn_classifier = MLPClassifier(hidden_layer_sizes=(100,), max_iter=500, random_state=42)
-    
+
     print("Training the Neural Network classifier...")
     nn_classifier.fit(X_train, y_train)
-    
-    # Make predictions on the test data
+
     print("Making predictions on the test data...")
     y_pred = nn_classifier.predict(X_test)
-    
+
     # Calculate and report the accuracy
     accuracy = accuracy_score(y_test, y_pred)
-    
+
     print(f"\nNeural Network Classifier Accuracy on Test Data: {accuracy:.4f}")