From 93492bc46272048b1e533cb9a3dd177f04d9eb01 Mon Sep 17 00:00:00 2001 From: JISAUAY Date: Mon, 6 Oct 2025 10:32:18 -0500 Subject: [PATCH] cleaned up the code some --- part1.py | 68 ++++++++------------------------------------------------ part2.py | 22 ------------------ part3.py | 44 +++++++++--------------------------- part4.py | 38 +++++++++---------------------- 4 files changed, 30 insertions(+), 142 deletions(-) diff --git a/part1.py b/part1.py index e5e4c18..d24f02b 100644 --- a/part1.py +++ b/part1.py @@ -7,53 +7,32 @@ from sklearn.naive_bayes import BernoulliNB from sklearn.metrics import classification_report, confusion_matrix, accuracy_score def process_data(train_path: str, test_path: str) -> Tuple[pd.DataFrame, pd.Series, pd.DataFrame, pd.Series]: - """ - Processes the adult dataset by cleaning, removing continuous attributes, and one-hot encoding. - - Args: - train_path (str): The path to the training data file. - test_path (str): The path to the test data file. - - Returns: - Tuple[pd.DataFrame, pd.Series, pd.DataFrame, pd.Series]: A tuple containing: - - X_train_encoded: Processed and one-hot encoded training features. - - y_train: Training labels. - - X_test_encoded: Processed and one-hot encoded test features. - - y_test: Test labels. - """ columns: list[str] = [ 'age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country', 'income' ] - # Load datasets df_train: pd.DataFrame = pd.read_csv(train_path, header=None, names=columns, sep=r',\s*', engine='python', na_values='?') df_test: pd.DataFrame = pd.read_csv(test_path, header=None, names=columns, sep=r',\s*', engine='python', na_values='?', skiprows=1) - # Remove rows with any missing values df_train.dropna(inplace=True) df_test.dropna(inplace=True) - # Define continuous attributes to remove continuous_attributes: list[str] = [ 'age', 'fnlwgt', 'education-num', 'capital-gain', 'capital-loss', 'hours-per-week' ] - # Separate features and target X_train: pd.DataFrame = df_train.drop(columns=['income']) y_train: pd.Series = df_train['income'].str.replace('.', '', regex=False) X_test: pd.DataFrame = df_test.drop(columns=['income']) y_test: pd.Series = df_test['income'].str.replace('.', '', regex=False) - # Remove continuous attributes X_train = X_train.drop(columns=continuous_attributes) X_test = X_test.drop(columns=continuous_attributes) - # Identify categorical attributes for one-hot encoding categorical_attributes: list[str] = X_train.columns.tolist() - # One-hot encode categorical attributes encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False) X_train_encoded: pd.DataFrame = pd.DataFrame(encoder.fit_transform(X_train[categorical_attributes]), columns=encoder.get_feature_names_out(categorical_attributes)) X_test_encoded: pd.DataFrame = pd.DataFrame(encoder.transform(X_test[categorical_attributes]), columns=encoder.get_feature_names_out(categorical_attributes)) @@ -61,51 +40,28 @@ def process_data(train_path: str, test_path: str) -> Tuple[pd.DataFrame, pd.Seri return X_train_encoded, y_train, X_test_encoded, y_test def evaluate_model(X_train: pd.DataFrame, y_train: pd.Series, X_test: pd.DataFrame, y_test: pd.Series, model, model_name: str): - """ - Trains and evaluates a given model, printing a detailed report. - - Args: - X_train (pd.DataFrame): Training features. - y_train (pd.Series): Training labels. - X_test (pd.DataFrame): Test features. - y_test (pd.Series): Test labels. - model: The classifier model to evaluate. - model_name (str): The name of the model for reporting. - """ - # Train the model model.fit(X_train, y_train) - # Make predictions y_pred = model.predict(X_test) - # Generate the classification report report = classification_report(y_test, y_pred, output_dict=True) - - # Calculate confusion matrix to get TP and FP rates - # For binary classification: [[TN, FP], [FN, TP]] - # For multi-class, we calculate per class + cm = confusion_matrix(y_test, y_pred, labels=model.classes_) - + print(f"--- {model_name} Evaluation ---") print(f"Overall Accuracy: {accuracy_score(y_test, y_pred):.4f}\n") - + for label in model.classes_: - # Get the index for the current class class_idx = list(model.classes_).index(label) - - # TP is the diagonal element + tp = cm[class_idx, class_idx] - - # FP is the sum of the column for this class, excluding the TP + fp = cm[:, class_idx].sum() - tp - - # FN is the sum of the row for this class, excluding the TP + fn = cm[class_idx, :].sum() - tp - - # TN is the sum of all cells minus the TP, FP, and FN for this class + tn = cm.sum() - (tp + fp + fn) - # Rates tp_rate = tp / (tp + fn) if (tp + fn) > 0 else 0 # Same as recall fp_rate = fp / (fp + tn) if (fp + tn) > 0 else 0 @@ -121,22 +77,18 @@ if __name__ == '__main__': test_file = 'adult/adult.test' output_dir = 'adult-clean' - # Create the output directory if it doesn't exist os.makedirs(output_dir, exist_ok=True) X_train, y_train, X_test, y_test = process_data(train_file, test_file) - # Reset index to align features and labels for concatenation y_train = y_train.reset_index(drop=True) X_train = X_train.reset_index(drop=True) y_test = y_test.reset_index(drop=True) X_test = X_test.reset_index(drop=True) - # Concatenate features and labels train_cleaned = pd.concat([X_train, y_train], axis=1) test_cleaned = pd.concat([X_test, y_test], axis=1) - # Save the cleaned data to new CSV files train_cleaned.to_csv(os.path.join(output_dir, 'train_clean.csv'), index=False) test_cleaned.to_csv(os.path.join(output_dir, 'test_clean.csv'), index=False) @@ -144,12 +96,10 @@ if __name__ == '__main__': print(f"Training data shape: {train_cleaned.shape}") print(f"Test data shape: {test_cleaned.shape}\n") - # --- Model Training and Evaluation --- - - # 1. Decision Tree Classifier + # Decision Tree Classifier dt_classifier = DecisionTreeClassifier(random_state=42) evaluate_model(X_train, y_train, X_test, y_test, dt_classifier, "Decision Tree Classifier") - # 2. Naïve Bayesian Classifier + # Naive Bayesian Classifier nb_classifier = BernoulliNB() evaluate_model(X_train, y_train, X_test, y_test, nb_classifier, "Naïve Bayesian Classifier") diff --git a/part2.py b/part2.py index 505b51d..1c5d505 100644 --- a/part2.py +++ b/part2.py @@ -8,7 +8,6 @@ import numpy as np def process_data_part2(train_path: str, test_path: str) -> Tuple[pd.DataFrame, pd.Series, pd.DataFrame, pd.Series]: """ - Processes the adult dataset for part 2 requirements. - Removes unknown values. - Binarizes numerical attributes based on the mean. - One-hot encodes categorical attributes. @@ -19,27 +18,20 @@ def process_data_part2(train_path: str, test_path: str) -> Tuple[pd.DataFrame, p 'hours-per-week', 'native-country', 'income' ] - # Load datasets df_train: pd.DataFrame = pd.read_csv(train_path, header=None, names=columns, sep=r',\s*', engine='python', na_values='?') df_test: pd.DataFrame = pd.read_csv(test_path, header=None, names=columns, sep=r',\s*', engine='python', na_values='?', skiprows=1) - # Remove rows with any missing values df_train.dropna(inplace=True) df_test.dropna(inplace=True) - # Separate features and target, and clean target labels X_train_raw = df_train.drop('income', axis=1) y_train = df_train['income'].str.replace('.', '', regex=False) X_test_raw = df_test.drop('income', axis=1) y_test = df_test['income'].str.replace('.', '', regex=False) - # Identify numerical and categorical attributes numerical_cols = X_train_raw.select_dtypes(include=np.number).columns.tolist() categorical_cols = X_train_raw.select_dtypes(exclude=np.number).columns.tolist() - # --- Preprocessing --- - - # 1. Binarize numerical attributes X_train_numerical_processed = pd.DataFrame() X_test_numerical_processed = pd.DataFrame() @@ -48,7 +40,6 @@ def process_data_part2(train_path: str, test_path: str) -> Tuple[pd.DataFrame, p X_train_numerical_processed[col] = (X_train_raw[col] > mean_val).astype(int) X_test_numerical_processed[col] = (X_test_raw[col] > mean_val).astype(int) - # 2. One-hot encode categorical attributes encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False) X_train_categorical_processed = pd.DataFrame( @@ -60,22 +51,17 @@ def process_data_part2(train_path: str, test_path: str) -> Tuple[pd.DataFrame, p columns=encoder.get_feature_names_out(categorical_cols) ) - # Reset index to ensure concatenation works correctly X_train_numerical_processed.index = X_train_categorical_processed.index X_test_numerical_processed.index = X_test_categorical_processed.index y_train.index = X_train_categorical_processed.index y_test.index = X_test_categorical_processed.index - # 3. Combine processed features X_train_processed = pd.concat([X_train_numerical_processed, X_train_categorical_processed], axis=1) X_test_processed = pd.concat([X_test_numerical_processed, X_test_categorical_processed], axis=1) return X_train_processed, y_train, X_test_processed, y_test def run_kmeans_clustering(X_train: pd.DataFrame, k_values: List[int]): - """ - Runs K-Means clustering for different k values and reports centroids. - """ print("--- K-Means Clustering ---") for k in k_values: print(f"\nRunning K-Means with k={k}...") @@ -83,17 +69,12 @@ def run_kmeans_clustering(X_train: pd.DataFrame, k_values: List[int]): kmeans.fit(X_train) print(f"Centroids for k={k}:") - # Printing only the first 5 dimensions for brevity print(pd.DataFrame(kmeans.cluster_centers_[:, :5], columns=X_train.columns[:5])) print("-" * 20) def run_knn_classification(X_train: pd.DataFrame, y_train: pd.Series, X_test: pd.DataFrame, y_test: pd.Series, k_values: List[int]): - """ - Runs kNN classification on the last 10 test samples and reports accuracy. - """ print("\n--- k-Nearest Neighbors (kNN) Classification ---") - # Use the last 10 records from the test set X_test_sample = X_test.tail(10) y_test_sample = y_test.tail(10) @@ -116,17 +97,14 @@ if __name__ == '__main__': train_file = 'adult/adult.data' test_file = 'adult/adult.test' - # Process data according to Part 2 requirements X_train, y_train, X_test, y_test = process_data_part2(train_file, test_file) print("Data processing complete.") print(f"Training data shape: {X_train.shape}") print(f"Test data shape: {X_test.shape}\n") - # Run K-Means Clustering kmeans_k_values = [3, 5, 10] run_kmeans_clustering(X_train, kmeans_k_values) - # Run kNN Classification knn_k_values = [3, 5, 10] run_knn_classification(X_train, y_train, X_test, y_test, knn_k_values) diff --git a/part3.py b/part3.py index 7ad4504..9950ca9 100644 --- a/part3.py +++ b/part3.py @@ -5,10 +5,8 @@ from sklearn.metrics import accuracy_score from typing import Tuple, List import numpy as np -# This is the same data processing function from Part 2 def process_data_part2(train_path: str, test_path: str) -> Tuple[pd.DataFrame, pd.Series, pd.DataFrame, pd.Series]: """ - Processes the adult dataset for part 2 requirements. - Removes unknown values. - Binarizes numerical attributes based on the mean. - One-hot encodes categorical attributes. @@ -18,39 +16,31 @@ def process_data_part2(train_path: str, test_path: str) -> Tuple[pd.DataFrame, p 'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country', 'income' ] - - # Load datasets + df_train: pd.DataFrame = pd.read_csv(train_path, header=None, names=columns, sep=r',\s*', engine='python', na_values='?') df_test: pd.DataFrame = pd.read_csv(test_path, header=None, names=columns, sep=r',\s*', engine='python', na_values='?', skiprows=1) - # Remove rows with any missing values df_train.dropna(inplace=True) df_test.dropna(inplace=True) - # Separate features and target, and clean target labels X_train_raw = df_train.drop('income', axis=1) y_train = df_train['income'].str.replace('.', '', regex=False) X_test_raw = df_test.drop('income', axis=1) y_test = df_test['income'].str.replace('.', '', regex=False) - # Identify numerical and categorical attributes numerical_cols = X_train_raw.select_dtypes(include=np.number).columns.tolist() categorical_cols = X_train_raw.select_dtypes(exclude=np.number).columns.tolist() - # --- Preprocessing --- - - # 1. Binarize numerical attributes X_train_numerical_processed = pd.DataFrame() X_test_numerical_processed = pd.DataFrame() - + for col in numerical_cols: mean_val = X_train_raw[col].mean() X_train_numerical_processed[col] = (X_train_raw[col] > mean_val).astype(int) X_test_numerical_processed[col] = (X_test_raw[col] > mean_val).astype(int) - # 2. One-hot encode categorical attributes encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False) - + X_train_categorical_processed = pd.DataFrame( encoder.fit_transform(X_train_raw[categorical_cols]), columns=encoder.get_feature_names_out(categorical_cols), @@ -61,12 +51,10 @@ def process_data_part2(train_path: str, test_path: str) -> Tuple[pd.DataFrame, p columns=encoder.get_feature_names_out(categorical_cols), index=X_test_raw.index ) - - # 3. Combine processed features + X_train_processed = pd.concat([X_train_numerical_processed, X_train_categorical_processed], axis=1) X_test_processed = pd.concat([X_test_numerical_processed, X_test_categorical_processed], axis=1) - # Align y labels with the processed X dataframes y_train = y_train.loc[X_train_processed.index] y_test = y_test.loc[X_test_processed.index] @@ -76,32 +64,22 @@ if __name__ == '__main__': train_file = 'adult/adult.data' test_file = 'adult/adult.test' - # Process data using the function from Part 2 X_train, y_train, X_test, y_test = process_data_part2(train_file, test_file) - + print("Data processing complete.") print(f"Training data shape: {X_train.shape}") print(f"Test data shape: {X_test.shape}\n") - # --- SVM Classifier --- print("--- Support Vector Machine (SVM) Classifier ---") - - # Initialize SVM classifier. A linear kernel is often a good starting point. - # Using a smaller subset for training due to SVM's computational complexity - # For a full run, you would use the entire X_train, y_train - print("Training the SVM classifier... (This may take a few minutes)") - # Note: SVM can be slow on large datasets. For demonstration, you might - # sample your data, e.g., X_train.sample(n=5000, random_state=42) + + print("Training the SVM classifier...") svm_classifier = SVC(kernel='linear', random_state=42) - - # Train the model on the full training data + svm_classifier.fit(X_train, y_train) - - # Make predictions on the test data + print("Making predictions on the test data...") y_pred = svm_classifier.predict(X_test) - - # Calculate and report the accuracy + accuracy = accuracy_score(y_test, y_pred) - + print(f"\nSVM Classifier Accuracy on Test Data: {accuracy:.4f}") diff --git a/part4.py b/part4.py index 972fa33..7ffed5d 100644 --- a/part4.py +++ b/part4.py @@ -5,7 +5,6 @@ from sklearn.metrics import accuracy_score from typing import Tuple, List import numpy as np -# This is the same data processing function from Part 2 def process_data_part2(train_path: str, test_path: str) -> Tuple[pd.DataFrame, pd.Series, pd.DataFrame, pd.Series]: """ Processes the adult dataset for part 2 requirements. @@ -18,39 +17,31 @@ def process_data_part2(train_path: str, test_path: str) -> Tuple[pd.DataFrame, p 'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country', 'income' ] - - # Load datasets + df_train: pd.DataFrame = pd.read_csv(train_path, header=None, names=columns, sep=r',\s*', engine='python', na_values='?') df_test: pd.DataFrame = pd.read_csv(test_path, header=None, names=columns, sep=r',\s*', engine='python', na_values='?', skiprows=1) - # Remove rows with any missing values df_train.dropna(inplace=True) df_test.dropna(inplace=True) - # Separate features and target, and clean target labels X_train_raw = df_train.drop('income', axis=1) y_train = df_train['income'].str.replace('.', '', regex=False) X_test_raw = df_test.drop('income', axis=1) y_test = df_test['income'].str.replace('.', '', regex=False) - # Identify numerical and categorical attributes numerical_cols = X_train_raw.select_dtypes(include=np.number).columns.tolist() categorical_cols = X_train_raw.select_dtypes(exclude=np.number).columns.tolist() - # --- Preprocessing --- - - # 1. Binarize numerical attributes X_train_numerical_processed = pd.DataFrame() X_test_numerical_processed = pd.DataFrame() - + for col in numerical_cols: mean_val = X_train_raw[col].mean() X_train_numerical_processed[col] = (X_train_raw[col] > mean_val).astype(int) X_test_numerical_processed[col] = (X_test_raw[col] > mean_val).astype(int) - # 2. One-hot encode categorical attributes encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False) - + X_train_categorical_processed = pd.DataFrame( encoder.fit_transform(X_train_raw[categorical_cols]), columns=encoder.get_feature_names_out(categorical_cols), @@ -61,12 +52,10 @@ def process_data_part2(train_path: str, test_path: str) -> Tuple[pd.DataFrame, p columns=encoder.get_feature_names_out(categorical_cols), index=X_test_raw.index ) - - # 3. Combine processed features + X_train_processed = pd.concat([X_train_numerical_processed, X_train_categorical_processed], axis=1) X_test_processed = pd.concat([X_test_numerical_processed, X_test_categorical_processed], axis=1) - # Align y labels with the processed X dataframes y_train = y_train.loc[X_train_processed.index] y_test = y_test.loc[X_test_processed.index] @@ -76,30 +65,23 @@ if __name__ == '__main__': train_file = 'adult/adult.data' test_file = 'adult/adult.test' - # Process data using the function from Part 2 X_train, y_train, X_test, y_test = process_data_part2(train_file, test_file) - + print("Data processing complete.") print(f"Training data shape: {X_train.shape}") print(f"Test data shape: {X_test.shape}\n") - # --- Neural Network Classifier --- print("--- Neural Network (MLP) Classifier ---") - - # Initialize the Multi-layer Perceptron classifier - # hidden_layer_sizes=(100,) means one hidden layer with 100 neurons. - # max_iter=500 to ensure the model has enough iterations to converge. - # random_state=42 for reproducibility. + nn_classifier = MLPClassifier(hidden_layer_sizes=(100,), max_iter=500, random_state=42) - + print("Training the Neural Network classifier...") nn_classifier.fit(X_train, y_train) - - # Make predictions on the test data + print("Making predictions on the test data...") y_pred = nn_classifier.predict(X_test) - + # Calculate and report the accuracy accuracy = accuracy_score(y_test, y_pred) - + print(f"\nNeural Network Classifier Accuracy on Test Data: {accuracy:.4f}")