133 lines
5.3 KiB
Python
133 lines
5.3 KiB
Python
import pandas as pd
|
|
from sklearn.preprocessing import OneHotEncoder
|
|
from sklearn.cluster import KMeans
|
|
from sklearn.neighbors import KNeighborsClassifier
|
|
from sklearn.metrics import accuracy_score
|
|
from typing import Tuple, List
|
|
import numpy as np
|
|
|
|
def process_data_part2(train_path: str, test_path: str) -> Tuple[pd.DataFrame, pd.Series, pd.DataFrame, pd.Series]:
|
|
"""
|
|
Processes the adult dataset for part 2 requirements.
|
|
- Removes unknown values.
|
|
- Binarizes numerical attributes based on the mean.
|
|
- One-hot encodes categorical attributes.
|
|
"""
|
|
columns: List[str] = [
|
|
'age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status',
|
|
'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss',
|
|
'hours-per-week', 'native-country', 'income'
|
|
]
|
|
|
|
# Load datasets
|
|
df_train: pd.DataFrame = pd.read_csv(train_path, header=None, names=columns, sep=r',\s*', engine='python', na_values='?')
|
|
df_test: pd.DataFrame = pd.read_csv(test_path, header=None, names=columns, sep=r',\s*', engine='python', na_values='?', skiprows=1)
|
|
|
|
# Remove rows with any missing values
|
|
df_train.dropna(inplace=True)
|
|
df_test.dropna(inplace=True)
|
|
|
|
# Separate features and target, and clean target labels
|
|
X_train_raw = df_train.drop('income', axis=1)
|
|
y_train = df_train['income'].str.replace('.', '', regex=False)
|
|
X_test_raw = df_test.drop('income', axis=1)
|
|
y_test = df_test['income'].str.replace('.', '', regex=False)
|
|
|
|
# Identify numerical and categorical attributes
|
|
numerical_cols = X_train_raw.select_dtypes(include=np.number).columns.tolist()
|
|
categorical_cols = X_train_raw.select_dtypes(exclude=np.number).columns.tolist()
|
|
|
|
# --- Preprocessing ---
|
|
|
|
# 1. Binarize numerical attributes
|
|
X_train_numerical_processed = pd.DataFrame()
|
|
X_test_numerical_processed = pd.DataFrame()
|
|
|
|
for col in numerical_cols:
|
|
mean_val = X_train_raw[col].mean()
|
|
X_train_numerical_processed[col] = (X_train_raw[col] > mean_val).astype(int)
|
|
X_test_numerical_processed[col] = (X_test_raw[col] > mean_val).astype(int)
|
|
|
|
# 2. One-hot encode categorical attributes
|
|
encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
|
|
|
|
X_train_categorical_processed = pd.DataFrame(
|
|
encoder.fit_transform(X_train_raw[categorical_cols]),
|
|
columns=encoder.get_feature_names_out(categorical_cols)
|
|
)
|
|
X_test_categorical_processed = pd.DataFrame(
|
|
encoder.transform(X_test_raw[categorical_cols]),
|
|
columns=encoder.get_feature_names_out(categorical_cols)
|
|
)
|
|
|
|
# Reset index to ensure concatenation works correctly
|
|
X_train_numerical_processed.index = X_train_categorical_processed.index
|
|
X_test_numerical_processed.index = X_test_categorical_processed.index
|
|
y_train.index = X_train_categorical_processed.index
|
|
y_test.index = X_test_categorical_processed.index
|
|
|
|
# 3. Combine processed features
|
|
X_train_processed = pd.concat([X_train_numerical_processed, X_train_categorical_processed], axis=1)
|
|
X_test_processed = pd.concat([X_test_numerical_processed, X_test_categorical_processed], axis=1)
|
|
|
|
return X_train_processed, y_train, X_test_processed, y_test
|
|
|
|
def run_kmeans_clustering(X_train: pd.DataFrame, k_values: List[int]):
|
|
"""
|
|
Runs K-Means clustering for different k values and reports centroids.
|
|
"""
|
|
print("--- K-Means Clustering ---")
|
|
for k in k_values:
|
|
print(f"\nRunning K-Means with k={k}...")
|
|
kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
|
|
kmeans.fit(X_train)
|
|
|
|
print(f"Centroids for k={k}:")
|
|
# Printing only the first 5 dimensions for brevity
|
|
print(pd.DataFrame(kmeans.cluster_centers_[:, :5], columns=X_train.columns[:5]))
|
|
print("-" * 20)
|
|
|
|
def run_knn_classification(X_train: pd.DataFrame, y_train: pd.Series, X_test: pd.DataFrame, y_test: pd.Series, k_values: List[int]):
|
|
"""
|
|
Runs kNN classification on the last 10 test samples and reports accuracy.
|
|
"""
|
|
print("\n--- k-Nearest Neighbors (kNN) Classification ---")
|
|
|
|
# Use the last 10 records from the test set
|
|
X_test_sample = X_test.tail(10)
|
|
y_test_sample = y_test.tail(10)
|
|
|
|
print(f"Predicting for the last {len(X_test_sample)} records of the test set.\n")
|
|
|
|
for k in k_values:
|
|
knn = KNeighborsClassifier(n_neighbors=k)
|
|
knn.fit(X_train, y_train)
|
|
|
|
y_pred_sample = knn.predict(X_test_sample)
|
|
accuracy = accuracy_score(y_test_sample, y_pred_sample)
|
|
|
|
print(f"kNN with k={k}:")
|
|
print(f" Prediction Accuracy: {accuracy:.2f}")
|
|
print(f" Predicted Labels: {y_pred_sample}")
|
|
print(f" Actual Labels: {y_test_sample.values}")
|
|
print("-" * 20)
|
|
|
|
if __name__ == '__main__':
|
|
train_file = 'adult/adult.data'
|
|
test_file = 'adult/adult.test'
|
|
|
|
# Process data according to Part 2 requirements
|
|
X_train, y_train, X_test, y_test = process_data_part2(train_file, test_file)
|
|
|
|
print("Data processing complete.")
|
|
print(f"Training data shape: {X_train.shape}")
|
|
print(f"Test data shape: {X_test.shape}\n")
|
|
|
|
# Run K-Means Clustering
|
|
kmeans_k_values = [3, 5, 10]
|
|
run_kmeans_clustering(X_train, kmeans_k_values)
|
|
|
|
# Run kNN Classification
|
|
knn_k_values = [3, 5, 10]
|
|
run_knn_classification(X_train, y_train, X_test, y_test, knn_k_values)
|