Datas-Mining-LightGBM/LightGBM/train_lightgbm.py
2025-12-04 16:48:17 -06:00

221 lines
7.8 KiB
Python

import numpy as np
import pandas as pd
import lightgbm as lgb
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import accuracy_score, classification_report, log_loss
import pickle
import matplotlib.pyplot as plt
def train_lightgbm():
"""Train LightGBM classifier for destination prediction"""
print("="*70)
print("LIGHTGBM CLASSIFIER TRAINING")
print("="*70)
# Load preprocessed data and metadata
print("\nLoading preprocessed data and metadata...")
X_train = np.load('X_train.npy')
y_train = np.load('y_train.npy')
# Load feature names and categorical feature list
with open('feature_names.pkl', 'rb') as f:
feature_names = pickle.load(f)
with open('categorical_features.pkl', 'rb') as f:
categorical_features = pickle.load(f)
# Load target encoder
with open('target_encoder.pkl', 'rb') as f:
target_encoder = pickle.load(f)
print(f"Training samples: {len(y_train)}")
print(f"Number of classes: {len(target_encoder.classes_)}")
print(f"Classes: {target_encoder.classes_}")
# Show class distribution
print("\nClass distribution:")
unique, counts = np.unique(y_train, return_counts=True)
for idx, count in zip(unique, counts):
pct = count / len(y_train) * 100
print(f"{target_encoder.classes_[idx]}: {count} ({pct:.2f}%)")
# Convert to DataFrame to preserve feature names and pass categorical features to LightGBM
import pandas as pd
X_df = pd.DataFrame(X_train, columns=feature_names)
print(f"\nUsing categorical features: {categorical_features}")
# Prepare CV
n_splits = 5
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
oof_preds = np.zeros((X_df.shape[0], len(target_encoder.classes_)))
# If test set exists, prepare to accumulate test predictions
X_test = None
try:
X_test = np.load('X_test.npy')
X_test_df = pd.DataFrame(X_test, columns=feature_names)
test_preds = np.zeros((X_test_df.shape[0], len(target_encoder.classes_)))
except Exception:
X_test_df = None
test_preds = None
# LightGBM parameters - updated for CV and categorical handling
params = {
'objective': 'multiclass',
'num_class': len(target_encoder.classes_),
'metric': 'multi_logloss',
'boosting_type': 'gbdt',
'num_leaves': 128,
'learning_rate': 0.05,
'feature_fraction': 0.7,
'bagging_fraction': 0.8,
'bagging_freq': 5,
'verbose': -1,
'max_depth': -1,
'min_data_in_leaf': 20,
'lambda_l1': 0.1,
'lambda_l2': 0.1,
'seed': 42,
'boost_from_average': False
}
print("\nLightGBM Parameters:")
for key, value in params.items():
print(f" {key}: {value}")
# Train with Stratified K-Fold CV
print("\nTraining LightGBM with Stratified K-Fold CV...")
fold = 0
models = []
evals_result = {}
for train_idx, val_idx in skf.split(X_df, y_train):
fold += 1
print('\n' + '='*50)
print(f"Fold {fold}/{n_splits}")
print('='*50)
X_tr = X_df.iloc[train_idx]
X_val = X_df.iloc[val_idx]
y_tr = y_train[train_idx]
y_val = y_train[val_idx]
train_data = lgb.Dataset(X_tr, label=y_tr, feature_name=feature_names, categorical_feature=categorical_features)
val_data = lgb.Dataset(X_val, label=y_val, reference=train_data, feature_name=feature_names, categorical_feature=categorical_features)
model = lgb.train(
params,
train_data,
num_boost_round=3000,
valid_sets=[train_data, val_data],
valid_names=['train', 'valid'],
callbacks=[
lgb.early_stopping(stopping_rounds=100),
lgb.log_evaluation(period=100)
]
)
models.append(model)
# OOF predictions
val_pred = model.predict(X_val, num_iteration=model.best_iteration)
oof_preds[val_idx] = val_pred
# Test predictions (if available)
if X_test_df is not None:
fold_test_pred = model.predict(X_test_df, num_iteration=model.best_iteration)
test_preds += fold_test_pred
# Save fold model
model.save_model(f'lightgbm_model_fold{fold}.txt')
print(f"Saved model for fold {fold} as 'lightgbm_model_fold{fold}.txt'")
print("-" * 70)
# Evaluate using OOF predictions across all folds
print("\n" + "="*70)
print("CROSS-VALIDATION OOF RESULTS")
print("="*70)
oof_logloss = log_loss(y_train, oof_preds)
oof_preds_argmax = np.argmax(oof_preds, axis=1)
oof_accuracy = accuracy_score(y_train, oof_preds_argmax)
print(f"\nOOF Accuracy: {oof_accuracy:.4f}")
print(f"OOF Log Loss: {oof_logloss:.4f}")
# Baseline accuracy on full train
most_common = np.bincount(y_train).argmax()
baseline = np.sum(y_train == most_common) / len(y_train)
print(f"Baseline (always {target_encoder.classes_[most_common]}): {baseline:.4f}")
print(f"Improvement over baseline: {(oof_accuracy - baseline):.4f}")
# Show classification report on OOF hard predictions
print("\nClassification Report (OOF predictions):")
print(classification_report(y_train, oof_preds_argmax, target_names=target_encoder.classes_, zero_division=0))
# Prediction distribution (OOF)
print("\nOOF Prediction Distribution:")
print(f"{'Class':<10} {'Actual':<10} {'Predicted':<10} {'Actual %':<12} {'Pred %'}")
print("-" * 60)
for idx in range(len(target_encoder.classes_)):
class_name = target_encoder.classes_[idx]
actual_count = np.sum(y_train == idx)
pred_count = np.sum(oof_preds_argmax == idx)
actual_pct = actual_count / len(y_train) * 100
pred_pct = pred_count / len(oof_preds_argmax) * 100
print(f"{class_name:<10} {actual_count:<10} {pred_count:<10} {actual_pct:>6.2f}% {pred_pct:>6.2f}%")
# Feature importance
print("\n" + "="*70)
print("TOP 20 FEATURE IMPORTANCES")
print("="*70)
# Aggregate feature importances across folds
total_importance = np.zeros(len(feature_names))
for m in models:
total_importance += np.array(m.feature_importance(importance_type='gain'))
avg_importance = total_importance / max(1, len(models))
importance_df = pd.DataFrame({
'feature': feature_names,
'importance': avg_importance
}).sort_values('importance', ascending=False)
print(importance_df.head(20).to_string(index=False))
# Plot feature importance
plt.figure(figsize=(10, 8))
top_features = importance_df.head(20)
plt.barh(range(len(top_features)), top_features['importance'])
plt.yticks(range(len(top_features)), top_features['feature'])
plt.xlabel('Importance (Gain)')
plt.title('Top 20 Feature Importances')
plt.gca().invert_yaxis()
plt.tight_layout()
plt.savefig('feature_importance.png', dpi=300)
print("\nFeature importance plot saved as 'feature_importance.png'")
# (Optional) Save a simple CSV of top importances
importance_df.head(100).to_csv('feature_importances_avg.csv', index=False)
print("Saved averaged feature importances as 'feature_importances_avg.csv'")
# Save list of fold models
model_files = [f'lightgbm_model_fold{i+1}.txt' for i in range(len(models))]
with open('lightgbm_models_list.pkl', 'wb') as f:
pickle.dump(model_files, f)
print("\n" + "="*70)
print(f"Saved {len(models)} fold models and model list 'lightgbm_models_list.pkl'")
print("="*70)
return models
if __name__ == '__main__':
model = train_lightgbm()
print("\n" + "="*70)
print("TRAINING COMPLETE!")
print("="*70)
print("\nNext step: Use predict_lightgbm.py to make predictions on test data")