import numpy as np import pandas as pd import lightgbm as lgb from sklearn.model_selection import train_test_split, StratifiedKFold from sklearn.metrics import accuracy_score, classification_report, log_loss import pickle import matplotlib.pyplot as plt def train_lightgbm(): """Train LightGBM classifier for destination prediction""" print("="*70) print("LIGHTGBM CLASSIFIER TRAINING") print("="*70) # Load preprocessed data and metadata print("\nLoading preprocessed data and metadata...") X_train = np.load('X_train.npy') y_train = np.load('y_train.npy') # Load feature names and categorical feature list with open('feature_names.pkl', 'rb') as f: feature_names = pickle.load(f) with open('categorical_features.pkl', 'rb') as f: categorical_features = pickle.load(f) # Load target encoder with open('target_encoder.pkl', 'rb') as f: target_encoder = pickle.load(f) print(f"Training samples: {len(y_train)}") print(f"Number of classes: {len(target_encoder.classes_)}") print(f"Classes: {target_encoder.classes_}") # Show class distribution print("\nClass distribution:") unique, counts = np.unique(y_train, return_counts=True) for idx, count in zip(unique, counts): pct = count / len(y_train) * 100 print(f"{target_encoder.classes_[idx]}: {count} ({pct:.2f}%)") # Convert to DataFrame to preserve feature names and pass categorical features to LightGBM import pandas as pd X_df = pd.DataFrame(X_train, columns=feature_names) print(f"\nUsing categorical features: {categorical_features}") # Prepare CV n_splits = 5 skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42) oof_preds = np.zeros((X_df.shape[0], len(target_encoder.classes_))) # If test set exists, prepare to accumulate test predictions X_test = None try: X_test = np.load('X_test.npy') X_test_df = pd.DataFrame(X_test, columns=feature_names) test_preds = np.zeros((X_test_df.shape[0], len(target_encoder.classes_))) except Exception: X_test_df = None test_preds = None # LightGBM parameters - updated for CV and categorical handling params = { 'objective': 'multiclass', 'num_class': len(target_encoder.classes_), 'metric': 'multi_logloss', 'boosting_type': 'gbdt', 'num_leaves': 128, 'learning_rate': 0.05, 'feature_fraction': 0.7, 'bagging_fraction': 0.8, 'bagging_freq': 5, 'verbose': -1, 'max_depth': -1, 'min_data_in_leaf': 20, 'lambda_l1': 0.1, 'lambda_l2': 0.1, 'seed': 42, 'boost_from_average': False } print("\nLightGBM Parameters:") for key, value in params.items(): print(f" {key}: {value}") # Train with Stratified K-Fold CV print("\nTraining LightGBM with Stratified K-Fold CV...") fold = 0 models = [] evals_result = {} for train_idx, val_idx in skf.split(X_df, y_train): fold += 1 print('\n' + '='*50) print(f"Fold {fold}/{n_splits}") print('='*50) X_tr = X_df.iloc[train_idx] X_val = X_df.iloc[val_idx] y_tr = y_train[train_idx] y_val = y_train[val_idx] train_data = lgb.Dataset(X_tr, label=y_tr, feature_name=feature_names, categorical_feature=categorical_features) val_data = lgb.Dataset(X_val, label=y_val, reference=train_data, feature_name=feature_names, categorical_feature=categorical_features) model = lgb.train( params, train_data, num_boost_round=3000, valid_sets=[train_data, val_data], valid_names=['train', 'valid'], callbacks=[ lgb.early_stopping(stopping_rounds=100), lgb.log_evaluation(period=100) ] ) models.append(model) # OOF predictions val_pred = model.predict(X_val, num_iteration=model.best_iteration) oof_preds[val_idx] = val_pred # Test predictions (if available) if X_test_df is not None: fold_test_pred = model.predict(X_test_df, num_iteration=model.best_iteration) test_preds += fold_test_pred # Save fold model model.save_model(f'lightgbm_model_fold{fold}.txt') print(f"Saved model for fold {fold} as 'lightgbm_model_fold{fold}.txt'") print("-" * 70) # Evaluate using OOF predictions across all folds print("\n" + "="*70) print("CROSS-VALIDATION OOF RESULTS") print("="*70) oof_logloss = log_loss(y_train, oof_preds) oof_preds_argmax = np.argmax(oof_preds, axis=1) oof_accuracy = accuracy_score(y_train, oof_preds_argmax) print(f"\nOOF Accuracy: {oof_accuracy:.4f}") print(f"OOF Log Loss: {oof_logloss:.4f}") # Baseline accuracy on full train most_common = np.bincount(y_train).argmax() baseline = np.sum(y_train == most_common) / len(y_train) print(f"Baseline (always {target_encoder.classes_[most_common]}): {baseline:.4f}") print(f"Improvement over baseline: {(oof_accuracy - baseline):.4f}") # Show classification report on OOF hard predictions print("\nClassification Report (OOF predictions):") print(classification_report(y_train, oof_preds_argmax, target_names=target_encoder.classes_, zero_division=0)) # Prediction distribution (OOF) print("\nOOF Prediction Distribution:") print(f"{'Class':<10} {'Actual':<10} {'Predicted':<10} {'Actual %':<12} {'Pred %'}") print("-" * 60) for idx in range(len(target_encoder.classes_)): class_name = target_encoder.classes_[idx] actual_count = np.sum(y_train == idx) pred_count = np.sum(oof_preds_argmax == idx) actual_pct = actual_count / len(y_train) * 100 pred_pct = pred_count / len(oof_preds_argmax) * 100 print(f"{class_name:<10} {actual_count:<10} {pred_count:<10} {actual_pct:>6.2f}% {pred_pct:>6.2f}%") # Feature importance print("\n" + "="*70) print("TOP 20 FEATURE IMPORTANCES") print("="*70) # Aggregate feature importances across folds total_importance = np.zeros(len(feature_names)) for m in models: total_importance += np.array(m.feature_importance(importance_type='gain')) avg_importance = total_importance / max(1, len(models)) importance_df = pd.DataFrame({ 'feature': feature_names, 'importance': avg_importance }).sort_values('importance', ascending=False) print(importance_df.head(20).to_string(index=False)) # Plot feature importance plt.figure(figsize=(10, 8)) top_features = importance_df.head(20) plt.barh(range(len(top_features)), top_features['importance']) plt.yticks(range(len(top_features)), top_features['feature']) plt.xlabel('Importance (Gain)') plt.title('Top 20 Feature Importances') plt.gca().invert_yaxis() plt.tight_layout() plt.savefig('feature_importance.png', dpi=300) print("\nFeature importance plot saved as 'feature_importance.png'") # (Optional) Save a simple CSV of top importances importance_df.head(100).to_csv('feature_importances_avg.csv', index=False) print("Saved averaged feature importances as 'feature_importances_avg.csv'") # Save list of fold models model_files = [f'lightgbm_model_fold{i+1}.txt' for i in range(len(models))] with open('lightgbm_models_list.pkl', 'wb') as f: pickle.dump(model_files, f) print("\n" + "="*70) print(f"Saved {len(models)} fold models and model list 'lightgbm_models_list.pkl'") print("="*70) return models if __name__ == '__main__': model = train_lightgbm() print("\n" + "="*70) print("TRAINING COMPLETE!") print("="*70) print("\nNext step: Use predict_lightgbm.py to make predictions on test data")