import numpy as np import lightgbm as lgb import pickle def predict_lightgbm(): """Generate predictions using LightGBM model""" print("="*70) print("LIGHTGBM PREDICTION") print("="*70) # Load test data print("Loading test data...") X_test = np.load('X_test.npy') test_ids = np.load('test_ids.npy', allow_pickle=True) print(f"Test samples: {len(X_test)}") # Try to load list of CV models; fall back to single model file import os models = [] if os.path.exists('lightgbm_models_list.pkl'): with open('lightgbm_models_list.pkl', 'rb') as f: model_files = pickle.load(f) print(f"Loading {len(model_files)} fold models...") for mf in model_files: if os.path.exists(mf): models.append(lgb.Booster(model_file=mf)) elif os.path.exists('lightgbm_model.txt'): print("Loading single model 'lightgbm_model.txt'...") models = [lgb.Booster(model_file='lightgbm_model.txt')] else: raise FileNotFoundError('No LightGBM model files found. Run training first.') # Load target encoder with open('target_encoder.pkl', 'rb') as f: target_encoder = pickle.load(f) # Make predictions (average over fold models) print("\nGenerating predictions by averaging fold models...") preds = None for m in models: p = m.predict(X_test) if preds is None: preds = p else: preds += p y_pred_proba = preds / len(models) # Get top-5 destinations for each user print("\nGenerating top-5 destinations per user...") top5_indices = np.argsort(y_pred_proba, axis=1)[:, ::-1][:, :5] # Create submission with 5 rows per user import pandas as pd submission_rows = [] for user_idx, user_id in enumerate(test_ids): for rank in range(5): dest_idx = top5_indices[user_idx, rank] dest_country = target_encoder.inverse_transform([dest_idx])[0] submission_rows.append({ 'id': user_id, 'country': dest_country }) submission_df = pd.DataFrame(submission_rows) submission_df.to_csv('submission_lightgbm.csv', index=False) # Show distribution of top-1 predictions print("\nTop-1 Prediction Distribution:") top1_preds = target_encoder.inverse_transform(top5_indices[:, 0]) unique, counts = np.unique(top1_preds, return_counts=True) for country, count in sorted(zip(unique, counts), key=lambda x: -x[1]): pct = count / len(top1_preds) * 100 print(f"{country}: {count} ({pct:.2f}%)") print("\n" + "="*70) print("Predictions saved as 'submission_lightgbm.csv'") print("="*70) print(f"\nTotal users: {len(test_ids)}") print(f"Total rows (5 per user): {len(submission_rows)}") print(f"Unique destinations in top-1: {len(unique)}") return y_pred_proba if __name__ == '__main__': predictions = predict_lightgbm() print("\n" + "="*70) print("PREDICTION COMPLETE!") print("="*70)