Datas-Mining-LightGBM/LightGBM/predict_lightgbm.py
2025-12-04 16:48:17 -06:00

91 lines
3.0 KiB
Python

import numpy as np
import lightgbm as lgb
import pickle
def predict_lightgbm():
"""Generate predictions using LightGBM model"""
print("="*70)
print("LIGHTGBM PREDICTION")
print("="*70)
# Load test data
print("Loading test data...")
X_test = np.load('X_test.npy')
test_ids = np.load('test_ids.npy', allow_pickle=True)
print(f"Test samples: {len(X_test)}")
# Try to load list of CV models; fall back to single model file
import os
models = []
if os.path.exists('lightgbm_models_list.pkl'):
with open('lightgbm_models_list.pkl', 'rb') as f:
model_files = pickle.load(f)
print(f"Loading {len(model_files)} fold models...")
for mf in model_files:
if os.path.exists(mf):
models.append(lgb.Booster(model_file=mf))
elif os.path.exists('lightgbm_model.txt'):
print("Loading single model 'lightgbm_model.txt'...")
models = [lgb.Booster(model_file='lightgbm_model.txt')]
else:
raise FileNotFoundError('No LightGBM model files found. Run training first.')
# Load target encoder
with open('target_encoder.pkl', 'rb') as f:
target_encoder = pickle.load(f)
# Make predictions (average over fold models)
print("\nGenerating predictions by averaging fold models...")
preds = None
for m in models:
p = m.predict(X_test)
if preds is None:
preds = p
else:
preds += p
y_pred_proba = preds / len(models)
# Get top-5 destinations for each user
print("\nGenerating top-5 destinations per user...")
top5_indices = np.argsort(y_pred_proba, axis=1)[:, ::-1][:, :5]
# Create submission with 5 rows per user
import pandas as pd
submission_rows = []
for user_idx, user_id in enumerate(test_ids):
for rank in range(5):
dest_idx = top5_indices[user_idx, rank]
dest_country = target_encoder.inverse_transform([dest_idx])[0]
submission_rows.append({
'id': user_id,
'country': dest_country
})
submission_df = pd.DataFrame(submission_rows)
submission_df.to_csv('submission_lightgbm.csv', index=False)
# Show distribution of top-1 predictions
print("\nTop-1 Prediction Distribution:")
top1_preds = target_encoder.inverse_transform(top5_indices[:, 0])
unique, counts = np.unique(top1_preds, return_counts=True)
for country, count in sorted(zip(unique, counts), key=lambda x: -x[1]):
pct = count / len(top1_preds) * 100
print(f"{country}: {count} ({pct:.2f}%)")
print("\n" + "="*70)
print("Predictions saved as 'submission_lightgbm.csv'")
print("="*70)
print(f"\nTotal users: {len(test_ids)}")
print(f"Total rows (5 per user): {len(submission_rows)}")
print(f"Unique destinations in top-1: {len(unique)}")
return y_pred_proba
if __name__ == '__main__':
predictions = predict_lightgbm()
print("\n" + "="*70)
print("PREDICTION COMPLETE!")
print("="*70)