91 lines
3.0 KiB
Python
91 lines
3.0 KiB
Python
import numpy as np
|
|
import lightgbm as lgb
|
|
import pickle
|
|
|
|
def predict_lightgbm():
|
|
"""Generate predictions using LightGBM model"""
|
|
print("="*70)
|
|
print("LIGHTGBM PREDICTION")
|
|
print("="*70)
|
|
|
|
# Load test data
|
|
print("Loading test data...")
|
|
X_test = np.load('X_test.npy')
|
|
test_ids = np.load('test_ids.npy', allow_pickle=True)
|
|
|
|
print(f"Test samples: {len(X_test)}")
|
|
|
|
# Try to load list of CV models; fall back to single model file
|
|
import os
|
|
models = []
|
|
if os.path.exists('lightgbm_models_list.pkl'):
|
|
with open('lightgbm_models_list.pkl', 'rb') as f:
|
|
model_files = pickle.load(f)
|
|
print(f"Loading {len(model_files)} fold models...")
|
|
for mf in model_files:
|
|
if os.path.exists(mf):
|
|
models.append(lgb.Booster(model_file=mf))
|
|
elif os.path.exists('lightgbm_model.txt'):
|
|
print("Loading single model 'lightgbm_model.txt'...")
|
|
models = [lgb.Booster(model_file='lightgbm_model.txt')]
|
|
else:
|
|
raise FileNotFoundError('No LightGBM model files found. Run training first.')
|
|
|
|
# Load target encoder
|
|
with open('target_encoder.pkl', 'rb') as f:
|
|
target_encoder = pickle.load(f)
|
|
|
|
# Make predictions (average over fold models)
|
|
print("\nGenerating predictions by averaging fold models...")
|
|
preds = None
|
|
for m in models:
|
|
p = m.predict(X_test)
|
|
if preds is None:
|
|
preds = p
|
|
else:
|
|
preds += p
|
|
y_pred_proba = preds / len(models)
|
|
|
|
# Get top-5 destinations for each user
|
|
print("\nGenerating top-5 destinations per user...")
|
|
top5_indices = np.argsort(y_pred_proba, axis=1)[:, ::-1][:, :5]
|
|
|
|
# Create submission with 5 rows per user
|
|
import pandas as pd
|
|
submission_rows = []
|
|
for user_idx, user_id in enumerate(test_ids):
|
|
for rank in range(5):
|
|
dest_idx = top5_indices[user_idx, rank]
|
|
dest_country = target_encoder.inverse_transform([dest_idx])[0]
|
|
submission_rows.append({
|
|
'id': user_id,
|
|
'country': dest_country
|
|
})
|
|
|
|
submission_df = pd.DataFrame(submission_rows)
|
|
submission_df.to_csv('submission_lightgbm.csv', index=False)
|
|
|
|
# Show distribution of top-1 predictions
|
|
print("\nTop-1 Prediction Distribution:")
|
|
top1_preds = target_encoder.inverse_transform(top5_indices[:, 0])
|
|
unique, counts = np.unique(top1_preds, return_counts=True)
|
|
for country, count in sorted(zip(unique, counts), key=lambda x: -x[1]):
|
|
pct = count / len(top1_preds) * 100
|
|
print(f"{country}: {count} ({pct:.2f}%)")
|
|
|
|
print("\n" + "="*70)
|
|
print("Predictions saved as 'submission_lightgbm.csv'")
|
|
print("="*70)
|
|
print(f"\nTotal users: {len(test_ids)}")
|
|
print(f"Total rows (5 per user): {len(submission_rows)}")
|
|
print(f"Unique destinations in top-1: {len(unique)}")
|
|
|
|
return y_pred_proba
|
|
|
|
if __name__ == '__main__':
|
|
predictions = predict_lightgbm()
|
|
|
|
print("\n" + "="*70)
|
|
print("PREDICTION COMPLETE!")
|
|
print("="*70)
|