221 lines
7.8 KiB
Python
221 lines
7.8 KiB
Python
import numpy as np
|
|
import pandas as pd
|
|
import lightgbm as lgb
|
|
from sklearn.model_selection import train_test_split, StratifiedKFold
|
|
from sklearn.metrics import accuracy_score, classification_report, log_loss
|
|
import pickle
|
|
import matplotlib.pyplot as plt
|
|
|
|
def train_lightgbm():
|
|
"""Train LightGBM classifier for destination prediction"""
|
|
print("="*70)
|
|
print("LIGHTGBM CLASSIFIER TRAINING")
|
|
print("="*70)
|
|
|
|
# Load preprocessed data and metadata
|
|
print("\nLoading preprocessed data and metadata...")
|
|
X_train = np.load('X_train.npy')
|
|
y_train = np.load('y_train.npy')
|
|
|
|
# Load feature names and categorical feature list
|
|
with open('feature_names.pkl', 'rb') as f:
|
|
feature_names = pickle.load(f)
|
|
|
|
with open('categorical_features.pkl', 'rb') as f:
|
|
categorical_features = pickle.load(f)
|
|
|
|
# Load target encoder
|
|
with open('target_encoder.pkl', 'rb') as f:
|
|
target_encoder = pickle.load(f)
|
|
|
|
print(f"Training samples: {len(y_train)}")
|
|
print(f"Number of classes: {len(target_encoder.classes_)}")
|
|
print(f"Classes: {target_encoder.classes_}")
|
|
|
|
# Show class distribution
|
|
print("\nClass distribution:")
|
|
unique, counts = np.unique(y_train, return_counts=True)
|
|
for idx, count in zip(unique, counts):
|
|
pct = count / len(y_train) * 100
|
|
print(f"{target_encoder.classes_[idx]}: {count} ({pct:.2f}%)")
|
|
|
|
# Convert to DataFrame to preserve feature names and pass categorical features to LightGBM
|
|
import pandas as pd
|
|
X_df = pd.DataFrame(X_train, columns=feature_names)
|
|
|
|
print(f"\nUsing categorical features: {categorical_features}")
|
|
|
|
# Prepare CV
|
|
n_splits = 5
|
|
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
|
|
oof_preds = np.zeros((X_df.shape[0], len(target_encoder.classes_)))
|
|
|
|
# If test set exists, prepare to accumulate test predictions
|
|
X_test = None
|
|
try:
|
|
X_test = np.load('X_test.npy')
|
|
X_test_df = pd.DataFrame(X_test, columns=feature_names)
|
|
test_preds = np.zeros((X_test_df.shape[0], len(target_encoder.classes_)))
|
|
except Exception:
|
|
X_test_df = None
|
|
test_preds = None
|
|
|
|
# LightGBM parameters - updated for CV and categorical handling
|
|
params = {
|
|
'objective': 'multiclass',
|
|
'num_class': len(target_encoder.classes_),
|
|
'metric': 'multi_logloss',
|
|
'boosting_type': 'gbdt',
|
|
'num_leaves': 128,
|
|
'learning_rate': 0.05,
|
|
'feature_fraction': 0.7,
|
|
'bagging_fraction': 0.8,
|
|
'bagging_freq': 5,
|
|
'verbose': -1,
|
|
'max_depth': -1,
|
|
'min_data_in_leaf': 20,
|
|
'lambda_l1': 0.1,
|
|
'lambda_l2': 0.1,
|
|
'seed': 42,
|
|
'boost_from_average': False
|
|
}
|
|
|
|
print("\nLightGBM Parameters:")
|
|
for key, value in params.items():
|
|
print(f" {key}: {value}")
|
|
|
|
# Train with Stratified K-Fold CV
|
|
print("\nTraining LightGBM with Stratified K-Fold CV...")
|
|
fold = 0
|
|
models = []
|
|
evals_result = {}
|
|
for train_idx, val_idx in skf.split(X_df, y_train):
|
|
fold += 1
|
|
print('\n' + '='*50)
|
|
print(f"Fold {fold}/{n_splits}")
|
|
print('='*50)
|
|
|
|
X_tr = X_df.iloc[train_idx]
|
|
X_val = X_df.iloc[val_idx]
|
|
y_tr = y_train[train_idx]
|
|
y_val = y_train[val_idx]
|
|
|
|
train_data = lgb.Dataset(X_tr, label=y_tr, feature_name=feature_names, categorical_feature=categorical_features)
|
|
val_data = lgb.Dataset(X_val, label=y_val, reference=train_data, feature_name=feature_names, categorical_feature=categorical_features)
|
|
|
|
model = lgb.train(
|
|
params,
|
|
train_data,
|
|
num_boost_round=3000,
|
|
valid_sets=[train_data, val_data],
|
|
valid_names=['train', 'valid'],
|
|
callbacks=[
|
|
lgb.early_stopping(stopping_rounds=100),
|
|
lgb.log_evaluation(period=100)
|
|
]
|
|
)
|
|
|
|
models.append(model)
|
|
|
|
# OOF predictions
|
|
val_pred = model.predict(X_val, num_iteration=model.best_iteration)
|
|
oof_preds[val_idx] = val_pred
|
|
|
|
# Test predictions (if available)
|
|
if X_test_df is not None:
|
|
fold_test_pred = model.predict(X_test_df, num_iteration=model.best_iteration)
|
|
test_preds += fold_test_pred
|
|
|
|
# Save fold model
|
|
model.save_model(f'lightgbm_model_fold{fold}.txt')
|
|
print(f"Saved model for fold {fold} as 'lightgbm_model_fold{fold}.txt'")
|
|
|
|
print("-" * 70)
|
|
|
|
# Evaluate using OOF predictions across all folds
|
|
print("\n" + "="*70)
|
|
print("CROSS-VALIDATION OOF RESULTS")
|
|
print("="*70)
|
|
|
|
oof_logloss = log_loss(y_train, oof_preds)
|
|
oof_preds_argmax = np.argmax(oof_preds, axis=1)
|
|
oof_accuracy = accuracy_score(y_train, oof_preds_argmax)
|
|
|
|
print(f"\nOOF Accuracy: {oof_accuracy:.4f}")
|
|
print(f"OOF Log Loss: {oof_logloss:.4f}")
|
|
|
|
# Baseline accuracy on full train
|
|
most_common = np.bincount(y_train).argmax()
|
|
baseline = np.sum(y_train == most_common) / len(y_train)
|
|
print(f"Baseline (always {target_encoder.classes_[most_common]}): {baseline:.4f}")
|
|
print(f"Improvement over baseline: {(oof_accuracy - baseline):.4f}")
|
|
|
|
# Show classification report on OOF hard predictions
|
|
print("\nClassification Report (OOF predictions):")
|
|
print(classification_report(y_train, oof_preds_argmax, target_names=target_encoder.classes_, zero_division=0))
|
|
|
|
# Prediction distribution (OOF)
|
|
print("\nOOF Prediction Distribution:")
|
|
print(f"{'Class':<10} {'Actual':<10} {'Predicted':<10} {'Actual %':<12} {'Pred %'}")
|
|
print("-" * 60)
|
|
for idx in range(len(target_encoder.classes_)):
|
|
class_name = target_encoder.classes_[idx]
|
|
actual_count = np.sum(y_train == idx)
|
|
pred_count = np.sum(oof_preds_argmax == idx)
|
|
actual_pct = actual_count / len(y_train) * 100
|
|
pred_pct = pred_count / len(oof_preds_argmax) * 100
|
|
print(f"{class_name:<10} {actual_count:<10} {pred_count:<10} {actual_pct:>6.2f}% {pred_pct:>6.2f}%")
|
|
|
|
# Feature importance
|
|
print("\n" + "="*70)
|
|
print("TOP 20 FEATURE IMPORTANCES")
|
|
print("="*70)
|
|
|
|
# Aggregate feature importances across folds
|
|
total_importance = np.zeros(len(feature_names))
|
|
for m in models:
|
|
total_importance += np.array(m.feature_importance(importance_type='gain'))
|
|
avg_importance = total_importance / max(1, len(models))
|
|
|
|
importance_df = pd.DataFrame({
|
|
'feature': feature_names,
|
|
'importance': avg_importance
|
|
}).sort_values('importance', ascending=False)
|
|
|
|
print(importance_df.head(20).to_string(index=False))
|
|
|
|
# Plot feature importance
|
|
plt.figure(figsize=(10, 8))
|
|
top_features = importance_df.head(20)
|
|
plt.barh(range(len(top_features)), top_features['importance'])
|
|
plt.yticks(range(len(top_features)), top_features['feature'])
|
|
plt.xlabel('Importance (Gain)')
|
|
plt.title('Top 20 Feature Importances')
|
|
plt.gca().invert_yaxis()
|
|
plt.tight_layout()
|
|
plt.savefig('feature_importance.png', dpi=300)
|
|
print("\nFeature importance plot saved as 'feature_importance.png'")
|
|
|
|
# (Optional) Save a simple CSV of top importances
|
|
importance_df.head(100).to_csv('feature_importances_avg.csv', index=False)
|
|
print("Saved averaged feature importances as 'feature_importances_avg.csv'")
|
|
|
|
# Save list of fold models
|
|
model_files = [f'lightgbm_model_fold{i+1}.txt' for i in range(len(models))]
|
|
with open('lightgbm_models_list.pkl', 'wb') as f:
|
|
pickle.dump(model_files, f)
|
|
|
|
print("\n" + "="*70)
|
|
print(f"Saved {len(models)} fold models and model list 'lightgbm_models_list.pkl'")
|
|
print("="*70)
|
|
|
|
return models
|
|
|
|
if __name__ == '__main__':
|
|
model = train_lightgbm()
|
|
|
|
print("\n" + "="*70)
|
|
print("TRAINING COMPLETE!")
|
|
print("="*70)
|
|
print("\nNext step: Use predict_lightgbm.py to make predictions on test data")
|