init
This commit is contained in:
commit
e8e36adb24
243
LightGBM/data_preprocessing.py
Normal file
243
LightGBM/data_preprocessing.py
Normal file
@ -0,0 +1,243 @@
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
from sklearn.preprocessing import LabelEncoder
|
||||
import pickle
|
||||
|
||||
def load_data():
|
||||
"""Load all CSV files"""
|
||||
print("Loading data...")
|
||||
train_users = pd.read_csv('../data/train_users_2.csv')
|
||||
test_users = pd.read_csv('../data/test_users.csv')
|
||||
sessions = pd.read_csv('../data/sessions.csv')
|
||||
countries = pd.read_csv('../data/countries.csv')
|
||||
age_gender = pd.read_csv('../data/age_gender_bkts.csv')
|
||||
|
||||
return train_users, test_users, sessions, countries, age_gender
|
||||
|
||||
def preprocess_users(df, is_train=True):
|
||||
"""Preprocess user data"""
|
||||
print(f"Preprocessing {'train' if is_train else 'test'} users...")
|
||||
|
||||
# Create a copy
|
||||
df = df.copy()
|
||||
|
||||
# Handle date features
|
||||
df['date_account_created'] = pd.to_datetime(df['date_account_created'])
|
||||
df['timestamp_first_active'] = pd.to_datetime(df['timestamp_first_active'].astype(str), format='%Y%m%d%H%M%S')
|
||||
|
||||
# Extract date features
|
||||
df['dac_year'] = df['date_account_created'].dt.year
|
||||
df['dac_month'] = df['date_account_created'].dt.month
|
||||
df['dac_day'] = df['date_account_created'].dt.day
|
||||
df['dac_weekday'] = df['date_account_created'].dt.weekday
|
||||
|
||||
df['tfa_year'] = df['timestamp_first_active'].dt.year
|
||||
df['tfa_month'] = df['timestamp_first_active'].dt.month
|
||||
df['tfa_day'] = df['timestamp_first_active'].dt.day
|
||||
|
||||
# Handle date_first_booking if it exists (only in train)
|
||||
# NOTE: We should NOT use date_first_booking features as they leak information
|
||||
# about the target variable (booking date exists only if user made a booking)
|
||||
if 'date_first_booking' in df.columns:
|
||||
# Simply drop this column - don't extract features from it
|
||||
df = df.drop('date_first_booking', axis=1)
|
||||
|
||||
# Drop original date columns
|
||||
df = df.drop(['date_account_created', 'timestamp_first_active'], axis=1)
|
||||
|
||||
# Handle age - clean outliers
|
||||
df['age'] = df['age'].fillna(-1)
|
||||
df.loc[(df['age'] < 18) | (df['age'] > 100), 'age'] = -1
|
||||
|
||||
# Handle gender
|
||||
df['gender'] = df['gender'].fillna('-unknown-')
|
||||
|
||||
# Fill other categorical NaN with 'unknown'
|
||||
categorical_cols = ['signup_method', 'signup_flow', 'language', 'affiliate_channel',
|
||||
'affiliate_provider', 'first_affiliate_tracked', 'signup_app',
|
||||
'first_device_type', 'first_browser']
|
||||
for col in categorical_cols:
|
||||
if col in df.columns:
|
||||
df[col] = df[col].fillna('unknown')
|
||||
|
||||
return df
|
||||
|
||||
def aggregate_sessions(sessions):
|
||||
"""Aggregate session data per user"""
|
||||
print("Aggregating session data...")
|
||||
|
||||
if sessions.empty:
|
||||
return pd.DataFrame()
|
||||
|
||||
# Fill NaN values
|
||||
sessions = sessions.fillna(-1)
|
||||
|
||||
# Aggregate session features per user
|
||||
session_agg = sessions.groupby('user_id').agg({
|
||||
'action': 'count',
|
||||
'action_type': lambda x: x.value_counts().index[0] if len(x) > 0 else 'unknown',
|
||||
'action_detail': lambda x: x.value_counts().index[0] if len(x) > 0 else 'unknown',
|
||||
'device_type': lambda x: x.value_counts().index[0] if len(x) > 0 else 'unknown',
|
||||
'secs_elapsed': ['sum', 'mean', 'max', 'min']
|
||||
}).reset_index()
|
||||
|
||||
# Flatten column names
|
||||
session_agg.columns = ['user_id', 'num_actions', 'most_common_action_type',
|
||||
'most_common_action_detail', 'most_common_device_type',
|
||||
'total_secs', 'mean_secs', 'max_secs', 'min_secs']
|
||||
|
||||
return session_agg
|
||||
|
||||
def encode_features(train_df, test_df, label_encoders=None):
|
||||
"""Encode categorical features"""
|
||||
print("Encoding features...")
|
||||
|
||||
categorical_cols = ['gender', 'signup_method', 'language', 'affiliate_channel',
|
||||
'affiliate_provider', 'first_affiliate_tracked', 'signup_app',
|
||||
'first_device_type', 'first_browser']
|
||||
|
||||
# Add session categorical columns if they exist
|
||||
if 'most_common_action_type' in train_df.columns:
|
||||
categorical_cols.extend(['most_common_action_type', 'most_common_action_detail',
|
||||
'most_common_device_type'])
|
||||
|
||||
if label_encoders is None:
|
||||
label_encoders = {}
|
||||
for col in categorical_cols:
|
||||
if col in train_df.columns:
|
||||
le = LabelEncoder()
|
||||
# Fit on combined train and test to handle all categories
|
||||
combined = pd.concat([train_df[col], test_df[col]]).astype(str)
|
||||
le.fit(combined)
|
||||
label_encoders[col] = le
|
||||
|
||||
# Transform
|
||||
for col in categorical_cols:
|
||||
if col in train_df.columns:
|
||||
train_df[col] = label_encoders[col].transform(train_df[col].astype(str))
|
||||
test_df[col] = label_encoders[col].transform(test_df[col].astype(str))
|
||||
|
||||
# Encode signup_flow as numeric
|
||||
if 'signup_flow' in train_df.columns:
|
||||
train_df['signup_flow'] = train_df['signup_flow'].astype(int)
|
||||
test_df['signup_flow'] = test_df['signup_flow'].astype(int)
|
||||
|
||||
return train_df, test_df, label_encoders
|
||||
|
||||
def prepare_datasets():
|
||||
"""Main function to prepare train and test datasets"""
|
||||
# Load data
|
||||
train_users, test_users, sessions, countries, age_gender = load_data()
|
||||
|
||||
# Store IDs and target
|
||||
train_ids = train_users['id']
|
||||
test_ids = test_users['id']
|
||||
target = train_users['country_destination']
|
||||
|
||||
# Preprocess users
|
||||
train_users = preprocess_users(train_users, is_train=True)
|
||||
test_users = preprocess_users(test_users, is_train=False)
|
||||
|
||||
# Aggregate sessions
|
||||
session_agg = aggregate_sessions(sessions)
|
||||
|
||||
# Merge session data if available
|
||||
if not session_agg.empty:
|
||||
train_users = train_users.merge(session_agg, left_on='id', right_on='user_id', how='left')
|
||||
test_users = test_users.merge(session_agg, left_on='id', right_on='user_id', how='left')
|
||||
|
||||
# Drop user_id column from merge
|
||||
if 'user_id' in train_users.columns:
|
||||
train_users = train_users.drop('user_id', axis=1)
|
||||
test_users = test_users.drop('user_id', axis=1)
|
||||
|
||||
# Fill NaN values from merge
|
||||
session_cols = ['num_actions', 'total_secs', 'mean_secs', 'max_secs', 'min_secs']
|
||||
for col in session_cols:
|
||||
if col in train_users.columns:
|
||||
train_users[col] = train_users[col].fillna(0)
|
||||
test_users[col] = test_users[col].fillna(0)
|
||||
|
||||
session_cat_cols = ['most_common_action_type', 'most_common_action_detail', 'most_common_device_type']
|
||||
for col in session_cat_cols:
|
||||
if col in train_users.columns:
|
||||
train_users[col] = train_users[col].fillna('unknown')
|
||||
test_users[col] = test_users[col].fillna('unknown')
|
||||
|
||||
# Drop ID and target from train
|
||||
train_users = train_users.drop(['id', 'country_destination'], axis=1)
|
||||
test_users = test_users.drop(['id'], axis=1)
|
||||
|
||||
# Encode categorical features
|
||||
train_users, test_users, label_encoders = encode_features(train_users, test_users)
|
||||
|
||||
# Fill any remaining NaN values before scaling
|
||||
print("Checking for NaN values...")
|
||||
print(f"Train NaN count: {train_users.isna().sum().sum()}")
|
||||
print(f"Test NaN count: {test_users.isna().sum().sum()}")
|
||||
|
||||
if train_users.isna().any().any():
|
||||
print("Warning: Found NaN values in train data. Filling with 0...")
|
||||
train_users = train_users.fillna(0)
|
||||
|
||||
if test_users.isna().any().any():
|
||||
print("Warning: Found NaN values in test data. Filling with 0...")
|
||||
test_users = test_users.fillna(0)
|
||||
|
||||
# Encode target variable
|
||||
target_encoder = LabelEncoder()
|
||||
target_encoded = target_encoder.fit_transform(target)
|
||||
|
||||
# Save encoders
|
||||
with open('label_encoders.pkl', 'wb') as f:
|
||||
pickle.dump(label_encoders, f)
|
||||
|
||||
with open('target_encoder.pkl', 'wb') as f:
|
||||
pickle.dump(target_encoder, f)
|
||||
|
||||
# Determine categorical columns (should match those encoded earlier)
|
||||
categorical_cols = ['gender', 'signup_method', 'language', 'affiliate_channel',
|
||||
'affiliate_provider', 'first_affiliate_tracked', 'signup_app',
|
||||
'first_device_type', 'first_browser']
|
||||
|
||||
# Add session categorical columns if they exist
|
||||
session_cat_cols = ['most_common_action_type', 'most_common_action_detail', 'most_common_device_type']
|
||||
for col in session_cat_cols:
|
||||
if col in train_users.columns:
|
||||
categorical_cols.append(col)
|
||||
|
||||
# Keep only those categorical cols that actually exist in the dataframe
|
||||
categorical_cols = [c for c in categorical_cols if c in train_users.columns]
|
||||
|
||||
# For LightGBM: No scaling needed (tree-based model)
|
||||
# Keep all features as-is (numeric and categorical)
|
||||
feature_names = list(train_users.columns)
|
||||
|
||||
# Save feature metadata
|
||||
with open('feature_names.pkl', 'wb') as f:
|
||||
pickle.dump(feature_names, f)
|
||||
|
||||
with open('categorical_features.pkl', 'wb') as f:
|
||||
pickle.dump(categorical_cols, f)
|
||||
|
||||
# Convert to numpy arrays
|
||||
X_train = train_users.values
|
||||
X_test = test_users.values
|
||||
|
||||
print(f"Train shape: {X_train.shape}")
|
||||
print(f"Test shape: {X_test.shape}")
|
||||
print(f"Number of classes: {len(target_encoder.classes_)}")
|
||||
print(f"Classes: {target_encoder.classes_}")
|
||||
|
||||
return X_train, target_encoded, X_test, test_ids, target_encoder
|
||||
|
||||
if __name__ == '__main__':
|
||||
X_train, y_train, X_test, test_ids, target_encoder = prepare_datasets()
|
||||
|
||||
# Save preprocessed data
|
||||
np.save('X_train.npy', X_train)
|
||||
np.save('y_train.npy', y_train)
|
||||
np.save('X_test.npy', X_test)
|
||||
np.save('test_ids.npy', test_ids)
|
||||
|
||||
print("\nData preprocessing completed successfully!")
|
||||
90
LightGBM/predict_lightgbm.py
Normal file
90
LightGBM/predict_lightgbm.py
Normal file
@ -0,0 +1,90 @@
|
||||
import numpy as np
|
||||
import lightgbm as lgb
|
||||
import pickle
|
||||
|
||||
def predict_lightgbm():
|
||||
"""Generate predictions using LightGBM model"""
|
||||
print("="*70)
|
||||
print("LIGHTGBM PREDICTION")
|
||||
print("="*70)
|
||||
|
||||
# Load test data
|
||||
print("Loading test data...")
|
||||
X_test = np.load('X_test.npy')
|
||||
test_ids = np.load('test_ids.npy', allow_pickle=True)
|
||||
|
||||
print(f"Test samples: {len(X_test)}")
|
||||
|
||||
# Try to load list of CV models; fall back to single model file
|
||||
import os
|
||||
models = []
|
||||
if os.path.exists('lightgbm_models_list.pkl'):
|
||||
with open('lightgbm_models_list.pkl', 'rb') as f:
|
||||
model_files = pickle.load(f)
|
||||
print(f"Loading {len(model_files)} fold models...")
|
||||
for mf in model_files:
|
||||
if os.path.exists(mf):
|
||||
models.append(lgb.Booster(model_file=mf))
|
||||
elif os.path.exists('lightgbm_model.txt'):
|
||||
print("Loading single model 'lightgbm_model.txt'...")
|
||||
models = [lgb.Booster(model_file='lightgbm_model.txt')]
|
||||
else:
|
||||
raise FileNotFoundError('No LightGBM model files found. Run training first.')
|
||||
|
||||
# Load target encoder
|
||||
with open('target_encoder.pkl', 'rb') as f:
|
||||
target_encoder = pickle.load(f)
|
||||
|
||||
# Make predictions (average over fold models)
|
||||
print("\nGenerating predictions by averaging fold models...")
|
||||
preds = None
|
||||
for m in models:
|
||||
p = m.predict(X_test)
|
||||
if preds is None:
|
||||
preds = p
|
||||
else:
|
||||
preds += p
|
||||
y_pred_proba = preds / len(models)
|
||||
|
||||
# Get top-5 destinations for each user
|
||||
print("\nGenerating top-5 destinations per user...")
|
||||
top5_indices = np.argsort(y_pred_proba, axis=1)[:, ::-1][:, :5]
|
||||
|
||||
# Create submission with 5 rows per user
|
||||
import pandas as pd
|
||||
submission_rows = []
|
||||
for user_idx, user_id in enumerate(test_ids):
|
||||
for rank in range(5):
|
||||
dest_idx = top5_indices[user_idx, rank]
|
||||
dest_country = target_encoder.inverse_transform([dest_idx])[0]
|
||||
submission_rows.append({
|
||||
'id': user_id,
|
||||
'country': dest_country
|
||||
})
|
||||
|
||||
submission_df = pd.DataFrame(submission_rows)
|
||||
submission_df.to_csv('submission_lightgbm.csv', index=False)
|
||||
|
||||
# Show distribution of top-1 predictions
|
||||
print("\nTop-1 Prediction Distribution:")
|
||||
top1_preds = target_encoder.inverse_transform(top5_indices[:, 0])
|
||||
unique, counts = np.unique(top1_preds, return_counts=True)
|
||||
for country, count in sorted(zip(unique, counts), key=lambda x: -x[1]):
|
||||
pct = count / len(top1_preds) * 100
|
||||
print(f"{country}: {count} ({pct:.2f}%)")
|
||||
|
||||
print("\n" + "="*70)
|
||||
print("Predictions saved as 'submission_lightgbm.csv'")
|
||||
print("="*70)
|
||||
print(f"\nTotal users: {len(test_ids)}")
|
||||
print(f"Total rows (5 per user): {len(submission_rows)}")
|
||||
print(f"Unique destinations in top-1: {len(unique)}")
|
||||
|
||||
return y_pred_proba
|
||||
|
||||
if __name__ == '__main__':
|
||||
predictions = predict_lightgbm()
|
||||
|
||||
print("\n" + "="*70)
|
||||
print("PREDICTION COMPLETE!")
|
||||
print("="*70)
|
||||
220
LightGBM/train_lightgbm.py
Normal file
220
LightGBM/train_lightgbm.py
Normal file
@ -0,0 +1,220 @@
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
import lightgbm as lgb
|
||||
from sklearn.model_selection import train_test_split, StratifiedKFold
|
||||
from sklearn.metrics import accuracy_score, classification_report, log_loss
|
||||
import pickle
|
||||
import matplotlib.pyplot as plt
|
||||
|
||||
def train_lightgbm():
|
||||
"""Train LightGBM classifier for destination prediction"""
|
||||
print("="*70)
|
||||
print("LIGHTGBM CLASSIFIER TRAINING")
|
||||
print("="*70)
|
||||
|
||||
# Load preprocessed data and metadata
|
||||
print("\nLoading preprocessed data and metadata...")
|
||||
X_train = np.load('X_train.npy')
|
||||
y_train = np.load('y_train.npy')
|
||||
|
||||
# Load feature names and categorical feature list
|
||||
with open('feature_names.pkl', 'rb') as f:
|
||||
feature_names = pickle.load(f)
|
||||
|
||||
with open('categorical_features.pkl', 'rb') as f:
|
||||
categorical_features = pickle.load(f)
|
||||
|
||||
# Load target encoder
|
||||
with open('target_encoder.pkl', 'rb') as f:
|
||||
target_encoder = pickle.load(f)
|
||||
|
||||
print(f"Training samples: {len(y_train)}")
|
||||
print(f"Number of classes: {len(target_encoder.classes_)}")
|
||||
print(f"Classes: {target_encoder.classes_}")
|
||||
|
||||
# Show class distribution
|
||||
print("\nClass distribution:")
|
||||
unique, counts = np.unique(y_train, return_counts=True)
|
||||
for idx, count in zip(unique, counts):
|
||||
pct = count / len(y_train) * 100
|
||||
print(f"{target_encoder.classes_[idx]}: {count} ({pct:.2f}%)")
|
||||
|
||||
# Convert to DataFrame to preserve feature names and pass categorical features to LightGBM
|
||||
import pandas as pd
|
||||
X_df = pd.DataFrame(X_train, columns=feature_names)
|
||||
|
||||
print(f"\nUsing categorical features: {categorical_features}")
|
||||
|
||||
# Prepare CV
|
||||
n_splits = 5
|
||||
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
|
||||
oof_preds = np.zeros((X_df.shape[0], len(target_encoder.classes_)))
|
||||
|
||||
# If test set exists, prepare to accumulate test predictions
|
||||
X_test = None
|
||||
try:
|
||||
X_test = np.load('X_test.npy')
|
||||
X_test_df = pd.DataFrame(X_test, columns=feature_names)
|
||||
test_preds = np.zeros((X_test_df.shape[0], len(target_encoder.classes_)))
|
||||
except Exception:
|
||||
X_test_df = None
|
||||
test_preds = None
|
||||
|
||||
# LightGBM parameters - updated for CV and categorical handling
|
||||
params = {
|
||||
'objective': 'multiclass',
|
||||
'num_class': len(target_encoder.classes_),
|
||||
'metric': 'multi_logloss',
|
||||
'boosting_type': 'gbdt',
|
||||
'num_leaves': 128,
|
||||
'learning_rate': 0.05,
|
||||
'feature_fraction': 0.7,
|
||||
'bagging_fraction': 0.8,
|
||||
'bagging_freq': 5,
|
||||
'verbose': -1,
|
||||
'max_depth': -1,
|
||||
'min_data_in_leaf': 20,
|
||||
'lambda_l1': 0.1,
|
||||
'lambda_l2': 0.1,
|
||||
'seed': 42,
|
||||
'boost_from_average': False
|
||||
}
|
||||
|
||||
print("\nLightGBM Parameters:")
|
||||
for key, value in params.items():
|
||||
print(f" {key}: {value}")
|
||||
|
||||
# Train with Stratified K-Fold CV
|
||||
print("\nTraining LightGBM with Stratified K-Fold CV...")
|
||||
fold = 0
|
||||
models = []
|
||||
evals_result = {}
|
||||
for train_idx, val_idx in skf.split(X_df, y_train):
|
||||
fold += 1
|
||||
print('\n' + '='*50)
|
||||
print(f"Fold {fold}/{n_splits}")
|
||||
print('='*50)
|
||||
|
||||
X_tr = X_df.iloc[train_idx]
|
||||
X_val = X_df.iloc[val_idx]
|
||||
y_tr = y_train[train_idx]
|
||||
y_val = y_train[val_idx]
|
||||
|
||||
train_data = lgb.Dataset(X_tr, label=y_tr, feature_name=feature_names, categorical_feature=categorical_features)
|
||||
val_data = lgb.Dataset(X_val, label=y_val, reference=train_data, feature_name=feature_names, categorical_feature=categorical_features)
|
||||
|
||||
model = lgb.train(
|
||||
params,
|
||||
train_data,
|
||||
num_boost_round=3000,
|
||||
valid_sets=[train_data, val_data],
|
||||
valid_names=['train', 'valid'],
|
||||
callbacks=[
|
||||
lgb.early_stopping(stopping_rounds=100),
|
||||
lgb.log_evaluation(period=100)
|
||||
]
|
||||
)
|
||||
|
||||
models.append(model)
|
||||
|
||||
# OOF predictions
|
||||
val_pred = model.predict(X_val, num_iteration=model.best_iteration)
|
||||
oof_preds[val_idx] = val_pred
|
||||
|
||||
# Test predictions (if available)
|
||||
if X_test_df is not None:
|
||||
fold_test_pred = model.predict(X_test_df, num_iteration=model.best_iteration)
|
||||
test_preds += fold_test_pred
|
||||
|
||||
# Save fold model
|
||||
model.save_model(f'lightgbm_model_fold{fold}.txt')
|
||||
print(f"Saved model for fold {fold} as 'lightgbm_model_fold{fold}.txt'")
|
||||
|
||||
print("-" * 70)
|
||||
|
||||
# Evaluate using OOF predictions across all folds
|
||||
print("\n" + "="*70)
|
||||
print("CROSS-VALIDATION OOF RESULTS")
|
||||
print("="*70)
|
||||
|
||||
oof_logloss = log_loss(y_train, oof_preds)
|
||||
oof_preds_argmax = np.argmax(oof_preds, axis=1)
|
||||
oof_accuracy = accuracy_score(y_train, oof_preds_argmax)
|
||||
|
||||
print(f"\nOOF Accuracy: {oof_accuracy:.4f}")
|
||||
print(f"OOF Log Loss: {oof_logloss:.4f}")
|
||||
|
||||
# Baseline accuracy on full train
|
||||
most_common = np.bincount(y_train).argmax()
|
||||
baseline = np.sum(y_train == most_common) / len(y_train)
|
||||
print(f"Baseline (always {target_encoder.classes_[most_common]}): {baseline:.4f}")
|
||||
print(f"Improvement over baseline: {(oof_accuracy - baseline):.4f}")
|
||||
|
||||
# Show classification report on OOF hard predictions
|
||||
print("\nClassification Report (OOF predictions):")
|
||||
print(classification_report(y_train, oof_preds_argmax, target_names=target_encoder.classes_, zero_division=0))
|
||||
|
||||
# Prediction distribution (OOF)
|
||||
print("\nOOF Prediction Distribution:")
|
||||
print(f"{'Class':<10} {'Actual':<10} {'Predicted':<10} {'Actual %':<12} {'Pred %'}")
|
||||
print("-" * 60)
|
||||
for idx in range(len(target_encoder.classes_)):
|
||||
class_name = target_encoder.classes_[idx]
|
||||
actual_count = np.sum(y_train == idx)
|
||||
pred_count = np.sum(oof_preds_argmax == idx)
|
||||
actual_pct = actual_count / len(y_train) * 100
|
||||
pred_pct = pred_count / len(oof_preds_argmax) * 100
|
||||
print(f"{class_name:<10} {actual_count:<10} {pred_count:<10} {actual_pct:>6.2f}% {pred_pct:>6.2f}%")
|
||||
|
||||
# Feature importance
|
||||
print("\n" + "="*70)
|
||||
print("TOP 20 FEATURE IMPORTANCES")
|
||||
print("="*70)
|
||||
|
||||
# Aggregate feature importances across folds
|
||||
total_importance = np.zeros(len(feature_names))
|
||||
for m in models:
|
||||
total_importance += np.array(m.feature_importance(importance_type='gain'))
|
||||
avg_importance = total_importance / max(1, len(models))
|
||||
|
||||
importance_df = pd.DataFrame({
|
||||
'feature': feature_names,
|
||||
'importance': avg_importance
|
||||
}).sort_values('importance', ascending=False)
|
||||
|
||||
print(importance_df.head(20).to_string(index=False))
|
||||
|
||||
# Plot feature importance
|
||||
plt.figure(figsize=(10, 8))
|
||||
top_features = importance_df.head(20)
|
||||
plt.barh(range(len(top_features)), top_features['importance'])
|
||||
plt.yticks(range(len(top_features)), top_features['feature'])
|
||||
plt.xlabel('Importance (Gain)')
|
||||
plt.title('Top 20 Feature Importances')
|
||||
plt.gca().invert_yaxis()
|
||||
plt.tight_layout()
|
||||
plt.savefig('feature_importance.png', dpi=300)
|
||||
print("\nFeature importance plot saved as 'feature_importance.png'")
|
||||
|
||||
# (Optional) Save a simple CSV of top importances
|
||||
importance_df.head(100).to_csv('feature_importances_avg.csv', index=False)
|
||||
print("Saved averaged feature importances as 'feature_importances_avg.csv'")
|
||||
|
||||
# Save list of fold models
|
||||
model_files = [f'lightgbm_model_fold{i+1}.txt' for i in range(len(models))]
|
||||
with open('lightgbm_models_list.pkl', 'wb') as f:
|
||||
pickle.dump(model_files, f)
|
||||
|
||||
print("\n" + "="*70)
|
||||
print(f"Saved {len(models)} fold models and model list 'lightgbm_models_list.pkl'")
|
||||
print("="*70)
|
||||
|
||||
return models
|
||||
|
||||
if __name__ == '__main__':
|
||||
model = train_lightgbm()
|
||||
|
||||
print("\n" + "="*70)
|
||||
print("TRAINING COMPLETE!")
|
||||
print("="*70)
|
||||
print("\nNext step: Use predict_lightgbm.py to make predictions on test data")
|
||||
421
data/age_gender_bkts.csv
Normal file
421
data/age_gender_bkts.csv
Normal file
@ -0,0 +1,421 @@
|
||||
age_bucket,country_destination,gender,population_in_thousands,year
|
||||
100+,AU,male,1.0,2015.0
|
||||
95-99,AU,male,9.0,2015.0
|
||||
90-94,AU,male,47.0,2015.0
|
||||
85-89,AU,male,118.0,2015.0
|
||||
80-84,AU,male,199.0,2015.0
|
||||
75-79,AU,male,298.0,2015.0
|
||||
70-74,AU,male,415.0,2015.0
|
||||
65-69,AU,male,574.0,2015.0
|
||||
60-64,AU,male,636.0,2015.0
|
||||
55-59,AU,male,714.0,2015.0
|
||||
50-54,AU,male,778.0,2015.0
|
||||
45-49,AU,male,778.0,2015.0
|
||||
40-44,AU,male,820.0,2015.0
|
||||
35-39,AU,male,797.0,2015.0
|
||||
30-34,AU,male,881.0,2015.0
|
||||
25-29,AU,male,895.0,2015.0
|
||||
20-24,AU,male,820.0,2015.0
|
||||
15-19,AU,male,768.0,2015.0
|
||||
10-14,AU,male,743.0,2015.0
|
||||
5-9,AU,male,784.0,2015.0
|
||||
0-4,AU,male,824.0,2015.0
|
||||
100+,AU,female,4.0,2015.0
|
||||
95-99,AU,female,25.0,2015.0
|
||||
90-94,AU,female,94.0,2015.0
|
||||
85-89,AU,female,179.0,2015.0
|
||||
80-84,AU,female,252.0,2015.0
|
||||
75-79,AU,female,338.0,2015.0
|
||||
70-74,AU,female,438.0,2015.0
|
||||
65-69,AU,female,592.0,2015.0
|
||||
60-64,AU,female,660.0,2015.0
|
||||
55-59,AU,female,739.0,2015.0
|
||||
50-54,AU,female,798.0,2015.0
|
||||
45-49,AU,female,793.0,2015.0
|
||||
40-44,AU,female,838.0,2015.0
|
||||
35-39,AU,female,801.0,2015.0
|
||||
30-34,AU,female,865.0,2015.0
|
||||
25-29,AU,female,851.0,2015.0
|
||||
20-24,AU,female,787.0,2015.0
|
||||
15-19,AU,female,737.0,2015.0
|
||||
10-14,AU,female,707.0,2015.0
|
||||
5-9,AU,female,745.0,2015.0
|
||||
0-4,AU,female,781.0,2015.0
|
||||
75-79,CA,female,530.0,2015.0
|
||||
75-79,CA,male,446.0,2015.0
|
||||
35-39,CA,female,1192.0,2015.0
|
||||
25-29,CA,female,1220.0,2015.0
|
||||
95-99,CA,male,13.0,2015.0
|
||||
40-44,CA,male,1179.0,2015.0
|
||||
30-34,CA,female,1240.0,2015.0
|
||||
60-64,CA,female,1142.0,2015.0
|
||||
55-59,CA,male,1287.0,2015.0
|
||||
45-49,CA,male,1232.0,2015.0
|
||||
85-89,CA,female,300.0,2015.0
|
||||
50-54,CA,male,1400.0,2015.0
|
||||
100+,CA,male,1.0,2015.0
|
||||
70-74,CA,male,650.0,2015.0
|
||||
95-99,CA,female,42.0,2015.0
|
||||
100+,CA,female,7.0,2015.0
|
||||
90-94,CA,male,68.0,2015.0
|
||||
50-54,CA,female,1391.0,2015.0
|
||||
0-4,CA,male,1045.0,2015.0
|
||||
5-9,CA,male,1011.0,2015.0
|
||||
70-74,CA,female,715.0,2015.0
|
||||
10-14,CA,male,983.0,2015.0
|
||||
55-59,CA,female,1305.0,2015.0
|
||||
45-49,CA,female,1217.0,2015.0
|
||||
20-24,CA,female,1159.0,2015.0
|
||||
15-19,CA,male,1054.0,2015.0
|
||||
20-24,CA,male,1203.0,2015.0
|
||||
65-69,CA,male,914.0,2015.0
|
||||
40-44,CA,female,1169.0,2015.0
|
||||
90-94,CA,female,153.0,2015.0
|
||||
65-69,CA,female,973.0,2015.0
|
||||
60-64,CA,male,1094.0,2015.0
|
||||
85-89,CA,male,183.0,2015.0
|
||||
25-29,CA,male,1273.0,2015.0
|
||||
5-9,CA,female,960.0,2015.0
|
||||
80-84,CA,female,422.0,2015.0
|
||||
30-34,CA,male,1262.0,2015.0
|
||||
10-14,CA,female,929.0,2015.0
|
||||
0-4,CA,female,991.0,2015.0
|
||||
35-39,CA,male,1189.0,2015.0
|
||||
15-19,CA,female,1009.0,2015.0
|
||||
80-84,CA,male,318.0,2015.0
|
||||
70-74,DE,male,2099.0,2015.0
|
||||
80-84,DE,female,1486.0,2015.0
|
||||
60-64,DE,female,2799.0,2015.0
|
||||
100+,DE,male,3.0,2015.0
|
||||
5-9,DE,female,1690.0,2015.0
|
||||
75-79,DE,female,2421.0,2015.0
|
||||
70-74,DE,female,2362.0,2015.0
|
||||
65-69,DE,female,2134.0,2015.0
|
||||
85-89,DE,male,517.0,2015.0
|
||||
25-29,DE,female,2495.0,2015.0
|
||||
60-64,DE,male,2575.0,2015.0
|
||||
0-4,DE,female,1713.0,2015.0
|
||||
55-59,DE,male,2983.0,2015.0
|
||||
50-54,DE,male,3614.0,2015.0
|
||||
30-34,DE,female,2571.0,2015.0
|
||||
45-49,DE,male,3525.0,2015.0
|
||||
40-44,DE,male,2633.0,2015.0
|
||||
35-39,DE,female,2402.0,2015.0
|
||||
65-69,DE,male,2003.0,2015.0
|
||||
35-39,DE,male,2448.0,2015.0
|
||||
20-24,DE,female,2161.0,2015.0
|
||||
30-34,DE,male,2627.0,2015.0
|
||||
95-99,DE,male,17.0,2015.0
|
||||
40-44,DE,female,2559.0,2015.0
|
||||
75-79,DE,male,1932.0,2015.0
|
||||
25-29,DE,male,2593.0,2015.0
|
||||
10-14,DE,female,1800.0,2015.0
|
||||
20-24,DE,male,2266.0,2015.0
|
||||
15-19,DE,male,2076.0,2015.0
|
||||
45-49,DE,female,3357.0,2015.0
|
||||
10-14,DE,male,1892.0,2015.0
|
||||
5-9,DE,male,1781.0,2015.0
|
||||
50-54,DE,female,3513.0,2015.0
|
||||
0-4,DE,male,1811.0,2015.0
|
||||
15-19,DE,female,1974.0,2015.0
|
||||
100+,DE,female,14.0,2015.0
|
||||
90-94,DE,male,154.0,2015.0
|
||||
80-84,DE,male,1016.0,2015.0
|
||||
95-99,DE,female,71.0,2015.0
|
||||
90-94,DE,female,491.0,2015.0
|
||||
55-59,DE,female,2996.0,2015.0
|
||||
85-89,DE,female,988.0,2015.0
|
||||
95-99,ES,male,22.0,2015.0
|
||||
15-19,ES,female,1027.0,2015.0
|
||||
85-89,ES,male,306.0,2015.0
|
||||
75-79,ES,male,688.0,2015.0
|
||||
40-44,ES,female,1924.0,2015.0
|
||||
100+,ES,male,3.0,2015.0
|
||||
90-94,ES,male,112.0,2015.0
|
||||
45-49,ES,male,1909.0,2015.0
|
||||
15-19,ES,male,1087.0,2015.0
|
||||
0-4,ES,female,1198.0,2015.0
|
||||
70-74,ES,female,1040.0,2015.0
|
||||
5-9,ES,male,1307.0,2015.0
|
||||
10-14,ES,female,1124.0,2015.0
|
||||
30-34,ES,male,1748.0,2015.0
|
||||
65-69,ES,female,1251.0,2015.0
|
||||
95-99,ES,female,64.0,2015.0
|
||||
80-84,ES,female,843.0,2015.0
|
||||
55-59,ES,male,1479.0,2015.0
|
||||
10-14,ES,male,1189.0,2015.0
|
||||
50-54,ES,female,1733.0,2015.0
|
||||
20-24,ES,female,1106.0,2015.0
|
||||
40-44,ES,male,2052.0,2015.0
|
||||
70-74,ES,male,880.0,2015.0
|
||||
30-34,ES,female,1646.0,2015.0
|
||||
35-39,ES,male,2117.0,2015.0
|
||||
75-79,ES,female,906.0,2015.0
|
||||
25-29,ES,female,1280.0,2015.0
|
||||
5-9,ES,female,1235.0,2015.0
|
||||
60-64,ES,male,1235.0,2015.0
|
||||
55-59,ES,female,1531.0,2015.0
|
||||
25-29,ES,male,1347.0,2015.0
|
||||
85-89,ES,female,563.0,2015.0
|
||||
65-69,ES,male,1119.0,2015.0
|
||||
100+,ES,female,9.0,2015.0
|
||||
90-94,ES,female,256.0,2015.0
|
||||
35-39,ES,female,1966.0,2015.0
|
||||
80-84,ES,male,559.0,2015.0
|
||||
0-4,ES,male,1272.0,2015.0
|
||||
60-64,ES,female,1316.0,2015.0
|
||||
50-54,ES,male,1727.0,2015.0
|
||||
20-24,ES,male,1175.0,2015.0
|
||||
45-49,ES,female,1852.0,2015.0
|
||||
40-44,FR,male,2212.0,2015.0
|
||||
60-64,FR,male,1891.0,2015.0
|
||||
90-94,FR,male,164.0,2015.0
|
||||
35-39,FR,male,1842.0,2015.0
|
||||
100+,FR,male,3.0,2015.0
|
||||
75-79,FR,female,1223.0,2015.0
|
||||
25-29,FR,female,2041.0,2015.0
|
||||
70-74,FR,male,1103.0,2015.0
|
||||
30-34,FR,male,2035.0,2015.0
|
||||
85-89,FR,male,405.0,2015.0
|
||||
55-59,FR,female,2125.0,2015.0
|
||||
90-94,FR,female,465.0,2015.0
|
||||
25-29,FR,male,2081.0,2015.0
|
||||
95-99,FR,male,20.0,2015.0
|
||||
75-79,FR,male,918.0,2015.0
|
||||
20-24,FR,male,2040.0,2015.0
|
||||
10-14,FR,female,1894.0,2015.0
|
||||
65-69,FR,female,1985.0,2015.0
|
||||
45-49,FR,female,2220.0,2015.0
|
||||
15-19,FR,male,2016.0,2015.0
|
||||
70-74,FR,female,1317.0,2015.0
|
||||
80-84,FR,male,712.0,2015.0
|
||||
10-14,FR,male,1985.0,2015.0
|
||||
80-84,FR,female,1137.0,2015.0
|
||||
5-9,FR,male,1992.0,2015.0
|
||||
50-54,FR,female,2234.0,2015.0
|
||||
5-9,FR,female,1914.0,2015.0
|
||||
60-64,FR,female,2065.0,2015.0
|
||||
0-4,FR,male,2035.0,2015.0
|
||||
40-44,FR,female,2231.0,2015.0
|
||||
15-19,FR,female,1916.0,2015.0
|
||||
85-89,FR,female,838.0,2015.0
|
||||
100+,FR,female,19.0,2015.0
|
||||
0-4,FR,female,1938.0,2015.0
|
||||
20-24,FR,female,1947.0,2015.0
|
||||
55-59,FR,male,1939.0,2015.0
|
||||
30-34,FR,female,2046.0,2015.0
|
||||
50-54,FR,male,2123.0,2015.0
|
||||
95-99,FR,female,82.0,2015.0
|
||||
45-49,FR,male,2194.0,2015.0
|
||||
65-69,FR,male,1780.0,2015.0
|
||||
35-39,FR,female,1856.0,2015.0
|
||||
10-14,GB,female,1690.0,2015.0
|
||||
35-39,GB,male,1979.0,2015.0
|
||||
65-69,GB,female,1858.0,2015.0
|
||||
60-64,GB,male,1693.0,2015.0
|
||||
10-14,GB,male,1771.0,2015.0
|
||||
95-99,GB,female,81.0,2015.0
|
||||
25-29,GB,male,2213.0,2015.0
|
||||
5-9,GB,female,1913.0,2015.0
|
||||
40-44,GB,male,2101.0,2015.0
|
||||
100+,GB,female,13.0,2015.0
|
||||
70-74,GB,female,1422.0,2015.0
|
||||
60-64,GB,female,1775.0,2015.0
|
||||
85-89,GB,female,602.0,2015.0
|
||||
30-34,GB,male,2190.0,2015.0
|
||||
65-69,GB,male,1735.0,2015.0
|
||||
55-59,GB,male,1925.0,2015.0
|
||||
80-84,GB,female,896.0,2015.0
|
||||
100+,GB,male,3.0,2015.0
|
||||
45-49,GB,male,2301.0,2015.0
|
||||
35-39,GB,female,1964.0,2015.0
|
||||
55-59,GB,female,1991.0,2015.0
|
||||
85-89,GB,male,365.0,2015.0
|
||||
40-44,GB,female,2147.0,2015.0
|
||||
95-99,GB,male,29.0,2015.0
|
||||
50-54,GB,female,2306.0,2015.0
|
||||
0-4,GB,female,1888.0,2015.0
|
||||
25-29,GB,female,2122.0,2015.0
|
||||
20-24,GB,female,1957.0,2015.0
|
||||
15-19,GB,male,1864.0,2015.0
|
||||
50-54,GB,male,2220.0,2015.0
|
||||
90-94,GB,female,310.0,2015.0
|
||||
5-9,GB,male,2007.0,2015.0
|
||||
20-24,GB,male,2061.0,2015.0
|
||||
75-79,GB,male,978.0,2015.0
|
||||
15-19,GB,female,1783.0,2015.0
|
||||
80-84,GB,male,661.0,2015.0
|
||||
70-74,GB,male,1273.0,2015.0
|
||||
30-34,GB,female,2112.0,2015.0
|
||||
45-49,GB,female,2349.0,2015.0
|
||||
75-79,GB,female,1166.0,2015.0
|
||||
90-94,GB,male,145.0,2015.0
|
||||
0-4,GB,male,1981.0,2015.0
|
||||
20-24,IT,female,1514.0,2015.0
|
||||
65-69,IT,male,1716.0,2015.0
|
||||
100+,IT,male,3.0,2015.0
|
||||
95-99,IT,male,22.0,2015.0
|
||||
90-94,IT,male,164.0,2015.0
|
||||
85-89,IT,male,440.0,2015.0
|
||||
60-64,IT,male,1749.0,2015.0
|
||||
55-59,IT,male,1976.0,2015.0
|
||||
15-19,IT,female,1411.0,2015.0
|
||||
80-84,IT,male,808.0,2015.0
|
||||
50-54,IT,male,2322.0,2015.0
|
||||
45-49,IT,male,2476.0,2015.0
|
||||
40-44,IT,male,2428.0,2015.0
|
||||
35-39,IT,male,2117.0,2015.0
|
||||
30-34,IT,male,1814.0,2015.0
|
||||
25-29,IT,male,1673.0,2015.0
|
||||
20-24,IT,male,1601.0,2015.0
|
||||
15-19,IT,male,1493.0,2015.0
|
||||
10-14,IT,male,1468.0,2015.0
|
||||
75-79,IT,male,1191.0,2015.0
|
||||
5-9,IT,male,1473.0,2015.0
|
||||
10-14,IT,female,1388.0,2015.0
|
||||
0-4,IT,male,1468.0,2015.0
|
||||
100+,IT,female,15.0,2015.0
|
||||
0-4,IT,female,1383.0,2015.0
|
||||
95-99,IT,female,79.0,2015.0
|
||||
90-94,IT,female,436.0,2015.0
|
||||
85-89,IT,female,855.0,2015.0
|
||||
80-84,IT,female,1231.0,2015.0
|
||||
75-79,IT,female,1534.0,2015.0
|
||||
70-74,IT,female,1567.0,2015.0
|
||||
65-69,IT,female,1893.0,2015.0
|
||||
70-74,IT,male,1338.0,2015.0
|
||||
60-64,IT,female,1880.0,2015.0
|
||||
5-9,IT,female,1395.0,2015.0
|
||||
55-59,IT,female,2069.0,2015.0
|
||||
50-54,IT,female,2373.0,2015.0
|
||||
45-49,IT,female,2480.0,2015.0
|
||||
40-44,IT,female,2411.0,2015.0
|
||||
35-39,IT,female,2090.0,2015.0
|
||||
30-34,IT,female,1791.0,2015.0
|
||||
25-29,IT,female,1610.0,2015.0
|
||||
60-64,NL,female,524.0,2015.0
|
||||
80-84,NL,female,231.0,2015.0
|
||||
5-9,NL,female,450.0,2015.0
|
||||
90-94,NL,male,25.0,2015.0
|
||||
85-89,NL,female,151.0,2015.0
|
||||
70-74,NL,male,351.0,2015.0
|
||||
55-59,NL,female,580.0,2015.0
|
||||
90-94,NL,female,69.0,2015.0
|
||||
95-99,NL,female,15.0,2015.0
|
||||
100+,NL,female,2.0,2015.0
|
||||
50-54,NL,female,636.0,2015.0
|
||||
0-4,NL,male,462.0,2015.0
|
||||
5-9,NL,male,473.0,2015.0
|
||||
10-14,NL,male,517.0,2015.0
|
||||
45-49,NL,female,637.0,2015.0
|
||||
15-19,NL,male,510.0,2015.0
|
||||
20-24,NL,female,504.0,2015.0
|
||||
20-24,NL,male,527.0,2015.0
|
||||
65-69,NL,male,518.0,2015.0
|
||||
25-29,NL,male,518.0,2015.0
|
||||
40-44,NL,female,583.0,2015.0
|
||||
10-14,NL,female,493.0,2015.0
|
||||
30-34,NL,male,503.0,2015.0
|
||||
95-99,NL,male,4.0,2015.0
|
||||
75-79,NL,male,246.0,2015.0
|
||||
35-39,NL,male,490.0,2015.0
|
||||
80-84,NL,male,158.0,2015.0
|
||||
35-39,NL,female,490.0,2015.0
|
||||
40-44,NL,male,582.0,2015.0
|
||||
15-19,NL,female,484.0,2015.0
|
||||
0-4,NL,female,438.0,2015.0
|
||||
45-49,NL,male,650.0,2015.0
|
||||
30-34,NL,female,497.0,2015.0
|
||||
50-54,NL,male,646.0,2015.0
|
||||
100+,NL,male,0.0,2015.0
|
||||
55-59,NL,male,581.0,2015.0
|
||||
25-29,NL,female,505.0,2015.0
|
||||
60-64,NL,male,523.0,2015.0
|
||||
85-89,NL,male,78.0,2015.0
|
||||
65-69,NL,female,527.0,2015.0
|
||||
70-74,NL,female,375.0,2015.0
|
||||
75-79,NL,female,295.0,2015.0
|
||||
80-84,PT,female,194.0,2015.0
|
||||
40-44,PT,female,418.0,2015.0
|
||||
65-69,PT,female,313.0,2015.0
|
||||
60-64,PT,female,341.0,2015.0
|
||||
90-94,PT,male,19.0,2015.0
|
||||
85-89,PT,female,115.0,2015.0
|
||||
80-84,PT,male,122.0,2015.0
|
||||
35-39,PT,male,419.0,2015.0
|
||||
15-19,PT,female,266.0,2015.0
|
||||
60-64,PT,male,308.0,2015.0
|
||||
90-94,PT,female,45.0,2015.0
|
||||
25-29,PT,female,299.0,2015.0
|
||||
95-99,PT,female,9.0,2015.0
|
||||
55-59,PT,female,366.0,2015.0
|
||||
65-69,PT,male,267.0,2015.0
|
||||
40-44,PT,male,416.0,2015.0
|
||||
100+,PT,female,1.0,2015.0
|
||||
75-79,PT,female,244.0,2015.0
|
||||
0-4,PT,male,239.0,2015.0
|
||||
85-89,PT,male,59.0,2015.0
|
||||
35-39,PT,female,414.0,2015.0
|
||||
5-9,PT,male,264.0,2015.0
|
||||
100+,PT,male,0.0,2015.0
|
||||
50-54,PT,female,397.0,2015.0
|
||||
95-99,PT,male,3.0,2015.0
|
||||
10-14,PT,female,269.0,2015.0
|
||||
10-14,PT,male,285.0,2015.0
|
||||
0-4,PT,female,225.0,2015.0
|
||||
75-79,PT,male,177.0,2015.0
|
||||
5-9,PT,female,250.0,2015.0
|
||||
15-19,PT,male,277.0,2015.0
|
||||
45-49,PT,male,386.0,2015.0
|
||||
20-24,PT,male,285.0,2015.0
|
||||
45-49,PT,female,395.0,2015.0
|
||||
20-24,PT,female,275.0,2015.0
|
||||
70-74,PT,male,214.0,2015.0
|
||||
25-29,PT,male,309.0,2015.0
|
||||
50-54,PT,male,378.0,2015.0
|
||||
70-74,PT,female,270.0,2015.0
|
||||
55-59,PT,male,343.0,2015.0
|
||||
30-34,PT,female,362.0,2015.0
|
||||
30-34,PT,male,371.0,2015.0
|
||||
90-94,US,female,1193.0,2015.0
|
||||
75-79,US,male,3641.0,2015.0
|
||||
70-74,US,male,5278.0,2015.0
|
||||
65-69,US,male,7561.0,2015.0
|
||||
60-64,US,male,9217.0,2015.0
|
||||
55-59,US,male,10689.0,2015.0
|
||||
50-54,US,male,11013.0,2015.0
|
||||
45-49,US,male,10454.0,2015.0
|
||||
40-44,US,male,10159.0,2015.0
|
||||
10-14,US,female,10346.0,2015.0
|
||||
35-39,US,male,10329.0,2015.0
|
||||
30-34,US,male,10984.0,2015.0
|
||||
25-29,US,male,11385.0,2015.0
|
||||
20-24,US,male,11601.0,2015.0
|
||||
15-19,US,male,11025.0,2015.0
|
||||
10-14,US,male,10771.0,2015.0
|
||||
5-9,US,male,10632.0,2015.0
|
||||
0-4,US,male,10788.0,2015.0
|
||||
100+,US,female,61.0,2015.0
|
||||
95-99,US,female,361.0,2015.0
|
||||
5-9,US,female,10201.0,2015.0
|
||||
85-89,US,female,2459.0,2015.0
|
||||
80-84,US,female,3394.0,2015.0
|
||||
75-79,US,female,4532.0,2015.0
|
||||
70-74,US,female,6179.0,2015.0
|
||||
65-69,US,female,8483.0,2015.0
|
||||
60-64,US,female,10004.0,2015.0
|
||||
55-59,US,female,11264.0,2015.0
|
||||
50-54,US,female,11413.0,2015.0
|
||||
45-49,US,female,10659.0,2015.0
|
||||
0-4,US,female,10306.0,2015.0
|
||||
40-44,US,female,10308.0,2015.0
|
||||
35-39,US,female,10352.0,2015.0
|
||||
30-34,US,female,10863.0,2015.0
|
||||
25-29,US,female,11011.0,2015.0
|
||||
20-24,US,female,11094.0,2015.0
|
||||
100+,US,male,13.0,2015.0
|
||||
95-99,US,male,115.0,2015.0
|
||||
90-94,US,male,541.0,2015.0
|
||||
15-19,US,female,10570.0,2015.0
|
||||
85-89,US,male,1441.0,2015.0
|
||||
80-84,US,male,2442.0,2015.0
|
||||
|
11
data/countries.csv
Normal file
11
data/countries.csv
Normal file
@ -0,0 +1,11 @@
|
||||
country_destination,lat_destination,lng_destination,distance_km,destination_km2,destination_language ,language_levenshtein_distance
|
||||
AU,-26.853388,133.27516,15297.744,7741220.0,eng,0.0
|
||||
CA,62.393303,-96.818146,2828.1333,9984670.0,eng,0.0
|
||||
DE,51.165707,10.452764,7879.568,357022.0,deu,72.61
|
||||
ES,39.896027,-2.4876945,7730.724,505370.0,spa,92.25
|
||||
FR,46.232193,2.209667,7682.945,643801.0,fra,92.06
|
||||
GB,54.63322,-3.4322774,6883.659,243610.0,eng,0.0
|
||||
IT,41.87399,12.564167,8636.631,301340.0,ita,89.4
|
||||
NL,52.133057,5.29525,7524.3203,41543.0,nld,63.22
|
||||
PT,39.553444,-7.839319,7355.2534,92090.0,por,95.45
|
||||
US,36.966427,-95.84403,0.0,9826675.0,eng,0.0
|
||||
|
35
data/data.info
Normal file
35
data/data.info
Normal file
@ -0,0 +1,35 @@
|
||||
age_gender_bkts.csv
|
||||
Columns: age_bucket,country_destination,gender,population_in_thousands,year
|
||||
|
||||
age_bucket: Ranges from 0-100 (literally ranges ex: 90-94) and 100+
|
||||
country_destination: Two letter country code out of ['AU', 'CA', 'DE', 'ES', 'FR', 'GB', 'IT', 'NL', 'PT', 'US']
|
||||
gender: male or female or unknown
|
||||
population_in_thousands: float
|
||||
year: float
|
||||
|
||||
countries.csv
|
||||
Columns: country_destination, lat_destination, lng_destination, distance_km, destination_km2, destination_language, language_levenshtein_distance
|
||||
country_destination: Two letter country code
|
||||
lat_destination: float
|
||||
lng_destination: float
|
||||
distance_km: float
|
||||
destination_km2: float
|
||||
destination_language: 3 letter code ['eng', 'deu', 'spa', 'fra', etc.]
|
||||
language_levenshtein_distance: float
|
||||
|
||||
sample_submission_NDF.csv (This is what the output should look like)
|
||||
Columns: id, country
|
||||
id: user id, string
|
||||
country: most probably country they will vist (country code)
|
||||
|
||||
session.csv
|
||||
Column: user_id, action, action_type, action_detail, device_type, secs_elapsed
|
||||
user_id: string
|
||||
action: string ['lookup', 'search_results', 'index', etc.]
|
||||
action_type: nullable string ['click', 'data', etc.]
|
||||
action_detail: nullable string. Either a description of the action or maybe a function name? unsure
|
||||
device_type: string
|
||||
secs_elapsed: float
|
||||
|
||||
test_users.csv
|
||||
Column: id, date_account_created, timestamp_first_active, date_first_booking, gender, age, signup_method, signup_flow, language, affiliate_channel, affiliate_provider, first_affiliate_tracked, signup_app, first_device_type, first_browser
|
||||
62097
data/sample_submission_NDF.csv
Normal file
62097
data/sample_submission_NDF.csv
Normal file
File diff suppressed because it is too large
Load Diff
BIN
data/sessions.csv
Normal file
BIN
data/sessions.csv
Normal file
Binary file not shown.
|
Can't render this file because it is too large.
|
62097
data/test_users.csv
Normal file
62097
data/test_users.csv
Normal file
File diff suppressed because it is too large
Load Diff
213452
data/train_users_2.csv
Normal file
213452
data/train_users_2.csv
Normal file
File diff suppressed because it is too large
Load Diff
7
requirements.txt
Normal file
7
requirements.txt
Normal file
@ -0,0 +1,7 @@
|
||||
pandas
|
||||
scikit-learn
|
||||
numpy<2
|
||||
matplotlib
|
||||
tensorflow
|
||||
keras
|
||||
lightgbm
|
||||
Loading…
x
Reference in New Issue
Block a user