init

2025-12-04 16:48:17 -06:00 · 2025-12-04 16:48:17 -06:00 · e8e36adb24
commit e8e36adb24
11 changed files with 338673 additions and 0 deletions
--- a/LightGBM/data_preprocessing.py
+++ b/LightGBM/data_preprocessing.py
@ -0,0 +1,243 @@
+import pandas as pd
+import numpy as np
+from sklearn.preprocessing import LabelEncoder
+import pickle
+
+def load_data():
+    """Load all CSV files"""
+    print("Loading data...")
+    train_users = pd.read_csv('../data/train_users_2.csv')
+    test_users = pd.read_csv('../data/test_users.csv')
+    sessions = pd.read_csv('../data/sessions.csv')
+    countries = pd.read_csv('../data/countries.csv')
+    age_gender = pd.read_csv('../data/age_gender_bkts.csv')
+
+    return train_users, test_users, sessions, countries, age_gender
+
+def preprocess_users(df, is_train=True):
+    """Preprocess user data"""
+    print(f"Preprocessing {'train' if is_train else 'test'} users...")
+
+    # Create a copy
+    df = df.copy()
+
+    # Handle date features
+    df['date_account_created'] = pd.to_datetime(df['date_account_created'])
+    df['timestamp_first_active'] = pd.to_datetime(df['timestamp_first_active'].astype(str), format='%Y%m%d%H%M%S')
+
+    # Extract date features
+    df['dac_year'] = df['date_account_created'].dt.year
+    df['dac_month'] = df['date_account_created'].dt.month
+    df['dac_day'] = df['date_account_created'].dt.day
+    df['dac_weekday'] = df['date_account_created'].dt.weekday
+
+    df['tfa_year'] = df['timestamp_first_active'].dt.year
+    df['tfa_month'] = df['timestamp_first_active'].dt.month
+    df['tfa_day'] = df['timestamp_first_active'].dt.day
+
+    # Handle date_first_booking if it exists (only in train)
+    # NOTE: We should NOT use date_first_booking features as they leak information
+    # about the target variable (booking date exists only if user made a booking)
+    if 'date_first_booking' in df.columns:
+        # Simply drop this column - don't extract features from it
+        df = df.drop('date_first_booking', axis=1)
+
+    # Drop original date columns
+    df = df.drop(['date_account_created', 'timestamp_first_active'], axis=1)
+
+    # Handle age - clean outliers
+    df['age'] = df['age'].fillna(-1)
+    df.loc[(df['age'] < 18) | (df['age'] > 100), 'age'] = -1
+
+    # Handle gender
+    df['gender'] = df['gender'].fillna('-unknown-')
+
+    # Fill other categorical NaN with 'unknown'
+    categorical_cols = ['signup_method', 'signup_flow', 'language', 'affiliate_channel',
+                       'affiliate_provider', 'first_affiliate_tracked', 'signup_app',
+                       'first_device_type', 'first_browser']
+    for col in categorical_cols:
+        if col in df.columns:
+            df[col] = df[col].fillna('unknown')
+
+    return df
+
+def aggregate_sessions(sessions):
+    """Aggregate session data per user"""
+    print("Aggregating session data...")
+
+    if sessions.empty:
+        return pd.DataFrame()
+
+    # Fill NaN values
+    sessions = sessions.fillna(-1)
+
+    # Aggregate session features per user
+    session_agg = sessions.groupby('user_id').agg({
+        'action': 'count',
+        'action_type': lambda x: x.value_counts().index[0] if len(x) > 0 else 'unknown',
+        'action_detail': lambda x: x.value_counts().index[0] if len(x) > 0 else 'unknown',
+        'device_type': lambda x: x.value_counts().index[0] if len(x) > 0 else 'unknown',
+        'secs_elapsed': ['sum', 'mean', 'max', 'min']
+    }).reset_index()
+
+    # Flatten column names
+    session_agg.columns = ['user_id', 'num_actions', 'most_common_action_type',
+                          'most_common_action_detail', 'most_common_device_type',
+                          'total_secs', 'mean_secs', 'max_secs', 'min_secs']
+
+    return session_agg
+
+def encode_features(train_df, test_df, label_encoders=None):
+    """Encode categorical features"""
+    print("Encoding features...")
+
+    categorical_cols = ['gender', 'signup_method', 'language', 'affiliate_channel',
+                       'affiliate_provider', 'first_affiliate_tracked', 'signup_app',
+                       'first_device_type', 'first_browser']
+
+    # Add session categorical columns if they exist
+    if 'most_common_action_type' in train_df.columns:
+        categorical_cols.extend(['most_common_action_type', 'most_common_action_detail',
+                                'most_common_device_type'])
+
+    if label_encoders is None:
+        label_encoders = {}
+        for col in categorical_cols:
+            if col in train_df.columns:
+                le = LabelEncoder()
+                # Fit on combined train and test to handle all categories
+                combined = pd.concat([train_df[col], test_df[col]]).astype(str)
+                le.fit(combined)
+                label_encoders[col] = le
+
+    # Transform
+    for col in categorical_cols:
+        if col in train_df.columns:
+            train_df[col] = label_encoders[col].transform(train_df[col].astype(str))
+            test_df[col] = label_encoders[col].transform(test_df[col].astype(str))
+
+    # Encode signup_flow as numeric
+    if 'signup_flow' in train_df.columns:
+        train_df['signup_flow'] = train_df['signup_flow'].astype(int)
+        test_df['signup_flow'] = test_df['signup_flow'].astype(int)
+
+    return train_df, test_df, label_encoders
+
+def prepare_datasets():
+    """Main function to prepare train and test datasets"""
+    # Load data
+    train_users, test_users, sessions, countries, age_gender = load_data()
+
+    # Store IDs and target
+    train_ids = train_users['id']
+    test_ids = test_users['id']
+    target = train_users['country_destination']
+
+    # Preprocess users
+    train_users = preprocess_users(train_users, is_train=True)
+    test_users = preprocess_users(test_users, is_train=False)
+
+    # Aggregate sessions
+    session_agg = aggregate_sessions(sessions)
+
+    # Merge session data if available
+    if not session_agg.empty:
+        train_users = train_users.merge(session_agg, left_on='id', right_on='user_id', how='left')
+        test_users = test_users.merge(session_agg, left_on='id', right_on='user_id', how='left')
+
+        # Drop user_id column from merge
+        if 'user_id' in train_users.columns:
+            train_users = train_users.drop('user_id', axis=1)
+            test_users = test_users.drop('user_id', axis=1)
+
+        # Fill NaN values from merge
+        session_cols = ['num_actions', 'total_secs', 'mean_secs', 'max_secs', 'min_secs']
+        for col in session_cols:
+            if col in train_users.columns:
+                train_users[col] = train_users[col].fillna(0)
+                test_users[col] = test_users[col].fillna(0)
+
+        session_cat_cols = ['most_common_action_type', 'most_common_action_detail', 'most_common_device_type']
+        for col in session_cat_cols:
+            if col in train_users.columns:
+                train_users[col] = train_users[col].fillna('unknown')
+                test_users[col] = test_users[col].fillna('unknown')
+
+    # Drop ID and target from train
+    train_users = train_users.drop(['id', 'country_destination'], axis=1)
+    test_users = test_users.drop(['id'], axis=1)
+
+    # Encode categorical features
+    train_users, test_users, label_encoders = encode_features(train_users, test_users)
+    
+    # Fill any remaining NaN values before scaling
+    print("Checking for NaN values...")
+    print(f"Train NaN count: {train_users.isna().sum().sum()}")
+    print(f"Test NaN count: {test_users.isna().sum().sum()}")
+    
+    if train_users.isna().any().any():
+        print("Warning: Found NaN values in train data. Filling with 0...")
+        train_users = train_users.fillna(0)
+    
+    if test_users.isna().any().any():
+        print("Warning: Found NaN values in test data. Filling with 0...")
+        test_users = test_users.fillna(0)
+
+    # Encode target variable
+    target_encoder = LabelEncoder()
+    target_encoded = target_encoder.fit_transform(target)
+
+    # Save encoders
+    with open('label_encoders.pkl', 'wb') as f:
+        pickle.dump(label_encoders, f)
+
+    with open('target_encoder.pkl', 'wb') as f:
+        pickle.dump(target_encoder, f)
+
+    # Determine categorical columns (should match those encoded earlier)
+    categorical_cols = ['gender', 'signup_method', 'language', 'affiliate_channel',
+                       'affiliate_provider', 'first_affiliate_tracked', 'signup_app',
+                       'first_device_type', 'first_browser']
+
+    # Add session categorical columns if they exist
+    session_cat_cols = ['most_common_action_type', 'most_common_action_detail', 'most_common_device_type']
+    for col in session_cat_cols:
+        if col in train_users.columns:
+            categorical_cols.append(col)
+
+    # Keep only those categorical cols that actually exist in the dataframe
+    categorical_cols = [c for c in categorical_cols if c in train_users.columns]
+
+    # For LightGBM: No scaling needed (tree-based model)
+    # Keep all features as-is (numeric and categorical)
+    feature_names = list(train_users.columns)
+
+    # Save feature metadata
+    with open('feature_names.pkl', 'wb') as f:
+        pickle.dump(feature_names, f)
+
+    with open('categorical_features.pkl', 'wb') as f:
+        pickle.dump(categorical_cols, f)
+
+    # Convert to numpy arrays
+    X_train = train_users.values
+    X_test = test_users.values
+
+    print(f"Train shape: {X_train.shape}")
+    print(f"Test shape: {X_test.shape}")
+    print(f"Number of classes: {len(target_encoder.classes_)}")
+    print(f"Classes: {target_encoder.classes_}")
+
+    return X_train, target_encoded, X_test, test_ids, target_encoder
+
+if __name__ == '__main__':
+    X_train, y_train, X_test, test_ids, target_encoder = prepare_datasets()
+
+    # Save preprocessed data
+    np.save('X_train.npy', X_train)
+    np.save('y_train.npy', y_train)
+    np.save('X_test.npy', X_test)
+    np.save('test_ids.npy', test_ids)
+
+    print("\nData preprocessing completed successfully!")
--- a/LightGBM/predict_lightgbm.py
+++ b/LightGBM/predict_lightgbm.py
@ -0,0 +1,90 @@
+import numpy as np
+import lightgbm as lgb
+import pickle
+
+def predict_lightgbm():
+    """Generate predictions using LightGBM model"""
+    print("="*70)
+    print("LIGHTGBM PREDICTION")
+    print("="*70)
+    
+    # Load test data
+    print("Loading test data...")
+    X_test = np.load('X_test.npy')
+    test_ids = np.load('test_ids.npy', allow_pickle=True)
+
+    print(f"Test samples: {len(X_test)}")
+
+    # Try to load list of CV models; fall back to single model file
+    import os
+    models = []
+    if os.path.exists('lightgbm_models_list.pkl'):
+        with open('lightgbm_models_list.pkl', 'rb') as f:
+            model_files = pickle.load(f)
+        print(f"Loading {len(model_files)} fold models...")
+        for mf in model_files:
+            if os.path.exists(mf):
+                models.append(lgb.Booster(model_file=mf))
+    elif os.path.exists('lightgbm_model.txt'):
+        print("Loading single model 'lightgbm_model.txt'...")
+        models = [lgb.Booster(model_file='lightgbm_model.txt')]
+    else:
+        raise FileNotFoundError('No LightGBM model files found. Run training first.')
+    
+    # Load target encoder
+    with open('target_encoder.pkl', 'rb') as f:
+        target_encoder = pickle.load(f)
+    
+    # Make predictions (average over fold models)
+    print("\nGenerating predictions by averaging fold models...")
+    preds = None
+    for m in models:
+        p = m.predict(X_test)
+        if preds is None:
+            preds = p
+        else:
+            preds += p
+    y_pred_proba = preds / len(models)
+    
+    # Get top-5 destinations for each user
+    print("\nGenerating top-5 destinations per user...")
+    top5_indices = np.argsort(y_pred_proba, axis=1)[:, ::-1][:, :5]
+    
+    # Create submission with 5 rows per user
+    import pandas as pd
+    submission_rows = []
+    for user_idx, user_id in enumerate(test_ids):
+        for rank in range(5):
+            dest_idx = top5_indices[user_idx, rank]
+            dest_country = target_encoder.inverse_transform([dest_idx])[0]
+            submission_rows.append({
+                'id': user_id,
+                'country': dest_country
+            })
+    
+    submission_df = pd.DataFrame(submission_rows)
+    submission_df.to_csv('submission_lightgbm.csv', index=False)
+    
+    # Show distribution of top-1 predictions
+    print("\nTop-1 Prediction Distribution:")
+    top1_preds = target_encoder.inverse_transform(top5_indices[:, 0])
+    unique, counts = np.unique(top1_preds, return_counts=True)
+    for country, count in sorted(zip(unique, counts), key=lambda x: -x[1]):
+        pct = count / len(top1_preds) * 100
+        print(f"{country}: {count} ({pct:.2f}%)")
+    
+    print("\n" + "="*70)
+    print("Predictions saved as 'submission_lightgbm.csv'")
+    print("="*70)
+    print(f"\nTotal users: {len(test_ids)}")
+    print(f"Total rows (5 per user): {len(submission_rows)}")
+    print(f"Unique destinations in top-1: {len(unique)}")
+    
+    return y_pred_proba
+
+if __name__ == '__main__':
+    predictions = predict_lightgbm()
+    
+    print("\n" + "="*70)
+    print("PREDICTION COMPLETE!")
+    print("="*70)
--- a/LightGBM/train_lightgbm.py
+++ b/LightGBM/train_lightgbm.py
@ -0,0 +1,220 @@
+import numpy as np
+import pandas as pd
+import lightgbm as lgb
+from sklearn.model_selection import train_test_split, StratifiedKFold
+from sklearn.metrics import accuracy_score, classification_report, log_loss
+import pickle
+import matplotlib.pyplot as plt
+
+def train_lightgbm():
+    """Train LightGBM classifier for destination prediction"""
+    print("="*70)
+    print("LIGHTGBM CLASSIFIER TRAINING")
+    print("="*70)
+    
+    # Load preprocessed data and metadata
+    print("\nLoading preprocessed data and metadata...")
+    X_train = np.load('X_train.npy')
+    y_train = np.load('y_train.npy')
+
+    # Load feature names and categorical feature list
+    with open('feature_names.pkl', 'rb') as f:
+        feature_names = pickle.load(f)
+
+    with open('categorical_features.pkl', 'rb') as f:
+        categorical_features = pickle.load(f)
+
+    # Load target encoder
+    with open('target_encoder.pkl', 'rb') as f:
+        target_encoder = pickle.load(f)
+    
+    print(f"Training samples: {len(y_train)}")
+    print(f"Number of classes: {len(target_encoder.classes_)}")
+    print(f"Classes: {target_encoder.classes_}")
+    
+    # Show class distribution
+    print("\nClass distribution:")
+    unique, counts = np.unique(y_train, return_counts=True)
+    for idx, count in zip(unique, counts):
+        pct = count / len(y_train) * 100
+        print(f"{target_encoder.classes_[idx]}: {count} ({pct:.2f}%)")
+    
+    # Convert to DataFrame to preserve feature names and pass categorical features to LightGBM
+    import pandas as pd
+    X_df = pd.DataFrame(X_train, columns=feature_names)
+
+    print(f"\nUsing categorical features: {categorical_features}")
+
+    # Prepare CV
+    n_splits = 5
+    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
+    oof_preds = np.zeros((X_df.shape[0], len(target_encoder.classes_)))
+
+    # If test set exists, prepare to accumulate test predictions
+    X_test = None
+    try:
+        X_test = np.load('X_test.npy')
+        X_test_df = pd.DataFrame(X_test, columns=feature_names)
+        test_preds = np.zeros((X_test_df.shape[0], len(target_encoder.classes_)))
+    except Exception:
+        X_test_df = None
+        test_preds = None
+    
+    # LightGBM parameters - updated for CV and categorical handling
+    params = {
+        'objective': 'multiclass',
+        'num_class': len(target_encoder.classes_),
+        'metric': 'multi_logloss',
+        'boosting_type': 'gbdt',
+        'num_leaves': 128,
+        'learning_rate': 0.05,
+        'feature_fraction': 0.7,
+        'bagging_fraction': 0.8,
+        'bagging_freq': 5,
+        'verbose': -1,
+        'max_depth': -1,
+        'min_data_in_leaf': 20,
+        'lambda_l1': 0.1,
+        'lambda_l2': 0.1,
+        'seed': 42,
+        'boost_from_average': False
+    }
+    
+    print("\nLightGBM Parameters:")
+    for key, value in params.items():
+        print(f"  {key}: {value}")
+    
+    # Train with Stratified K-Fold CV
+    print("\nTraining LightGBM with Stratified K-Fold CV...")
+    fold = 0
+    models = []
+    evals_result = {}
+    for train_idx, val_idx in skf.split(X_df, y_train):
+        fold += 1
+        print('\n' + '='*50)
+        print(f"Fold {fold}/{n_splits}")
+        print('='*50)
+
+        X_tr = X_df.iloc[train_idx]
+        X_val = X_df.iloc[val_idx]
+        y_tr = y_train[train_idx]
+        y_val = y_train[val_idx]
+
+        train_data = lgb.Dataset(X_tr, label=y_tr, feature_name=feature_names, categorical_feature=categorical_features)
+        val_data = lgb.Dataset(X_val, label=y_val, reference=train_data, feature_name=feature_names, categorical_feature=categorical_features)
+
+        model = lgb.train(
+            params,
+            train_data,
+            num_boost_round=3000,
+            valid_sets=[train_data, val_data],
+            valid_names=['train', 'valid'],
+            callbacks=[
+                lgb.early_stopping(stopping_rounds=100),
+                lgb.log_evaluation(period=100)
+            ]
+        )
+
+        models.append(model)
+
+        # OOF predictions
+        val_pred = model.predict(X_val, num_iteration=model.best_iteration)
+        oof_preds[val_idx] = val_pred
+
+        # Test predictions (if available)
+        if X_test_df is not None:
+            fold_test_pred = model.predict(X_test_df, num_iteration=model.best_iteration)
+            test_preds += fold_test_pred
+
+        # Save fold model
+        model.save_model(f'lightgbm_model_fold{fold}.txt')
+        print(f"Saved model for fold {fold} as 'lightgbm_model_fold{fold}.txt'")
+    
+    print("-" * 70)
+
+    # Evaluate using OOF predictions across all folds
+    print("\n" + "="*70)
+    print("CROSS-VALIDATION OOF RESULTS")
+    print("="*70)
+
+    oof_logloss = log_loss(y_train, oof_preds)
+    oof_preds_argmax = np.argmax(oof_preds, axis=1)
+    oof_accuracy = accuracy_score(y_train, oof_preds_argmax)
+
+    print(f"\nOOF Accuracy: {oof_accuracy:.4f}")
+    print(f"OOF Log Loss: {oof_logloss:.4f}")
+
+    # Baseline accuracy on full train
+    most_common = np.bincount(y_train).argmax()
+    baseline = np.sum(y_train == most_common) / len(y_train)
+    print(f"Baseline (always {target_encoder.classes_[most_common]}): {baseline:.4f}")
+    print(f"Improvement over baseline: {(oof_accuracy - baseline):.4f}")
+
+    # Show classification report on OOF hard predictions
+    print("\nClassification Report (OOF predictions):")
+    print(classification_report(y_train, oof_preds_argmax, target_names=target_encoder.classes_, zero_division=0))
+    
+    # Prediction distribution (OOF)
+    print("\nOOF Prediction Distribution:")
+    print(f"{'Class':<10} {'Actual':<10} {'Predicted':<10} {'Actual %':<12} {'Pred %'}")
+    print("-" * 60)
+    for idx in range(len(target_encoder.classes_)):
+        class_name = target_encoder.classes_[idx]
+        actual_count = np.sum(y_train == idx)
+        pred_count = np.sum(oof_preds_argmax == idx)
+        actual_pct = actual_count / len(y_train) * 100
+        pred_pct = pred_count / len(oof_preds_argmax) * 100
+        print(f"{class_name:<10} {actual_count:<10} {pred_count:<10} {actual_pct:>6.2f}%      {pred_pct:>6.2f}%")
+    
+    # Feature importance
+    print("\n" + "="*70)
+    print("TOP 20 FEATURE IMPORTANCES")
+    print("="*70)
+    
+    # Aggregate feature importances across folds
+    total_importance = np.zeros(len(feature_names))
+    for m in models:
+        total_importance += np.array(m.feature_importance(importance_type='gain'))
+    avg_importance = total_importance / max(1, len(models))
+
+    importance_df = pd.DataFrame({
+        'feature': feature_names,
+        'importance': avg_importance
+    }).sort_values('importance', ascending=False)
+
+    print(importance_df.head(20).to_string(index=False))
+    
+    # Plot feature importance
+    plt.figure(figsize=(10, 8))
+    top_features = importance_df.head(20)
+    plt.barh(range(len(top_features)), top_features['importance'])
+    plt.yticks(range(len(top_features)), top_features['feature'])
+    plt.xlabel('Importance (Gain)')
+    plt.title('Top 20 Feature Importances')
+    plt.gca().invert_yaxis()
+    plt.tight_layout()
+    plt.savefig('feature_importance.png', dpi=300)
+    print("\nFeature importance plot saved as 'feature_importance.png'")
+    
+    # (Optional) Save a simple CSV of top importances
+    importance_df.head(100).to_csv('feature_importances_avg.csv', index=False)
+    print("Saved averaged feature importances as 'feature_importances_avg.csv'")
+    
+    # Save list of fold models
+    model_files = [f'lightgbm_model_fold{i+1}.txt' for i in range(len(models))]
+    with open('lightgbm_models_list.pkl', 'wb') as f:
+        pickle.dump(model_files, f)
+
+    print("\n" + "="*70)
+    print(f"Saved {len(models)} fold models and model list 'lightgbm_models_list.pkl'")
+    print("="*70)
+
+    return models
+
+if __name__ == '__main__':
+    model = train_lightgbm()
+    
+    print("\n" + "="*70)
+    print("TRAINING COMPLETE!")
+    print("="*70)
+    print("\nNext step: Use predict_lightgbm.py to make predictions on test data")
--- a/data/age_gender_bkts.csv
+++ b/data/age_gender_bkts.csv
@ -0,0 +1,421 @@
+age_bucket,country_destination,gender,population_in_thousands,year
+100+,AU,male,1.0,2015.0
+95-99,AU,male,9.0,2015.0
+90-94,AU,male,47.0,2015.0
+85-89,AU,male,118.0,2015.0
+80-84,AU,male,199.0,2015.0
+75-79,AU,male,298.0,2015.0
+70-74,AU,male,415.0,2015.0
+65-69,AU,male,574.0,2015.0
+60-64,AU,male,636.0,2015.0
+55-59,AU,male,714.0,2015.0
+50-54,AU,male,778.0,2015.0
+45-49,AU,male,778.0,2015.0
+40-44,AU,male,820.0,2015.0
+35-39,AU,male,797.0,2015.0
+30-34,AU,male,881.0,2015.0
+25-29,AU,male,895.0,2015.0
+20-24,AU,male,820.0,2015.0
+15-19,AU,male,768.0,2015.0
+10-14,AU,male,743.0,2015.0
+5-9,AU,male,784.0,2015.0
+0-4,AU,male,824.0,2015.0
+100+,AU,female,4.0,2015.0
+95-99,AU,female,25.0,2015.0
+90-94,AU,female,94.0,2015.0
+85-89,AU,female,179.0,2015.0
+80-84,AU,female,252.0,2015.0
+75-79,AU,female,338.0,2015.0
+70-74,AU,female,438.0,2015.0
+65-69,AU,female,592.0,2015.0
+60-64,AU,female,660.0,2015.0
+55-59,AU,female,739.0,2015.0
+50-54,AU,female,798.0,2015.0
+45-49,AU,female,793.0,2015.0
+40-44,AU,female,838.0,2015.0
+35-39,AU,female,801.0,2015.0
+30-34,AU,female,865.0,2015.0
+25-29,AU,female,851.0,2015.0
+20-24,AU,female,787.0,2015.0
+15-19,AU,female,737.0,2015.0
+10-14,AU,female,707.0,2015.0
+5-9,AU,female,745.0,2015.0
+0-4,AU,female,781.0,2015.0
+75-79,CA,female,530.0,2015.0
+75-79,CA,male,446.0,2015.0
+35-39,CA,female,1192.0,2015.0
+25-29,CA,female,1220.0,2015.0
+95-99,CA,male,13.0,2015.0
+40-44,CA,male,1179.0,2015.0
+30-34,CA,female,1240.0,2015.0
+60-64,CA,female,1142.0,2015.0
+55-59,CA,male,1287.0,2015.0
+45-49,CA,male,1232.0,2015.0
+85-89,CA,female,300.0,2015.0
+50-54,CA,male,1400.0,2015.0
+100+,CA,male,1.0,2015.0
+70-74,CA,male,650.0,2015.0
+95-99,CA,female,42.0,2015.0
+100+,CA,female,7.0,2015.0
+90-94,CA,male,68.0,2015.0
+50-54,CA,female,1391.0,2015.0
+0-4,CA,male,1045.0,2015.0
+5-9,CA,male,1011.0,2015.0
+70-74,CA,female,715.0,2015.0
+10-14,CA,male,983.0,2015.0
+55-59,CA,female,1305.0,2015.0
+45-49,CA,female,1217.0,2015.0
+20-24,CA,female,1159.0,2015.0
+15-19,CA,male,1054.0,2015.0
+20-24,CA,male,1203.0,2015.0
+65-69,CA,male,914.0,2015.0
+40-44,CA,female,1169.0,2015.0
+90-94,CA,female,153.0,2015.0
+65-69,CA,female,973.0,2015.0
+60-64,CA,male,1094.0,2015.0
+85-89,CA,male,183.0,2015.0
+25-29,CA,male,1273.0,2015.0
+5-9,CA,female,960.0,2015.0
+80-84,CA,female,422.0,2015.0
+30-34,CA,male,1262.0,2015.0
+10-14,CA,female,929.0,2015.0
+0-4,CA,female,991.0,2015.0
+35-39,CA,male,1189.0,2015.0
+15-19,CA,female,1009.0,2015.0
+80-84,CA,male,318.0,2015.0
+70-74,DE,male,2099.0,2015.0
+80-84,DE,female,1486.0,2015.0
+60-64,DE,female,2799.0,2015.0
+100+,DE,male,3.0,2015.0
+5-9,DE,female,1690.0,2015.0
+75-79,DE,female,2421.0,2015.0
+70-74,DE,female,2362.0,2015.0
+65-69,DE,female,2134.0,2015.0
+85-89,DE,male,517.0,2015.0
+25-29,DE,female,2495.0,2015.0
+60-64,DE,male,2575.0,2015.0
+0-4,DE,female,1713.0,2015.0
+55-59,DE,male,2983.0,2015.0
+50-54,DE,male,3614.0,2015.0
+30-34,DE,female,2571.0,2015.0
+45-49,DE,male,3525.0,2015.0
+40-44,DE,male,2633.0,2015.0
+35-39,DE,female,2402.0,2015.0
+65-69,DE,male,2003.0,2015.0
+35-39,DE,male,2448.0,2015.0
+20-24,DE,female,2161.0,2015.0
+30-34,DE,male,2627.0,2015.0
+95-99,DE,male,17.0,2015.0
+40-44,DE,female,2559.0,2015.0
+75-79,DE,male,1932.0,2015.0
+25-29,DE,male,2593.0,2015.0
+10-14,DE,female,1800.0,2015.0
+20-24,DE,male,2266.0,2015.0
+15-19,DE,male,2076.0,2015.0
+45-49,DE,female,3357.0,2015.0
+10-14,DE,male,1892.0,2015.0
+5-9,DE,male,1781.0,2015.0
+50-54,DE,female,3513.0,2015.0
+0-4,DE,male,1811.0,2015.0
+15-19,DE,female,1974.0,2015.0
+100+,DE,female,14.0,2015.0
+90-94,DE,male,154.0,2015.0
+80-84,DE,male,1016.0,2015.0
+95-99,DE,female,71.0,2015.0
+90-94,DE,female,491.0,2015.0
+55-59,DE,female,2996.0,2015.0
+85-89,DE,female,988.0,2015.0
+95-99,ES,male,22.0,2015.0
+15-19,ES,female,1027.0,2015.0
+85-89,ES,male,306.0,2015.0
+75-79,ES,male,688.0,2015.0
+40-44,ES,female,1924.0,2015.0
+100+,ES,male,3.0,2015.0
+90-94,ES,male,112.0,2015.0
+45-49,ES,male,1909.0,2015.0
+15-19,ES,male,1087.0,2015.0
+0-4,ES,female,1198.0,2015.0
+70-74,ES,female,1040.0,2015.0
+5-9,ES,male,1307.0,2015.0
+10-14,ES,female,1124.0,2015.0
+30-34,ES,male,1748.0,2015.0
+65-69,ES,female,1251.0,2015.0
+95-99,ES,female,64.0,2015.0
+80-84,ES,female,843.0,2015.0
+55-59,ES,male,1479.0,2015.0
+10-14,ES,male,1189.0,2015.0
+50-54,ES,female,1733.0,2015.0
+20-24,ES,female,1106.0,2015.0
+40-44,ES,male,2052.0,2015.0
+70-74,ES,male,880.0,2015.0
+30-34,ES,female,1646.0,2015.0
+35-39,ES,male,2117.0,2015.0
+75-79,ES,female,906.0,2015.0
+25-29,ES,female,1280.0,2015.0
+5-9,ES,female,1235.0,2015.0
+60-64,ES,male,1235.0,2015.0
+55-59,ES,female,1531.0,2015.0
+25-29,ES,male,1347.0,2015.0
+85-89,ES,female,563.0,2015.0
+65-69,ES,male,1119.0,2015.0
+100+,ES,female,9.0,2015.0
+90-94,ES,female,256.0,2015.0
+35-39,ES,female,1966.0,2015.0
+80-84,ES,male,559.0,2015.0
+0-4,ES,male,1272.0,2015.0
+60-64,ES,female,1316.0,2015.0
+50-54,ES,male,1727.0,2015.0
+20-24,ES,male,1175.0,2015.0
+45-49,ES,female,1852.0,2015.0
+40-44,FR,male,2212.0,2015.0
+60-64,FR,male,1891.0,2015.0
+90-94,FR,male,164.0,2015.0
+35-39,FR,male,1842.0,2015.0
+100+,FR,male,3.0,2015.0
+75-79,FR,female,1223.0,2015.0
+25-29,FR,female,2041.0,2015.0
+70-74,FR,male,1103.0,2015.0
+30-34,FR,male,2035.0,2015.0
+85-89,FR,male,405.0,2015.0
+55-59,FR,female,2125.0,2015.0
+90-94,FR,female,465.0,2015.0
+25-29,FR,male,2081.0,2015.0
+95-99,FR,male,20.0,2015.0
+75-79,FR,male,918.0,2015.0
+20-24,FR,male,2040.0,2015.0
+10-14,FR,female,1894.0,2015.0
+65-69,FR,female,1985.0,2015.0
+45-49,FR,female,2220.0,2015.0
+15-19,FR,male,2016.0,2015.0
+70-74,FR,female,1317.0,2015.0
+80-84,FR,male,712.0,2015.0
+10-14,FR,male,1985.0,2015.0
+80-84,FR,female,1137.0,2015.0
+5-9,FR,male,1992.0,2015.0
+50-54,FR,female,2234.0,2015.0
+5-9,FR,female,1914.0,2015.0
+60-64,FR,female,2065.0,2015.0
+0-4,FR,male,2035.0,2015.0
+40-44,FR,female,2231.0,2015.0
+15-19,FR,female,1916.0,2015.0
+85-89,FR,female,838.0,2015.0
+100+,FR,female,19.0,2015.0
+0-4,FR,female,1938.0,2015.0
+20-24,FR,female,1947.0,2015.0
+55-59,FR,male,1939.0,2015.0
+30-34,FR,female,2046.0,2015.0
+50-54,FR,male,2123.0,2015.0
+95-99,FR,female,82.0,2015.0
+45-49,FR,male,2194.0,2015.0
+65-69,FR,male,1780.0,2015.0
+35-39,FR,female,1856.0,2015.0
+10-14,GB,female,1690.0,2015.0
+35-39,GB,male,1979.0,2015.0
+65-69,GB,female,1858.0,2015.0
+60-64,GB,male,1693.0,2015.0
+10-14,GB,male,1771.0,2015.0
+95-99,GB,female,81.0,2015.0
+25-29,GB,male,2213.0,2015.0
+5-9,GB,female,1913.0,2015.0
+40-44,GB,male,2101.0,2015.0
+100+,GB,female,13.0,2015.0
+70-74,GB,female,1422.0,2015.0
+60-64,GB,female,1775.0,2015.0
+85-89,GB,female,602.0,2015.0
+30-34,GB,male,2190.0,2015.0
+65-69,GB,male,1735.0,2015.0
+55-59,GB,male,1925.0,2015.0
+80-84,GB,female,896.0,2015.0
+100+,GB,male,3.0,2015.0
+45-49,GB,male,2301.0,2015.0
+35-39,GB,female,1964.0,2015.0
+55-59,GB,female,1991.0,2015.0
+85-89,GB,male,365.0,2015.0
+40-44,GB,female,2147.0,2015.0
+95-99,GB,male,29.0,2015.0
+50-54,GB,female,2306.0,2015.0
+0-4,GB,female,1888.0,2015.0
+25-29,GB,female,2122.0,2015.0
+20-24,GB,female,1957.0,2015.0
+15-19,GB,male,1864.0,2015.0
+50-54,GB,male,2220.0,2015.0
+90-94,GB,female,310.0,2015.0
+5-9,GB,male,2007.0,2015.0
+20-24,GB,male,2061.0,2015.0
+75-79,GB,male,978.0,2015.0
+15-19,GB,female,1783.0,2015.0
+80-84,GB,male,661.0,2015.0
+70-74,GB,male,1273.0,2015.0
+30-34,GB,female,2112.0,2015.0
+45-49,GB,female,2349.0,2015.0
+75-79,GB,female,1166.0,2015.0
+90-94,GB,male,145.0,2015.0
+0-4,GB,male,1981.0,2015.0
+20-24,IT,female,1514.0,2015.0
+65-69,IT,male,1716.0,2015.0
+100+,IT,male,3.0,2015.0
+95-99,IT,male,22.0,2015.0
+90-94,IT,male,164.0,2015.0
+85-89,IT,male,440.0,2015.0
+60-64,IT,male,1749.0,2015.0
+55-59,IT,male,1976.0,2015.0
+15-19,IT,female,1411.0,2015.0
+80-84,IT,male,808.0,2015.0
+50-54,IT,male,2322.0,2015.0
+45-49,IT,male,2476.0,2015.0
+40-44,IT,male,2428.0,2015.0
+35-39,IT,male,2117.0,2015.0
+30-34,IT,male,1814.0,2015.0
+25-29,IT,male,1673.0,2015.0
+20-24,IT,male,1601.0,2015.0
+15-19,IT,male,1493.0,2015.0
+10-14,IT,male,1468.0,2015.0
+75-79,IT,male,1191.0,2015.0
+5-9,IT,male,1473.0,2015.0
+10-14,IT,female,1388.0,2015.0
+0-4,IT,male,1468.0,2015.0
+100+,IT,female,15.0,2015.0
+0-4,IT,female,1383.0,2015.0
+95-99,IT,female,79.0,2015.0
+90-94,IT,female,436.0,2015.0
+85-89,IT,female,855.0,2015.0
+80-84,IT,female,1231.0,2015.0
+75-79,IT,female,1534.0,2015.0
+70-74,IT,female,1567.0,2015.0
+65-69,IT,female,1893.0,2015.0
+70-74,IT,male,1338.0,2015.0
+60-64,IT,female,1880.0,2015.0
+5-9,IT,female,1395.0,2015.0
+55-59,IT,female,2069.0,2015.0
+50-54,IT,female,2373.0,2015.0
+45-49,IT,female,2480.0,2015.0
+40-44,IT,female,2411.0,2015.0
+35-39,IT,female,2090.0,2015.0
+30-34,IT,female,1791.0,2015.0
+25-29,IT,female,1610.0,2015.0
+60-64,NL,female,524.0,2015.0
+80-84,NL,female,231.0,2015.0
+5-9,NL,female,450.0,2015.0
+90-94,NL,male,25.0,2015.0
+85-89,NL,female,151.0,2015.0
+70-74,NL,male,351.0,2015.0
+55-59,NL,female,580.0,2015.0
+90-94,NL,female,69.0,2015.0
+95-99,NL,female,15.0,2015.0
+100+,NL,female,2.0,2015.0
+50-54,NL,female,636.0,2015.0
+0-4,NL,male,462.0,2015.0
+5-9,NL,male,473.0,2015.0
+10-14,NL,male,517.0,2015.0
+45-49,NL,female,637.0,2015.0
+15-19,NL,male,510.0,2015.0
+20-24,NL,female,504.0,2015.0
+20-24,NL,male,527.0,2015.0
+65-69,NL,male,518.0,2015.0
+25-29,NL,male,518.0,2015.0
+40-44,NL,female,583.0,2015.0
+10-14,NL,female,493.0,2015.0
+30-34,NL,male,503.0,2015.0
+95-99,NL,male,4.0,2015.0
+75-79,NL,male,246.0,2015.0
+35-39,NL,male,490.0,2015.0
+80-84,NL,male,158.0,2015.0
+35-39,NL,female,490.0,2015.0
+40-44,NL,male,582.0,2015.0
+15-19,NL,female,484.0,2015.0
+0-4,NL,female,438.0,2015.0
+45-49,NL,male,650.0,2015.0
+30-34,NL,female,497.0,2015.0
+50-54,NL,male,646.0,2015.0
+100+,NL,male,0.0,2015.0
+55-59,NL,male,581.0,2015.0
+25-29,NL,female,505.0,2015.0
+60-64,NL,male,523.0,2015.0
+85-89,NL,male,78.0,2015.0
+65-69,NL,female,527.0,2015.0
+70-74,NL,female,375.0,2015.0
+75-79,NL,female,295.0,2015.0
+80-84,PT,female,194.0,2015.0
+40-44,PT,female,418.0,2015.0
+65-69,PT,female,313.0,2015.0
+60-64,PT,female,341.0,2015.0
+90-94,PT,male,19.0,2015.0
+85-89,PT,female,115.0,2015.0
+80-84,PT,male,122.0,2015.0
+35-39,PT,male,419.0,2015.0
+15-19,PT,female,266.0,2015.0
+60-64,PT,male,308.0,2015.0
+90-94,PT,female,45.0,2015.0
+25-29,PT,female,299.0,2015.0
+95-99,PT,female,9.0,2015.0
+55-59,PT,female,366.0,2015.0
+65-69,PT,male,267.0,2015.0
+40-44,PT,male,416.0,2015.0
+100+,PT,female,1.0,2015.0
+75-79,PT,female,244.0,2015.0
+0-4,PT,male,239.0,2015.0
+85-89,PT,male,59.0,2015.0
+35-39,PT,female,414.0,2015.0
+5-9,PT,male,264.0,2015.0
+100+,PT,male,0.0,2015.0
+50-54,PT,female,397.0,2015.0
+95-99,PT,male,3.0,2015.0
+10-14,PT,female,269.0,2015.0
+10-14,PT,male,285.0,2015.0
+0-4,PT,female,225.0,2015.0
+75-79,PT,male,177.0,2015.0
+5-9,PT,female,250.0,2015.0
+15-19,PT,male,277.0,2015.0
+45-49,PT,male,386.0,2015.0
+20-24,PT,male,285.0,2015.0
+45-49,PT,female,395.0,2015.0
+20-24,PT,female,275.0,2015.0
+70-74,PT,male,214.0,2015.0
+25-29,PT,male,309.0,2015.0
+50-54,PT,male,378.0,2015.0
+70-74,PT,female,270.0,2015.0
+55-59,PT,male,343.0,2015.0
+30-34,PT,female,362.0,2015.0
+30-34,PT,male,371.0,2015.0
+90-94,US,female,1193.0,2015.0
+75-79,US,male,3641.0,2015.0
+70-74,US,male,5278.0,2015.0
+65-69,US,male,7561.0,2015.0
+60-64,US,male,9217.0,2015.0
+55-59,US,male,10689.0,2015.0
+50-54,US,male,11013.0,2015.0
+45-49,US,male,10454.0,2015.0
+40-44,US,male,10159.0,2015.0
+10-14,US,female,10346.0,2015.0
+35-39,US,male,10329.0,2015.0
+30-34,US,male,10984.0,2015.0
+25-29,US,male,11385.0,2015.0
+20-24,US,male,11601.0,2015.0
+15-19,US,male,11025.0,2015.0
+10-14,US,male,10771.0,2015.0
+5-9,US,male,10632.0,2015.0
+0-4,US,male,10788.0,2015.0
+100+,US,female,61.0,2015.0
+95-99,US,female,361.0,2015.0
+5-9,US,female,10201.0,2015.0
+85-89,US,female,2459.0,2015.0
+80-84,US,female,3394.0,2015.0
+75-79,US,female,4532.0,2015.0
+70-74,US,female,6179.0,2015.0
+65-69,US,female,8483.0,2015.0
+60-64,US,female,10004.0,2015.0
+55-59,US,female,11264.0,2015.0
+50-54,US,female,11413.0,2015.0
+45-49,US,female,10659.0,2015.0
+0-4,US,female,10306.0,2015.0
+40-44,US,female,10308.0,2015.0
+35-39,US,female,10352.0,2015.0
+30-34,US,female,10863.0,2015.0
+25-29,US,female,11011.0,2015.0
+20-24,US,female,11094.0,2015.0
+100+,US,male,13.0,2015.0
+95-99,US,male,115.0,2015.0
+90-94,US,male,541.0,2015.0
+15-19,US,female,10570.0,2015.0
+85-89,US,male,1441.0,2015.0
+80-84,US,male,2442.0,2015.0
--- a/data/countries.csv
+++ b/data/countries.csv
@ -0,0 +1,11 @@
+country_destination,lat_destination,lng_destination,distance_km,destination_km2,destination_language ,language_levenshtein_distance
+AU,-26.853388,133.27516,15297.744,7741220.0,eng,0.0
+CA,62.393303,-96.818146,2828.1333,9984670.0,eng,0.0
+DE,51.165707,10.452764,7879.568,357022.0,deu,72.61
+ES,39.896027,-2.4876945,7730.724,505370.0,spa,92.25
+FR,46.232193,2.209667,7682.945,643801.0,fra,92.06
+GB,54.63322,-3.4322774,6883.659,243610.0,eng,0.0
+IT,41.87399,12.564167,8636.631,301340.0,ita,89.4
+NL,52.133057,5.29525,7524.3203,41543.0,nld,63.22
+PT,39.553444,-7.839319,7355.2534,92090.0,por,95.45
+US,36.966427,-95.84403,0.0,9826675.0,eng,0.0
--- a/data/data.info
+++ b/data/data.info
@ -0,0 +1,35 @@
+age_gender_bkts.csv
+Columns: age_bucket,country_destination,gender,population_in_thousands,year
+
+age_bucket: Ranges from 0-100 (literally ranges ex: 90-94) and 100+
+country_destination: Two letter country code out of ['AU', 'CA', 'DE', 'ES', 'FR', 'GB', 'IT', 'NL', 'PT', 'US']
+gender: male or female or unknown
+population_in_thousands: float
+year: float
+
+countries.csv
+Columns: country_destination, lat_destination, lng_destination, distance_km, destination_km2, destination_language, language_levenshtein_distance
+country_destination: Two letter country code
+lat_destination: float
+lng_destination: float
+distance_km: float
+destination_km2: float
+destination_language: 3 letter code ['eng', 'deu', 'spa', 'fra', etc.]
+language_levenshtein_distance: float
+
+sample_submission_NDF.csv (This is what the output should look like)
+Columns: id, country
+id: user id, string
+country: most probably country they will vist (country code)
+
+session.csv
+Column: user_id, action, action_type, action_detail, device_type, secs_elapsed
+user_id: string
+action: string ['lookup', 'search_results', 'index', etc.]
+action_type: nullable string ['click', 'data', etc.]
+action_detail: nullable string. Either a description of the action or maybe a function name? unsure
+device_type: string
+secs_elapsed: float
+
+test_users.csv
+Column: id, date_account_created, timestamp_first_active, date_first_booking, gender, age, signup_method, signup_flow, language, affiliate_channel, affiliate_provider, first_affiliate_tracked, signup_app, first_device_type, first_browser
--- a/data/sample_submission_NDF.csv
+++ b/data/sample_submission_NDF.csv
--- a/data/sessions.csv
+++ b/data/sessions.csv
--- a/data/test_users.csv
+++ b/data/test_users.csv
--- a/data/train_users_2.csv
+++ b/data/train_users_2.csv
--- a/requirements.txt
+++ b/requirements.txt
@ -0,0 +1,7 @@
+pandas
+scikit-learn
+numpy<2
+matplotlib
+tensorflow
+keras
+lightgbm