This commit is contained in:
0x01FE 2025-12-04 16:48:17 -06:00
commit e8e36adb24
11 changed files with 338673 additions and 0 deletions

View File

@ -0,0 +1,243 @@
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
import pickle
def load_data():
"""Load all CSV files"""
print("Loading data...")
train_users = pd.read_csv('../data/train_users_2.csv')
test_users = pd.read_csv('../data/test_users.csv')
sessions = pd.read_csv('../data/sessions.csv')
countries = pd.read_csv('../data/countries.csv')
age_gender = pd.read_csv('../data/age_gender_bkts.csv')
return train_users, test_users, sessions, countries, age_gender
def preprocess_users(df, is_train=True):
"""Preprocess user data"""
print(f"Preprocessing {'train' if is_train else 'test'} users...")
# Create a copy
df = df.copy()
# Handle date features
df['date_account_created'] = pd.to_datetime(df['date_account_created'])
df['timestamp_first_active'] = pd.to_datetime(df['timestamp_first_active'].astype(str), format='%Y%m%d%H%M%S')
# Extract date features
df['dac_year'] = df['date_account_created'].dt.year
df['dac_month'] = df['date_account_created'].dt.month
df['dac_day'] = df['date_account_created'].dt.day
df['dac_weekday'] = df['date_account_created'].dt.weekday
df['tfa_year'] = df['timestamp_first_active'].dt.year
df['tfa_month'] = df['timestamp_first_active'].dt.month
df['tfa_day'] = df['timestamp_first_active'].dt.day
# Handle date_first_booking if it exists (only in train)
# NOTE: We should NOT use date_first_booking features as they leak information
# about the target variable (booking date exists only if user made a booking)
if 'date_first_booking' in df.columns:
# Simply drop this column - don't extract features from it
df = df.drop('date_first_booking', axis=1)
# Drop original date columns
df = df.drop(['date_account_created', 'timestamp_first_active'], axis=1)
# Handle age - clean outliers
df['age'] = df['age'].fillna(-1)
df.loc[(df['age'] < 18) | (df['age'] > 100), 'age'] = -1
# Handle gender
df['gender'] = df['gender'].fillna('-unknown-')
# Fill other categorical NaN with 'unknown'
categorical_cols = ['signup_method', 'signup_flow', 'language', 'affiliate_channel',
'affiliate_provider', 'first_affiliate_tracked', 'signup_app',
'first_device_type', 'first_browser']
for col in categorical_cols:
if col in df.columns:
df[col] = df[col].fillna('unknown')
return df
def aggregate_sessions(sessions):
"""Aggregate session data per user"""
print("Aggregating session data...")
if sessions.empty:
return pd.DataFrame()
# Fill NaN values
sessions = sessions.fillna(-1)
# Aggregate session features per user
session_agg = sessions.groupby('user_id').agg({
'action': 'count',
'action_type': lambda x: x.value_counts().index[0] if len(x) > 0 else 'unknown',
'action_detail': lambda x: x.value_counts().index[0] if len(x) > 0 else 'unknown',
'device_type': lambda x: x.value_counts().index[0] if len(x) > 0 else 'unknown',
'secs_elapsed': ['sum', 'mean', 'max', 'min']
}).reset_index()
# Flatten column names
session_agg.columns = ['user_id', 'num_actions', 'most_common_action_type',
'most_common_action_detail', 'most_common_device_type',
'total_secs', 'mean_secs', 'max_secs', 'min_secs']
return session_agg
def encode_features(train_df, test_df, label_encoders=None):
"""Encode categorical features"""
print("Encoding features...")
categorical_cols = ['gender', 'signup_method', 'language', 'affiliate_channel',
'affiliate_provider', 'first_affiliate_tracked', 'signup_app',
'first_device_type', 'first_browser']
# Add session categorical columns if they exist
if 'most_common_action_type' in train_df.columns:
categorical_cols.extend(['most_common_action_type', 'most_common_action_detail',
'most_common_device_type'])
if label_encoders is None:
label_encoders = {}
for col in categorical_cols:
if col in train_df.columns:
le = LabelEncoder()
# Fit on combined train and test to handle all categories
combined = pd.concat([train_df[col], test_df[col]]).astype(str)
le.fit(combined)
label_encoders[col] = le
# Transform
for col in categorical_cols:
if col in train_df.columns:
train_df[col] = label_encoders[col].transform(train_df[col].astype(str))
test_df[col] = label_encoders[col].transform(test_df[col].astype(str))
# Encode signup_flow as numeric
if 'signup_flow' in train_df.columns:
train_df['signup_flow'] = train_df['signup_flow'].astype(int)
test_df['signup_flow'] = test_df['signup_flow'].astype(int)
return train_df, test_df, label_encoders
def prepare_datasets():
"""Main function to prepare train and test datasets"""
# Load data
train_users, test_users, sessions, countries, age_gender = load_data()
# Store IDs and target
train_ids = train_users['id']
test_ids = test_users['id']
target = train_users['country_destination']
# Preprocess users
train_users = preprocess_users(train_users, is_train=True)
test_users = preprocess_users(test_users, is_train=False)
# Aggregate sessions
session_agg = aggregate_sessions(sessions)
# Merge session data if available
if not session_agg.empty:
train_users = train_users.merge(session_agg, left_on='id', right_on='user_id', how='left')
test_users = test_users.merge(session_agg, left_on='id', right_on='user_id', how='left')
# Drop user_id column from merge
if 'user_id' in train_users.columns:
train_users = train_users.drop('user_id', axis=1)
test_users = test_users.drop('user_id', axis=1)
# Fill NaN values from merge
session_cols = ['num_actions', 'total_secs', 'mean_secs', 'max_secs', 'min_secs']
for col in session_cols:
if col in train_users.columns:
train_users[col] = train_users[col].fillna(0)
test_users[col] = test_users[col].fillna(0)
session_cat_cols = ['most_common_action_type', 'most_common_action_detail', 'most_common_device_type']
for col in session_cat_cols:
if col in train_users.columns:
train_users[col] = train_users[col].fillna('unknown')
test_users[col] = test_users[col].fillna('unknown')
# Drop ID and target from train
train_users = train_users.drop(['id', 'country_destination'], axis=1)
test_users = test_users.drop(['id'], axis=1)
# Encode categorical features
train_users, test_users, label_encoders = encode_features(train_users, test_users)
# Fill any remaining NaN values before scaling
print("Checking for NaN values...")
print(f"Train NaN count: {train_users.isna().sum().sum()}")
print(f"Test NaN count: {test_users.isna().sum().sum()}")
if train_users.isna().any().any():
print("Warning: Found NaN values in train data. Filling with 0...")
train_users = train_users.fillna(0)
if test_users.isna().any().any():
print("Warning: Found NaN values in test data. Filling with 0...")
test_users = test_users.fillna(0)
# Encode target variable
target_encoder = LabelEncoder()
target_encoded = target_encoder.fit_transform(target)
# Save encoders
with open('label_encoders.pkl', 'wb') as f:
pickle.dump(label_encoders, f)
with open('target_encoder.pkl', 'wb') as f:
pickle.dump(target_encoder, f)
# Determine categorical columns (should match those encoded earlier)
categorical_cols = ['gender', 'signup_method', 'language', 'affiliate_channel',
'affiliate_provider', 'first_affiliate_tracked', 'signup_app',
'first_device_type', 'first_browser']
# Add session categorical columns if they exist
session_cat_cols = ['most_common_action_type', 'most_common_action_detail', 'most_common_device_type']
for col in session_cat_cols:
if col in train_users.columns:
categorical_cols.append(col)
# Keep only those categorical cols that actually exist in the dataframe
categorical_cols = [c for c in categorical_cols if c in train_users.columns]
# For LightGBM: No scaling needed (tree-based model)
# Keep all features as-is (numeric and categorical)
feature_names = list(train_users.columns)
# Save feature metadata
with open('feature_names.pkl', 'wb') as f:
pickle.dump(feature_names, f)
with open('categorical_features.pkl', 'wb') as f:
pickle.dump(categorical_cols, f)
# Convert to numpy arrays
X_train = train_users.values
X_test = test_users.values
print(f"Train shape: {X_train.shape}")
print(f"Test shape: {X_test.shape}")
print(f"Number of classes: {len(target_encoder.classes_)}")
print(f"Classes: {target_encoder.classes_}")
return X_train, target_encoded, X_test, test_ids, target_encoder
if __name__ == '__main__':
X_train, y_train, X_test, test_ids, target_encoder = prepare_datasets()
# Save preprocessed data
np.save('X_train.npy', X_train)
np.save('y_train.npy', y_train)
np.save('X_test.npy', X_test)
np.save('test_ids.npy', test_ids)
print("\nData preprocessing completed successfully!")

View File

@ -0,0 +1,90 @@
import numpy as np
import lightgbm as lgb
import pickle
def predict_lightgbm():
"""Generate predictions using LightGBM model"""
print("="*70)
print("LIGHTGBM PREDICTION")
print("="*70)
# Load test data
print("Loading test data...")
X_test = np.load('X_test.npy')
test_ids = np.load('test_ids.npy', allow_pickle=True)
print(f"Test samples: {len(X_test)}")
# Try to load list of CV models; fall back to single model file
import os
models = []
if os.path.exists('lightgbm_models_list.pkl'):
with open('lightgbm_models_list.pkl', 'rb') as f:
model_files = pickle.load(f)
print(f"Loading {len(model_files)} fold models...")
for mf in model_files:
if os.path.exists(mf):
models.append(lgb.Booster(model_file=mf))
elif os.path.exists('lightgbm_model.txt'):
print("Loading single model 'lightgbm_model.txt'...")
models = [lgb.Booster(model_file='lightgbm_model.txt')]
else:
raise FileNotFoundError('No LightGBM model files found. Run training first.')
# Load target encoder
with open('target_encoder.pkl', 'rb') as f:
target_encoder = pickle.load(f)
# Make predictions (average over fold models)
print("\nGenerating predictions by averaging fold models...")
preds = None
for m in models:
p = m.predict(X_test)
if preds is None:
preds = p
else:
preds += p
y_pred_proba = preds / len(models)
# Get top-5 destinations for each user
print("\nGenerating top-5 destinations per user...")
top5_indices = np.argsort(y_pred_proba, axis=1)[:, ::-1][:, :5]
# Create submission with 5 rows per user
import pandas as pd
submission_rows = []
for user_idx, user_id in enumerate(test_ids):
for rank in range(5):
dest_idx = top5_indices[user_idx, rank]
dest_country = target_encoder.inverse_transform([dest_idx])[0]
submission_rows.append({
'id': user_id,
'country': dest_country
})
submission_df = pd.DataFrame(submission_rows)
submission_df.to_csv('submission_lightgbm.csv', index=False)
# Show distribution of top-1 predictions
print("\nTop-1 Prediction Distribution:")
top1_preds = target_encoder.inverse_transform(top5_indices[:, 0])
unique, counts = np.unique(top1_preds, return_counts=True)
for country, count in sorted(zip(unique, counts), key=lambda x: -x[1]):
pct = count / len(top1_preds) * 100
print(f"{country}: {count} ({pct:.2f}%)")
print("\n" + "="*70)
print("Predictions saved as 'submission_lightgbm.csv'")
print("="*70)
print(f"\nTotal users: {len(test_ids)}")
print(f"Total rows (5 per user): {len(submission_rows)}")
print(f"Unique destinations in top-1: {len(unique)}")
return y_pred_proba
if __name__ == '__main__':
predictions = predict_lightgbm()
print("\n" + "="*70)
print("PREDICTION COMPLETE!")
print("="*70)

220
LightGBM/train_lightgbm.py Normal file
View File

@ -0,0 +1,220 @@
import numpy as np
import pandas as pd
import lightgbm as lgb
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import accuracy_score, classification_report, log_loss
import pickle
import matplotlib.pyplot as plt
def train_lightgbm():
"""Train LightGBM classifier for destination prediction"""
print("="*70)
print("LIGHTGBM CLASSIFIER TRAINING")
print("="*70)
# Load preprocessed data and metadata
print("\nLoading preprocessed data and metadata...")
X_train = np.load('X_train.npy')
y_train = np.load('y_train.npy')
# Load feature names and categorical feature list
with open('feature_names.pkl', 'rb') as f:
feature_names = pickle.load(f)
with open('categorical_features.pkl', 'rb') as f:
categorical_features = pickle.load(f)
# Load target encoder
with open('target_encoder.pkl', 'rb') as f:
target_encoder = pickle.load(f)
print(f"Training samples: {len(y_train)}")
print(f"Number of classes: {len(target_encoder.classes_)}")
print(f"Classes: {target_encoder.classes_}")
# Show class distribution
print("\nClass distribution:")
unique, counts = np.unique(y_train, return_counts=True)
for idx, count in zip(unique, counts):
pct = count / len(y_train) * 100
print(f"{target_encoder.classes_[idx]}: {count} ({pct:.2f}%)")
# Convert to DataFrame to preserve feature names and pass categorical features to LightGBM
import pandas as pd
X_df = pd.DataFrame(X_train, columns=feature_names)
print(f"\nUsing categorical features: {categorical_features}")
# Prepare CV
n_splits = 5
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
oof_preds = np.zeros((X_df.shape[0], len(target_encoder.classes_)))
# If test set exists, prepare to accumulate test predictions
X_test = None
try:
X_test = np.load('X_test.npy')
X_test_df = pd.DataFrame(X_test, columns=feature_names)
test_preds = np.zeros((X_test_df.shape[0], len(target_encoder.classes_)))
except Exception:
X_test_df = None
test_preds = None
# LightGBM parameters - updated for CV and categorical handling
params = {
'objective': 'multiclass',
'num_class': len(target_encoder.classes_),
'metric': 'multi_logloss',
'boosting_type': 'gbdt',
'num_leaves': 128,
'learning_rate': 0.05,
'feature_fraction': 0.7,
'bagging_fraction': 0.8,
'bagging_freq': 5,
'verbose': -1,
'max_depth': -1,
'min_data_in_leaf': 20,
'lambda_l1': 0.1,
'lambda_l2': 0.1,
'seed': 42,
'boost_from_average': False
}
print("\nLightGBM Parameters:")
for key, value in params.items():
print(f" {key}: {value}")
# Train with Stratified K-Fold CV
print("\nTraining LightGBM with Stratified K-Fold CV...")
fold = 0
models = []
evals_result = {}
for train_idx, val_idx in skf.split(X_df, y_train):
fold += 1
print('\n' + '='*50)
print(f"Fold {fold}/{n_splits}")
print('='*50)
X_tr = X_df.iloc[train_idx]
X_val = X_df.iloc[val_idx]
y_tr = y_train[train_idx]
y_val = y_train[val_idx]
train_data = lgb.Dataset(X_tr, label=y_tr, feature_name=feature_names, categorical_feature=categorical_features)
val_data = lgb.Dataset(X_val, label=y_val, reference=train_data, feature_name=feature_names, categorical_feature=categorical_features)
model = lgb.train(
params,
train_data,
num_boost_round=3000,
valid_sets=[train_data, val_data],
valid_names=['train', 'valid'],
callbacks=[
lgb.early_stopping(stopping_rounds=100),
lgb.log_evaluation(period=100)
]
)
models.append(model)
# OOF predictions
val_pred = model.predict(X_val, num_iteration=model.best_iteration)
oof_preds[val_idx] = val_pred
# Test predictions (if available)
if X_test_df is not None:
fold_test_pred = model.predict(X_test_df, num_iteration=model.best_iteration)
test_preds += fold_test_pred
# Save fold model
model.save_model(f'lightgbm_model_fold{fold}.txt')
print(f"Saved model for fold {fold} as 'lightgbm_model_fold{fold}.txt'")
print("-" * 70)
# Evaluate using OOF predictions across all folds
print("\n" + "="*70)
print("CROSS-VALIDATION OOF RESULTS")
print("="*70)
oof_logloss = log_loss(y_train, oof_preds)
oof_preds_argmax = np.argmax(oof_preds, axis=1)
oof_accuracy = accuracy_score(y_train, oof_preds_argmax)
print(f"\nOOF Accuracy: {oof_accuracy:.4f}")
print(f"OOF Log Loss: {oof_logloss:.4f}")
# Baseline accuracy on full train
most_common = np.bincount(y_train).argmax()
baseline = np.sum(y_train == most_common) / len(y_train)
print(f"Baseline (always {target_encoder.classes_[most_common]}): {baseline:.4f}")
print(f"Improvement over baseline: {(oof_accuracy - baseline):.4f}")
# Show classification report on OOF hard predictions
print("\nClassification Report (OOF predictions):")
print(classification_report(y_train, oof_preds_argmax, target_names=target_encoder.classes_, zero_division=0))
# Prediction distribution (OOF)
print("\nOOF Prediction Distribution:")
print(f"{'Class':<10} {'Actual':<10} {'Predicted':<10} {'Actual %':<12} {'Pred %'}")
print("-" * 60)
for idx in range(len(target_encoder.classes_)):
class_name = target_encoder.classes_[idx]
actual_count = np.sum(y_train == idx)
pred_count = np.sum(oof_preds_argmax == idx)
actual_pct = actual_count / len(y_train) * 100
pred_pct = pred_count / len(oof_preds_argmax) * 100
print(f"{class_name:<10} {actual_count:<10} {pred_count:<10} {actual_pct:>6.2f}% {pred_pct:>6.2f}%")
# Feature importance
print("\n" + "="*70)
print("TOP 20 FEATURE IMPORTANCES")
print("="*70)
# Aggregate feature importances across folds
total_importance = np.zeros(len(feature_names))
for m in models:
total_importance += np.array(m.feature_importance(importance_type='gain'))
avg_importance = total_importance / max(1, len(models))
importance_df = pd.DataFrame({
'feature': feature_names,
'importance': avg_importance
}).sort_values('importance', ascending=False)
print(importance_df.head(20).to_string(index=False))
# Plot feature importance
plt.figure(figsize=(10, 8))
top_features = importance_df.head(20)
plt.barh(range(len(top_features)), top_features['importance'])
plt.yticks(range(len(top_features)), top_features['feature'])
plt.xlabel('Importance (Gain)')
plt.title('Top 20 Feature Importances')
plt.gca().invert_yaxis()
plt.tight_layout()
plt.savefig('feature_importance.png', dpi=300)
print("\nFeature importance plot saved as 'feature_importance.png'")
# (Optional) Save a simple CSV of top importances
importance_df.head(100).to_csv('feature_importances_avg.csv', index=False)
print("Saved averaged feature importances as 'feature_importances_avg.csv'")
# Save list of fold models
model_files = [f'lightgbm_model_fold{i+1}.txt' for i in range(len(models))]
with open('lightgbm_models_list.pkl', 'wb') as f:
pickle.dump(model_files, f)
print("\n" + "="*70)
print(f"Saved {len(models)} fold models and model list 'lightgbm_models_list.pkl'")
print("="*70)
return models
if __name__ == '__main__':
model = train_lightgbm()
print("\n" + "="*70)
print("TRAINING COMPLETE!")
print("="*70)
print("\nNext step: Use predict_lightgbm.py to make predictions on test data")

421
data/age_gender_bkts.csv Normal file
View File

@ -0,0 +1,421 @@
age_bucket,country_destination,gender,population_in_thousands,year
100+,AU,male,1.0,2015.0
95-99,AU,male,9.0,2015.0
90-94,AU,male,47.0,2015.0
85-89,AU,male,118.0,2015.0
80-84,AU,male,199.0,2015.0
75-79,AU,male,298.0,2015.0
70-74,AU,male,415.0,2015.0
65-69,AU,male,574.0,2015.0
60-64,AU,male,636.0,2015.0
55-59,AU,male,714.0,2015.0
50-54,AU,male,778.0,2015.0
45-49,AU,male,778.0,2015.0
40-44,AU,male,820.0,2015.0
35-39,AU,male,797.0,2015.0
30-34,AU,male,881.0,2015.0
25-29,AU,male,895.0,2015.0
20-24,AU,male,820.0,2015.0
15-19,AU,male,768.0,2015.0
10-14,AU,male,743.0,2015.0
5-9,AU,male,784.0,2015.0
0-4,AU,male,824.0,2015.0
100+,AU,female,4.0,2015.0
95-99,AU,female,25.0,2015.0
90-94,AU,female,94.0,2015.0
85-89,AU,female,179.0,2015.0
80-84,AU,female,252.0,2015.0
75-79,AU,female,338.0,2015.0
70-74,AU,female,438.0,2015.0
65-69,AU,female,592.0,2015.0
60-64,AU,female,660.0,2015.0
55-59,AU,female,739.0,2015.0
50-54,AU,female,798.0,2015.0
45-49,AU,female,793.0,2015.0
40-44,AU,female,838.0,2015.0
35-39,AU,female,801.0,2015.0
30-34,AU,female,865.0,2015.0
25-29,AU,female,851.0,2015.0
20-24,AU,female,787.0,2015.0
15-19,AU,female,737.0,2015.0
10-14,AU,female,707.0,2015.0
5-9,AU,female,745.0,2015.0
0-4,AU,female,781.0,2015.0
75-79,CA,female,530.0,2015.0
75-79,CA,male,446.0,2015.0
35-39,CA,female,1192.0,2015.0
25-29,CA,female,1220.0,2015.0
95-99,CA,male,13.0,2015.0
40-44,CA,male,1179.0,2015.0
30-34,CA,female,1240.0,2015.0
60-64,CA,female,1142.0,2015.0
55-59,CA,male,1287.0,2015.0
45-49,CA,male,1232.0,2015.0
85-89,CA,female,300.0,2015.0
50-54,CA,male,1400.0,2015.0
100+,CA,male,1.0,2015.0
70-74,CA,male,650.0,2015.0
95-99,CA,female,42.0,2015.0
100+,CA,female,7.0,2015.0
90-94,CA,male,68.0,2015.0
50-54,CA,female,1391.0,2015.0
0-4,CA,male,1045.0,2015.0
5-9,CA,male,1011.0,2015.0
70-74,CA,female,715.0,2015.0
10-14,CA,male,983.0,2015.0
55-59,CA,female,1305.0,2015.0
45-49,CA,female,1217.0,2015.0
20-24,CA,female,1159.0,2015.0
15-19,CA,male,1054.0,2015.0
20-24,CA,male,1203.0,2015.0
65-69,CA,male,914.0,2015.0
40-44,CA,female,1169.0,2015.0
90-94,CA,female,153.0,2015.0
65-69,CA,female,973.0,2015.0
60-64,CA,male,1094.0,2015.0
85-89,CA,male,183.0,2015.0
25-29,CA,male,1273.0,2015.0
5-9,CA,female,960.0,2015.0
80-84,CA,female,422.0,2015.0
30-34,CA,male,1262.0,2015.0
10-14,CA,female,929.0,2015.0
0-4,CA,female,991.0,2015.0
35-39,CA,male,1189.0,2015.0
15-19,CA,female,1009.0,2015.0
80-84,CA,male,318.0,2015.0
70-74,DE,male,2099.0,2015.0
80-84,DE,female,1486.0,2015.0
60-64,DE,female,2799.0,2015.0
100+,DE,male,3.0,2015.0
5-9,DE,female,1690.0,2015.0
75-79,DE,female,2421.0,2015.0
70-74,DE,female,2362.0,2015.0
65-69,DE,female,2134.0,2015.0
85-89,DE,male,517.0,2015.0
25-29,DE,female,2495.0,2015.0
60-64,DE,male,2575.0,2015.0
0-4,DE,female,1713.0,2015.0
55-59,DE,male,2983.0,2015.0
50-54,DE,male,3614.0,2015.0
30-34,DE,female,2571.0,2015.0
45-49,DE,male,3525.0,2015.0
40-44,DE,male,2633.0,2015.0
35-39,DE,female,2402.0,2015.0
65-69,DE,male,2003.0,2015.0
35-39,DE,male,2448.0,2015.0
20-24,DE,female,2161.0,2015.0
30-34,DE,male,2627.0,2015.0
95-99,DE,male,17.0,2015.0
40-44,DE,female,2559.0,2015.0
75-79,DE,male,1932.0,2015.0
25-29,DE,male,2593.0,2015.0
10-14,DE,female,1800.0,2015.0
20-24,DE,male,2266.0,2015.0
15-19,DE,male,2076.0,2015.0
45-49,DE,female,3357.0,2015.0
10-14,DE,male,1892.0,2015.0
5-9,DE,male,1781.0,2015.0
50-54,DE,female,3513.0,2015.0
0-4,DE,male,1811.0,2015.0
15-19,DE,female,1974.0,2015.0
100+,DE,female,14.0,2015.0
90-94,DE,male,154.0,2015.0
80-84,DE,male,1016.0,2015.0
95-99,DE,female,71.0,2015.0
90-94,DE,female,491.0,2015.0
55-59,DE,female,2996.0,2015.0
85-89,DE,female,988.0,2015.0
95-99,ES,male,22.0,2015.0
15-19,ES,female,1027.0,2015.0
85-89,ES,male,306.0,2015.0
75-79,ES,male,688.0,2015.0
40-44,ES,female,1924.0,2015.0
100+,ES,male,3.0,2015.0
90-94,ES,male,112.0,2015.0
45-49,ES,male,1909.0,2015.0
15-19,ES,male,1087.0,2015.0
0-4,ES,female,1198.0,2015.0
70-74,ES,female,1040.0,2015.0
5-9,ES,male,1307.0,2015.0
10-14,ES,female,1124.0,2015.0
30-34,ES,male,1748.0,2015.0
65-69,ES,female,1251.0,2015.0
95-99,ES,female,64.0,2015.0
80-84,ES,female,843.0,2015.0
55-59,ES,male,1479.0,2015.0
10-14,ES,male,1189.0,2015.0
50-54,ES,female,1733.0,2015.0
20-24,ES,female,1106.0,2015.0
40-44,ES,male,2052.0,2015.0
70-74,ES,male,880.0,2015.0
30-34,ES,female,1646.0,2015.0
35-39,ES,male,2117.0,2015.0
75-79,ES,female,906.0,2015.0
25-29,ES,female,1280.0,2015.0
5-9,ES,female,1235.0,2015.0
60-64,ES,male,1235.0,2015.0
55-59,ES,female,1531.0,2015.0
25-29,ES,male,1347.0,2015.0
85-89,ES,female,563.0,2015.0
65-69,ES,male,1119.0,2015.0
100+,ES,female,9.0,2015.0
90-94,ES,female,256.0,2015.0
35-39,ES,female,1966.0,2015.0
80-84,ES,male,559.0,2015.0
0-4,ES,male,1272.0,2015.0
60-64,ES,female,1316.0,2015.0
50-54,ES,male,1727.0,2015.0
20-24,ES,male,1175.0,2015.0
45-49,ES,female,1852.0,2015.0
40-44,FR,male,2212.0,2015.0
60-64,FR,male,1891.0,2015.0
90-94,FR,male,164.0,2015.0
35-39,FR,male,1842.0,2015.0
100+,FR,male,3.0,2015.0
75-79,FR,female,1223.0,2015.0
25-29,FR,female,2041.0,2015.0
70-74,FR,male,1103.0,2015.0
30-34,FR,male,2035.0,2015.0
85-89,FR,male,405.0,2015.0
55-59,FR,female,2125.0,2015.0
90-94,FR,female,465.0,2015.0
25-29,FR,male,2081.0,2015.0
95-99,FR,male,20.0,2015.0
75-79,FR,male,918.0,2015.0
20-24,FR,male,2040.0,2015.0
10-14,FR,female,1894.0,2015.0
65-69,FR,female,1985.0,2015.0
45-49,FR,female,2220.0,2015.0
15-19,FR,male,2016.0,2015.0
70-74,FR,female,1317.0,2015.0
80-84,FR,male,712.0,2015.0
10-14,FR,male,1985.0,2015.0
80-84,FR,female,1137.0,2015.0
5-9,FR,male,1992.0,2015.0
50-54,FR,female,2234.0,2015.0
5-9,FR,female,1914.0,2015.0
60-64,FR,female,2065.0,2015.0
0-4,FR,male,2035.0,2015.0
40-44,FR,female,2231.0,2015.0
15-19,FR,female,1916.0,2015.0
85-89,FR,female,838.0,2015.0
100+,FR,female,19.0,2015.0
0-4,FR,female,1938.0,2015.0
20-24,FR,female,1947.0,2015.0
55-59,FR,male,1939.0,2015.0
30-34,FR,female,2046.0,2015.0
50-54,FR,male,2123.0,2015.0
95-99,FR,female,82.0,2015.0
45-49,FR,male,2194.0,2015.0
65-69,FR,male,1780.0,2015.0
35-39,FR,female,1856.0,2015.0
10-14,GB,female,1690.0,2015.0
35-39,GB,male,1979.0,2015.0
65-69,GB,female,1858.0,2015.0
60-64,GB,male,1693.0,2015.0
10-14,GB,male,1771.0,2015.0
95-99,GB,female,81.0,2015.0
25-29,GB,male,2213.0,2015.0
5-9,GB,female,1913.0,2015.0
40-44,GB,male,2101.0,2015.0
100+,GB,female,13.0,2015.0
70-74,GB,female,1422.0,2015.0
60-64,GB,female,1775.0,2015.0
85-89,GB,female,602.0,2015.0
30-34,GB,male,2190.0,2015.0
65-69,GB,male,1735.0,2015.0
55-59,GB,male,1925.0,2015.0
80-84,GB,female,896.0,2015.0
100+,GB,male,3.0,2015.0
45-49,GB,male,2301.0,2015.0
35-39,GB,female,1964.0,2015.0
55-59,GB,female,1991.0,2015.0
85-89,GB,male,365.0,2015.0
40-44,GB,female,2147.0,2015.0
95-99,GB,male,29.0,2015.0
50-54,GB,female,2306.0,2015.0
0-4,GB,female,1888.0,2015.0
25-29,GB,female,2122.0,2015.0
20-24,GB,female,1957.0,2015.0
15-19,GB,male,1864.0,2015.0
50-54,GB,male,2220.0,2015.0
90-94,GB,female,310.0,2015.0
5-9,GB,male,2007.0,2015.0
20-24,GB,male,2061.0,2015.0
75-79,GB,male,978.0,2015.0
15-19,GB,female,1783.0,2015.0
80-84,GB,male,661.0,2015.0
70-74,GB,male,1273.0,2015.0
30-34,GB,female,2112.0,2015.0
45-49,GB,female,2349.0,2015.0
75-79,GB,female,1166.0,2015.0
90-94,GB,male,145.0,2015.0
0-4,GB,male,1981.0,2015.0
20-24,IT,female,1514.0,2015.0
65-69,IT,male,1716.0,2015.0
100+,IT,male,3.0,2015.0
95-99,IT,male,22.0,2015.0
90-94,IT,male,164.0,2015.0
85-89,IT,male,440.0,2015.0
60-64,IT,male,1749.0,2015.0
55-59,IT,male,1976.0,2015.0
15-19,IT,female,1411.0,2015.0
80-84,IT,male,808.0,2015.0
50-54,IT,male,2322.0,2015.0
45-49,IT,male,2476.0,2015.0
40-44,IT,male,2428.0,2015.0
35-39,IT,male,2117.0,2015.0
30-34,IT,male,1814.0,2015.0
25-29,IT,male,1673.0,2015.0
20-24,IT,male,1601.0,2015.0
15-19,IT,male,1493.0,2015.0
10-14,IT,male,1468.0,2015.0
75-79,IT,male,1191.0,2015.0
5-9,IT,male,1473.0,2015.0
10-14,IT,female,1388.0,2015.0
0-4,IT,male,1468.0,2015.0
100+,IT,female,15.0,2015.0
0-4,IT,female,1383.0,2015.0
95-99,IT,female,79.0,2015.0
90-94,IT,female,436.0,2015.0
85-89,IT,female,855.0,2015.0
80-84,IT,female,1231.0,2015.0
75-79,IT,female,1534.0,2015.0
70-74,IT,female,1567.0,2015.0
65-69,IT,female,1893.0,2015.0
70-74,IT,male,1338.0,2015.0
60-64,IT,female,1880.0,2015.0
5-9,IT,female,1395.0,2015.0
55-59,IT,female,2069.0,2015.0
50-54,IT,female,2373.0,2015.0
45-49,IT,female,2480.0,2015.0
40-44,IT,female,2411.0,2015.0
35-39,IT,female,2090.0,2015.0
30-34,IT,female,1791.0,2015.0
25-29,IT,female,1610.0,2015.0
60-64,NL,female,524.0,2015.0
80-84,NL,female,231.0,2015.0
5-9,NL,female,450.0,2015.0
90-94,NL,male,25.0,2015.0
85-89,NL,female,151.0,2015.0
70-74,NL,male,351.0,2015.0
55-59,NL,female,580.0,2015.0
90-94,NL,female,69.0,2015.0
95-99,NL,female,15.0,2015.0
100+,NL,female,2.0,2015.0
50-54,NL,female,636.0,2015.0
0-4,NL,male,462.0,2015.0
5-9,NL,male,473.0,2015.0
10-14,NL,male,517.0,2015.0
45-49,NL,female,637.0,2015.0
15-19,NL,male,510.0,2015.0
20-24,NL,female,504.0,2015.0
20-24,NL,male,527.0,2015.0
65-69,NL,male,518.0,2015.0
25-29,NL,male,518.0,2015.0
40-44,NL,female,583.0,2015.0
10-14,NL,female,493.0,2015.0
30-34,NL,male,503.0,2015.0
95-99,NL,male,4.0,2015.0
75-79,NL,male,246.0,2015.0
35-39,NL,male,490.0,2015.0
80-84,NL,male,158.0,2015.0
35-39,NL,female,490.0,2015.0
40-44,NL,male,582.0,2015.0
15-19,NL,female,484.0,2015.0
0-4,NL,female,438.0,2015.0
45-49,NL,male,650.0,2015.0
30-34,NL,female,497.0,2015.0
50-54,NL,male,646.0,2015.0
100+,NL,male,0.0,2015.0
55-59,NL,male,581.0,2015.0
25-29,NL,female,505.0,2015.0
60-64,NL,male,523.0,2015.0
85-89,NL,male,78.0,2015.0
65-69,NL,female,527.0,2015.0
70-74,NL,female,375.0,2015.0
75-79,NL,female,295.0,2015.0
80-84,PT,female,194.0,2015.0
40-44,PT,female,418.0,2015.0
65-69,PT,female,313.0,2015.0
60-64,PT,female,341.0,2015.0
90-94,PT,male,19.0,2015.0
85-89,PT,female,115.0,2015.0
80-84,PT,male,122.0,2015.0
35-39,PT,male,419.0,2015.0
15-19,PT,female,266.0,2015.0
60-64,PT,male,308.0,2015.0
90-94,PT,female,45.0,2015.0
25-29,PT,female,299.0,2015.0
95-99,PT,female,9.0,2015.0
55-59,PT,female,366.0,2015.0
65-69,PT,male,267.0,2015.0
40-44,PT,male,416.0,2015.0
100+,PT,female,1.0,2015.0
75-79,PT,female,244.0,2015.0
0-4,PT,male,239.0,2015.0
85-89,PT,male,59.0,2015.0
35-39,PT,female,414.0,2015.0
5-9,PT,male,264.0,2015.0
100+,PT,male,0.0,2015.0
50-54,PT,female,397.0,2015.0
95-99,PT,male,3.0,2015.0
10-14,PT,female,269.0,2015.0
10-14,PT,male,285.0,2015.0
0-4,PT,female,225.0,2015.0
75-79,PT,male,177.0,2015.0
5-9,PT,female,250.0,2015.0
15-19,PT,male,277.0,2015.0
45-49,PT,male,386.0,2015.0
20-24,PT,male,285.0,2015.0
45-49,PT,female,395.0,2015.0
20-24,PT,female,275.0,2015.0
70-74,PT,male,214.0,2015.0
25-29,PT,male,309.0,2015.0
50-54,PT,male,378.0,2015.0
70-74,PT,female,270.0,2015.0
55-59,PT,male,343.0,2015.0
30-34,PT,female,362.0,2015.0
30-34,PT,male,371.0,2015.0
90-94,US,female,1193.0,2015.0
75-79,US,male,3641.0,2015.0
70-74,US,male,5278.0,2015.0
65-69,US,male,7561.0,2015.0
60-64,US,male,9217.0,2015.0
55-59,US,male,10689.0,2015.0
50-54,US,male,11013.0,2015.0
45-49,US,male,10454.0,2015.0
40-44,US,male,10159.0,2015.0
10-14,US,female,10346.0,2015.0
35-39,US,male,10329.0,2015.0
30-34,US,male,10984.0,2015.0
25-29,US,male,11385.0,2015.0
20-24,US,male,11601.0,2015.0
15-19,US,male,11025.0,2015.0
10-14,US,male,10771.0,2015.0
5-9,US,male,10632.0,2015.0
0-4,US,male,10788.0,2015.0
100+,US,female,61.0,2015.0
95-99,US,female,361.0,2015.0
5-9,US,female,10201.0,2015.0
85-89,US,female,2459.0,2015.0
80-84,US,female,3394.0,2015.0
75-79,US,female,4532.0,2015.0
70-74,US,female,6179.0,2015.0
65-69,US,female,8483.0,2015.0
60-64,US,female,10004.0,2015.0
55-59,US,female,11264.0,2015.0
50-54,US,female,11413.0,2015.0
45-49,US,female,10659.0,2015.0
0-4,US,female,10306.0,2015.0
40-44,US,female,10308.0,2015.0
35-39,US,female,10352.0,2015.0
30-34,US,female,10863.0,2015.0
25-29,US,female,11011.0,2015.0
20-24,US,female,11094.0,2015.0
100+,US,male,13.0,2015.0
95-99,US,male,115.0,2015.0
90-94,US,male,541.0,2015.0
15-19,US,female,10570.0,2015.0
85-89,US,male,1441.0,2015.0
80-84,US,male,2442.0,2015.0
1 age_bucket country_destination gender population_in_thousands year
2 100+ AU male 1.0 2015.0
3 95-99 AU male 9.0 2015.0
4 90-94 AU male 47.0 2015.0
5 85-89 AU male 118.0 2015.0
6 80-84 AU male 199.0 2015.0
7 75-79 AU male 298.0 2015.0
8 70-74 AU male 415.0 2015.0
9 65-69 AU male 574.0 2015.0
10 60-64 AU male 636.0 2015.0
11 55-59 AU male 714.0 2015.0
12 50-54 AU male 778.0 2015.0
13 45-49 AU male 778.0 2015.0
14 40-44 AU male 820.0 2015.0
15 35-39 AU male 797.0 2015.0
16 30-34 AU male 881.0 2015.0
17 25-29 AU male 895.0 2015.0
18 20-24 AU male 820.0 2015.0
19 15-19 AU male 768.0 2015.0
20 10-14 AU male 743.0 2015.0
21 5-9 AU male 784.0 2015.0
22 0-4 AU male 824.0 2015.0
23 100+ AU female 4.0 2015.0
24 95-99 AU female 25.0 2015.0
25 90-94 AU female 94.0 2015.0
26 85-89 AU female 179.0 2015.0
27 80-84 AU female 252.0 2015.0
28 75-79 AU female 338.0 2015.0
29 70-74 AU female 438.0 2015.0
30 65-69 AU female 592.0 2015.0
31 60-64 AU female 660.0 2015.0
32 55-59 AU female 739.0 2015.0
33 50-54 AU female 798.0 2015.0
34 45-49 AU female 793.0 2015.0
35 40-44 AU female 838.0 2015.0
36 35-39 AU female 801.0 2015.0
37 30-34 AU female 865.0 2015.0
38 25-29 AU female 851.0 2015.0
39 20-24 AU female 787.0 2015.0
40 15-19 AU female 737.0 2015.0
41 10-14 AU female 707.0 2015.0
42 5-9 AU female 745.0 2015.0
43 0-4 AU female 781.0 2015.0
44 75-79 CA female 530.0 2015.0
45 75-79 CA male 446.0 2015.0
46 35-39 CA female 1192.0 2015.0
47 25-29 CA female 1220.0 2015.0
48 95-99 CA male 13.0 2015.0
49 40-44 CA male 1179.0 2015.0
50 30-34 CA female 1240.0 2015.0
51 60-64 CA female 1142.0 2015.0
52 55-59 CA male 1287.0 2015.0
53 45-49 CA male 1232.0 2015.0
54 85-89 CA female 300.0 2015.0
55 50-54 CA male 1400.0 2015.0
56 100+ CA male 1.0 2015.0
57 70-74 CA male 650.0 2015.0
58 95-99 CA female 42.0 2015.0
59 100+ CA female 7.0 2015.0
60 90-94 CA male 68.0 2015.0
61 50-54 CA female 1391.0 2015.0
62 0-4 CA male 1045.0 2015.0
63 5-9 CA male 1011.0 2015.0
64 70-74 CA female 715.0 2015.0
65 10-14 CA male 983.0 2015.0
66 55-59 CA female 1305.0 2015.0
67 45-49 CA female 1217.0 2015.0
68 20-24 CA female 1159.0 2015.0
69 15-19 CA male 1054.0 2015.0
70 20-24 CA male 1203.0 2015.0
71 65-69 CA male 914.0 2015.0
72 40-44 CA female 1169.0 2015.0
73 90-94 CA female 153.0 2015.0
74 65-69 CA female 973.0 2015.0
75 60-64 CA male 1094.0 2015.0
76 85-89 CA male 183.0 2015.0
77 25-29 CA male 1273.0 2015.0
78 5-9 CA female 960.0 2015.0
79 80-84 CA female 422.0 2015.0
80 30-34 CA male 1262.0 2015.0
81 10-14 CA female 929.0 2015.0
82 0-4 CA female 991.0 2015.0
83 35-39 CA male 1189.0 2015.0
84 15-19 CA female 1009.0 2015.0
85 80-84 CA male 318.0 2015.0
86 70-74 DE male 2099.0 2015.0
87 80-84 DE female 1486.0 2015.0
88 60-64 DE female 2799.0 2015.0
89 100+ DE male 3.0 2015.0
90 5-9 DE female 1690.0 2015.0
91 75-79 DE female 2421.0 2015.0
92 70-74 DE female 2362.0 2015.0
93 65-69 DE female 2134.0 2015.0
94 85-89 DE male 517.0 2015.0
95 25-29 DE female 2495.0 2015.0
96 60-64 DE male 2575.0 2015.0
97 0-4 DE female 1713.0 2015.0
98 55-59 DE male 2983.0 2015.0
99 50-54 DE male 3614.0 2015.0
100 30-34 DE female 2571.0 2015.0
101 45-49 DE male 3525.0 2015.0
102 40-44 DE male 2633.0 2015.0
103 35-39 DE female 2402.0 2015.0
104 65-69 DE male 2003.0 2015.0
105 35-39 DE male 2448.0 2015.0
106 20-24 DE female 2161.0 2015.0
107 30-34 DE male 2627.0 2015.0
108 95-99 DE male 17.0 2015.0
109 40-44 DE female 2559.0 2015.0
110 75-79 DE male 1932.0 2015.0
111 25-29 DE male 2593.0 2015.0
112 10-14 DE female 1800.0 2015.0
113 20-24 DE male 2266.0 2015.0
114 15-19 DE male 2076.0 2015.0
115 45-49 DE female 3357.0 2015.0
116 10-14 DE male 1892.0 2015.0
117 5-9 DE male 1781.0 2015.0
118 50-54 DE female 3513.0 2015.0
119 0-4 DE male 1811.0 2015.0
120 15-19 DE female 1974.0 2015.0
121 100+ DE female 14.0 2015.0
122 90-94 DE male 154.0 2015.0
123 80-84 DE male 1016.0 2015.0
124 95-99 DE female 71.0 2015.0
125 90-94 DE female 491.0 2015.0
126 55-59 DE female 2996.0 2015.0
127 85-89 DE female 988.0 2015.0
128 95-99 ES male 22.0 2015.0
129 15-19 ES female 1027.0 2015.0
130 85-89 ES male 306.0 2015.0
131 75-79 ES male 688.0 2015.0
132 40-44 ES female 1924.0 2015.0
133 100+ ES male 3.0 2015.0
134 90-94 ES male 112.0 2015.0
135 45-49 ES male 1909.0 2015.0
136 15-19 ES male 1087.0 2015.0
137 0-4 ES female 1198.0 2015.0
138 70-74 ES female 1040.0 2015.0
139 5-9 ES male 1307.0 2015.0
140 10-14 ES female 1124.0 2015.0
141 30-34 ES male 1748.0 2015.0
142 65-69 ES female 1251.0 2015.0
143 95-99 ES female 64.0 2015.0
144 80-84 ES female 843.0 2015.0
145 55-59 ES male 1479.0 2015.0
146 10-14 ES male 1189.0 2015.0
147 50-54 ES female 1733.0 2015.0
148 20-24 ES female 1106.0 2015.0
149 40-44 ES male 2052.0 2015.0
150 70-74 ES male 880.0 2015.0
151 30-34 ES female 1646.0 2015.0
152 35-39 ES male 2117.0 2015.0
153 75-79 ES female 906.0 2015.0
154 25-29 ES female 1280.0 2015.0
155 5-9 ES female 1235.0 2015.0
156 60-64 ES male 1235.0 2015.0
157 55-59 ES female 1531.0 2015.0
158 25-29 ES male 1347.0 2015.0
159 85-89 ES female 563.0 2015.0
160 65-69 ES male 1119.0 2015.0
161 100+ ES female 9.0 2015.0
162 90-94 ES female 256.0 2015.0
163 35-39 ES female 1966.0 2015.0
164 80-84 ES male 559.0 2015.0
165 0-4 ES male 1272.0 2015.0
166 60-64 ES female 1316.0 2015.0
167 50-54 ES male 1727.0 2015.0
168 20-24 ES male 1175.0 2015.0
169 45-49 ES female 1852.0 2015.0
170 40-44 FR male 2212.0 2015.0
171 60-64 FR male 1891.0 2015.0
172 90-94 FR male 164.0 2015.0
173 35-39 FR male 1842.0 2015.0
174 100+ FR male 3.0 2015.0
175 75-79 FR female 1223.0 2015.0
176 25-29 FR female 2041.0 2015.0
177 70-74 FR male 1103.0 2015.0
178 30-34 FR male 2035.0 2015.0
179 85-89 FR male 405.0 2015.0
180 55-59 FR female 2125.0 2015.0
181 90-94 FR female 465.0 2015.0
182 25-29 FR male 2081.0 2015.0
183 95-99 FR male 20.0 2015.0
184 75-79 FR male 918.0 2015.0
185 20-24 FR male 2040.0 2015.0
186 10-14 FR female 1894.0 2015.0
187 65-69 FR female 1985.0 2015.0
188 45-49 FR female 2220.0 2015.0
189 15-19 FR male 2016.0 2015.0
190 70-74 FR female 1317.0 2015.0
191 80-84 FR male 712.0 2015.0
192 10-14 FR male 1985.0 2015.0
193 80-84 FR female 1137.0 2015.0
194 5-9 FR male 1992.0 2015.0
195 50-54 FR female 2234.0 2015.0
196 5-9 FR female 1914.0 2015.0
197 60-64 FR female 2065.0 2015.0
198 0-4 FR male 2035.0 2015.0
199 40-44 FR female 2231.0 2015.0
200 15-19 FR female 1916.0 2015.0
201 85-89 FR female 838.0 2015.0
202 100+ FR female 19.0 2015.0
203 0-4 FR female 1938.0 2015.0
204 20-24 FR female 1947.0 2015.0
205 55-59 FR male 1939.0 2015.0
206 30-34 FR female 2046.0 2015.0
207 50-54 FR male 2123.0 2015.0
208 95-99 FR female 82.0 2015.0
209 45-49 FR male 2194.0 2015.0
210 65-69 FR male 1780.0 2015.0
211 35-39 FR female 1856.0 2015.0
212 10-14 GB female 1690.0 2015.0
213 35-39 GB male 1979.0 2015.0
214 65-69 GB female 1858.0 2015.0
215 60-64 GB male 1693.0 2015.0
216 10-14 GB male 1771.0 2015.0
217 95-99 GB female 81.0 2015.0
218 25-29 GB male 2213.0 2015.0
219 5-9 GB female 1913.0 2015.0
220 40-44 GB male 2101.0 2015.0
221 100+ GB female 13.0 2015.0
222 70-74 GB female 1422.0 2015.0
223 60-64 GB female 1775.0 2015.0
224 85-89 GB female 602.0 2015.0
225 30-34 GB male 2190.0 2015.0
226 65-69 GB male 1735.0 2015.0
227 55-59 GB male 1925.0 2015.0
228 80-84 GB female 896.0 2015.0
229 100+ GB male 3.0 2015.0
230 45-49 GB male 2301.0 2015.0
231 35-39 GB female 1964.0 2015.0
232 55-59 GB female 1991.0 2015.0
233 85-89 GB male 365.0 2015.0
234 40-44 GB female 2147.0 2015.0
235 95-99 GB male 29.0 2015.0
236 50-54 GB female 2306.0 2015.0
237 0-4 GB female 1888.0 2015.0
238 25-29 GB female 2122.0 2015.0
239 20-24 GB female 1957.0 2015.0
240 15-19 GB male 1864.0 2015.0
241 50-54 GB male 2220.0 2015.0
242 90-94 GB female 310.0 2015.0
243 5-9 GB male 2007.0 2015.0
244 20-24 GB male 2061.0 2015.0
245 75-79 GB male 978.0 2015.0
246 15-19 GB female 1783.0 2015.0
247 80-84 GB male 661.0 2015.0
248 70-74 GB male 1273.0 2015.0
249 30-34 GB female 2112.0 2015.0
250 45-49 GB female 2349.0 2015.0
251 75-79 GB female 1166.0 2015.0
252 90-94 GB male 145.0 2015.0
253 0-4 GB male 1981.0 2015.0
254 20-24 IT female 1514.0 2015.0
255 65-69 IT male 1716.0 2015.0
256 100+ IT male 3.0 2015.0
257 95-99 IT male 22.0 2015.0
258 90-94 IT male 164.0 2015.0
259 85-89 IT male 440.0 2015.0
260 60-64 IT male 1749.0 2015.0
261 55-59 IT male 1976.0 2015.0
262 15-19 IT female 1411.0 2015.0
263 80-84 IT male 808.0 2015.0
264 50-54 IT male 2322.0 2015.0
265 45-49 IT male 2476.0 2015.0
266 40-44 IT male 2428.0 2015.0
267 35-39 IT male 2117.0 2015.0
268 30-34 IT male 1814.0 2015.0
269 25-29 IT male 1673.0 2015.0
270 20-24 IT male 1601.0 2015.0
271 15-19 IT male 1493.0 2015.0
272 10-14 IT male 1468.0 2015.0
273 75-79 IT male 1191.0 2015.0
274 5-9 IT male 1473.0 2015.0
275 10-14 IT female 1388.0 2015.0
276 0-4 IT male 1468.0 2015.0
277 100+ IT female 15.0 2015.0
278 0-4 IT female 1383.0 2015.0
279 95-99 IT female 79.0 2015.0
280 90-94 IT female 436.0 2015.0
281 85-89 IT female 855.0 2015.0
282 80-84 IT female 1231.0 2015.0
283 75-79 IT female 1534.0 2015.0
284 70-74 IT female 1567.0 2015.0
285 65-69 IT female 1893.0 2015.0
286 70-74 IT male 1338.0 2015.0
287 60-64 IT female 1880.0 2015.0
288 5-9 IT female 1395.0 2015.0
289 55-59 IT female 2069.0 2015.0
290 50-54 IT female 2373.0 2015.0
291 45-49 IT female 2480.0 2015.0
292 40-44 IT female 2411.0 2015.0
293 35-39 IT female 2090.0 2015.0
294 30-34 IT female 1791.0 2015.0
295 25-29 IT female 1610.0 2015.0
296 60-64 NL female 524.0 2015.0
297 80-84 NL female 231.0 2015.0
298 5-9 NL female 450.0 2015.0
299 90-94 NL male 25.0 2015.0
300 85-89 NL female 151.0 2015.0
301 70-74 NL male 351.0 2015.0
302 55-59 NL female 580.0 2015.0
303 90-94 NL female 69.0 2015.0
304 95-99 NL female 15.0 2015.0
305 100+ NL female 2.0 2015.0
306 50-54 NL female 636.0 2015.0
307 0-4 NL male 462.0 2015.0
308 5-9 NL male 473.0 2015.0
309 10-14 NL male 517.0 2015.0
310 45-49 NL female 637.0 2015.0
311 15-19 NL male 510.0 2015.0
312 20-24 NL female 504.0 2015.0
313 20-24 NL male 527.0 2015.0
314 65-69 NL male 518.0 2015.0
315 25-29 NL male 518.0 2015.0
316 40-44 NL female 583.0 2015.0
317 10-14 NL female 493.0 2015.0
318 30-34 NL male 503.0 2015.0
319 95-99 NL male 4.0 2015.0
320 75-79 NL male 246.0 2015.0
321 35-39 NL male 490.0 2015.0
322 80-84 NL male 158.0 2015.0
323 35-39 NL female 490.0 2015.0
324 40-44 NL male 582.0 2015.0
325 15-19 NL female 484.0 2015.0
326 0-4 NL female 438.0 2015.0
327 45-49 NL male 650.0 2015.0
328 30-34 NL female 497.0 2015.0
329 50-54 NL male 646.0 2015.0
330 100+ NL male 0.0 2015.0
331 55-59 NL male 581.0 2015.0
332 25-29 NL female 505.0 2015.0
333 60-64 NL male 523.0 2015.0
334 85-89 NL male 78.0 2015.0
335 65-69 NL female 527.0 2015.0
336 70-74 NL female 375.0 2015.0
337 75-79 NL female 295.0 2015.0
338 80-84 PT female 194.0 2015.0
339 40-44 PT female 418.0 2015.0
340 65-69 PT female 313.0 2015.0
341 60-64 PT female 341.0 2015.0
342 90-94 PT male 19.0 2015.0
343 85-89 PT female 115.0 2015.0
344 80-84 PT male 122.0 2015.0
345 35-39 PT male 419.0 2015.0
346 15-19 PT female 266.0 2015.0
347 60-64 PT male 308.0 2015.0
348 90-94 PT female 45.0 2015.0
349 25-29 PT female 299.0 2015.0
350 95-99 PT female 9.0 2015.0
351 55-59 PT female 366.0 2015.0
352 65-69 PT male 267.0 2015.0
353 40-44 PT male 416.0 2015.0
354 100+ PT female 1.0 2015.0
355 75-79 PT female 244.0 2015.0
356 0-4 PT male 239.0 2015.0
357 85-89 PT male 59.0 2015.0
358 35-39 PT female 414.0 2015.0
359 5-9 PT male 264.0 2015.0
360 100+ PT male 0.0 2015.0
361 50-54 PT female 397.0 2015.0
362 95-99 PT male 3.0 2015.0
363 10-14 PT female 269.0 2015.0
364 10-14 PT male 285.0 2015.0
365 0-4 PT female 225.0 2015.0
366 75-79 PT male 177.0 2015.0
367 5-9 PT female 250.0 2015.0
368 15-19 PT male 277.0 2015.0
369 45-49 PT male 386.0 2015.0
370 20-24 PT male 285.0 2015.0
371 45-49 PT female 395.0 2015.0
372 20-24 PT female 275.0 2015.0
373 70-74 PT male 214.0 2015.0
374 25-29 PT male 309.0 2015.0
375 50-54 PT male 378.0 2015.0
376 70-74 PT female 270.0 2015.0
377 55-59 PT male 343.0 2015.0
378 30-34 PT female 362.0 2015.0
379 30-34 PT male 371.0 2015.0
380 90-94 US female 1193.0 2015.0
381 75-79 US male 3641.0 2015.0
382 70-74 US male 5278.0 2015.0
383 65-69 US male 7561.0 2015.0
384 60-64 US male 9217.0 2015.0
385 55-59 US male 10689.0 2015.0
386 50-54 US male 11013.0 2015.0
387 45-49 US male 10454.0 2015.0
388 40-44 US male 10159.0 2015.0
389 10-14 US female 10346.0 2015.0
390 35-39 US male 10329.0 2015.0
391 30-34 US male 10984.0 2015.0
392 25-29 US male 11385.0 2015.0
393 20-24 US male 11601.0 2015.0
394 15-19 US male 11025.0 2015.0
395 10-14 US male 10771.0 2015.0
396 5-9 US male 10632.0 2015.0
397 0-4 US male 10788.0 2015.0
398 100+ US female 61.0 2015.0
399 95-99 US female 361.0 2015.0
400 5-9 US female 10201.0 2015.0
401 85-89 US female 2459.0 2015.0
402 80-84 US female 3394.0 2015.0
403 75-79 US female 4532.0 2015.0
404 70-74 US female 6179.0 2015.0
405 65-69 US female 8483.0 2015.0
406 60-64 US female 10004.0 2015.0
407 55-59 US female 11264.0 2015.0
408 50-54 US female 11413.0 2015.0
409 45-49 US female 10659.0 2015.0
410 0-4 US female 10306.0 2015.0
411 40-44 US female 10308.0 2015.0
412 35-39 US female 10352.0 2015.0
413 30-34 US female 10863.0 2015.0
414 25-29 US female 11011.0 2015.0
415 20-24 US female 11094.0 2015.0
416 100+ US male 13.0 2015.0
417 95-99 US male 115.0 2015.0
418 90-94 US male 541.0 2015.0
419 15-19 US female 10570.0 2015.0
420 85-89 US male 1441.0 2015.0
421 80-84 US male 2442.0 2015.0

11
data/countries.csv Normal file
View File

@ -0,0 +1,11 @@
country_destination,lat_destination,lng_destination,distance_km,destination_km2,destination_language ,language_levenshtein_distance
AU,-26.853388,133.27516,15297.744,7741220.0,eng,0.0
CA,62.393303,-96.818146,2828.1333,9984670.0,eng,0.0
DE,51.165707,10.452764,7879.568,357022.0,deu,72.61
ES,39.896027,-2.4876945,7730.724,505370.0,spa,92.25
FR,46.232193,2.209667,7682.945,643801.0,fra,92.06
GB,54.63322,-3.4322774,6883.659,243610.0,eng,0.0
IT,41.87399,12.564167,8636.631,301340.0,ita,89.4
NL,52.133057,5.29525,7524.3203,41543.0,nld,63.22
PT,39.553444,-7.839319,7355.2534,92090.0,por,95.45
US,36.966427,-95.84403,0.0,9826675.0,eng,0.0
1 country_destination lat_destination lng_destination distance_km destination_km2 destination_language language_levenshtein_distance
2 AU -26.853388 133.27516 15297.744 7741220.0 eng 0.0
3 CA 62.393303 -96.818146 2828.1333 9984670.0 eng 0.0
4 DE 51.165707 10.452764 7879.568 357022.0 deu 72.61
5 ES 39.896027 -2.4876945 7730.724 505370.0 spa 92.25
6 FR 46.232193 2.209667 7682.945 643801.0 fra 92.06
7 GB 54.63322 -3.4322774 6883.659 243610.0 eng 0.0
8 IT 41.87399 12.564167 8636.631 301340.0 ita 89.4
9 NL 52.133057 5.29525 7524.3203 41543.0 nld 63.22
10 PT 39.553444 -7.839319 7355.2534 92090.0 por 95.45
11 US 36.966427 -95.84403 0.0 9826675.0 eng 0.0

35
data/data.info Normal file
View File

@ -0,0 +1,35 @@
age_gender_bkts.csv
Columns: age_bucket,country_destination,gender,population_in_thousands,year
age_bucket: Ranges from 0-100 (literally ranges ex: 90-94) and 100+
country_destination: Two letter country code out of ['AU', 'CA', 'DE', 'ES', 'FR', 'GB', 'IT', 'NL', 'PT', 'US']
gender: male or female or unknown
population_in_thousands: float
year: float
countries.csv
Columns: country_destination, lat_destination, lng_destination, distance_km, destination_km2, destination_language, language_levenshtein_distance
country_destination: Two letter country code
lat_destination: float
lng_destination: float
distance_km: float
destination_km2: float
destination_language: 3 letter code ['eng', 'deu', 'spa', 'fra', etc.]
language_levenshtein_distance: float
sample_submission_NDF.csv (This is what the output should look like)
Columns: id, country
id: user id, string
country: most probably country they will vist (country code)
session.csv
Column: user_id, action, action_type, action_detail, device_type, secs_elapsed
user_id: string
action: string ['lookup', 'search_results', 'index', etc.]
action_type: nullable string ['click', 'data', etc.]
action_detail: nullable string. Either a description of the action or maybe a function name? unsure
device_type: string
secs_elapsed: float
test_users.csv
Column: id, date_account_created, timestamp_first_active, date_first_booking, gender, age, signup_method, signup_flow, language, affiliate_channel, affiliate_provider, first_affiliate_tracked, signup_app, first_device_type, first_browser

62097
data/sample_submission_NDF.csv Normal file

File diff suppressed because it is too large Load Diff

BIN
data/sessions.csv Normal file

Binary file not shown.
Can't render this file because it is too large.

62097
data/test_users.csv Normal file

File diff suppressed because it is too large Load Diff

213452
data/train_users_2.csv Normal file

File diff suppressed because it is too large Load Diff

7
requirements.txt Normal file
View File

@ -0,0 +1,7 @@
pandas
scikit-learn
numpy<2
matplotlib
tensorflow
keras
lightgbm