244 lines
9.4 KiB
Python
244 lines
9.4 KiB
Python
import pandas as pd
|
|
import numpy as np
|
|
from sklearn.preprocessing import LabelEncoder
|
|
import pickle
|
|
|
|
def load_data():
|
|
"""Load all CSV files"""
|
|
print("Loading data...")
|
|
train_users = pd.read_csv('../data/train_users_2.csv')
|
|
test_users = pd.read_csv('../data/test_users.csv')
|
|
sessions = pd.read_csv('../data/sessions.csv')
|
|
countries = pd.read_csv('../data/countries.csv')
|
|
age_gender = pd.read_csv('../data/age_gender_bkts.csv')
|
|
|
|
return train_users, test_users, sessions, countries, age_gender
|
|
|
|
def preprocess_users(df, is_train=True):
|
|
"""Preprocess user data"""
|
|
print(f"Preprocessing {'train' if is_train else 'test'} users...")
|
|
|
|
# Create a copy
|
|
df = df.copy()
|
|
|
|
# Handle date features
|
|
df['date_account_created'] = pd.to_datetime(df['date_account_created'])
|
|
df['timestamp_first_active'] = pd.to_datetime(df['timestamp_first_active'].astype(str), format='%Y%m%d%H%M%S')
|
|
|
|
# Extract date features
|
|
df['dac_year'] = df['date_account_created'].dt.year
|
|
df['dac_month'] = df['date_account_created'].dt.month
|
|
df['dac_day'] = df['date_account_created'].dt.day
|
|
df['dac_weekday'] = df['date_account_created'].dt.weekday
|
|
|
|
df['tfa_year'] = df['timestamp_first_active'].dt.year
|
|
df['tfa_month'] = df['timestamp_first_active'].dt.month
|
|
df['tfa_day'] = df['timestamp_first_active'].dt.day
|
|
|
|
# Handle date_first_booking if it exists (only in train)
|
|
# NOTE: We should NOT use date_first_booking features as they leak information
|
|
# about the target variable (booking date exists only if user made a booking)
|
|
if 'date_first_booking' in df.columns:
|
|
# Simply drop this column - don't extract features from it
|
|
df = df.drop('date_first_booking', axis=1)
|
|
|
|
# Drop original date columns
|
|
df = df.drop(['date_account_created', 'timestamp_first_active'], axis=1)
|
|
|
|
# Handle age - clean outliers
|
|
df['age'] = df['age'].fillna(-1)
|
|
df.loc[(df['age'] < 18) | (df['age'] > 100), 'age'] = -1
|
|
|
|
# Handle gender
|
|
df['gender'] = df['gender'].fillna('-unknown-')
|
|
|
|
# Fill other categorical NaN with 'unknown'
|
|
categorical_cols = ['signup_method', 'signup_flow', 'language', 'affiliate_channel',
|
|
'affiliate_provider', 'first_affiliate_tracked', 'signup_app',
|
|
'first_device_type', 'first_browser']
|
|
for col in categorical_cols:
|
|
if col in df.columns:
|
|
df[col] = df[col].fillna('unknown')
|
|
|
|
return df
|
|
|
|
def aggregate_sessions(sessions):
|
|
"""Aggregate session data per user"""
|
|
print("Aggregating session data...")
|
|
|
|
if sessions.empty:
|
|
return pd.DataFrame()
|
|
|
|
# Fill NaN values
|
|
sessions = sessions.fillna(-1)
|
|
|
|
# Aggregate session features per user
|
|
session_agg = sessions.groupby('user_id').agg({
|
|
'action': 'count',
|
|
'action_type': lambda x: x.value_counts().index[0] if len(x) > 0 else 'unknown',
|
|
'action_detail': lambda x: x.value_counts().index[0] if len(x) > 0 else 'unknown',
|
|
'device_type': lambda x: x.value_counts().index[0] if len(x) > 0 else 'unknown',
|
|
'secs_elapsed': ['sum', 'mean', 'max', 'min']
|
|
}).reset_index()
|
|
|
|
# Flatten column names
|
|
session_agg.columns = ['user_id', 'num_actions', 'most_common_action_type',
|
|
'most_common_action_detail', 'most_common_device_type',
|
|
'total_secs', 'mean_secs', 'max_secs', 'min_secs']
|
|
|
|
return session_agg
|
|
|
|
def encode_features(train_df, test_df, label_encoders=None):
|
|
"""Encode categorical features"""
|
|
print("Encoding features...")
|
|
|
|
categorical_cols = ['gender', 'signup_method', 'language', 'affiliate_channel',
|
|
'affiliate_provider', 'first_affiliate_tracked', 'signup_app',
|
|
'first_device_type', 'first_browser']
|
|
|
|
# Add session categorical columns if they exist
|
|
if 'most_common_action_type' in train_df.columns:
|
|
categorical_cols.extend(['most_common_action_type', 'most_common_action_detail',
|
|
'most_common_device_type'])
|
|
|
|
if label_encoders is None:
|
|
label_encoders = {}
|
|
for col in categorical_cols:
|
|
if col in train_df.columns:
|
|
le = LabelEncoder()
|
|
# Fit on combined train and test to handle all categories
|
|
combined = pd.concat([train_df[col], test_df[col]]).astype(str)
|
|
le.fit(combined)
|
|
label_encoders[col] = le
|
|
|
|
# Transform
|
|
for col in categorical_cols:
|
|
if col in train_df.columns:
|
|
train_df[col] = label_encoders[col].transform(train_df[col].astype(str))
|
|
test_df[col] = label_encoders[col].transform(test_df[col].astype(str))
|
|
|
|
# Encode signup_flow as numeric
|
|
if 'signup_flow' in train_df.columns:
|
|
train_df['signup_flow'] = train_df['signup_flow'].astype(int)
|
|
test_df['signup_flow'] = test_df['signup_flow'].astype(int)
|
|
|
|
return train_df, test_df, label_encoders
|
|
|
|
def prepare_datasets():
|
|
"""Main function to prepare train and test datasets"""
|
|
# Load data
|
|
train_users, test_users, sessions, countries, age_gender = load_data()
|
|
|
|
# Store IDs and target
|
|
train_ids = train_users['id']
|
|
test_ids = test_users['id']
|
|
target = train_users['country_destination']
|
|
|
|
# Preprocess users
|
|
train_users = preprocess_users(train_users, is_train=True)
|
|
test_users = preprocess_users(test_users, is_train=False)
|
|
|
|
# Aggregate sessions
|
|
session_agg = aggregate_sessions(sessions)
|
|
|
|
# Merge session data if available
|
|
if not session_agg.empty:
|
|
train_users = train_users.merge(session_agg, left_on='id', right_on='user_id', how='left')
|
|
test_users = test_users.merge(session_agg, left_on='id', right_on='user_id', how='left')
|
|
|
|
# Drop user_id column from merge
|
|
if 'user_id' in train_users.columns:
|
|
train_users = train_users.drop('user_id', axis=1)
|
|
test_users = test_users.drop('user_id', axis=1)
|
|
|
|
# Fill NaN values from merge
|
|
session_cols = ['num_actions', 'total_secs', 'mean_secs', 'max_secs', 'min_secs']
|
|
for col in session_cols:
|
|
if col in train_users.columns:
|
|
train_users[col] = train_users[col].fillna(0)
|
|
test_users[col] = test_users[col].fillna(0)
|
|
|
|
session_cat_cols = ['most_common_action_type', 'most_common_action_detail', 'most_common_device_type']
|
|
for col in session_cat_cols:
|
|
if col in train_users.columns:
|
|
train_users[col] = train_users[col].fillna('unknown')
|
|
test_users[col] = test_users[col].fillna('unknown')
|
|
|
|
# Drop ID and target from train
|
|
train_users = train_users.drop(['id', 'country_destination'], axis=1)
|
|
test_users = test_users.drop(['id'], axis=1)
|
|
|
|
# Encode categorical features
|
|
train_users, test_users, label_encoders = encode_features(train_users, test_users)
|
|
|
|
# Fill any remaining NaN values before scaling
|
|
print("Checking for NaN values...")
|
|
print(f"Train NaN count: {train_users.isna().sum().sum()}")
|
|
print(f"Test NaN count: {test_users.isna().sum().sum()}")
|
|
|
|
if train_users.isna().any().any():
|
|
print("Warning: Found NaN values in train data. Filling with 0...")
|
|
train_users = train_users.fillna(0)
|
|
|
|
if test_users.isna().any().any():
|
|
print("Warning: Found NaN values in test data. Filling with 0...")
|
|
test_users = test_users.fillna(0)
|
|
|
|
# Encode target variable
|
|
target_encoder = LabelEncoder()
|
|
target_encoded = target_encoder.fit_transform(target)
|
|
|
|
# Save encoders
|
|
with open('label_encoders.pkl', 'wb') as f:
|
|
pickle.dump(label_encoders, f)
|
|
|
|
with open('target_encoder.pkl', 'wb') as f:
|
|
pickle.dump(target_encoder, f)
|
|
|
|
# Determine categorical columns (should match those encoded earlier)
|
|
categorical_cols = ['gender', 'signup_method', 'language', 'affiliate_channel',
|
|
'affiliate_provider', 'first_affiliate_tracked', 'signup_app',
|
|
'first_device_type', 'first_browser']
|
|
|
|
# Add session categorical columns if they exist
|
|
session_cat_cols = ['most_common_action_type', 'most_common_action_detail', 'most_common_device_type']
|
|
for col in session_cat_cols:
|
|
if col in train_users.columns:
|
|
categorical_cols.append(col)
|
|
|
|
# Keep only those categorical cols that actually exist in the dataframe
|
|
categorical_cols = [c for c in categorical_cols if c in train_users.columns]
|
|
|
|
# For LightGBM: No scaling needed (tree-based model)
|
|
# Keep all features as-is (numeric and categorical)
|
|
feature_names = list(train_users.columns)
|
|
|
|
# Save feature metadata
|
|
with open('feature_names.pkl', 'wb') as f:
|
|
pickle.dump(feature_names, f)
|
|
|
|
with open('categorical_features.pkl', 'wb') as f:
|
|
pickle.dump(categorical_cols, f)
|
|
|
|
# Convert to numpy arrays
|
|
X_train = train_users.values
|
|
X_test = test_users.values
|
|
|
|
print(f"Train shape: {X_train.shape}")
|
|
print(f"Test shape: {X_test.shape}")
|
|
print(f"Number of classes: {len(target_encoder.classes_)}")
|
|
print(f"Classes: {target_encoder.classes_}")
|
|
|
|
return X_train, target_encoded, X_test, test_ids, target_encoder
|
|
|
|
if __name__ == '__main__':
|
|
X_train, y_train, X_test, test_ids, target_encoder = prepare_datasets()
|
|
|
|
# Save preprocessed data
|
|
np.save('X_train.npy', X_train)
|
|
np.save('y_train.npy', y_train)
|
|
np.save('X_test.npy', X_test)
|
|
np.save('test_ids.npy', test_ids)
|
|
|
|
print("\nData preprocessing completed successfully!")
|