import pandas as pd import numpy as np from sklearn.preprocessing import LabelEncoder import pickle def load_data(): """Load all CSV files""" print("Loading data...") train_users = pd.read_csv('../data/train_users_2.csv') test_users = pd.read_csv('../data/test_users.csv') sessions = pd.read_csv('../data/sessions.csv') countries = pd.read_csv('../data/countries.csv') age_gender = pd.read_csv('../data/age_gender_bkts.csv') return train_users, test_users, sessions, countries, age_gender def preprocess_users(df, is_train=True): """Preprocess user data""" print(f"Preprocessing {'train' if is_train else 'test'} users...") # Create a copy df = df.copy() # Handle date features df['date_account_created'] = pd.to_datetime(df['date_account_created']) df['timestamp_first_active'] = pd.to_datetime(df['timestamp_first_active'].astype(str), format='%Y%m%d%H%M%S') # Extract date features df['dac_year'] = df['date_account_created'].dt.year df['dac_month'] = df['date_account_created'].dt.month df['dac_day'] = df['date_account_created'].dt.day df['dac_weekday'] = df['date_account_created'].dt.weekday df['tfa_year'] = df['timestamp_first_active'].dt.year df['tfa_month'] = df['timestamp_first_active'].dt.month df['tfa_day'] = df['timestamp_first_active'].dt.day # Handle date_first_booking if it exists (only in train) # NOTE: We should NOT use date_first_booking features as they leak information # about the target variable (booking date exists only if user made a booking) if 'date_first_booking' in df.columns: # Simply drop this column - don't extract features from it df = df.drop('date_first_booking', axis=1) # Drop original date columns df = df.drop(['date_account_created', 'timestamp_first_active'], axis=1) # Handle age - clean outliers df['age'] = df['age'].fillna(-1) df.loc[(df['age'] < 18) | (df['age'] > 100), 'age'] = -1 # Handle gender df['gender'] = df['gender'].fillna('-unknown-') # Fill other categorical NaN with 'unknown' categorical_cols = ['signup_method', 'signup_flow', 'language', 'affiliate_channel', 'affiliate_provider', 'first_affiliate_tracked', 'signup_app', 'first_device_type', 'first_browser'] for col in categorical_cols: if col in df.columns: df[col] = df[col].fillna('unknown') return df def aggregate_sessions(sessions): """Aggregate session data per user""" print("Aggregating session data...") if sessions.empty: return pd.DataFrame() # Fill NaN values sessions = sessions.fillna(-1) # Aggregate session features per user session_agg = sessions.groupby('user_id').agg({ 'action': 'count', 'action_type': lambda x: x.value_counts().index[0] if len(x) > 0 else 'unknown', 'action_detail': lambda x: x.value_counts().index[0] if len(x) > 0 else 'unknown', 'device_type': lambda x: x.value_counts().index[0] if len(x) > 0 else 'unknown', 'secs_elapsed': ['sum', 'mean', 'max', 'min'] }).reset_index() # Flatten column names session_agg.columns = ['user_id', 'num_actions', 'most_common_action_type', 'most_common_action_detail', 'most_common_device_type', 'total_secs', 'mean_secs', 'max_secs', 'min_secs'] return session_agg def encode_features(train_df, test_df, label_encoders=None): """Encode categorical features""" print("Encoding features...") categorical_cols = ['gender', 'signup_method', 'language', 'affiliate_channel', 'affiliate_provider', 'first_affiliate_tracked', 'signup_app', 'first_device_type', 'first_browser'] # Add session categorical columns if they exist if 'most_common_action_type' in train_df.columns: categorical_cols.extend(['most_common_action_type', 'most_common_action_detail', 'most_common_device_type']) if label_encoders is None: label_encoders = {} for col in categorical_cols: if col in train_df.columns: le = LabelEncoder() # Fit on combined train and test to handle all categories combined = pd.concat([train_df[col], test_df[col]]).astype(str) le.fit(combined) label_encoders[col] = le # Transform for col in categorical_cols: if col in train_df.columns: train_df[col] = label_encoders[col].transform(train_df[col].astype(str)) test_df[col] = label_encoders[col].transform(test_df[col].astype(str)) # Encode signup_flow as numeric if 'signup_flow' in train_df.columns: train_df['signup_flow'] = train_df['signup_flow'].astype(int) test_df['signup_flow'] = test_df['signup_flow'].astype(int) return train_df, test_df, label_encoders def prepare_datasets(): """Main function to prepare train and test datasets""" # Load data train_users, test_users, sessions, countries, age_gender = load_data() # Store IDs and target train_ids = train_users['id'] test_ids = test_users['id'] target = train_users['country_destination'] # Preprocess users train_users = preprocess_users(train_users, is_train=True) test_users = preprocess_users(test_users, is_train=False) # Aggregate sessions session_agg = aggregate_sessions(sessions) # Merge session data if available if not session_agg.empty: train_users = train_users.merge(session_agg, left_on='id', right_on='user_id', how='left') test_users = test_users.merge(session_agg, left_on='id', right_on='user_id', how='left') # Drop user_id column from merge if 'user_id' in train_users.columns: train_users = train_users.drop('user_id', axis=1) test_users = test_users.drop('user_id', axis=1) # Fill NaN values from merge session_cols = ['num_actions', 'total_secs', 'mean_secs', 'max_secs', 'min_secs'] for col in session_cols: if col in train_users.columns: train_users[col] = train_users[col].fillna(0) test_users[col] = test_users[col].fillna(0) session_cat_cols = ['most_common_action_type', 'most_common_action_detail', 'most_common_device_type'] for col in session_cat_cols: if col in train_users.columns: train_users[col] = train_users[col].fillna('unknown') test_users[col] = test_users[col].fillna('unknown') # Drop ID and target from train train_users = train_users.drop(['id', 'country_destination'], axis=1) test_users = test_users.drop(['id'], axis=1) # Encode categorical features train_users, test_users, label_encoders = encode_features(train_users, test_users) # Fill any remaining NaN values before scaling print("Checking for NaN values...") print(f"Train NaN count: {train_users.isna().sum().sum()}") print(f"Test NaN count: {test_users.isna().sum().sum()}") if train_users.isna().any().any(): print("Warning: Found NaN values in train data. Filling with 0...") train_users = train_users.fillna(0) if test_users.isna().any().any(): print("Warning: Found NaN values in test data. Filling with 0...") test_users = test_users.fillna(0) # Encode target variable target_encoder = LabelEncoder() target_encoded = target_encoder.fit_transform(target) # Save encoders with open('label_encoders.pkl', 'wb') as f: pickle.dump(label_encoders, f) with open('target_encoder.pkl', 'wb') as f: pickle.dump(target_encoder, f) # Determine categorical columns (should match those encoded earlier) categorical_cols = ['gender', 'signup_method', 'language', 'affiliate_channel', 'affiliate_provider', 'first_affiliate_tracked', 'signup_app', 'first_device_type', 'first_browser'] # Add session categorical columns if they exist session_cat_cols = ['most_common_action_type', 'most_common_action_detail', 'most_common_device_type'] for col in session_cat_cols: if col in train_users.columns: categorical_cols.append(col) # Keep only those categorical cols that actually exist in the dataframe categorical_cols = [c for c in categorical_cols if c in train_users.columns] # For LightGBM: No scaling needed (tree-based model) # Keep all features as-is (numeric and categorical) feature_names = list(train_users.columns) # Save feature metadata with open('feature_names.pkl', 'wb') as f: pickle.dump(feature_names, f) with open('categorical_features.pkl', 'wb') as f: pickle.dump(categorical_cols, f) # Convert to numpy arrays X_train = train_users.values X_test = test_users.values print(f"Train shape: {X_train.shape}") print(f"Test shape: {X_test.shape}") print(f"Number of classes: {len(target_encoder.classes_)}") print(f"Classes: {target_encoder.classes_}") return X_train, target_encoded, X_test, test_ids, target_encoder if __name__ == '__main__': X_train, y_train, X_test, test_ids, target_encoder = prepare_datasets() # Save preprocessed data np.save('X_train.npy', X_train) np.save('y_train.npy', y_train) np.save('X_test.npy', X_test) np.save('test_ids.npy', test_ids) print("\nData preprocessing completed successfully!")