Datas-Mining-LightGBM/LightGBM/data_preprocessing.py
2025-12-04 16:48:17 -06:00

244 lines
9.4 KiB
Python

import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
import pickle
def load_data():
"""Load all CSV files"""
print("Loading data...")
train_users = pd.read_csv('../data/train_users_2.csv')
test_users = pd.read_csv('../data/test_users.csv')
sessions = pd.read_csv('../data/sessions.csv')
countries = pd.read_csv('../data/countries.csv')
age_gender = pd.read_csv('../data/age_gender_bkts.csv')
return train_users, test_users, sessions, countries, age_gender
def preprocess_users(df, is_train=True):
"""Preprocess user data"""
print(f"Preprocessing {'train' if is_train else 'test'} users...")
# Create a copy
df = df.copy()
# Handle date features
df['date_account_created'] = pd.to_datetime(df['date_account_created'])
df['timestamp_first_active'] = pd.to_datetime(df['timestamp_first_active'].astype(str), format='%Y%m%d%H%M%S')
# Extract date features
df['dac_year'] = df['date_account_created'].dt.year
df['dac_month'] = df['date_account_created'].dt.month
df['dac_day'] = df['date_account_created'].dt.day
df['dac_weekday'] = df['date_account_created'].dt.weekday
df['tfa_year'] = df['timestamp_first_active'].dt.year
df['tfa_month'] = df['timestamp_first_active'].dt.month
df['tfa_day'] = df['timestamp_first_active'].dt.day
# Handle date_first_booking if it exists (only in train)
# NOTE: We should NOT use date_first_booking features as they leak information
# about the target variable (booking date exists only if user made a booking)
if 'date_first_booking' in df.columns:
# Simply drop this column - don't extract features from it
df = df.drop('date_first_booking', axis=1)
# Drop original date columns
df = df.drop(['date_account_created', 'timestamp_first_active'], axis=1)
# Handle age - clean outliers
df['age'] = df['age'].fillna(-1)
df.loc[(df['age'] < 18) | (df['age'] > 100), 'age'] = -1
# Handle gender
df['gender'] = df['gender'].fillna('-unknown-')
# Fill other categorical NaN with 'unknown'
categorical_cols = ['signup_method', 'signup_flow', 'language', 'affiliate_channel',
'affiliate_provider', 'first_affiliate_tracked', 'signup_app',
'first_device_type', 'first_browser']
for col in categorical_cols:
if col in df.columns:
df[col] = df[col].fillna('unknown')
return df
def aggregate_sessions(sessions):
"""Aggregate session data per user"""
print("Aggregating session data...")
if sessions.empty:
return pd.DataFrame()
# Fill NaN values
sessions = sessions.fillna(-1)
# Aggregate session features per user
session_agg = sessions.groupby('user_id').agg({
'action': 'count',
'action_type': lambda x: x.value_counts().index[0] if len(x) > 0 else 'unknown',
'action_detail': lambda x: x.value_counts().index[0] if len(x) > 0 else 'unknown',
'device_type': lambda x: x.value_counts().index[0] if len(x) > 0 else 'unknown',
'secs_elapsed': ['sum', 'mean', 'max', 'min']
}).reset_index()
# Flatten column names
session_agg.columns = ['user_id', 'num_actions', 'most_common_action_type',
'most_common_action_detail', 'most_common_device_type',
'total_secs', 'mean_secs', 'max_secs', 'min_secs']
return session_agg
def encode_features(train_df, test_df, label_encoders=None):
"""Encode categorical features"""
print("Encoding features...")
categorical_cols = ['gender', 'signup_method', 'language', 'affiliate_channel',
'affiliate_provider', 'first_affiliate_tracked', 'signup_app',
'first_device_type', 'first_browser']
# Add session categorical columns if they exist
if 'most_common_action_type' in train_df.columns:
categorical_cols.extend(['most_common_action_type', 'most_common_action_detail',
'most_common_device_type'])
if label_encoders is None:
label_encoders = {}
for col in categorical_cols:
if col in train_df.columns:
le = LabelEncoder()
# Fit on combined train and test to handle all categories
combined = pd.concat([train_df[col], test_df[col]]).astype(str)
le.fit(combined)
label_encoders[col] = le
# Transform
for col in categorical_cols:
if col in train_df.columns:
train_df[col] = label_encoders[col].transform(train_df[col].astype(str))
test_df[col] = label_encoders[col].transform(test_df[col].astype(str))
# Encode signup_flow as numeric
if 'signup_flow' in train_df.columns:
train_df['signup_flow'] = train_df['signup_flow'].astype(int)
test_df['signup_flow'] = test_df['signup_flow'].astype(int)
return train_df, test_df, label_encoders
def prepare_datasets():
"""Main function to prepare train and test datasets"""
# Load data
train_users, test_users, sessions, countries, age_gender = load_data()
# Store IDs and target
train_ids = train_users['id']
test_ids = test_users['id']
target = train_users['country_destination']
# Preprocess users
train_users = preprocess_users(train_users, is_train=True)
test_users = preprocess_users(test_users, is_train=False)
# Aggregate sessions
session_agg = aggregate_sessions(sessions)
# Merge session data if available
if not session_agg.empty:
train_users = train_users.merge(session_agg, left_on='id', right_on='user_id', how='left')
test_users = test_users.merge(session_agg, left_on='id', right_on='user_id', how='left')
# Drop user_id column from merge
if 'user_id' in train_users.columns:
train_users = train_users.drop('user_id', axis=1)
test_users = test_users.drop('user_id', axis=1)
# Fill NaN values from merge
session_cols = ['num_actions', 'total_secs', 'mean_secs', 'max_secs', 'min_secs']
for col in session_cols:
if col in train_users.columns:
train_users[col] = train_users[col].fillna(0)
test_users[col] = test_users[col].fillna(0)
session_cat_cols = ['most_common_action_type', 'most_common_action_detail', 'most_common_device_type']
for col in session_cat_cols:
if col in train_users.columns:
train_users[col] = train_users[col].fillna('unknown')
test_users[col] = test_users[col].fillna('unknown')
# Drop ID and target from train
train_users = train_users.drop(['id', 'country_destination'], axis=1)
test_users = test_users.drop(['id'], axis=1)
# Encode categorical features
train_users, test_users, label_encoders = encode_features(train_users, test_users)
# Fill any remaining NaN values before scaling
print("Checking for NaN values...")
print(f"Train NaN count: {train_users.isna().sum().sum()}")
print(f"Test NaN count: {test_users.isna().sum().sum()}")
if train_users.isna().any().any():
print("Warning: Found NaN values in train data. Filling with 0...")
train_users = train_users.fillna(0)
if test_users.isna().any().any():
print("Warning: Found NaN values in test data. Filling with 0...")
test_users = test_users.fillna(0)
# Encode target variable
target_encoder = LabelEncoder()
target_encoded = target_encoder.fit_transform(target)
# Save encoders
with open('label_encoders.pkl', 'wb') as f:
pickle.dump(label_encoders, f)
with open('target_encoder.pkl', 'wb') as f:
pickle.dump(target_encoder, f)
# Determine categorical columns (should match those encoded earlier)
categorical_cols = ['gender', 'signup_method', 'language', 'affiliate_channel',
'affiliate_provider', 'first_affiliate_tracked', 'signup_app',
'first_device_type', 'first_browser']
# Add session categorical columns if they exist
session_cat_cols = ['most_common_action_type', 'most_common_action_detail', 'most_common_device_type']
for col in session_cat_cols:
if col in train_users.columns:
categorical_cols.append(col)
# Keep only those categorical cols that actually exist in the dataframe
categorical_cols = [c for c in categorical_cols if c in train_users.columns]
# For LightGBM: No scaling needed (tree-based model)
# Keep all features as-is (numeric and categorical)
feature_names = list(train_users.columns)
# Save feature metadata
with open('feature_names.pkl', 'wb') as f:
pickle.dump(feature_names, f)
with open('categorical_features.pkl', 'wb') as f:
pickle.dump(categorical_cols, f)
# Convert to numpy arrays
X_train = train_users.values
X_test = test_users.values
print(f"Train shape: {X_train.shape}")
print(f"Test shape: {X_test.shape}")
print(f"Number of classes: {len(target_encoder.classes_)}")
print(f"Classes: {target_encoder.classes_}")
return X_train, target_encoded, X_test, test_ids, target_encoder
if __name__ == '__main__':
X_train, y_train, X_test, test_ids, target_encoder = prepare_datasets()
# Save preprocessed data
np.save('X_train.npy', X_train)
np.save('y_train.npy', y_train)
np.save('X_test.npy', X_test)
np.save('test_ids.npy', test_ids)
print("\nData preprocessing completed successfully!")