Datas-Mining-LightGBM/LightGBM/data_preprocessing.py

import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
import pickle

def load_data():
    """Load all CSV files"""
    print("Loading data...")
    train_users = pd.read_csv('../data/train_users_2.csv')
    test_users = pd.read_csv('../data/test_users.csv')
    sessions = pd.read_csv('../data/sessions.csv')
    countries = pd.read_csv('../data/countries.csv')
    age_gender = pd.read_csv('../data/age_gender_bkts.csv')

    return train_users, test_users, sessions, countries, age_gender

def preprocess_users(df, is_train=True):
    """Preprocess user data"""
    print(f"Preprocessing {'train' if is_train else 'test'} users...")

    # Create a copy
    df = df.copy()

    # Handle date features
    df['date_account_created'] = pd.to_datetime(df['date_account_created'])
    df['timestamp_first_active'] = pd.to_datetime(df['timestamp_first_active'].astype(str), format='%Y%m%d%H%M%S')

    # Extract date features
    df['dac_year'] = df['date_account_created'].dt.year
    df['dac_month'] = df['date_account_created'].dt.month
    df['dac_day'] = df['date_account_created'].dt.day
    df['dac_weekday'] = df['date_account_created'].dt.weekday

    df['tfa_year'] = df['timestamp_first_active'].dt.year
    df['tfa_month'] = df['timestamp_first_active'].dt.month
    df['tfa_day'] = df['timestamp_first_active'].dt.day

    # Handle date_first_booking if it exists (only in train)
    # NOTE: We should NOT use date_first_booking features as they leak information
    # about the target variable (booking date exists only if user made a booking)
    if 'date_first_booking' in df.columns:
        # Simply drop this column - don't extract features from it
        df = df.drop('date_first_booking', axis=1)

    # Drop original date columns
    df = df.drop(['date_account_created', 'timestamp_first_active'], axis=1)

    # Handle age - clean outliers
    df['age'] = df['age'].fillna(-1)
    df.loc[(df['age'] < 18) | (df['age'] > 100), 'age'] = -1

    # Handle gender
    df['gender'] = df['gender'].fillna('-unknown-')

    # Fill other categorical NaN with 'unknown'
    categorical_cols = ['signup_method', 'signup_flow', 'language', 'affiliate_channel',
                       'affiliate_provider', 'first_affiliate_tracked', 'signup_app',
                       'first_device_type', 'first_browser']
    for col in categorical_cols:
        if col in df.columns:
            df[col] = df[col].fillna('unknown')

    return df

def aggregate_sessions(sessions):
    """Aggregate session data per user"""
    print("Aggregating session data...")

    if sessions.empty:
        return pd.DataFrame()

    # Fill NaN values
    sessions = sessions.fillna(-1)

    # Aggregate session features per user
    session_agg = sessions.groupby('user_id').agg({
        'action': 'count',
        'action_type': lambda x: x.value_counts().index[0] if len(x) > 0 else 'unknown',
        'action_detail': lambda x: x.value_counts().index[0] if len(x) > 0 else 'unknown',
        'device_type': lambda x: x.value_counts().index[0] if len(x) > 0 else 'unknown',
        'secs_elapsed': ['sum', 'mean', 'max', 'min']
    }).reset_index()

    # Flatten column names
    session_agg.columns = ['user_id', 'num_actions', 'most_common_action_type',
                          'most_common_action_detail', 'most_common_device_type',
                          'total_secs', 'mean_secs', 'max_secs', 'min_secs']

    return session_agg

def encode_features(train_df, test_df, label_encoders=None):
    """Encode categorical features"""
    print("Encoding features...")

    categorical_cols = ['gender', 'signup_method', 'language', 'affiliate_channel',
                       'affiliate_provider', 'first_affiliate_tracked', 'signup_app',
                       'first_device_type', 'first_browser']

    # Add session categorical columns if they exist
    if 'most_common_action_type' in train_df.columns:
        categorical_cols.extend(['most_common_action_type', 'most_common_action_detail',
                                'most_common_device_type'])

    if label_encoders is None:
        label_encoders = {}
        for col in categorical_cols:
            if col in train_df.columns:
                le = LabelEncoder()
                # Fit on combined train and test to handle all categories
                combined = pd.concat([train_df[col], test_df[col]]).astype(str)
                le.fit(combined)
                label_encoders[col] = le

    # Transform
    for col in categorical_cols:
        if col in train_df.columns:
            train_df[col] = label_encoders[col].transform(train_df[col].astype(str))
            test_df[col] = label_encoders[col].transform(test_df[col].astype(str))

    # Encode signup_flow as numeric
    if 'signup_flow' in train_df.columns:
        train_df['signup_flow'] = train_df['signup_flow'].astype(int)
        test_df['signup_flow'] = test_df['signup_flow'].astype(int)

    return train_df, test_df, label_encoders

def prepare_datasets():
    """Main function to prepare train and test datasets"""
    # Load data
    train_users, test_users, sessions, countries, age_gender = load_data()

    # Store IDs and target
    train_ids = train_users['id']
    test_ids = test_users['id']
    target = train_users['country_destination']

    # Preprocess users
    train_users = preprocess_users(train_users, is_train=True)
    test_users = preprocess_users(test_users, is_train=False)

    # Aggregate sessions
    session_agg = aggregate_sessions(sessions)

    # Merge session data if available
    if not session_agg.empty:
        train_users = train_users.merge(session_agg, left_on='id', right_on='user_id', how='left')
        test_users = test_users.merge(session_agg, left_on='id', right_on='user_id', how='left')

        # Drop user_id column from merge
        if 'user_id' in train_users.columns:
            train_users = train_users.drop('user_id', axis=1)
            test_users = test_users.drop('user_id', axis=1)

        # Fill NaN values from merge
        session_cols = ['num_actions', 'total_secs', 'mean_secs', 'max_secs', 'min_secs']
        for col in session_cols:
            if col in train_users.columns:
                train_users[col] = train_users[col].fillna(0)
                test_users[col] = test_users[col].fillna(0)

        session_cat_cols = ['most_common_action_type', 'most_common_action_detail', 'most_common_device_type']
        for col in session_cat_cols:
            if col in train_users.columns:
                train_users[col] = train_users[col].fillna('unknown')
                test_users[col] = test_users[col].fillna('unknown')

    # Drop ID and target from train
    train_users = train_users.drop(['id', 'country_destination'], axis=1)
    test_users = test_users.drop(['id'], axis=1)

    # Encode categorical features
    train_users, test_users, label_encoders = encode_features(train_users, test_users)

    # Fill any remaining NaN values before scaling
    print("Checking for NaN values...")
    print(f"Train NaN count: {train_users.isna().sum().sum()}")
    print(f"Test NaN count: {test_users.isna().sum().sum()}")

    if train_users.isna().any().any():
        print("Warning: Found NaN values in train data. Filling with 0...")
        train_users = train_users.fillna(0)

    if test_users.isna().any().any():
        print("Warning: Found NaN values in test data. Filling with 0...")
        test_users = test_users.fillna(0)

    # Encode target variable
    target_encoder = LabelEncoder()
    target_encoded = target_encoder.fit_transform(target)

    # Save encoders
    with open('label_encoders.pkl', 'wb') as f:
        pickle.dump(label_encoders, f)

    with open('target_encoder.pkl', 'wb') as f:
        pickle.dump(target_encoder, f)

    # Determine categorical columns (should match those encoded earlier)
    categorical_cols = ['gender', 'signup_method', 'language', 'affiliate_channel',
                       'affiliate_provider', 'first_affiliate_tracked', 'signup_app',
                       'first_device_type', 'first_browser']

    # Add session categorical columns if they exist
    session_cat_cols = ['most_common_action_type', 'most_common_action_detail', 'most_common_device_type']
    for col in session_cat_cols:
        if col in train_users.columns:
            categorical_cols.append(col)

    # Keep only those categorical cols that actually exist in the dataframe
    categorical_cols = [c for c in categorical_cols if c in train_users.columns]

    # For LightGBM: No scaling needed (tree-based model)
    # Keep all features as-is (numeric and categorical)
    feature_names = list(train_users.columns)

    # Save feature metadata
    with open('feature_names.pkl', 'wb') as f:
        pickle.dump(feature_names, f)

    with open('categorical_features.pkl', 'wb') as f:
        pickle.dump(categorical_cols, f)

    # Convert to numpy arrays
    X_train = train_users.values
    X_test = test_users.values

    print(f"Train shape: {X_train.shape}")
    print(f"Test shape: {X_test.shape}")
    print(f"Number of classes: {len(target_encoder.classes_)}")
    print(f"Classes: {target_encoder.classes_}")

    return X_train, target_encoded, X_test, test_ids, target_encoder

if __name__ == '__main__':
    X_train, y_train, X_test, test_ids, target_encoder = prepare_datasets()

    # Save preprocessed data
    np.save('X_train.npy', X_train)
    np.save('y_train.npy', y_train)
    np.save('X_test.npy', X_test)
    np.save('test_ids.npy', test_ids)

    print("\nData preprocessing completed successfully!")