diff --git a/.gitignore b/.gitignore index 4cf493c..f3b2080 100644 --- a/.gitignore +++ b/.gitignore @@ -159,4 +159,5 @@ data/Create_data/api_football/venues.xlsx data/Create_data/api_football/api_test.py data/Create_data/api_football/api_test.json *ensemble_40 -*prompts \ No newline at end of file +*prompts +.sonarlint/connectedMode.json diff --git a/data/Create_data/add_ELO_scores.py b/data/Create_data/add_ELO_scores.py index 37c1466..00a693d 100644 --- a/data/Create_data/add_ELO_scores.py +++ b/data/Create_data/add_ELO_scores.py @@ -5,7 +5,6 @@ import pandas as pd import python_calamine as calamine from api_football.add_current_features_postgre import export_to_xlsx_fast -from openpyxl import Workbook class ELOCalculator: @@ -24,12 +23,16 @@ def __init__(self, logger=None): # Define data paths # self.api_prediction_data_path = "./data_files/api_football_prediction_newPoisson.xlsx" - self.api_prediction_data_path_new = "./data_files/api_football_prediction_new_newPoisson.xlsx" + self.api_prediction_data_path_new = ( + "./data_files/api_football_prediction_new_newPoisson.xlsx" + ) # self.api_training_data_path = "./data_files/api_football_training_newPoisson.xlsx" # Define export paths # self.api_prediction_export_path = "./data_files/api_football_prediction_newPoisson.xlsx" - self.api_prediction_export_path_new = "./data_files/api_football_prediction_new_newPoisson.xlsx" + self.api_prediction_export_path_new = ( + "./data_files/api_football_prediction_new_newPoisson.xlsx" + ) # self.api_training_export_path = "./data_files/api_football_training_newPoisson.xlsx" # ELO settings diff --git a/data/Create_data/add_poisson_xG.py b/data/Create_data/add_poisson_xG.py index 3a5fc87..ddeee48 100644 --- a/data/Create_data/add_poisson_xG.py +++ b/data/Create_data/add_poisson_xG.py @@ -6,7 +6,6 @@ import python_calamine as calamine import statsmodels.api as sm from api_football.add_current_features_postgre import export_to_xlsx_fast -from openpyxl import Workbook from sklearn.linear_model import PoissonRegressor from sklearn.metrics import mean_squared_error, r2_score from sklearn.preprocessing import StandardScaler @@ -283,7 +282,9 @@ def predict(self, df: pd.DataFrame) -> pd.DataFrame: self.logger.error(f"Error in prediction: {str(e)}") raise - def add_poisson_xG(self, df: pd.DataFrame, base_df: pd.DataFrame, type: str, is_training: bool = False) -> pd.DataFrame: + def add_poisson_xG( + self, df: pd.DataFrame, base_df: pd.DataFrame, type: str, is_training: bool = False + ) -> pd.DataFrame: # Sort by date if available if "Datum" in df.columns: df["Datum"] = pd.to_datetime(df["Datum"]) @@ -309,7 +310,7 @@ def add_poisson_xG(self, df: pd.DataFrame, base_df: pd.DataFrame, type: str, is_ "api_prediction": ("./data_files/api_football_prediction_newPoisson.xlsx"), "api_prediction_new": ("./data_files/api_football_prediction_new_newPoisson.xlsx"), "api_training": ("./data_files/api_football_training_newPoisson.xlsx"), - "api_future": ("./data_files/api_football_future_newPoisson.xlsx") + "api_future": ("./data_files/api_football_future_newPoisson.xlsx"), } output_path = datasets[type] @@ -372,8 +373,13 @@ def process_data(self): "away_points_cum": "Away_points_cum", } ) - - self.add_poisson_xG(api_prediction_data_new, api_prediction_data_new, "api_prediction_new", is_training=True) + + self.add_poisson_xG( + api_prediction_data_new, + api_prediction_data_new, + "api_prediction_new", + is_training=True, + ) self.add_poisson_xG(api_future_data, api_future_data, "api_future") except Exception as e: self.logger.error(f"Error in data processing: {str(e)}") diff --git a/data/Create_data/api_football/add_current_features_postgre.py b/data/Create_data/api_football/add_current_features_postgre.py index 4f65bb3..2395a1d 100644 --- a/data/Create_data/api_football/add_current_features_postgre.py +++ b/data/Create_data/api_football/add_current_features_postgre.py @@ -16,6 +16,7 @@ load_dotenv() + class PostgreSQLFeatures: """ A class to interact with PostgreSQL and retrieve fixtures data. @@ -28,15 +29,15 @@ def __init__(self, logger=None): self.db_name = os.getenv("POSTGRES_DB") self.db_user = os.getenv("POSTGRES_USER") self.db_password = os.getenv("POSTGRES_PASSWORD") - self.db_port = os.getenv("POSTGRES_PORT", "5432") # Default port for PostgreSQL + self.db_port = os.getenv("POSTGRES_PORT", "5432") # Default port for PostgreSQL self.conn = None self._connect_db() # Placeholder for other collections/tables until schemas are provided - self.predictions_table_name = "api_football.predictions" - self.venues_collection = "api_football.venues" - self.team_stats_collection = "api_football.team_stats" + self.predictions_table_name = "api_football.predictions" + self.venues_collection = "api_football.venues" + self.team_stats_collection = "api_football.team_stats" def _connect_db(self): """Establishes a connection to the PostgreSQL database.""" @@ -47,7 +48,7 @@ def _connect_db(self): database=self.db_name, user=self.db_user, password=self.db_password, - port=self.db_port + port=self.db_port, ) if self.logger: self.logger.info("Successfully connected to PostgreSQL database.") @@ -105,29 +106,35 @@ def get_fixtures_with_home_stats(self) -> list[dict]: df = pd.DataFrame() try: df = pd.read_sql_query(sql_query, self.conn) - + # Add match_outcome based on goals - df['match_outcome'] = df.apply(lambda x: - '1' if x['home_goals'] > x['away_goals'] - else '3' if x['home_goals'] < x['away_goals'] - else '2', axis=1) - + df["match_outcome"] = df.apply( + lambda x: "1" + if x["home_goals"] > x["away_goals"] + else "3" + if x["home_goals"] < x["away_goals"] + else "2", + axis=1, + ) + # Rename passes_accurate columns to passes_accuracy - df = df.rename(columns={ - 'home_total_passes': 'home_passes', - 'away_total_passes': 'away_passes', - 'home_goalkeeper_saves': 'home_saves', - 'away_goalkeeper_saves': 'away_saves', - 'home_corner_kicks': 'home_corners', - 'away_corner_kicks': 'away_corners' - }) - + df = df.rename( + columns={ + "home_total_passes": "home_passes", + "away_total_passes": "away_passes", + "home_goalkeeper_saves": "home_saves", + "away_goalkeeper_saves": "away_saves", + "home_corner_kicks": "home_corners", + "away_corner_kicks": "away_corners", + } + ) + print(f"Found {len(df)} fixtures matching criteria from PostgreSQL.") if self.logger: self.logger.info(f"Found {len(df)} fixtures matching criteria from PostgreSQL.") - + return df - + except psycopg2.Error as e: print(f"Error executing query in get_fixtures_with_home_stats: {e}") if self.logger: @@ -217,23 +224,42 @@ def load_and_prepare_data(self, fixtures_dataframe: pd.DataFrame): if self.logger: self.logger.info("Data Collected, Start Cleaning and Feature Engineering...") # Drop irrelevant columns - data = data.drop(columns=["prediction_outcome", "period_first", "period_second", "timestamp", "status_short", "status_elapsed", "status_extra", "status_elapsed", "league_logo", "league_flag", - "home_team_logo", "away_team_logo", "league_standings", "timezone", "model_prediction", "venue_city"], errors="ignore") + data = data.drop( + columns=[ + "prediction_outcome", + "period_first", + "period_second", + "timestamp", + "status_short", + "status_elapsed", + "status_extra", + "status_elapsed", + "league_logo", + "league_flag", + "home_team_logo", + "away_team_logo", + "league_standings", + "timezone", + "model_prediction", + "venue_city", + ], + errors="ignore", + ) # Type conversions and extracting date components data["home_advantage"] = 1 - data["Date"] = pd.to_datetime(data["date"], errors="coerce") + data["Date"] = pd.to_datetime(data["date"], errors="coerce") # Drop the 'date' column since we have 'Date' - data = data.drop(columns=['date'], errors='ignore') + data = data.drop(columns=["date"], errors="ignore") data["year"] = data["Date"].dt.year data["month"] = data["Date"].dt.month data["day_of_month"] = data["Date"].dt.day data["day_of_week"] = data["Date"].dt.dayofweek data["week_of_year"] = data["Date"].dt.isocalendar().week - + # Ensure match_outcome is numeric before use - if 'match_outcome' in data.columns: - data["match_outcome"] = pd.to_numeric(data["match_outcome"], errors='coerce') + if "match_outcome" in data.columns: + data["match_outcome"] = pd.to_numeric(data["match_outcome"], errors="coerce") # Label encoding for categorical variables le = LabelEncoder() @@ -248,22 +274,22 @@ def load_and_prepare_data(self, fixtures_dataframe: pd.DataFrame): data = data.sort_values(by=["fixture_id"]) shot_on_goal_weight = 0.20 - shot_inside_box_weight = 0.10 # Value for shots from dangerous areas + shot_inside_box_weight = 0.10 # Value for shots from dangerous areas corner_kick_weight = 0.03 - data['home_xG'] = ( - data.get('home_shots_on_goal', 0) * shot_on_goal_weight + - data.get('home_shots_insidebox', 0) * shot_inside_box_weight + - data.get('home_corner_kicks', 0) * corner_kick_weight + data["home_xG"] = ( + data.get("home_shots_on_goal", 0) * shot_on_goal_weight + + data.get("home_shots_insidebox", 0) * shot_inside_box_weight + + data.get("home_corner_kicks", 0) * corner_kick_weight ) - data['away_xG'] = ( - data.get('away_shots_on_goal', 0) * shot_on_goal_weight + - data.get('away_shots_insidebox', 0) * shot_inside_box_weight + - data.get('away_corner_kicks', 0) * corner_kick_weight + data["away_xG"] = ( + data.get("away_shots_on_goal", 0) * shot_on_goal_weight + + data.get("away_shots_insidebox", 0) * shot_inside_box_weight + + data.get("away_corner_kicks", 0) * corner_kick_weight ) # Ensure xG is not negative (though unlikely with positive weights and counts) - data['home_xG'] = data['home_xG'].clip(lower=0) - data['away_xG'] = data['away_xG'].clip(lower=0) + data["home_xG"] = data["home_xG"].clip(lower=0) + data["away_xG"] = data["away_xG"].clip(lower=0) print("Start possession and shooting...") if self.logger: @@ -304,20 +330,28 @@ def load_and_prepare_data(self, fixtures_dataframe: pd.DataFrame): data["home_saves_accuracy"] = data["home_saves"] / data["away_shots_on_goal"] data["away_saves_accuracy"] = data["away_saves"] / data["home_shots_on_goal"] # Fill NaN values with 0 for defensive stats before calculating activity - defensive_cols = ['home_blocked_shots', 'home_yellow_cards', 'home_red_cards', 'home_saves', - 'away_blocked_shots', 'away_yellow_cards', 'away_red_cards', 'away_saves'] + defensive_cols = [ + "home_blocked_shots", + "home_yellow_cards", + "home_red_cards", + "home_saves", + "away_blocked_shots", + "away_yellow_cards", + "away_red_cards", + "away_saves", + ] data[defensive_cols] = data[defensive_cols].fillna(0) - + data["home_defensive_activity"] = ( data["home_blocked_shots"].astype(float) - + data["home_yellow_cards"].astype(float) + + data["home_yellow_cards"].astype(float) + data["home_red_cards"].astype(float) + data["home_saves"].astype(float) ) data["away_defensive_activity"] = ( data["away_blocked_shots"].astype(float) + data["away_yellow_cards"].astype(float) - + data["away_red_cards"].astype(float) + + data["away_red_cards"].astype(float) + data["away_saves"].astype(float) ) # Set-piece threat and foul impact @@ -369,7 +403,7 @@ def load_and_prepare_data(self, fixtures_dataframe: pd.DataFrame): def get_future_matches(self) -> pd.DataFrame: """ - Retrieves all future fixtures (next 14 days, no scores) from the PostgreSQL + Retrieves all future fixtures (next 14 days, no scores) from the PostgreSQL api_football.fixtures table and returns them as a pandas DataFrame. Args: Used to potentially merge some common columns. @@ -381,15 +415,23 @@ def get_future_matches(self) -> pd.DataFrame: two_weeks_date = today + timedelta(days=14) two_weeks_str = two_weeks_date.strftime("%Y-%m-%d %H:%M:%S") today_str = today.strftime("%Y-%m-%d %H:%M:%S") - - + query_columns = [ - "fixture_id", "date", "league_id", "league_name", "league_season", - "referee", "venue_id", "venue_name", - "home_team_id", "home_team_name", "away_team_id", "away_team_name", - "league_round" + "fixture_id", + "date", + "league_id", + "league_name", + "league_season", + "referee", + "venue_id", + "venue_name", + "home_team_id", + "home_team_name", + "away_team_id", + "away_team_name", + "league_round", ] - select_cols_str = ", ".join([f'\"{col}\"' if col == "date" else col for col in query_columns]) + select_cols_str = ", ".join([f'"{col}"' if col == "date" else col for col in query_columns]) sql_query = f""" SELECT {select_cols_str} @@ -400,27 +442,39 @@ def get_future_matches(self) -> pd.DataFrame: df_future = pd.DataFrame() try: df_future = pd.read_sql_query(sql_query, self.conn) - print(f"Found {len(df_future)} future fixtures from PostgreSQL (next 14 days, no scores).") + print( + f"Found {len(df_future)} future fixtures from PostgreSQL (next 14 days, no scores)." + ) if self.logger: self.logger.info(f"Found {len(df_future)} future fixtures from PostgreSQL.") if df_future.empty: - return pd.DataFrame() # Return empty if no future matches found + return pd.DataFrame() # Return empty if no future matches found - df_future = df_future.rename(columns={"date": "Date"}) + df_future = df_future.rename(columns={"date": "Date"}) # Ensure essential columns exist, fill with None or a sensible default if not from query expected_cols = [ - "fixture_id", "Date", "league_id", "league_season", "league_name", "referee", - "venue_name", "venue_id", "home_team_id", "home_team_name", - "away_team_id", "away_team_name", "league_round" + "fixture_id", + "Date", + "league_id", + "league_season", + "league_name", + "referee", + "venue_name", + "venue_id", + "home_team_id", + "home_team_name", + "away_team_id", + "away_team_name", + "league_round", ] for col in expected_cols: if col not in df_future.columns: - df_future[col] = None # or np.nan or appropriate default - + df_future[col] = None # or np.nan or appropriate default + # Ensure correct dtype for fixture_id if it was read as float from DB with NaNs (not typical for PK) - if 'fixture_id' in df_future.columns: + if "fixture_id" in df_future.columns: df_future["fixture_id"] = df_future["fixture_id"].astype(int) except (Exception, psycopg2.Error) as e: @@ -428,7 +482,7 @@ def get_future_matches(self) -> pd.DataFrame: if self.logger: self.logger.error(f"Error retrieving future matches: {e}") # self._close_db() # Keep connection open - return pd.DataFrame() # Return empty on error + return pd.DataFrame() # Return empty on error export_path = "data/Create_data/data_files/base/api_future_matches.xlsx" try: @@ -460,8 +514,10 @@ def export_venues(self) -> pd.DataFrame: df = pd.read_sql_query(sql_query, self.conn) print(f"Successfully retrieved {len(df)} venue records from api_football.venues.") if self.logger: - self.logger.info(f"Successfully retrieved {len(df)} venue records from api_football.venues.") - + self.logger.info( + f"Successfully retrieved {len(df)} venue records from api_football.venues." + ) + for col in df.columns: if df[col].isnull().any(): # print(f"Missing values found in column: {col}") # Optional: for debugging @@ -514,7 +570,9 @@ def flatten_team_stats(self) -> pd.DataFrame: df = pd.read_sql_query(sql_query, self.conn) print(f"Successfully retrieved {len(df)} records from api_football.team_stats.") if self.logger: - self.logger.info(f"Successfully retrieved {len(df)} records from api_football.team_stats.") + self.logger.info( + f"Successfully retrieved {len(df)} records from api_football.team_stats." + ) except (Exception, psycopg2.Error) as e: error_message = f"Error retrieving data from api_football.team_stats: {e}" print(error_message) @@ -527,12 +585,12 @@ def export_events(self) -> pd.DataFrame: """ Retrieves and exports events data from the PostgreSQL api_football.events table in batches. Exports each batch to Excel and returns the complete DataFrame. - + Returns: pd.DataFrame: Complete DataFrame containing all events data. """ self._connect_db() - + # Base SQL query sql_query = """ SELECT @@ -542,46 +600,49 @@ def export_events(self) -> pd.DataFrame: FROM api_football.events e left join api_football.fixtures f on e.fixture_id = f.fixture_id; """ - + all_events = pd.DataFrame() - + try: # Fetch batch df_batch = pd.read_sql_query(sql_query, self.conn) - - + # Append to complete dataset all_events = pd.concat([all_events, df_batch], ignore_index=True) # Add time_extra to time_elapsed where time_extra is not null - all_events['time_elapsed'] = all_events.apply( - lambda x: x['time_elapsed'] + x['time_extra'] if pd.notnull(x['time_extra']) else x['time_elapsed'], - axis=1 + all_events["time_elapsed"] = all_events.apply( + lambda x: x["time_elapsed"] + x["time_extra"] + if pd.notnull(x["time_extra"]) + else x["time_elapsed"], + axis=1, ) # Drop time_extra column after adding it to time_elapsed - all_events = all_events.drop(columns=['time_extra']) - + all_events = all_events.drop(columns=["time_extra"]) + print(f"Successfully retrieved and exported {len(all_events)} total events records") - + # Export complete dataset complete_path = "data/Create_data/data_files/base/api_events_complete.xlsx" export_to_xlsx_fast(all_events, complete_path) print(f"Exported complete events dataset to {complete_path}") - + except (Exception, psycopg2.Error) as e: error_message = f"Error exporting events data: {e}" print(error_message) if self.logger: self.logger.error(error_message) - + return all_events + def export_to_xlsx_fast(df, path): # Replace NaN/None with empty string - df = df.fillna('') + df = df.fillna("") wb = Workbook() wb.new_sheet("Sheet1", data=[df.columns.tolist()] + df.values.tolist()) wb.save(path) + def main(): postgresql_features = PostgreSQLFeatures() print("Getting fixtures with stats") @@ -614,5 +675,6 @@ def main(): export_to_xlsx_fast(team_stats, export_path) print(f"Team stats shape: {team_stats.shape}") + if __name__ == "__main__": main() diff --git a/data/Create_data/api_football/event_features_calculator.py b/data/Create_data/api_football/event_features_calculator.py index 8bcdefb..a37568d 100644 --- a/data/Create_data/api_football/event_features_calculator.py +++ b/data/Create_data/api_football/event_features_calculator.py @@ -15,38 +15,65 @@ def _preprocess_events(events_df: pd.DataFrame) -> pd.DataFrame: - Identifies goal events. """ if events_df.empty: - return pd.DataFrame(columns=['fixture_id', 'time_elapsed', 'event_type', 'event_detail', 'team_id', 'home_team_id', 'away_team_id', 'is_goal']) + return pd.DataFrame( + columns=[ + "fixture_id", + "time_elapsed", + "event_type", + "event_detail", + "team_id", + "home_team_id", + "away_team_id", + "is_goal", + ] + ) + + events_df = events_df.sort_values(by=["fixture_id", "time_elapsed"]).copy() - events_df = events_df.sort_values(by=['fixture_id', 'time_elapsed']).copy() - # Ensure essential columns are present - required_cols = ['fixture_id', 'time_elapsed', 'event_type', 'event_detail', 'team_id', 'home_team_id', 'away_team_id'] + required_cols = [ + "fixture_id", + "time_elapsed", + "event_type", + "event_detail", + "team_id", + "home_team_id", + "away_team_id", + ] for col in required_cols: if col not in events_df.columns: # Add missing columns with NaNs or appropriate defaults if known - if col in ['home_team_id', 'away_team_id', 'team_id']: # These are IDs, float is ok for NaN + if col in [ + "home_team_id", + "away_team_id", + "team_id", + ]: # These are IDs, float is ok for NaN events_df[col] = np.nan else: events_df[col] = pd.NA print(f"Warning: Column '{col}' was missing and has been added with NaNs.") - events_df['is_goal'] = events_df['event_type'] == 'Goal' + events_df["is_goal"] = events_df["event_type"] == "Goal" return events_df + def export_to_xlsx_fast(df, path): # Replace NaN/None with empty string - df = df.fillna('') + df = df.fillna("") wb = Workbook() wb.new_sheet("Sheet1", data=[df.columns.tolist()] + df.values.tolist()) wb.save(path) -def calculate_event_features_vectorized(events_df: pd.DataFrame, fixtures_df: pd.DataFrame) -> pd.DataFrame: + +def calculate_event_features_vectorized( + events_df: pd.DataFrame, fixtures_df: pd.DataFrame +) -> pd.DataFrame: """ Calculates event-based features for each fixture in a vectorized manner. Args: - events_df (pd.DataFrame): DataFrame containing event data. - Required columns: fixture_id, time_elapsed, event_type, + events_df (pd.DataFrame): DataFrame containing event data. + Required columns: fixture_id, time_elapsed, event_type, event_detail, team_id, home_team_id, away_team_id. fixtures_df (pd.DataFrame): DataFrame containing fixture data. Required columns: fixture_id, home_team_id, away_team_id. @@ -58,49 +85,58 @@ def calculate_event_features_vectorized(events_df: pd.DataFrame, fixtures_df: pd if events_df.empty: print("Warning: events_df is empty. Returning empty features DataFrame.") # Create an empty df with expected feature columns if possible, or just fixture_id - return pd.DataFrame(columns=['fixture_id'] + get_feature_names()) - + return pd.DataFrame(columns=["fixture_id"] + get_feature_names()) # Ensure fixtures_df has the necessary columns - if not all(col in fixtures_df.columns for col in ['fixture_id', 'home_team_id', 'away_team_id']): - raise ValueError("fixtures_df must contain 'fixture_id', 'home_team_id', and 'away_team_id'.") + if not all( + col in fixtures_df.columns for col in ["fixture_id", "home_team_id", "away_team_id"] + ): + raise ValueError( + "fixtures_df must contain 'fixture_id', 'home_team_id', and 'away_team_id'." + ) # Merge home/away team_ids from fixtures_df to events_df to ensure they are present and correct - events_df = pd.merge(events_df.drop(columns=['home_team_id', 'away_team_id'], errors='ignore'), - fixtures_df[['fixture_id', 'home_team_id', 'away_team_id']], - on='fixture_id', - how='left') + events_df = pd.merge( + events_df.drop(columns=["home_team_id", "away_team_id"], errors="ignore"), + fixtures_df[["fixture_id", "home_team_id", "away_team_id"]], + on="fixture_id", + how="left", + ) processed_events_df = _preprocess_events(events_df) - - if processed_events_df.empty and not events_df.empty: # Preprocessing made it empty, likely due to missing critical columns - print("Warning: processed_events_df became empty after preprocessing. Check input events_df columns.") - return pd.DataFrame(columns=['fixture_id'] + get_feature_names()) - elif processed_events_df.empty and events_df.empty: # Was already empty - return pd.DataFrame(columns=['fixture_id'] + get_feature_names()) + + if ( + processed_events_df.empty and not events_df.empty + ): # Preprocessing made it empty, likely due to missing critical columns + print( + "Warning: processed_events_df became empty after preprocessing. Check input events_df columns." + ) + return pd.DataFrame(columns=["fixture_id"] + get_feature_names()) + elif processed_events_df.empty and events_df.empty: # Was already empty + return pd.DataFrame(columns=["fixture_id"] + get_feature_names()) # Initialize features_df with all unique fixture_ids from fixtures_df - features_df = fixtures_df[['fixture_id', 'home_team_id', 'away_team_id']].copy() - features_df = features_df.drop_duplicates(subset=['fixture_id']).set_index('fixture_id') + features_df = fixtures_df[["fixture_id", "home_team_id", "away_team_id"]].copy() + features_df = features_df.drop_duplicates(subset=["fixture_id"]).set_index("fixture_id") # --- Feature Calculation Functions Will Be Called Here --- feature_functions_map = { - 'is_score_tied_at_80_min': _calculate_is_score_tied_at_80_min_vectorized, - 'abs_goal_difference_at_45_min': _calculate_abs_goal_difference_at_45_min_vectorized, - 'late_equalizer_scored_in_last_15_mins': _calculate_late_equalizer_scored_in_last_15_mins_vectorized, - 'red_card_to_leading_team_after_60_min': _calculate_red_card_to_leading_team_after_60_min_vectorized, - 'max_continuous_time_score_tied_overall': _calculate_max_continuous_time_score_tied_overall_vectorized, - 'low_total_goals_match': _calculate_low_total_goals_match_vectorized, - 'first_goal_scored_late_or_no_goals': _calculate_first_goal_scored_late_or_no_goals_vectorized, - 'late_substitutions_intensity_when_close': _calculate_late_substitutions_intensity_when_close_vectorized, - 'rolling_avg_goal_difference_last_30_min_at_end': _calculate_rolling_avg_goal_difference_last_30_min_at_end_vectorized, - 'mean_time_between_goals_overall': _calculate_mean_time_between_goals_overall_vectorized, - 'abs_shots_on_target_diff_last_20_min': _calculate_abs_shots_on_target_diff_last_20_min_vectorized, - 'total_yellow_cards_first_half': _calculate_total_yellow_cards_first_half_vectorized, - 'abs_goal_diff_0_30_min': _calculate_abs_goal_diff_0_30_min_vectorized, - 'abs_goal_diff_30_60_min': _calculate_abs_goal_diff_30_60_min_vectorized, - 'abs_goal_diff_60_90_min': _calculate_abs_goal_diff_60_90_min_vectorized, - 'abs_substitutions_diff_overall': _calculate_abs_substitutions_diff_overall_vectorized + "is_score_tied_at_80_min": _calculate_is_score_tied_at_80_min_vectorized, + "abs_goal_difference_at_45_min": _calculate_abs_goal_difference_at_45_min_vectorized, + "late_equalizer_scored_in_last_15_mins": _calculate_late_equalizer_scored_in_last_15_mins_vectorized, + "red_card_to_leading_team_after_60_min": _calculate_red_card_to_leading_team_after_60_min_vectorized, + "max_continuous_time_score_tied_overall": _calculate_max_continuous_time_score_tied_overall_vectorized, + "low_total_goals_match": _calculate_low_total_goals_match_vectorized, + "first_goal_scored_late_or_no_goals": _calculate_first_goal_scored_late_or_no_goals_vectorized, + "late_substitutions_intensity_when_close": _calculate_late_substitutions_intensity_when_close_vectorized, + "rolling_avg_goal_difference_last_30_min_at_end": _calculate_rolling_avg_goal_difference_last_30_min_at_end_vectorized, + "mean_time_between_goals_overall": _calculate_mean_time_between_goals_overall_vectorized, + "abs_shots_on_target_diff_last_20_min": _calculate_abs_shots_on_target_diff_last_20_min_vectorized, + "total_yellow_cards_first_half": _calculate_total_yellow_cards_first_half_vectorized, + "abs_goal_diff_0_30_min": _calculate_abs_goal_diff_0_30_min_vectorized, + "abs_goal_diff_30_60_min": _calculate_abs_goal_diff_30_60_min_vectorized, + "abs_goal_diff_60_90_min": _calculate_abs_goal_diff_60_90_min_vectorized, + "abs_substitutions_diff_overall": _calculate_abs_substitutions_diff_overall_vectorized, # 'proportion_of_game_time_score_is_tied_at_end': _calculate_proportion_of_game_time_score_is_tied_at_end_vectorized } @@ -108,56 +144,67 @@ def calculate_event_features_vectorized(events_df: pd.DataFrame, fixtures_df: pd try: print(f"Calculating {feature_name}...") # Pass fixtures_df as the second argument, which corresponds to fixtures_info_df in helpers - feature_series = calc_function(processed_events_df, fixtures_df) - features_df = features_df.join(feature_series) + feature_series = calc_function(processed_events_df, fixtures_df) + features_df = features_df.join(feature_series) except Exception as e: print(f"Error calculating {feature_name}: {e}. Filling with NaNs.") # Create a default series with NaNs, indexed like features_df (which is fixture_id) default_series = pd.Series(np.nan, index=features_df.index, name=feature_name) features_df = features_df.join(default_series) - + # Reset index to make fixture_id a column again features_df.reset_index(inplace=True) - + # Ensure all defined feature columns are present, even if no events led to their calculation - expected_cols = ['fixture_id'] + get_feature_names() + expected_cols = ["fixture_id"] + get_feature_names() for col in expected_cols: if col not in features_df.columns: # Default for many features will be 0 if they are counts/flags, or NaN if ratios/means that couldn't be calculated - if col.startswith('is_') or col.startswith('late_equalizer') or col.startswith('red_card') or col.startswith('low_total') or col.startswith('first_goal_'): - features_df[col] = 0 - elif col.startswith('abs_') or col.startswith('total_') or col.startswith('late_subs'): + if ( + col.startswith("is_") + or col.startswith("late_equalizer") + or col.startswith("red_card") + or col.startswith("low_total") + or col.startswith("first_goal_") + ): features_df[col] = 0 - else: # Means, durations, proportions + elif col.startswith("abs_") or col.startswith("total_") or col.startswith("late_subs"): + features_df[col] = 0 + else: # Means, durations, proportions features_df[col] = np.nan - - return features_df # Return only expected columns in defined order + + return features_df # Return only expected columns in defined order + def get_feature_names(): """Returns a list of all feature names that will be calculated.""" return [ - 'is_score_tied_at_80_min', - 'abs_goal_difference_at_45_min', - 'late_equalizer_scored_in_last_15_mins', - 'red_card_to_leading_team_after_60_min', - 'max_continuous_time_score_tied_overall', - 'low_total_goals_match', - 'first_goal_scored_late_or_no_goals', - 'late_substitutions_intensity_when_close', - 'rolling_avg_goal_difference_last_30_min_at_end', - 'mean_time_between_goals_overall', - 'abs_shots_on_target_diff_last_20_min', - 'total_yellow_cards_first_half', - 'abs_goal_diff_0_30_min', - 'abs_goal_diff_30_60_min', - 'abs_goal_diff_60_90_min', - 'abs_substitutions_diff_overall' + "is_score_tied_at_80_min", + "abs_goal_difference_at_45_min", + "late_equalizer_scored_in_last_15_mins", + "red_card_to_leading_team_after_60_min", + "max_continuous_time_score_tied_overall", + "low_total_goals_match", + "first_goal_scored_late_or_no_goals", + "late_substitutions_intensity_when_close", + "rolling_avg_goal_difference_last_30_min_at_end", + "mean_time_between_goals_overall", + "abs_shots_on_target_diff_last_20_min", + "total_yellow_cards_first_half", + "abs_goal_diff_0_30_min", + "abs_goal_diff_30_60_min", + "abs_goal_diff_60_90_min", + "abs_substitutions_diff_overall", # 'proportion_of_game_time_score_is_tied_at_end' ] + # --- Individual Feature Calculation Functions Start Here --- -def _get_score_at_time_vectorized(processed_events_df: pd.DataFrame, time_limit: int, fixtures_info_df: pd.DataFrame) -> pd.DataFrame: + +def _get_score_at_time_vectorized( + processed_events_df: pd.DataFrame, time_limit: int, fixtures_info_df: pd.DataFrame +) -> pd.DataFrame: """ Calculates home and away goals at a specific time_limit for all fixtures. @@ -171,46 +218,60 @@ def _get_score_at_time_vectorized(processed_events_df: pd.DataFrame, time_limit: """ if processed_events_df.empty: # Create a DataFrame with all fixtures and 0 goals if no events - scores_at_time = fixtures_info_df[['fixture_id']].copy() - scores_at_time['home_goals_at_time'] = 0 - scores_at_time['away_goals_at_time'] = 0 - return scores_at_time.set_index('fixture_id') + scores_at_time = fixtures_info_df[["fixture_id"]].copy() + scores_at_time["home_goals_at_time"] = 0 + scores_at_time["away_goals_at_time"] = 0 + return scores_at_time.set_index("fixture_id") - events_at_time_limit = processed_events_df[processed_events_df['time_elapsed'] <= time_limit].copy() + events_at_time_limit = processed_events_df[ + processed_events_df["time_elapsed"] <= time_limit + ].copy() - if events_at_time_limit.empty: # No events at or before the time limit for any fixture - scores_at_time = fixtures_info_df[['fixture_id']].copy() - scores_at_time['home_goals_at_time'] = 0 - scores_at_time['away_goals_at_time'] = 0 - return scores_at_time.set_index('fixture_id') + if events_at_time_limit.empty: # No events at or before the time limit for any fixture + scores_at_time = fixtures_info_df[["fixture_id"]].copy() + scores_at_time["home_goals_at_time"] = 0 + scores_at_time["away_goals_at_time"] = 0 + return scores_at_time.set_index("fixture_id") # Ensure 'is_goal' is boolean for sum() - events_at_time_limit['is_goal'] = events_at_time_limit['is_goal'].astype(bool) - + events_at_time_limit["is_goal"] = events_at_time_limit["is_goal"].astype(bool) + # Calculate goals for home team and away team - events_at_time_limit['is_home_goal'] = (events_at_time_limit['is_goal']) & (events_at_time_limit['team_id'] == events_at_time_limit['home_team_id']) - events_at_time_limit['is_away_goal'] = (events_at_time_limit['is_goal']) & (events_at_time_limit['team_id'] == events_at_time_limit['away_team_id']) - - scores_at_time = events_at_time_limit.groupby('fixture_id').agg( - home_goals_at_time=('is_home_goal', 'sum'), - away_goals_at_time=('is_away_goal', 'sum') - ).reset_index() - + events_at_time_limit["is_home_goal"] = (events_at_time_limit["is_goal"]) & ( + events_at_time_limit["team_id"] == events_at_time_limit["home_team_id"] + ) + events_at_time_limit["is_away_goal"] = (events_at_time_limit["is_goal"]) & ( + events_at_time_limit["team_id"] == events_at_time_limit["away_team_id"] + ) + + scores_at_time = ( + events_at_time_limit.groupby("fixture_id") + .agg(home_goals_at_time=("is_home_goal", "sum"), away_goals_at_time=("is_away_goal", "sum")) + .reset_index() + ) + # Merge with fixtures_info_df to include all fixtures, defaulting missing ones to 0 goals # Need home_team_id, away_team_id from fixtures_info_df for context all_fixtures_scores = pd.merge( - fixtures_info_df[['fixture_id', 'home_team_id', 'away_team_id']], - scores_at_time, - on='fixture_id', - how='left' - ).fillna({'home_goals_at_time': 0, 'away_goals_at_time': 0}) - - all_fixtures_scores['home_goals_at_time'] = all_fixtures_scores['home_goals_at_time'].astype(int) - all_fixtures_scores['away_goals_at_time'] = all_fixtures_scores['away_goals_at_time'].astype(int) - - return all_fixtures_scores.set_index('fixture_id')[['home_goals_at_time', 'away_goals_at_time']] - -def _calculate_is_score_tied_at_80_min_vectorized(processed_events_df: pd.DataFrame, fixtures_info_df: pd.DataFrame) -> pd.Series: + fixtures_info_df[["fixture_id", "home_team_id", "away_team_id"]], + scores_at_time, + on="fixture_id", + how="left", + ).fillna({"home_goals_at_time": 0, "away_goals_at_time": 0}) + + all_fixtures_scores["home_goals_at_time"] = all_fixtures_scores["home_goals_at_time"].astype( + int + ) + all_fixtures_scores["away_goals_at_time"] = all_fixtures_scores["away_goals_at_time"].astype( + int + ) + + return all_fixtures_scores.set_index("fixture_id")[["home_goals_at_time", "away_goals_at_time"]] + + +def _calculate_is_score_tied_at_80_min_vectorized( + processed_events_df: pd.DataFrame, fixtures_info_df: pd.DataFrame +) -> pd.Series: """ Calculates if the score was tied at the 80th minute for each fixture. Args: @@ -221,12 +282,17 @@ def _calculate_is_score_tied_at_80_min_vectorized(processed_events_df: pd.DataFr Fixtures with no goals or events before 80 min are considered tied 0-0. """ scores_at_80_min = _get_score_at_time_vectorized(processed_events_df, 80, fixtures_info_df) - - is_tied_series = (scores_at_80_min['home_goals_at_time'] == scores_at_80_min['away_goals_at_time']).astype(int) - is_tied_series.name = 'is_score_tied_at_80_min' + + is_tied_series = ( + scores_at_80_min["home_goals_at_time"] == scores_at_80_min["away_goals_at_time"] + ).astype(int) + is_tied_series.name = "is_score_tied_at_80_min" return is_tied_series -def _calculate_abs_goal_difference_at_45_min_vectorized(processed_events_df: pd.DataFrame, fixtures_info_df: pd.DataFrame) -> pd.Series: + +def _calculate_abs_goal_difference_at_45_min_vectorized( + processed_events_df: pd.DataFrame, fixtures_info_df: pd.DataFrame +) -> pd.Series: """ Calculates the absolute goal difference at the 45th minute for each fixture. Args: @@ -237,11 +303,16 @@ def _calculate_abs_goal_difference_at_45_min_vectorized(processed_events_df: pd. Fixtures with no goals by 45th min have a difference of 0. """ scores_at_45_min = _get_score_at_time_vectorized(processed_events_df, 45, fixtures_info_df) - - abs_diff_series = (scores_at_45_min['home_goals_at_time'] - scores_at_45_min['away_goals_at_time']).abs().astype(int) - abs_diff_series.name = 'abs_goal_difference_at_45_min' + + abs_diff_series = ( + (scores_at_45_min["home_goals_at_time"] - scores_at_45_min["away_goals_at_time"]) + .abs() + .astype(int) + ) + abs_diff_series.name = "abs_goal_difference_at_45_min" return abs_diff_series + def _late_equalizer_check_fixture(fixture_events: pd.DataFrame) -> int: """ Helper function for a single fixture to check for a late equalizer. @@ -251,12 +322,11 @@ def _late_equalizer_check_fixture(fixture_events: pd.DataFrame) -> int: return 0 # Ensure home_team_id and away_team_id are consistent for the fixture - home_team_id = fixture_events['home_team_id'].iloc[0] - away_team_id = fixture_events['away_team_id'].iloc[0] + home_team_id = fixture_events["home_team_id"].iloc[0] + away_team_id = fixture_events["away_team_id"].iloc[0] late_goals = fixture_events[ - (fixture_events['is_goal']) & - (fixture_events['time_elapsed'] >= 75) + (fixture_events["is_goal"]) & (fixture_events["time_elapsed"] >= 75) ].copy() if late_goals.empty: @@ -264,37 +334,53 @@ def _late_equalizer_check_fixture(fixture_events: pd.DataFrame) -> int: # Calculate cumulative scores *within this fixture* up to each event time # This is a simplified version of _get_score_at_time for a single fixture context - fixture_events_sorted = fixture_events.sort_values(by='time_elapsed') - fixture_events_sorted['is_home_goal_event'] = (fixture_events_sorted['is_goal']) & (fixture_events_sorted['team_id'] == home_team_id) - fixture_events_sorted['is_away_goal_event'] = (fixture_events_sorted['is_goal']) & (fixture_events_sorted['team_id'] == away_team_id) - - fixture_events_sorted['current_home_score'] = fixture_events_sorted['is_home_goal_event'].cumsum() - fixture_events_sorted['current_away_score'] = fixture_events_sorted['is_away_goal_event'].cumsum() + fixture_events_sorted = fixture_events.sort_values(by="time_elapsed") + fixture_events_sorted["is_home_goal_event"] = (fixture_events_sorted["is_goal"]) & ( + fixture_events_sorted["team_id"] == home_team_id + ) + fixture_events_sorted["is_away_goal_event"] = (fixture_events_sorted["is_goal"]) & ( + fixture_events_sorted["team_id"] == away_team_id + ) + + fixture_events_sorted["current_home_score"] = fixture_events_sorted[ + "is_home_goal_event" + ].cumsum() + fixture_events_sorted["current_away_score"] = fixture_events_sorted[ + "is_away_goal_event" + ].cumsum() for _, goal_event in late_goals.iterrows(): - time_of_goal = goal_event['time_elapsed'] + time_of_goal = goal_event["time_elapsed"] # Score just BEFORE this specific late goal (using events strictly before this one) - events_before_goal = fixture_events_sorted[fixture_events_sorted['time_elapsed'] < time_of_goal] + events_before_goal = fixture_events_sorted[ + fixture_events_sorted["time_elapsed"] < time_of_goal + ] if not events_before_goal.empty: - home_goals_before_event = events_before_goal.iloc[-1]['current_home_score'] - away_goals_before_event = events_before_goal.iloc[-1]['current_away_score'] - else: # This goal is the first event, or first goal + home_goals_before_event = events_before_goal.iloc[-1]["current_home_score"] + away_goals_before_event = events_before_goal.iloc[-1]["current_away_score"] + else: # This goal is the first event, or first goal home_goals_before_event = 0 away_goals_before_event = 0 - + # Score right AFTER (i.e. including) this specific late goal # We need to find the state of current_home_score and current_away_score for *this* goal_event row # from fixture_events_sorted, which has the cumulative scores including this goal. current_event_scores = fixture_events_sorted[fixture_events_sorted.index == goal_event.name] - home_goals_after_event = current_event_scores['current_home_score'].iloc[0] - away_goals_after_event = current_event_scores['current_away_score'].iloc[0] + home_goals_after_event = current_event_scores["current_home_score"].iloc[0] + away_goals_after_event = current_event_scores["current_away_score"].iloc[0] - if home_goals_after_event == away_goals_after_event and home_goals_before_event != away_goals_before_event: + if ( + home_goals_after_event == away_goals_after_event + and home_goals_before_event != away_goals_before_event + ): return 1 return 0 -def _calculate_late_equalizer_scored_in_last_15_mins_vectorized(processed_events_df: pd.DataFrame, fixtures_info_df: pd.DataFrame) -> pd.Series: + +def _calculate_late_equalizer_scored_in_last_15_mins_vectorized( + processed_events_df: pd.DataFrame, fixtures_info_df: pd.DataFrame +) -> pd.Series: """ Calculates if a late equalizer was scored (from 75th min onwards) for each fixture. Args: @@ -305,27 +391,36 @@ def _calculate_late_equalizer_scored_in_last_15_mins_vectorized(processed_events Defaults to 0 for fixtures with no late goals or no events. """ if processed_events_df.empty: - result_series = pd.Series(0, index=fixtures_info_df['fixture_id'].unique(), name='late_equalizer_scored_in_last_15_mins') - return result_series.reindex(fixtures_info_df.set_index('fixture_id').index).fillna(0) + result_series = pd.Series( + 0, + index=fixtures_info_df["fixture_id"].unique(), + name="late_equalizer_scored_in_last_15_mins", + ) + return result_series.reindex(fixtures_info_df.set_index("fixture_id").index).fillna(0) # Ensure home_team_id and away_team_id are present from fixtures_info_df merge in main func. # These columns must be in processed_events_df. - if not all(col in processed_events_df.columns for col in ['home_team_id', 'away_team_id']): - raise ValueError("processed_events_df must contain 'home_team_id' and 'away_team_id' for late equalizer check.") + if not all(col in processed_events_df.columns for col in ["home_team_id", "away_team_id"]): + raise ValueError( + "processed_events_df must contain 'home_team_id' and 'away_team_id' for late equalizer check." + ) # Apply the helper function to each fixture's events # The `fixture_events` passed to `_late_equalizer_check_fixture` will already have the correct home/away team_id # due to the merge in `calculate_event_features_vectorized`. - late_equalizer_series = processed_events_df.groupby('fixture_id').apply(_late_equalizer_check_fixture, include_groups=False) - late_equalizer_series.name = 'late_equalizer_scored_in_last_15_mins' + late_equalizer_series = processed_events_df.groupby("fixture_id").apply( + _late_equalizer_check_fixture, include_groups=False + ) + late_equalizer_series.name = "late_equalizer_scored_in_last_15_mins" # Ensure all fixtures from fixtures_info_df are present, defaulting to 0 # Create a base series from fixtures_info_df to ensure all fixture_ids are included - base_fixture_ids = fixtures_info_df.set_index('fixture_id').index + base_fixture_ids = fixtures_info_df.set_index("fixture_id").index result_series = late_equalizer_series.reindex(base_fixture_ids).fillna(0).astype(int) - + return result_series + def _red_card_leading_team_check_fixture(fixture_events: pd.DataFrame) -> int: """ Helper for a single fixture to check for a red card to the leading team after 60 mins. @@ -334,44 +429,57 @@ def _red_card_leading_team_check_fixture(fixture_events: pd.DataFrame) -> int: if fixture_events.empty: return 0 - home_team_id = fixture_events['home_team_id'].iloc[0] - away_team_id = fixture_events['away_team_id'].iloc[0] + home_team_id = fixture_events["home_team_id"].iloc[0] + away_team_id = fixture_events["away_team_id"].iloc[0] red_card_events_late = fixture_events[ - (fixture_events['event_detail'] == 'Red Card') & # Assuming this is how red cards are marked - (fixture_events['time_elapsed'] >= 60) + (fixture_events["event_detail"] == "Red Card") # Assuming this is how red cards are marked + & (fixture_events["time_elapsed"] >= 60) ].copy() if red_card_events_late.empty: return 0 # Calculate cumulative scores within this fixture up to each event time - fixture_events_sorted = fixture_events.sort_values(by='time_elapsed') - fixture_events_sorted['is_home_goal_event'] = (fixture_events_sorted['is_goal']) & (fixture_events_sorted['team_id'] == home_team_id) - fixture_events_sorted['is_away_goal_event'] = (fixture_events_sorted['is_goal']) & (fixture_events_sorted['team_id'] == away_team_id) - fixture_events_sorted['current_home_score'] = fixture_events_sorted['is_home_goal_event'].cumsum() - fixture_events_sorted['current_away_score'] = fixture_events_sorted['is_away_goal_event'].cumsum() + fixture_events_sorted = fixture_events.sort_values(by="time_elapsed") + fixture_events_sorted["is_home_goal_event"] = (fixture_events_sorted["is_goal"]) & ( + fixture_events_sorted["team_id"] == home_team_id + ) + fixture_events_sorted["is_away_goal_event"] = (fixture_events_sorted["is_goal"]) & ( + fixture_events_sorted["team_id"] == away_team_id + ) + fixture_events_sorted["current_home_score"] = fixture_events_sorted[ + "is_home_goal_event" + ].cumsum() + fixture_events_sorted["current_away_score"] = fixture_events_sorted[ + "is_away_goal_event" + ].cumsum() for _, card_event in red_card_events_late.iterrows(): - time_of_card = card_event['time_elapsed'] - carded_team_id = card_event['team_id'] + time_of_card = card_event["time_elapsed"] + carded_team_id = card_event["team_id"] - events_before_card = fixture_events_sorted[fixture_events_sorted['time_elapsed'] < time_of_card] + events_before_card = fixture_events_sorted[ + fixture_events_sorted["time_elapsed"] < time_of_card + ] if not events_before_card.empty: - home_goals_before_card = events_before_card.iloc[-1]['current_home_score'] - away_goals_before_card = events_before_card.iloc[-1]['current_away_score'] - else: # Card is the first event, or no goals before it + home_goals_before_card = events_before_card.iloc[-1]["current_home_score"] + away_goals_before_card = events_before_card.iloc[-1]["current_away_score"] + else: # Card is the first event, or no goals before it home_goals_before_card = 0 away_goals_before_card = 0 - + if home_goals_before_card > away_goals_before_card and carded_team_id == home_team_id: - return 1 # Home team was leading and got a red card + return 1 # Home team was leading and got a red card if away_goals_before_card > home_goals_before_card and carded_team_id == away_team_id: - return 1 # Away team was leading and got a red card - + return 1 # Away team was leading and got a red card + return 0 -def _calculate_red_card_to_leading_team_after_60_min_vectorized(processed_events_df: pd.DataFrame, fixtures_info_df: pd.DataFrame) -> pd.Series: + +def _calculate_red_card_to_leading_team_after_60_min_vectorized( + processed_events_df: pd.DataFrame, fixtures_info_df: pd.DataFrame +) -> pd.Series: """ Calculates if a leading team received a red card after the 60th minute. Args: @@ -381,84 +489,112 @@ def _calculate_red_card_to_leading_team_after_60_min_vectorized(processed_events pd.Series: Indexed by fixture_id, value is 1 if condition met, 0 otherwise. """ if processed_events_df.empty: - result_series = pd.Series(0, index=fixtures_info_df['fixture_id'].unique(), name='red_card_to_leading_team_after_60_min') - return result_series.reindex(fixtures_info_df.set_index('fixture_id').index).fillna(0) + result_series = pd.Series( + 0, + index=fixtures_info_df["fixture_id"].unique(), + name="red_card_to_leading_team_after_60_min", + ) + return result_series.reindex(fixtures_info_df.set_index("fixture_id").index).fillna(0) - if not all(col in processed_events_df.columns for col in ['home_team_id', 'away_team_id', 'event_detail']): + if not all( + col in processed_events_df.columns + for col in ["home_team_id", "away_team_id", "event_detail"] + ): # If event_detail is missing, this feature cannot be calculated as specified. - print("Warning: 'home_team_id', 'away_team_id', or 'event_detail' missing in processed_events_df for red card check. Returning 0s.") - result_series = pd.Series(0, index=fixtures_info_df['fixture_id'].unique(), name='red_card_to_leading_team_after_60_min') - return result_series.reindex(fixtures_info_df.set_index('fixture_id').index).fillna(0).astype(int) - - red_card_series = processed_events_df.groupby('fixture_id').apply(_red_card_leading_team_check_fixture, include_groups=False) - red_card_series.name = 'red_card_to_leading_team_after_60_min' - - base_fixture_ids = fixtures_info_df.set_index('fixture_id').index + print( + "Warning: 'home_team_id', 'away_team_id', or 'event_detail' missing in processed_events_df for red card check. Returning 0s." + ) + result_series = pd.Series( + 0, + index=fixtures_info_df["fixture_id"].unique(), + name="red_card_to_leading_team_after_60_min", + ) + return ( + result_series.reindex(fixtures_info_df.set_index("fixture_id").index) + .fillna(0) + .astype(int) + ) + + red_card_series = processed_events_df.groupby("fixture_id").apply( + _red_card_leading_team_check_fixture, include_groups=False + ) + red_card_series.name = "red_card_to_leading_team_after_60_min" + + base_fixture_ids = fixtures_info_df.set_index("fixture_id").index result_series = red_card_series.reindex(base_fixture_ids).fillna(0).astype(int) return result_series + def _max_continuous_time_tied_fixture(fixture_events: pd.DataFrame) -> float: """ Helper for a single fixture to calculate max continuous time score was tied. Assumes fixture_events are sorted and contain home_team_id, away_team_id. """ - if fixture_events.empty: # No events, score is 0-0 for the whole match (assuming 90 min) - return 90.0 + if fixture_events.empty: # No events, score is 0-0 for the whole match (assuming 90 min) + return 90.0 - home_team_id = fixture_events['home_team_id'].iloc[0] - away_team_id = fixture_events['away_team_id'].iloc[0] + home_team_id = fixture_events["home_team_id"].iloc[0] + away_team_id = fixture_events["away_team_id"].iloc[0] # Relevant events are goals. Also consider match start and end. - goal_events_times = fixture_events[fixture_events['is_goal']]['time_elapsed'].unique() - + goal_events_times = fixture_events[fixture_events["is_goal"]]["time_elapsed"].unique() + # Define critical time points: start, goal times, and match end. # Assume match end is 90, or max event time if events go beyond that (e.g. injury time goals) match_end_time = 90.0 if not fixture_events.empty: - max_event_time = fixture_events['time_elapsed'].max() - if max_event_time > match_end_time: # Handle cases where events (like goals) are recorded past 90 min + max_event_time = fixture_events["time_elapsed"].max() + if ( + max_event_time > match_end_time + ): # Handle cases where events (like goals) are recorded past 90 min match_end_time = max_event_time - + time_points = sorted(list(set([0.0] + list(goal_events_times) + [match_end_time]))) # Filter time_points to be within 0 and match_end_time, and ensure they are unique and sorted. time_points = sorted(list(set(tp for tp in time_points if 0 <= tp <= match_end_time))) if not time_points or time_points[-1] < match_end_time: - if match_end_time not in time_points: + if match_end_time not in time_points: time_points.append(match_end_time) - time_points = sorted(list(set(time_points))) + time_points = sorted(list(set(time_points))) if 0.0 not in time_points: time_points.insert(0, 0.0) max_tied_duration = 0.0 current_tied_streak_start_time = 0.0 - score_is_currently_tied = True # Starts 0-0 at time 0.0 + score_is_currently_tied = True # Starts 0-0 at time 0.0 # Pre-calculate scores at each event to avoid repeated full scans for a single fixture # This part is crucial for performance within the apply function. # We need running scores at each of the *original* event times. - _fixture_events_sorted = fixture_events.sort_values(by='time_elapsed').copy() - _fixture_events_sorted['is_home_g'] = (_fixture_events_sorted['is_goal']) & (_fixture_events_sorted['team_id'] == home_team_id) - _fixture_events_sorted['is_away_g'] = (_fixture_events_sorted['is_goal']) & (_fixture_events_sorted['team_id'] == away_team_id) - _fixture_events_sorted['cum_home_g'] = _fixture_events_sorted['is_home_g'].cumsum() - _fixture_events_sorted['cum_away_g'] = _fixture_events_sorted['is_away_g'].cumsum() + _fixture_events_sorted = fixture_events.sort_values(by="time_elapsed").copy() + _fixture_events_sorted["is_home_g"] = (_fixture_events_sorted["is_goal"]) & ( + _fixture_events_sorted["team_id"] == home_team_id + ) + _fixture_events_sorted["is_away_g"] = (_fixture_events_sorted["is_goal"]) & ( + _fixture_events_sorted["team_id"] == away_team_id + ) + _fixture_events_sorted["cum_home_g"] = _fixture_events_sorted["is_home_g"].cumsum() + _fixture_events_sorted["cum_away_g"] = _fixture_events_sorted["is_away_g"].cumsum() # Create a lookup for scores at any given time for this fixture def get_score_at_t(t_limit, _events_with_cum_scores): - relevant_events = _events_with_cum_scores[_events_with_cum_scores['time_elapsed'] <= t_limit] + relevant_events = _events_with_cum_scores[ + _events_with_cum_scores["time_elapsed"] <= t_limit + ] if relevant_events.empty: return 0, 0 # Last event at or before t_limit holds the cumulative score last_event_at_t = relevant_events.iloc[-1] - return last_event_at_t['cum_home_g'], last_event_at_t['cum_away_g'] + return last_event_at_t["cum_home_g"], last_event_at_t["cum_away_g"] for i in range(len(time_points)): current_segment_end_time = time_points[i] - + # Score at the beginning of the current segment (end of last segment) # For the first segment (i=0, current_segment_end_time = 0.0), this is 0-0 # For subsequent segments, it's the score at time_points[i-1] - prev_segment_end_time = time_points[i-1] if i > 0 else 0.0 - + time_points[i - 1] if i > 0 else 0.0 + # We need score *at* current_segment_end_time to decide if the state *changes* at this point. # But the duration calculation depends on the state *during* the segment from prev to current. @@ -466,31 +602,42 @@ def get_score_at_t(t_limit, _events_with_cum_scores): if score_is_currently_tied: # This segment (from prev_segment_end_time to current_segment_end_time) was a tied period. # Add its duration to max_tied_duration if it's the end of the streak. - pass # Covered by logic below - + pass # Covered by logic below + # Determine score at current_segment_end_time - h_goals_at_current_t, a_goals_at_current_t = get_score_at_t(current_segment_end_time, _fixture_events_sorted) + h_goals_at_current_t, a_goals_at_current_t = get_score_at_t( + current_segment_end_time, _fixture_events_sorted + ) if score_is_currently_tied: - if h_goals_at_current_t != a_goals_at_current_t: # Score just became un-tied at current_segment_end_time + if ( + h_goals_at_current_t != a_goals_at_current_t + ): # Score just became un-tied at current_segment_end_time duration = current_segment_end_time - current_tied_streak_start_time max_tied_duration = max(max_tied_duration, duration) score_is_currently_tied = False # If it's still tied, the streak continues. current_tied_streak_start_time remains unchanged. - else: # Score was not tied leading into this point - if h_goals_at_current_t == a_goals_at_current_t: # Score just became tied at current_segment_end_time + else: # Score was not tied leading into this point + if ( + h_goals_at_current_t == a_goals_at_current_t + ): # Score just became tied at current_segment_end_time score_is_currently_tied = True - current_tied_streak_start_time = current_segment_end_time # New tied streak starts now - + current_tied_streak_start_time = ( + current_segment_end_time # New tied streak starts now + ) + # Special handling for the very last segment if the match ends tied if i == len(time_points) - 1 and score_is_currently_tied: # If it's the last point (match_end_time) and score is still tied duration = match_end_time - current_tied_streak_start_time max_tied_duration = max(max_tied_duration, duration) - + return round(max_tied_duration, 2) -def _calculate_max_continuous_time_score_tied_overall_vectorized(processed_events_df: pd.DataFrame, fixtures_info_df: pd.DataFrame) -> pd.Series: + +def _calculate_max_continuous_time_score_tied_overall_vectorized( + processed_events_df: pd.DataFrame, fixtures_info_df: pd.DataFrame +) -> pd.Series: """ Calculates the maximum continuous time the score was tied in minutes. Args: @@ -503,24 +650,41 @@ def _calculate_max_continuous_time_score_tied_overall_vectorized(processed_event """ if processed_events_df.empty: # Assume 90 mins tied if no events (0-0 for whole match) - result_series = pd.Series(90.0, index=fixtures_info_df['fixture_id'].unique(), name='max_continuous_time_score_tied_overall') - return result_series.reindex(fixtures_info_df.set_index('fixture_id').index).fillna(90.0) + result_series = pd.Series( + 90.0, + index=fixtures_info_df["fixture_id"].unique(), + name="max_continuous_time_score_tied_overall", + ) + return result_series.reindex(fixtures_info_df.set_index("fixture_id").index).fillna(90.0) - if not all(col in processed_events_df.columns for col in ['home_team_id', 'away_team_id']): + if not all(col in processed_events_df.columns for col in ["home_team_id", "away_team_id"]): # This shouldn't happen if pre-processing and merges are correct - print("Warning: 'home_team_id' or 'away_team_id' missing for max_continuous_time_score_tied. Returning 0s.") - result_series = pd.Series(0.0, index=fixtures_info_df['fixture_id'].unique(), name='max_continuous_time_score_tied_overall') - return result_series.reindex(fixtures_info_df.set_index('fixture_id').index).fillna(0.0) - - max_tied_time_series = processed_events_df.groupby('fixture_id').apply(_max_continuous_time_tied_fixture, include_groups=False) - max_tied_time_series.name = 'max_continuous_time_score_tied_overall' - - base_fixture_ids = fixtures_info_df.set_index('fixture_id').index - result_series = max_tied_time_series.reindex(base_fixture_ids).fillna(90.0) # Default for missing fixtures (no events) - + print( + "Warning: 'home_team_id' or 'away_team_id' missing for max_continuous_time_score_tied. Returning 0s." + ) + result_series = pd.Series( + 0.0, + index=fixtures_info_df["fixture_id"].unique(), + name="max_continuous_time_score_tied_overall", + ) + return result_series.reindex(fixtures_info_df.set_index("fixture_id").index).fillna(0.0) + + max_tied_time_series = processed_events_df.groupby("fixture_id").apply( + _max_continuous_time_tied_fixture, include_groups=False + ) + max_tied_time_series.name = "max_continuous_time_score_tied_overall" + + base_fixture_ids = fixtures_info_df.set_index("fixture_id").index + result_series = max_tied_time_series.reindex(base_fixture_ids).fillna( + 90.0 + ) # Default for missing fixtures (no events) + return result_series -def _calculate_low_total_goals_match_vectorized(processed_events_df: pd.DataFrame, fixtures_info_df: pd.DataFrame) -> pd.Series: + +def _calculate_low_total_goals_match_vectorized( + processed_events_df: pd.DataFrame, fixtures_info_df: pd.DataFrame +) -> pd.Series: """ Calculates if the match had low total goals (0 or 2). Args: @@ -532,25 +696,36 @@ def _calculate_low_total_goals_match_vectorized(processed_events_df: pd.DataFram """ if processed_events_df.empty: # No events means 0 goals, which is a low total goals match. - result_series = pd.Series(1, index=fixtures_info_df['fixture_id'].unique(), name='low_total_goals_match') - return result_series.reindex(fixtures_info_df.set_index('fixture_id').index).fillna(1).astype(int) + result_series = pd.Series( + 1, index=fixtures_info_df["fixture_id"].unique(), name="low_total_goals_match" + ) + return ( + result_series.reindex(fixtures_info_df.set_index("fixture_id").index) + .fillna(1) + .astype(int) + ) # Count total goals per fixture - total_goals_per_fixture = processed_events_df.groupby('fixture_id')['is_goal'].sum() - total_goals_per_fixture.name = 'total_goals' + total_goals_per_fixture = processed_events_df.groupby("fixture_id")["is_goal"].sum() + total_goals_per_fixture.name = "total_goals" # Check condition (0 or 2 goals) low_goals_series = ((total_goals_per_fixture == 0) | (total_goals_per_fixture == 2)).astype(int) - low_goals_series.name = 'low_total_goals_match' + low_goals_series.name = "low_total_goals_match" # Ensure all fixtures are present, defaulting appropriately # Fixtures not in total_goals_per_fixture had 0 goals, so they satisfy the condition (0 goals). - base_fixture_ids = fixtures_info_df.set_index('fixture_id').index - result_series = low_goals_series.reindex(base_fixture_ids).fillna(1).astype(int) # Default to 1 (0 goals) - + base_fixture_ids = fixtures_info_df.set_index("fixture_id").index + result_series = ( + low_goals_series.reindex(base_fixture_ids).fillna(1).astype(int) + ) # Default to 1 (0 goals) + return result_series -def _calculate_first_goal_scored_late_or_no_goals_vectorized(processed_events_df: pd.DataFrame, fixtures_info_df: pd.DataFrame) -> pd.Series: + +def _calculate_first_goal_scored_late_or_no_goals_vectorized( + processed_events_df: pd.DataFrame, fixtures_info_df: pd.DataFrame +) -> pd.Series: """ Calculates if the first goal was scored late (>=70 min) or if there were no goals. Args: @@ -562,31 +737,50 @@ def _calculate_first_goal_scored_late_or_no_goals_vectorized(processed_events_df """ if processed_events_df.empty: # No events means no goals, so condition is met. - result_series = pd.Series(1, index=fixtures_info_df['fixture_id'].unique(), name='first_goal_scored_late_or_no_goals') - return result_series.reindex(fixtures_info_df.set_index('fixture_id').index).fillna(1).astype(int) + result_series = pd.Series( + 1, + index=fixtures_info_df["fixture_id"].unique(), + name="first_goal_scored_late_or_no_goals", + ) + return ( + result_series.reindex(fixtures_info_df.set_index("fixture_id").index) + .fillna(1) + .astype(int) + ) - goal_events = processed_events_df[processed_events_df['is_goal']].copy() + goal_events = processed_events_df[processed_events_df["is_goal"]].copy() if goal_events.empty: # No goal events across all fixtures, so all meet the 'no goals' condition. - result_series = pd.Series(1, index=fixtures_info_df['fixture_id'].unique(), name='first_goal_scored_late_or_no_goals') - return result_series.reindex(fixtures_info_df.set_index('fixture_id').index).fillna(1).astype(int) + result_series = pd.Series( + 1, + index=fixtures_info_df["fixture_id"].unique(), + name="first_goal_scored_late_or_no_goals", + ) + return ( + result_series.reindex(fixtures_info_df.set_index("fixture_id").index) + .fillna(1) + .astype(int) + ) # Time of the first goal for each fixture that has goals - first_goal_times = goal_events.groupby('fixture_id')['time_elapsed'].min() - first_goal_times.name = 'first_goal_time' + first_goal_times = goal_events.groupby("fixture_id")["time_elapsed"].min() + first_goal_times.name = "first_goal_time" # Check condition: first goal time >= 70 late_first_goal_series = (first_goal_times >= 70).astype(int) - late_first_goal_series.name = 'first_goal_scored_late_or_no_goals' + late_first_goal_series.name = "first_goal_scored_late_or_no_goals" # Merge with all fixtures. Fixtures not in late_first_goal_series either had no goals (condition met) # or their first goal was < 70 (condition not met, but fillna(1) handles the no-goal case correctly). - base_fixture_ids = fixtures_info_df.set_index('fixture_id').index - result_series = late_first_goal_series.reindex(base_fixture_ids).fillna(1).astype(int) # Default to 1 (no goals implies condition met) - + base_fixture_ids = fixtures_info_df.set_index("fixture_id").index + result_series = ( + late_first_goal_series.reindex(base_fixture_ids).fillna(1).astype(int) + ) # Default to 1 (no goals implies condition met) + return result_series + def _late_subs_intensity_check_fixture(fixture_events: pd.DataFrame) -> int: """ Helper for a single fixture to count late substitutions when score is close. @@ -595,45 +789,53 @@ def _late_subs_intensity_check_fixture(fixture_events: pd.DataFrame) -> int: if fixture_events.empty: return 0 - home_team_id = fixture_events['home_team_id'].iloc[0] - away_team_id = fixture_events['away_team_id'].iloc[0] + home_team_id = fixture_events["home_team_id"].iloc[0] + away_team_id = fixture_events["away_team_id"].iloc[0] # Assuming 'subst' is the event_type for substitution late_subs = fixture_events[ - (fixture_events['event_type'] == 'subst') & - (fixture_events['time_elapsed'] >= 70) + (fixture_events["event_type"] == "subst") & (fixture_events["time_elapsed"] >= 70) ].copy() if late_subs.empty: return 0 # Pre-calculate cumulative scores for this fixture - _fixture_events_sorted = fixture_events.sort_values(by='time_elapsed').copy() - _fixture_events_sorted['is_home_g'] = (_fixture_events_sorted['is_goal']) & (_fixture_events_sorted['team_id'] == home_team_id) - _fixture_events_sorted['is_away_g'] = (_fixture_events_sorted['is_goal']) & (_fixture_events_sorted['team_id'] == away_team_id) - _fixture_events_sorted['cum_home_g'] = _fixture_events_sorted['is_home_g'].cumsum() - _fixture_events_sorted['cum_away_g'] = _fixture_events_sorted['is_away_g'].cumsum() + _fixture_events_sorted = fixture_events.sort_values(by="time_elapsed").copy() + _fixture_events_sorted["is_home_g"] = (_fixture_events_sorted["is_goal"]) & ( + _fixture_events_sorted["team_id"] == home_team_id + ) + _fixture_events_sorted["is_away_g"] = (_fixture_events_sorted["is_goal"]) & ( + _fixture_events_sorted["team_id"] == away_team_id + ) + _fixture_events_sorted["cum_home_g"] = _fixture_events_sorted["is_home_g"].cumsum() + _fixture_events_sorted["cum_away_g"] = _fixture_events_sorted["is_away_g"].cumsum() def get_score_just_before_t(t_event, _events_with_cum_scores): # Events strictly before t_event - relevant_events = _events_with_cum_scores[_events_with_cum_scores['time_elapsed'] < t_event] + relevant_events = _events_with_cum_scores[_events_with_cum_scores["time_elapsed"] < t_event] if relevant_events.empty: return 0, 0 last_event_before_t = relevant_events.iloc[-1] - return last_event_before_t['cum_home_g'], last_event_before_t['cum_away_g'] + return last_event_before_t["cum_home_g"], last_event_before_t["cum_away_g"] intensity_count = 0 for _, sub_event in late_subs.iterrows(): - time_of_sub = sub_event['time_elapsed'] - - home_goals_before_sub, away_goals_before_sub = get_score_just_before_t(time_of_sub, _fixture_events_sorted) - + time_of_sub = sub_event["time_elapsed"] + + home_goals_before_sub, away_goals_before_sub = get_score_just_before_t( + time_of_sub, _fixture_events_sorted + ) + if abs(home_goals_before_sub - away_goals_before_sub) <= 1: intensity_count += 1 - + return intensity_count -def _calculate_late_substitutions_intensity_when_close_vectorized(processed_events_df: pd.DataFrame, fixtures_info_df: pd.DataFrame) -> pd.Series: + +def _calculate_late_substitutions_intensity_when_close_vectorized( + processed_events_df: pd.DataFrame, fixtures_info_df: pd.DataFrame +) -> pd.Series: """ Calculates intensity of late substitutions when the score is close. Args: @@ -643,58 +845,88 @@ def _calculate_late_substitutions_intensity_when_close_vectorized(processed_even pd.Series: Indexed by fixture_id, value is count of such substitutions. Defaults to 0 for fixtures with no relevant events. """ - if processed_events_df.empty or 'subst' not in processed_events_df['event_type'].unique(): - result_series = pd.Series(0, index=fixtures_info_df['fixture_id'].unique(), name='late_substitutions_intensity_when_close') - return result_series.reindex(fixtures_info_df.set_index('fixture_id').index).fillna(0).astype(int) + if processed_events_df.empty or "subst" not in processed_events_df["event_type"].unique(): + result_series = pd.Series( + 0, + index=fixtures_info_df["fixture_id"].unique(), + name="late_substitutions_intensity_when_close", + ) + return ( + result_series.reindex(fixtures_info_df.set_index("fixture_id").index) + .fillna(0) + .astype(int) + ) - if not all(col in processed_events_df.columns for col in ['home_team_id', 'away_team_id', 'event_type']): + if not all( + col in processed_events_df.columns for col in ["home_team_id", "away_team_id", "event_type"] + ): print("Warning: Essential columns missing for late_substitutions_intensity. Returning 0s.") - result_series = pd.Series(0, index=fixtures_info_df['fixture_id'].unique(), name='late_substitutions_intensity_when_close') - return result_series.reindex(fixtures_info_df.set_index('fixture_id').index).fillna(0).astype(int) + result_series = pd.Series( + 0, + index=fixtures_info_df["fixture_id"].unique(), + name="late_substitutions_intensity_when_close", + ) + return ( + result_series.reindex(fixtures_info_df.set_index("fixture_id").index) + .fillna(0) + .astype(int) + ) + + intensity_series = processed_events_df.groupby("fixture_id").apply( + _late_subs_intensity_check_fixture, include_groups=False + ) + intensity_series.name = "late_substitutions_intensity_when_close" - intensity_series = processed_events_df.groupby('fixture_id').apply(_late_subs_intensity_check_fixture, include_groups=False) - intensity_series.name = 'late_substitutions_intensity_when_close' - - base_fixture_ids = fixtures_info_df.set_index('fixture_id').index + base_fixture_ids = fixtures_info_df.set_index("fixture_id").index result_series = intensity_series.reindex(base_fixture_ids).fillna(0).astype(int) return result_series + def _rolling_avg_goal_diff_last_30_fixture(fixture_events: pd.DataFrame) -> float: """ Helper for a single fixture to calculate rolling avg goal diff in last 30 mins (61-90). """ if fixture_events.empty: - return 0.0 # No events, assume 0-0, so goal diff is 0 + return 0.0 # No events, assume 0-0, so goal diff is 0 - home_team_id = fixture_events['home_team_id'].iloc[0] - away_team_id = fixture_events['away_team_id'].iloc[0] + home_team_id = fixture_events["home_team_id"].iloc[0] + away_team_id = fixture_events["away_team_id"].iloc[0] # Pre-calculate cumulative scores for this fixture - _fixture_events_sorted = fixture_events.sort_values(by='time_elapsed').copy() - _fixture_events_sorted['is_home_g'] = (_fixture_events_sorted['is_goal']) & (_fixture_events_sorted['team_id'] == home_team_id) - _fixture_events_sorted['is_away_g'] = (_fixture_events_sorted['is_goal']) & (_fixture_events_sorted['team_id'] == away_team_id) - _fixture_events_sorted['cum_home_g'] = _fixture_events_sorted['is_home_g'].cumsum() - _fixture_events_sorted['cum_away_g'] = _fixture_events_sorted['is_away_g'].cumsum() + _fixture_events_sorted = fixture_events.sort_values(by="time_elapsed").copy() + _fixture_events_sorted["is_home_g"] = (_fixture_events_sorted["is_goal"]) & ( + _fixture_events_sorted["team_id"] == home_team_id + ) + _fixture_events_sorted["is_away_g"] = (_fixture_events_sorted["is_goal"]) & ( + _fixture_events_sorted["team_id"] == away_team_id + ) + _fixture_events_sorted["cum_home_g"] = _fixture_events_sorted["is_home_g"].cumsum() + _fixture_events_sorted["cum_away_g"] = _fixture_events_sorted["is_away_g"].cumsum() def get_score_at_minute_t(minute_val, _events_with_cum_scores): - relevant_events = _events_with_cum_scores[_events_with_cum_scores['time_elapsed'] <= minute_val] + relevant_events = _events_with_cum_scores[ + _events_with_cum_scores["time_elapsed"] <= minute_val + ] if relevant_events.empty: return 0, 0 last_event_at_t = relevant_events.iloc[-1] - return last_event_at_t['cum_home_g'], last_event_at_t['cum_away_g'] + return last_event_at_t["cum_home_g"], last_event_at_t["cum_away_g"] goal_differences = [] start_minute = 60 - end_minute = 90 + end_minute = 90 # Original logic used range(start_minute + 1, end_minute + 1) -> 61 to 90. - for minute in range(start_minute + 1, end_minute + 1): # 61, 62, ..., 90 + for minute in range(start_minute + 1, end_minute + 1): # 61, 62, ..., 90 h_goals, a_goals = get_score_at_minute_t(minute, _fixture_events_sorted) goal_differences.append(h_goals - a_goals) - + return round(np.mean(goal_differences), 2) if goal_differences else 0.0 -def _calculate_rolling_avg_goal_difference_last_30_min_at_end_vectorized(processed_events_df: pd.DataFrame, fixtures_info_df: pd.DataFrame) -> pd.Series: + +def _calculate_rolling_avg_goal_difference_last_30_min_at_end_vectorized( + processed_events_df: pd.DataFrame, fixtures_info_df: pd.DataFrame +) -> pd.Series: """ Calculates rolling average goal difference in the last 30 minutes (61-90 min). Args: @@ -705,35 +937,51 @@ def _calculate_rolling_avg_goal_difference_last_30_min_at_end_vectorized(process Defaults to 0.0 for fixtures with no events. """ if processed_events_df.empty: - result_series = pd.Series(0.0, index=fixtures_info_df['fixture_id'].unique(), name='rolling_avg_goal_difference_last_30_min_at_end') - return result_series.reindex(fixtures_info_df.set_index('fixture_id').index).fillna(0.0) + result_series = pd.Series( + 0.0, + index=fixtures_info_df["fixture_id"].unique(), + name="rolling_avg_goal_difference_last_30_min_at_end", + ) + return result_series.reindex(fixtures_info_df.set_index("fixture_id").index).fillna(0.0) - if not all(col in processed_events_df.columns for col in ['home_team_id', 'away_team_id']): + if not all(col in processed_events_df.columns for col in ["home_team_id", "away_team_id"]): print("Warning: Essential columns missing for rolling_avg_goal_difference. Returning 0.0s.") - result_series = pd.Series(0.0, index=fixtures_info_df['fixture_id'].unique(), name='rolling_avg_goal_difference_last_30_min_at_end') - return result_series.reindex(fixtures_info_df.set_index('fixture_id').index).fillna(0.0) - - avg_goal_diff_series = processed_events_df.groupby('fixture_id').apply(_rolling_avg_goal_diff_last_30_fixture, include_groups=False) - avg_goal_diff_series.name = 'rolling_avg_goal_difference_last_30_min_at_end' - - base_fixture_ids = fixtures_info_df.set_index('fixture_id').index - result_series = avg_goal_diff_series.reindex(base_fixture_ids).fillna(0.0) # Default for no events is 0.0 diff + result_series = pd.Series( + 0.0, + index=fixtures_info_df["fixture_id"].unique(), + name="rolling_avg_goal_difference_last_30_min_at_end", + ) + return result_series.reindex(fixtures_info_df.set_index("fixture_id").index).fillna(0.0) + + avg_goal_diff_series = processed_events_df.groupby("fixture_id").apply( + _rolling_avg_goal_diff_last_30_fixture, include_groups=False + ) + avg_goal_diff_series.name = "rolling_avg_goal_difference_last_30_min_at_end" + + base_fixture_ids = fixtures_info_df.set_index("fixture_id").index + result_series = avg_goal_diff_series.reindex(base_fixture_ids).fillna( + 0.0 + ) # Default for no events is 0.0 diff return result_series + def _mean_time_between_goals_fixture(fixture_goal_events: pd.DataFrame) -> float: """ Helper for a single fixture to calculate mean time between goals. Assumes fixture_goal_events contains only goal events for that fixture, sorted by time. """ if fixture_goal_events.shape[0] < 2: - return np.nan # Not enough goals for a duration "between" - - goal_times = fixture_goal_events['time_elapsed'].to_list() - inter_goal_durations = [goal_times[i] - goal_times[i-1] for i in range(1, len(goal_times))] - + return np.nan # Not enough goals for a duration "between" + + goal_times = fixture_goal_events["time_elapsed"].to_list() + inter_goal_durations = [goal_times[i] - goal_times[i - 1] for i in range(1, len(goal_times))] + return round(np.mean(inter_goal_durations), 2) if inter_goal_durations else np.nan -def _calculate_mean_time_between_goals_overall_vectorized(processed_events_df: pd.DataFrame, fixtures_info_df: pd.DataFrame) -> pd.Series: + +def _calculate_mean_time_between_goals_overall_vectorized( + processed_events_df: pd.DataFrame, fixtures_info_df: pd.DataFrame +) -> pd.Series: """ Calculates the mean time between goals for each fixture. Args: @@ -744,25 +992,40 @@ def _calculate_mean_time_between_goals_overall_vectorized(processed_events_df: p Defaults to NaN for fixtures with < 2 goals. """ if processed_events_df.empty: - result_series = pd.Series(np.nan, index=fixtures_info_df['fixture_id'].unique(), name='mean_time_between_goals_overall') - return result_series.reindex(fixtures_info_df.set_index('fixture_id').index).fillna(np.nan) + result_series = pd.Series( + np.nan, + index=fixtures_info_df["fixture_id"].unique(), + name="mean_time_between_goals_overall", + ) + return result_series.reindex(fixtures_info_df.set_index("fixture_id").index).fillna(np.nan) - goal_events = processed_events_df[processed_events_df['is_goal']].copy() + goal_events = processed_events_df[processed_events_df["is_goal"]].copy() if goal_events.empty: - result_series = pd.Series(np.nan, index=fixtures_info_df['fixture_id'].unique(), name='mean_time_between_goals_overall') - return result_series.reindex(fixtures_info_df.set_index('fixture_id').index).fillna(np.nan) + result_series = pd.Series( + np.nan, + index=fixtures_info_df["fixture_id"].unique(), + name="mean_time_between_goals_overall", + ) + return result_series.reindex(fixtures_info_df.set_index("fixture_id").index).fillna(np.nan) # Sort goal events by fixture and time before applying - goal_events_sorted = goal_events.sort_values(by=['fixture_id', 'time_elapsed']) - - mean_time_series = goal_events_sorted.groupby('fixture_id').apply(_mean_time_between_goals_fixture, include_groups=False) - mean_time_series.name = 'mean_time_between_goals_overall' - - base_fixture_ids = fixtures_info_df.set_index('fixture_id').index - result_series = mean_time_series.reindex(base_fixture_ids).fillna(np.nan) # Default for no/few goals is NaN + goal_events_sorted = goal_events.sort_values(by=["fixture_id", "time_elapsed"]) + + mean_time_series = goal_events_sorted.groupby("fixture_id").apply( + _mean_time_between_goals_fixture, include_groups=False + ) + mean_time_series.name = "mean_time_between_goals_overall" + + base_fixture_ids = fixtures_info_df.set_index("fixture_id").index + result_series = mean_time_series.reindex(base_fixture_ids).fillna( + np.nan + ) # Default for no/few goals is NaN return result_series -def _calculate_abs_shots_on_target_diff_last_20_min_vectorized(processed_events_df: pd.DataFrame, fixtures_info_df: pd.DataFrame) -> pd.Series: + +def _calculate_abs_shots_on_target_diff_last_20_min_vectorized( + processed_events_df: pd.DataFrame, fixtures_info_df: pd.DataFrame +) -> pd.Series: """ Calculates absolute difference in shots on target in the last 20 mins (>=70 min). Args: @@ -773,45 +1036,78 @@ def _calculate_abs_shots_on_target_diff_last_20_min_vectorized(processed_events_ Defaults to 0 for fixtures with no late SOTs. """ if processed_events_df.empty: - result_series = pd.Series(0, index=fixtures_info_df['fixture_id'].unique(), name='abs_shots_on_target_diff_last_20_min') - return result_series.reindex(fixtures_info_df.set_index('fixture_id').index).fillna(0).astype(int) + result_series = pd.Series( + 0, + index=fixtures_info_df["fixture_id"].unique(), + name="abs_shots_on_target_diff_last_20_min", + ) + return ( + result_series.reindex(fixtures_info_df.set_index("fixture_id").index) + .fillna(0) + .astype(int) + ) # Define conditions for a shot on target based on original logic # This part is crucial and must match how SOTs are identified. sot_condition = ( - ((processed_events_df['event_type'] == 'Shot') & (processed_events_df['event_detail'] == 'Shot on Target')) | - (processed_events_df['event_detail'] == 'Shot on target') | # common variation - (processed_events_df['event_detail'] == 'On Target') # common variation for event_type 'shot' + ( + (processed_events_df["event_type"] == "Shot") + & (processed_events_df["event_detail"] == "Shot on Target") + ) + | (processed_events_df["event_detail"] == "Shot on target") # common variation + | ( + processed_events_df["event_detail"] == "On Target" + ) # common variation for event_type 'shot' ) sot_events = processed_events_df[sot_condition].copy() - + if sot_events.empty: - result_series = pd.Series(0, index=fixtures_info_df['fixture_id'].unique(), name='abs_shots_on_target_diff_last_20_min') - return result_series.reindex(fixtures_info_df.set_index('fixture_id').index).fillna(0).astype(int) + result_series = pd.Series( + 0, + index=fixtures_info_df["fixture_id"].unique(), + name="abs_shots_on_target_diff_last_20_min", + ) + return ( + result_series.reindex(fixtures_info_df.set_index("fixture_id").index) + .fillna(0) + .astype(int) + ) - late_sot_events = sot_events[sot_events['time_elapsed'] >= 70].copy() + late_sot_events = sot_events[sot_events["time_elapsed"] >= 70].copy() if late_sot_events.empty: - result_series = pd.Series(0, index=fixtures_info_df['fixture_id'].unique(), name='abs_shots_on_target_diff_last_20_min') - return result_series.reindex(fixtures_info_df.set_index('fixture_id').index).fillna(0).astype(int) + result_series = pd.Series( + 0, + index=fixtures_info_df["fixture_id"].unique(), + name="abs_shots_on_target_diff_last_20_min", + ) + return ( + result_series.reindex(fixtures_info_df.set_index("fixture_id").index) + .fillna(0) + .astype(int) + ) # Identify home and away SOTs - late_sot_events['is_home_sot'] = late_sot_events['team_id'] == late_sot_events['home_team_id'] - late_sot_events['is_away_sot'] = late_sot_events['team_id'] == late_sot_events['away_team_id'] + late_sot_events["is_home_sot"] = late_sot_events["team_id"] == late_sot_events["home_team_id"] + late_sot_events["is_away_sot"] = late_sot_events["team_id"] == late_sot_events["away_team_id"] - sot_counts = late_sot_events.groupby('fixture_id').agg( - home_sots_late=('is_home_sot', 'sum'), - away_sots_late=('is_away_sot', 'sum') + sot_counts = late_sot_events.groupby("fixture_id").agg( + home_sots_late=("is_home_sot", "sum"), away_sots_late=("is_away_sot", "sum") ) - abs_diff_sot_series = (sot_counts['home_sots_late'] - sot_counts['away_sots_late']).abs().astype(int) - abs_diff_sot_series.name = 'abs_shots_on_target_diff_last_20_min' - - base_fixture_ids = fixtures_info_df.set_index('fixture_id').index + abs_diff_sot_series = ( + (sot_counts["home_sots_late"] - sot_counts["away_sots_late"]).abs().astype(int) + ) + abs_diff_sot_series.name = "abs_shots_on_target_diff_last_20_min" + + base_fixture_ids = fixtures_info_df.set_index("fixture_id").index result_series = abs_diff_sot_series.reindex(base_fixture_ids).fillna(0).astype(int) return result_series -def _calculate_total_yellow_cards_first_half_vectorized(processed_events_df: pd.DataFrame, fixtures_info_df: pd.DataFrame) -> pd.Series: + +def _calculate_total_yellow_cards_first_half_vectorized( + processed_events_df: pd.DataFrame, fixtures_info_df: pd.DataFrame +) -> pd.Series: """ Calculates the total number of yellow cards in the first half (<= 45 min). Args: @@ -821,53 +1117,82 @@ def _calculate_total_yellow_cards_first_half_vectorized(processed_events_df: pd. pd.Series: Indexed by fixture_id, value is the count of yellow cards (int). Defaults to 0 for fixtures with no first-half yellow cards. """ - if processed_events_df.empty or 'event_detail' not in processed_events_df.columns: - result_series = pd.Series(0, index=fixtures_info_df['fixture_id'].unique(), name='total_yellow_cards_first_half') - return result_series.reindex(fixtures_info_df.set_index('fixture_id').index).fillna(0).astype(int) + if processed_events_df.empty or "event_detail" not in processed_events_df.columns: + result_series = pd.Series( + 0, index=fixtures_info_df["fixture_id"].unique(), name="total_yellow_cards_first_half" + ) + return ( + result_series.reindex(fixtures_info_df.set_index("fixture_id").index) + .fillna(0) + .astype(int) + ) # Filter for yellow card events in the first half first_half_yellow_cards = processed_events_df[ - (processed_events_df['event_detail'] == 'Yellow Card') & - (processed_events_df['time_elapsed'] <= 45) + (processed_events_df["event_detail"] == "Yellow Card") + & (processed_events_df["time_elapsed"] <= 45) ] if first_half_yellow_cards.empty: - result_series = pd.Series(0, index=fixtures_info_df['fixture_id'].unique(), name='total_yellow_cards_first_half') - return result_series.reindex(fixtures_info_df.set_index('fixture_id').index).fillna(0).astype(int) + result_series = pd.Series( + 0, index=fixtures_info_df["fixture_id"].unique(), name="total_yellow_cards_first_half" + ) + return ( + result_series.reindex(fixtures_info_df.set_index("fixture_id").index) + .fillna(0) + .astype(int) + ) # Count yellow cards per fixture - yc_counts_series = first_half_yellow_cards.groupby('fixture_id').size() - yc_counts_series.name = 'total_yellow_cards_first_half' - - base_fixture_ids = fixtures_info_df.set_index('fixture_id').index + yc_counts_series = first_half_yellow_cards.groupby("fixture_id").size() + yc_counts_series.name = "total_yellow_cards_first_half" + + base_fixture_ids = fixtures_info_df.set_index("fixture_id").index result_series = yc_counts_series.reindex(base_fixture_ids).fillna(0).astype(int) return result_series -def _calculate_abs_goal_diff_0_30_min_vectorized(processed_events_df: pd.DataFrame, fixtures_info_df: pd.DataFrame) -> pd.Series: + +def _calculate_abs_goal_diff_0_30_min_vectorized( + processed_events_df: pd.DataFrame, fixtures_info_df: pd.DataFrame +) -> pd.Series: """ Calculates absolute goal difference in the first 30 minutes (0-30 min). """ scores_at_30_min = _get_score_at_time_vectorized(processed_events_df, 30, fixtures_info_df) - abs_diff_series = (scores_at_30_min['home_goals_at_time'] - scores_at_30_min['away_goals_at_time']).abs().astype(int) - abs_diff_series.name = 'abs_goal_diff_0_30_min' + abs_diff_series = ( + (scores_at_30_min["home_goals_at_time"] - scores_at_30_min["away_goals_at_time"]) + .abs() + .astype(int) + ) + abs_diff_series.name = "abs_goal_diff_0_30_min" # _get_score_at_time_vectorized already ensures all fixtures from fixtures_info_df are present and defaults to 0 goals. return abs_diff_series -def _calculate_abs_goal_diff_30_60_min_vectorized(processed_events_df: pd.DataFrame, fixtures_info_df: pd.DataFrame) -> pd.Series: + +def _calculate_abs_goal_diff_30_60_min_vectorized( + processed_events_df: pd.DataFrame, fixtures_info_df: pd.DataFrame +) -> pd.Series: """ Calculates absolute goal difference in the middle 30 minutes (30-60 min). """ scores_at_30_min = _get_score_at_time_vectorized(processed_events_df, 30, fixtures_info_df) scores_at_60_min = _get_score_at_time_vectorized(processed_events_df, 60, fixtures_info_df) - home_goals_in_period = scores_at_60_min['home_goals_at_time'] - scores_at_30_min['home_goals_at_time'] - away_goals_in_period = scores_at_60_min['away_goals_at_time'] - scores_at_30_min['away_goals_at_time'] - + home_goals_in_period = ( + scores_at_60_min["home_goals_at_time"] - scores_at_30_min["home_goals_at_time"] + ) + away_goals_in_period = ( + scores_at_60_min["away_goals_at_time"] - scores_at_30_min["away_goals_at_time"] + ) + abs_diff_series = (home_goals_in_period - away_goals_in_period).abs().astype(int) - abs_diff_series.name = 'abs_goal_diff_30_60_min' + abs_diff_series.name = "abs_goal_diff_30_60_min" return abs_diff_series -def _calculate_abs_goal_diff_60_90_min_vectorized(processed_events_df: pd.DataFrame, fixtures_info_df: pd.DataFrame) -> pd.Series: + +def _calculate_abs_goal_diff_60_90_min_vectorized( + processed_events_df: pd.DataFrame, fixtures_info_df: pd.DataFrame +) -> pd.Series: """ Calculates absolute goal difference in the last 30 regular minutes (60-90 min). Note: Original logic used max_time (max(90, fixture_events_df['time_elapsed'].max())). @@ -879,14 +1204,21 @@ def _calculate_abs_goal_diff_60_90_min_vectorized(processed_events_df: pd.DataFr # If strict adherence to original max_time is needed, _get_score_at_time_vectorized would need adjustment or a new variant. scores_at_90_min = _get_score_at_time_vectorized(processed_events_df, 90, fixtures_info_df) - home_goals_in_period = scores_at_90_min['home_goals_at_time'] - scores_at_60_min['home_goals_at_time'] - away_goals_in_period = scores_at_90_min['away_goals_at_time'] - scores_at_60_min['away_goals_at_time'] - + home_goals_in_period = ( + scores_at_90_min["home_goals_at_time"] - scores_at_60_min["home_goals_at_time"] + ) + away_goals_in_period = ( + scores_at_90_min["away_goals_at_time"] - scores_at_60_min["away_goals_at_time"] + ) + abs_diff_series = (home_goals_in_period - away_goals_in_period).abs().astype(int) - abs_diff_series.name = 'abs_goal_diff_60_90_min' + abs_diff_series.name = "abs_goal_diff_60_90_min" return abs_diff_series -def _calculate_abs_substitutions_diff_overall_vectorized(processed_events_df: pd.DataFrame, fixtures_info_df: pd.DataFrame) -> pd.Series: + +def _calculate_abs_substitutions_diff_overall_vectorized( + processed_events_df: pd.DataFrame, fixtures_info_df: pd.DataFrame +) -> pd.Series: """ Calculates the absolute difference in total substitutions made by home and away teams. Args: @@ -896,57 +1228,77 @@ def _calculate_abs_substitutions_diff_overall_vectorized(processed_events_df: pd pd.Series: Indexed by fixture_id, value is the absolute difference (int). Defaults to 0 for fixtures with no substitutions. """ - if processed_events_df.empty or 'subst' not in processed_events_df['event_type'].unique(): - result_series = pd.Series(0, index=fixtures_info_df['fixture_id'].unique(), name='abs_substitutions_diff_overall') - return result_series.reindex(fixtures_info_df.set_index('fixture_id').index).fillna(0).astype(int) + if processed_events_df.empty or "subst" not in processed_events_df["event_type"].unique(): + result_series = pd.Series( + 0, index=fixtures_info_df["fixture_id"].unique(), name="abs_substitutions_diff_overall" + ) + return ( + result_series.reindex(fixtures_info_df.set_index("fixture_id").index) + .fillna(0) + .astype(int) + ) - sub_events = processed_events_df[processed_events_df['event_type'] == 'subst'].copy() + sub_events = processed_events_df[processed_events_df["event_type"] == "subst"].copy() if sub_events.empty: - result_series = pd.Series(0, index=fixtures_info_df['fixture_id'].unique(), name='abs_substitutions_diff_overall') - return result_series.reindex(fixtures_info_df.set_index('fixture_id').index).fillna(0).astype(int) + result_series = pd.Series( + 0, index=fixtures_info_df["fixture_id"].unique(), name="abs_substitutions_diff_overall" + ) + return ( + result_series.reindex(fixtures_info_df.set_index("fixture_id").index) + .fillna(0) + .astype(int) + ) # Identify home and away substitutions # Ensure home_team_id and away_team_id are present from the merge in the main function - if not all(col in sub_events.columns for col in ['home_team_id', 'away_team_id']): - # This should not happen if main merge was successful - print("Warning: home_team_id or away_team_id missing in sub_events. Cannot calculate abs_substitutions_diff_overall accurately. Returning 0s.") - result_series = pd.Series(0, index=fixtures_info_df['fixture_id'].unique(), name='abs_substitutions_diff_overall') - return result_series.reindex(fixtures_info_df.set_index('fixture_id').index).fillna(0).astype(int) - - sub_events['is_home_sub'] = sub_events['team_id'] == sub_events['home_team_id'] - sub_events['is_away_sub'] = sub_events['team_id'] == sub_events['away_team_id'] - - sub_counts = sub_events.groupby('fixture_id').agg( - home_subs=('is_home_sub', 'sum'), - away_subs=('is_away_sub', 'sum') + if not all(col in sub_events.columns for col in ["home_team_id", "away_team_id"]): + # This should not happen if main merge was successful + print( + "Warning: home_team_id or away_team_id missing in sub_events. Cannot calculate abs_substitutions_diff_overall accurately. Returning 0s." + ) + result_series = pd.Series( + 0, index=fixtures_info_df["fixture_id"].unique(), name="abs_substitutions_diff_overall" + ) + return ( + result_series.reindex(fixtures_info_df.set_index("fixture_id").index) + .fillna(0) + .astype(int) + ) + + sub_events["is_home_sub"] = sub_events["team_id"] == sub_events["home_team_id"] + sub_events["is_away_sub"] = sub_events["team_id"] == sub_events["away_team_id"] + + sub_counts = sub_events.groupby("fixture_id").agg( + home_subs=("is_home_sub", "sum"), away_subs=("is_away_sub", "sum") ) - abs_diff_subs_series = (sub_counts['home_subs'] - sub_counts['away_subs']).abs().astype(int) - abs_diff_subs_series.name = 'abs_substitutions_diff_overall' - - base_fixture_ids = fixtures_info_df.set_index('fixture_id').index + abs_diff_subs_series = (sub_counts["home_subs"] - sub_counts["away_subs"]).abs().astype(int) + abs_diff_subs_series.name = "abs_substitutions_diff_overall" + + base_fixture_ids = fixtures_info_df.set_index("fixture_id").index result_series = abs_diff_subs_series.reindex(base_fixture_ids).fillna(0).astype(int) return result_series + def _proportion_time_tied_fixture(fixture_events: pd.DataFrame) -> float: """ Helper for a single fixture to calculate the proportion of game time the score was tied. Assumes fixture_events are sorted and contain home_team_id, away_team_id. """ if fixture_events.empty: - return 1.0 # No events, score 0-0 for the whole match (assumed 90 min) + return 1.0 # No events, score 0-0 for the whole match (assumed 90 min) - home_team_id = fixture_events['home_team_id'].iloc[0] - away_team_id = fixture_events['away_team_id'].iloc[0] + home_team_id = fixture_events["home_team_id"].iloc[0] + away_team_id = fixture_events["away_team_id"].iloc[0] + + goal_event_times = fixture_events[fixture_events["is_goal"]]["time_elapsed"].unique() - goal_event_times = fixture_events[fixture_events['is_goal']]['time_elapsed'].unique() - match_end_time = 90.0 if not fixture_events.empty(): - max_event_time = fixture_events['time_elapsed'].max() + max_event_time = fixture_events["time_elapsed"].max() if max_event_time > match_end_time: match_end_time = max_event_time - + # Critical time points: start, goal times, and match end. time_points = sorted(list(set([0.0] + list(goal_event_times) + [match_end_time]))) time_points = sorted(list(set(tp for tp in time_points if 0 <= tp <= match_end_time))) @@ -955,9 +1307,9 @@ def _proportion_time_tied_fixture(fixture_events: pd.DataFrame) -> float: time_points.append(match_end_time) time_points = sorted(list(set(time_points))) if 0.0 not in time_points: - time_points.insert(0,0.0) # Ensure 0 is the start + time_points.insert(0, 0.0) # Ensure 0 is the start - if not time_points or match_end_time == 0: # No duration or invalid match end time + if not time_points or match_end_time == 0: # No duration or invalid match end time # If match end time is 0, but events might exist, this is tricky. # Let's assume if match_end_time is 0, proportion is 1.0 if no goals, 0.0 if goals (implies some duration) # However, the current logic for match_end_time means it's at least 90 unless events are truly all at 0. @@ -970,50 +1322,67 @@ def _proportion_time_tied_fixture(fixture_events: pd.DataFrame) -> float: # Calculate score at t=0. If tied, 1.0, else 0.0. if match_end_time == 0: h_goals_at_0, a_goals_at_0 = 0, 0 - if not fixture_events[fixture_events['time_elapsed'] == 0].empty: - goals_at_0 = fixture_events[(fixture_events['time_elapsed'] == 0) & (fixture_events['is_goal'])].copy() + if not fixture_events[fixture_events["time_elapsed"] == 0].empty: + goals_at_0 = fixture_events[ + (fixture_events["time_elapsed"] == 0) & (fixture_events["is_goal"]) + ].copy() if not goals_at_0.empty: - h_goals_at_0 = goals_at_0[goals_at_0['team_id'] == home_team_id].shape[0] - a_goals_at_0 = goals_at_0[goals_at_0['team_id'] == away_team_id].shape[0] + h_goals_at_0 = goals_at_0[goals_at_0["team_id"] == home_team_id].shape[0] + a_goals_at_0 = goals_at_0[goals_at_0["team_id"] == away_team_id].shape[0] return 1.0 if h_goals_at_0 == a_goals_at_0 else 0.0 total_time_score_tied = 0.0 # Pre-calculate cumulative scores for this fixture - _fixture_events_sorted = fixture_events.sort_values(by='time_elapsed').copy() - _fixture_events_sorted['is_home_g'] = (_fixture_events_sorted['is_goal']) & (_fixture_events_sorted['team_id'] == home_team_id) - _fixture_events_sorted['is_away_g'] = (_fixture_events_sorted['is_goal']) & (_fixture_events_sorted['team_id'] == away_team_id) - _fixture_events_sorted['cum_home_g'] = _fixture_events_sorted['is_home_g'].cumsum() - _fixture_events_sorted['cum_away_g'] = _fixture_events_sorted['is_away_g'].cumsum() + _fixture_events_sorted = fixture_events.sort_values(by="time_elapsed").copy() + _fixture_events_sorted["is_home_g"] = (_fixture_events_sorted["is_goal"]) & ( + _fixture_events_sorted["team_id"] == home_team_id + ) + _fixture_events_sorted["is_away_g"] = (_fixture_events_sorted["is_goal"]) & ( + _fixture_events_sorted["team_id"] == away_team_id + ) + _fixture_events_sorted["cum_home_g"] = _fixture_events_sorted["is_home_g"].cumsum() + _fixture_events_sorted["cum_away_g"] = _fixture_events_sorted["is_away_g"].cumsum() def get_score_at_t_for_proportion(t_limit, _events_with_cum_scores): - relevant_events = _events_with_cum_scores[_events_with_cum_scores['time_elapsed'] <= t_limit] + relevant_events = _events_with_cum_scores[ + _events_with_cum_scores["time_elapsed"] <= t_limit + ] if relevant_events.empty: return 0, 0 last_event_at_t = relevant_events.iloc[-1] - return last_event_at_t['cum_home_g'], last_event_at_t['cum_away_g'] + return last_event_at_t["cum_home_g"], last_event_at_t["cum_away_g"] # Iterate through segments defined by the unique time points - for i in range(len(time_points) -1): # Iterate up to the second to last point + for i in range(len(time_points) - 1): # Iterate up to the second to last point segment_start_time = time_points[i] - segment_end_time = time_points[i+1] + segment_end_time = time_points[i + 1] segment_duration = segment_end_time - segment_start_time - if segment_duration <= 0: # Should not happen if time_points are unique and sorted + if segment_duration <= 0: # Should not happen if time_points are unique and sorted continue # Score at the START of this segment (which is the score at segment_start_time) - h_goals_at_segment_start, a_goals_at_segment_start = get_score_at_t_for_proportion(segment_start_time, _fixture_events_sorted) + h_goals_at_segment_start, a_goals_at_segment_start = get_score_at_t_for_proportion( + segment_start_time, _fixture_events_sorted + ) if h_goals_at_segment_start == a_goals_at_segment_start: total_time_score_tied += segment_duration - + # Handle the very first state at t=0. If time_points starts with 0, the first segment is [0, time_points[1]]. # The logic above correctly uses score at t=0 (0-0) for the first segment. - return round(total_time_score_tied / match_end_time, 3) if match_end_time > 0 else (1.0 if total_time_score_tied == 0 else 0.0) # Avoid div by zero, though match_end_time logic tries to prevent it. - # If match_end_time is 0 and total_time_score_tied is also 0 (e.g. only event at t=0 was a goal), this is 1.0. Correct as it was tied for its non-duration. + return ( + round(total_time_score_tied / match_end_time, 3) + if match_end_time > 0 + else (1.0 if total_time_score_tied == 0 else 0.0) + ) # Avoid div by zero, though match_end_time logic tries to prevent it. + # If match_end_time is 0 and total_time_score_tied is also 0 (e.g. only event at t=0 was a goal), this is 1.0. Correct as it was tied for its non-duration. + -def _calculate_proportion_of_game_time_score_is_tied_at_end_vectorized(processed_events_df: pd.DataFrame, fixtures_info_df: pd.DataFrame) -> pd.Series: +def _calculate_proportion_of_game_time_score_is_tied_at_end_vectorized( + processed_events_df: pd.DataFrame, fixtures_info_df: pd.DataFrame +) -> pd.Series: """ Calculates the proportion of game time the score was tied. Args: @@ -1024,30 +1393,51 @@ def _calculate_proportion_of_game_time_score_is_tied_at_end_vectorized(processed Defaults to 1.0 for fixtures with no events (tied 0-0 for full match). """ if processed_events_df.empty: - result_series = pd.Series(1.0, index=fixtures_info_df['fixture_id'].unique(), name='proportion_of_game_time_score_is_tied_at_end') - return result_series.reindex(fixtures_info_df.set_index('fixture_id').index).fillna(1.0) + result_series = pd.Series( + 1.0, + index=fixtures_info_df["fixture_id"].unique(), + name="proportion_of_game_time_score_is_tied_at_end", + ) + return result_series.reindex(fixtures_info_df.set_index("fixture_id").index).fillna(1.0) - if not all(col in processed_events_df.columns for col in ['home_team_id', 'away_team_id']): - print("Warning: Essential columns missing for proportion_of_game_time_score_is_tied. Returning NaNs or 0s might be better here.") + if not all(col in processed_events_df.columns for col in ["home_team_id", "away_team_id"]): + print( + "Warning: Essential columns missing for proportion_of_game_time_score_is_tied. Returning NaNs or 0s might be better here." + ) # Defaulting to NaN as calculation is unreliable - result_series = pd.Series(np.nan, index=fixtures_info_df['fixture_id'].unique(), name='proportion_of_game_time_score_is_tied_at_end') - return result_series.reindex(fixtures_info_df.set_index('fixture_id').index).fillna(np.nan) + result_series = pd.Series( + np.nan, + index=fixtures_info_df["fixture_id"].unique(), + name="proportion_of_game_time_score_is_tied_at_end", + ) + return result_series.reindex(fixtures_info_df.set_index("fixture_id").index).fillna(np.nan) try: - proportion_series = processed_events_df.groupby('fixture_id').apply(lambda x: _proportion_time_tied_fixture(x)) - proportion_series.name = 'proportion_of_game_time_score_is_tied_at_end' - - base_fixture_ids = fixtures_info_df.set_index('fixture_id').index - result_series = proportion_series.reindex(base_fixture_ids).fillna(1.0) # Default for no events is 1.0 (tied 0-0) + proportion_series = processed_events_df.groupby("fixture_id").apply( + lambda x: _proportion_time_tied_fixture(x) + ) + proportion_series.name = "proportion_of_game_time_score_is_tied_at_end" + + base_fixture_ids = fixtures_info_df.set_index("fixture_id").index + result_series = proportion_series.reindex(base_fixture_ids).fillna( + 1.0 + ) # Default for no events is 1.0 (tied 0-0) return result_series except Exception as e: - print(f"Error calculating proportion_of_game_time_score_is_tied_at_end: {str(e)}. Filling with NaNs.") - result_series = pd.Series(np.nan, index=fixtures_info_df['fixture_id'].unique(), name='proportion_of_game_time_score_is_tied_at_end') - return result_series.reindex(fixtures_info_df.set_index('fixture_id').index).fillna(np.nan) + print( + f"Error calculating proportion_of_game_time_score_is_tied_at_end: {str(e)}. Filling with NaNs." + ) + result_series = pd.Series( + np.nan, + index=fixtures_info_df["fixture_id"].unique(), + name="proportion_of_game_time_score_is_tied_at_end", + ) + return result_series.reindex(fixtures_info_df.set_index("fixture_id").index).fillna(np.nan) + def _get_fixtures_from_db() -> pd.DataFrame: """Fetches fixtures data from the PostgreSQL database.""" - load_dotenv() # Load environment variables from .env file + load_dotenv() # Load environment variables from .env file db_host = os.getenv("POSTGRES_HOST") db_name = os.getenv("POSTGRES_DB") @@ -1084,35 +1474,35 @@ def _get_fixtures_from_db() -> pd.DataFrame: """ sample_fallback_fixtures_data = { - 'fixture_id': [1, 2, 3, 4], - 'home_team_id': [101, 103, 105, 107], - 'away_team_id': [102, 104, 106, 108] + "fixture_id": [1, 2, 3, 4], + "home_team_id": [101, 103, 105, 107], + "away_team_id": [102, 104, 106, 108], } try: if not all([db_host, db_name, db_user, db_password, db_port]): - print("Database environment variables not fully set. Falling back to sample fixtures data.") + print( + "Database environment variables not fully set. Falling back to sample fixtures data." + ) return pd.DataFrame(sample_fallback_fixtures_data) conn = psycopg2.connect( - host=db_host, - database=db_name, - user=db_user, - password=db_password, - port=db_port + host=db_host, database=db_name, user=db_user, password=db_password, port=db_port ) print("Successfully connected to PostgreSQL database for fixtures.") fixtures_df = pd.read_sql_query(sql_query, conn) print(f"Successfully fetched {len(fixtures_df)} fixtures from the database.") - + if fixtures_df.empty: print("Query returned no fixtures. Falling back to sample fixtures data.") return pd.DataFrame(sample_fallback_fixtures_data) - + # Ensure necessary columns for calculator are present - required_cols = ['fixture_id', 'home_team_id', 'away_team_id'] + required_cols = ["fixture_id", "home_team_id", "away_team_id"] if not all(col in fixtures_df.columns for col in required_cols): - print(f"Fetched fixtures_df is missing one of required columns: {required_cols}. Falling back to sample data.") + print( + f"Fetched fixtures_df is missing one of required columns: {required_cols}. Falling back to sample data." + ) return pd.DataFrame(sample_fallback_fixtures_data) except psycopg2.Error as e: @@ -1127,12 +1517,13 @@ def _get_fixtures_from_db() -> pd.DataFrame: if conn: conn.close() print("PostgreSQL connection closed.") - + return fixtures_df + def _get_events_from_db() -> pd.DataFrame: """Fetches events data from the PostgreSQL database.""" - load_dotenv() # Ensure env vars are loaded + load_dotenv() # Ensure env vars are loaded db_host = os.getenv("POSTGRES_HOST") db_name = os.getenv("POSTGRES_DB") @@ -1153,47 +1544,66 @@ def _get_events_from_db() -> pd.DataFrame: """ sample_fallback_events_data = { - 'fixture_id': [1, 1, 1, 1, 1, 2, 2, 2, 3], - 'time_elapsed': [30, 40, 70, 78, 85, 25, 60, 65, 10], - 'event_type': ['Card','Goal','subst','Goal','subst','Shot','Goal','Card','subst'], - 'event_detail': ['Yellow Card', None, None, None, None, 'On Target',None,'Red Card', None], - 'team_id': [101, 101, 101, 102, 102, 103, 103, 104, 105], + "fixture_id": [1, 1, 1, 1, 1, 2, 2, 2, 3], + "time_elapsed": [30, 40, 70, 78, 85, 25, 60, 65, 10], + "event_type": ["Card", "Goal", "subst", "Goal", "subst", "Shot", "Goal", "Card", "subst"], + "event_detail": [ + "Yellow Card", + None, + None, + None, + None, + "On Target", + None, + "Red Card", + None, + ], + "team_id": [101, 101, 101, 102, 102, 103, 103, 104, 105], # Sample data needs home_team_id and away_team_id if DB fails - 'home_team_id': [101, 101, 101, 101, 101, 103, 103, 103, 105], - 'away_team_id': [102, 102, 102, 102, 102, 104, 104, 104, 106] + "home_team_id": [101, 101, 101, 101, 101, 103, 103, 103, 105], + "away_team_id": [102, 102, 102, 102, 102, 104, 104, 104, 106], } try: if not all([db_host, db_name, db_user, db_password, db_port]): - print("Database environment variables not fully set. Falling back to sample events data.") + print( + "Database environment variables not fully set. Falling back to sample events data." + ) return pd.DataFrame(sample_fallback_events_data) conn = psycopg2.connect( - host=db_host, - database=db_name, - user=db_user, - password=db_password, - port=db_port + host=db_host, database=db_name, user=db_user, password=db_password, port=db_port ) print("Successfully connected to PostgreSQL database for events.") events_df = pd.read_sql_query(sql_query, conn) # Add time_extra to time_elapsed if time_extra exists and is not null - if 'time_extra' in events_df.columns: - events_df['time_elapsed'] = events_df.apply( - lambda row: row['time_elapsed'] + row['time_extra'] - if pd.notnull(row['time_extra']) else row['time_elapsed'], - axis=1 + if "time_extra" in events_df.columns: + events_df["time_elapsed"] = events_df.apply( + lambda row: row["time_elapsed"] + row["time_extra"] + if pd.notnull(row["time_extra"]) + else row["time_elapsed"], + axis=1, ) - events_df = events_df.drop(columns=['time_extra']) + events_df = events_df.drop(columns=["time_extra"]) print(f"Successfully fetched {len(events_df)} events from the database.") if events_df.empty: print("Query returned no events. Falling back to sample events data.") return pd.DataFrame(sample_fallback_events_data) - - required_cols = ['fixture_id', 'time_elapsed', 'event_type', 'event_detail', 'team_id', 'home_team_id', 'away_team_id'] + + required_cols = [ + "fixture_id", + "time_elapsed", + "event_type", + "event_detail", + "team_id", + "home_team_id", + "away_team_id", + ] if not all(col in events_df.columns for col in required_cols): - print(f"Fetched events_df is missing one of required columns: {required_cols}. Falling back to sample data.") + print( + f"Fetched events_df is missing one of required columns: {required_cols}. Falling back to sample data." + ) return pd.DataFrame(sample_fallback_events_data) except psycopg2.Error as e: @@ -1208,12 +1618,13 @@ def _get_events_from_db() -> pd.DataFrame: if conn: conn.close() print("PostgreSQL connection for events closed.") - + return events_df + def main_test(): """Main function for testing the event feature calculation.""" - + # Get fixtures from DB or fallback to sample fixtures_df = _get_fixtures_from_db() @@ -1229,12 +1640,15 @@ def main_test(): # Calculate features if not fixtures_df.empty: - event_features = calculate_event_features_vectorized(events_df.copy(), fixtures_df.copy()) # Pass copies + event_features = calculate_event_features_vectorized( + events_df.copy(), fixtures_df.copy() + ) # Pass copies print("\n--- Calculated Event Features ---") export_to_xlsx_fast(event_features, "data/Create_data/data_files/base/event_features.xlsx") else: print("\n--- Could not load fixtures_df, skipping feature calculation. ---") + if __name__ == "__main__": - main_test() \ No newline at end of file + main_test() diff --git a/data/Create_data/api_football/get_fixtures_postgre.py b/data/Create_data/api_football/get_fixtures_postgre.py index 6e4a3a4..c824fad 100644 --- a/data/Create_data/api_football/get_fixtures_postgre.py +++ b/data/Create_data/api_football/get_fixtures_postgre.py @@ -4,9 +4,8 @@ import time from datetime import datetime, timedelta from pathlib import Path -from typing import Any +from typing import Any, Optional -import pandas as pd import psycopg2 import requests from dotenv import load_dotenv @@ -26,27 +25,41 @@ except Exception as e: print(f"Error setting project root path: {e}") # Fallback to current directory if path resolution fails - sys.path.append(os.getcwd().parent.parent.parent) - print(f"Current directory get_fixtures: {os.getcwd().parent.parent.parent}") + fallback_path = Path(os.getcwd()).parent.parent.parent + sys.path.append(str(fallback_path)) + print(f"Current directory get_fixtures: {fallback_path}") from src.utils.logger import ExperimentLogger -# PostgreSQL engine setup -engine = create_engine('postgresql+psycopg2://postgres:ronaldo99@localhost:5432/api_football') +# PostgreSQL engine setup with environment variables +db_host = os.getenv("POSTGRES_HOST", "localhost") +db_name = os.getenv("POSTGRES_DB", "api_football") +db_user = os.getenv("POSTGRES_USER", "postgres") +db_password = os.getenv("POSTGRES_PASSWORD") +db_port = os.getenv("POSTGRES_PORT", "5432") + +if not db_password: + raise ValueError("POSTGRES_PASSWORD environment variable is required") + +engine = create_engine(f"postgresql+psycopg2://{db_user}:{db_password}@{db_host}:{db_port}/{db_name}") metadata = MetaData() # Reflect the fixtures table -fixtures_table = Table('fixtures', metadata, autoload_with=engine, schema='api_football') -predictions_table = Table('predictions', metadata, autoload_with=engine, schema='api_football') -team_stats_table = Table('team_stats', metadata, autoload_with=engine, schema='api_football') -fixture_events_table = Table('events', metadata, autoload_with=engine, schema='api_football') +fixtures_table = Table("fixtures", metadata, autoload_with=engine, schema="api_football") +predictions_table = Table("predictions", metadata, autoload_with=engine, schema="api_football") +team_stats_table = Table("team_stats", metadata, autoload_with=engine, schema="api_football") +fixture_events_table = Table("events", metadata, autoload_with=engine, schema="api_football") + +# Constants for event types +VAR_CHECK_DETAIL = "VAR Check" + class ApiFootball: """ A class to interact with the API-Football API and store data in PostgreSQL. """ - def __init__(self, api_key: str, logger: ExperimentLogger = None): + def __init__(self, api_key: str, logger: Optional[ExperimentLogger] = None): self.api_key = api_key self.logger = logger or ExperimentLogger() self.base_url = "https://v3.football.api-sports.io/" @@ -58,7 +71,7 @@ def __init__(self, api_key: str, logger: ExperimentLogger = None): self.data_dir = os.path.join(self.project_root, "data", "create_data", "api-football") os.makedirs(self.data_dir, exist_ok=True) - def _get_request(self, endpoint: str, params: dict = None) -> dict: + def _get_request(self, endpoint: str, params: Optional[dict] = None) -> dict: """ Sends a GET request to the specified endpoint. Args: @@ -101,54 +114,54 @@ def safe_get(d, *keys, default=None): return d row = { - 'fixture_id': safe_get(fixture, 'fixture', 'id'), - 'referee': safe_get(fixture, 'fixture', 'referee'), - 'timezone': safe_get(fixture, 'fixture', 'timezone'), - 'date': safe_get(fixture, 'fixture', 'date'), - 'timestamp': safe_get(fixture, 'fixture', 'timestamp'), - 'period_first': safe_get(fixture, 'fixture', 'periods', 'first'), - 'period_second': safe_get(fixture, 'fixture', 'periods', 'second'), - 'venue_id': safe_get(fixture, 'fixture', 'venue', 'id'), - 'venue_name': safe_get(fixture, 'fixture', 'venue', 'name'), - 'venue_city': safe_get(fixture, 'fixture', 'venue', 'city'), - 'status_long': safe_get(fixture, 'fixture', 'status', 'long'), - 'status_short': safe_get(fixture, 'fixture', 'status', 'short'), - 'status_elapsed': safe_get(fixture, 'fixture', 'status', 'elapsed'), - 'status_extra': safe_get(fixture, 'fixture', 'status', 'extra'), - 'league_id': safe_get(fixture, 'league', 'id'), - 'league_name': safe_get(fixture, 'league', 'name'), - 'league_country': safe_get(fixture, 'league', 'country'), - 'league_logo': safe_get(fixture, 'league', 'logo'), - 'league_flag': safe_get(fixture, 'league', 'flag'), - 'league_season': safe_get(fixture, 'league', 'season'), - 'league_round': safe_get(fixture, 'league', 'round'), - 'league_standings': safe_get(fixture, 'league', 'standings'), - 'home_team_id': safe_get(fixture, 'teams', 'home', 'id'), - 'home_team_name': safe_get(fixture, 'teams', 'home', 'name'), - 'home_team_logo': safe_get(fixture, 'teams', 'home', 'logo'), - 'home_team_winner': safe_get(fixture, 'teams', 'home', 'winner'), - 'away_team_id': safe_get(fixture, 'teams', 'away', 'id'), - 'away_team_name': safe_get(fixture, 'teams', 'away', 'name'), - 'away_team_logo': safe_get(fixture, 'teams', 'away', 'logo'), - 'away_team_winner': safe_get(fixture, 'teams', 'away', 'winner'), - 'home_goals': safe_get(fixture, 'goals', 'home'), - 'away_goals': safe_get(fixture, 'goals', 'away'), - 'home_halftime_goals': safe_get(fixture, 'score', 'halftime', 'home'), - 'away_halftime_goals': safe_get(fixture, 'score', 'halftime', 'away'), - 'home_fulltime_goals': safe_get(fixture, 'score', 'fulltime', 'home'), - 'away_fulltime_goals': safe_get(fixture, 'score', 'fulltime', 'away'), - 'home_extratime_goals': safe_get(fixture, 'score', 'extratime', 'home'), - 'away_extratime_goals': safe_get(fixture, 'score', 'extratime', 'away'), - 'home_penalty_goals': safe_get(fixture, 'score', 'penalty', 'home'), - 'away_penalty_goals': safe_get(fixture, 'score', 'penalty', 'away') + "fixture_id": safe_get(fixture, "fixture", "id"), + "referee": safe_get(fixture, "fixture", "referee"), + "timezone": safe_get(fixture, "fixture", "timezone"), + "date": safe_get(fixture, "fixture", "date"), + "timestamp": safe_get(fixture, "fixture", "timestamp"), + "period_first": safe_get(fixture, "fixture", "periods", "first"), + "period_second": safe_get(fixture, "fixture", "periods", "second"), + "venue_id": safe_get(fixture, "fixture", "venue", "id"), + "venue_name": safe_get(fixture, "fixture", "venue", "name"), + "venue_city": safe_get(fixture, "fixture", "venue", "city"), + "status_long": safe_get(fixture, "fixture", "status", "long"), + "status_short": safe_get(fixture, "fixture", "status", "short"), + "status_elapsed": safe_get(fixture, "fixture", "status", "elapsed"), + "status_extra": safe_get(fixture, "fixture", "status", "extra"), + "league_id": safe_get(fixture, "league", "id"), + "league_name": safe_get(fixture, "league", "name"), + "league_country": safe_get(fixture, "league", "country"), + "league_logo": safe_get(fixture, "league", "logo"), + "league_flag": safe_get(fixture, "league", "flag"), + "league_season": safe_get(fixture, "league", "season"), + "league_round": safe_get(fixture, "league", "round"), + "league_standings": safe_get(fixture, "league", "standings"), + "home_team_id": safe_get(fixture, "teams", "home", "id"), + "home_team_name": safe_get(fixture, "teams", "home", "name"), + "home_team_logo": safe_get(fixture, "teams", "home", "logo"), + "home_team_winner": safe_get(fixture, "teams", "home", "winner"), + "away_team_id": safe_get(fixture, "teams", "away", "id"), + "away_team_name": safe_get(fixture, "teams", "away", "name"), + "away_team_logo": safe_get(fixture, "teams", "away", "logo"), + "away_team_winner": safe_get(fixture, "teams", "away", "winner"), + "home_goals": safe_get(fixture, "goals", "home"), + "away_goals": safe_get(fixture, "goals", "away"), + "home_halftime_goals": safe_get(fixture, "score", "halftime", "home"), + "away_halftime_goals": safe_get(fixture, "score", "halftime", "away"), + "home_fulltime_goals": safe_get(fixture, "score", "fulltime", "home"), + "away_fulltime_goals": safe_get(fixture, "score", "fulltime", "away"), + "home_extratime_goals": safe_get(fixture, "score", "extratime", "home"), + "away_extratime_goals": safe_get(fixture, "score", "extratime", "away"), + "home_penalty_goals": safe_get(fixture, "score", "penalty", "home"), + "away_penalty_goals": safe_get(fixture, "score", "penalty", "away"), } return row def upsert_fixture(self, row: dict): # Upsert a single fixture row into PostgreSQL stmt = pg_insert(fixtures_table).values(**row) - update_dict = {c: stmt.excluded[c] for c in row.keys() if c != 'fixture_id'} - stmt = stmt.on_conflict_do_update(index_elements=['fixture_id'], set_=update_dict) + update_dict = {c: stmt.excluded[c] for c in row.keys() if c != "fixture_id"} + stmt = stmt.on_conflict_do_update(index_elements=["fixture_id"], set_=update_dict) try: with engine.begin() as conn: conn.execute(stmt) @@ -166,8 +179,10 @@ def get_fixtures(self, league_id: int, season: int) -> int: # print(fixture) row = self._map_fixture_to_row(fixture) self.upsert_fixture(row) - - self.logger.info(f"Upserted {len(fixtures)} fixtures for league {league_id} season {season}") + + self.logger.info( + f"Upserted {len(fixtures)} fixtures for league {league_id} season {season}" + ) return len(fixtures) else: self.logger.warning(f"No fixtures found for league {league_id} season {season}") @@ -205,45 +220,95 @@ def get_fixtures_for_leagues(self) -> None: def _parse_api_numeric(self, value: Any) -> Any: """Helper to parse numeric values that might be strings, ints, floats, or None.""" - if value is None: + if value is None: return None - if isinstance(value, (int, float)): + if isinstance(value, (int, float)): return value try: # Attempt to remove % if it's a percentage string before float conversion str_value = str(value).strip() - if str_value.endswith('%'): + if str_value.endswith("%"): return float(str_value[:-1]) return float(str_value) except (ValueError, TypeError): return None - def _parse_team_stat_response(self, api_response_data: dict, fixture_id: int, target_team_id: Any) -> dict: + def _parse_team_stat_response( + self, api_response_data: dict, fixture_id: int, target_team_id: Any + ) -> Optional[dict]: """Parses the teams/statistics API response (a dictionary) for a specific team's season stats, and filters the output to match the known columns in api_football.team_stats table.""" known_team_stats_columns = [ - "fixture_id", "team_id", "updated_at", "league_id", "league_name", "league_country", "league_season", - "team_name", "form", "played_home", "wins_home", "draws_home", "loses_home", "played_away", - "wins_away", "draws_away", "loses_away", "played_total", "wins_total", "draws_total", "loses_total", - "goals_for_home", "goals_for_avg_home", "goals_for_away", "goals_for_avg_away", "goals_for_total", - "goals_for_avg_total", "goals_against_home", "goals_against_avg_home", "goals_against_away", - "goals_against_avg_away", "goals_against_total", "goals_against_avg_total", "clean_sheet_home", - "failed_to_score_home", "clean_sheet_away", "failed_to_score_away", "clean_sheet_total", - "failed_to_score_total", "penalty_scored", "penalty_missed", "penalty_total", "streak_wins", - "streak_draws", "streak_loses", "biggest_wins_home", "biggest_wins_away", "biggest_loses_home", - "biggest_loses_away", "biggest_goals_for_home", "biggest_goals_for_away", - "biggest_goals_against_home", "biggest_goals_against_away" + "fixture_id", + "team_id", + "updated_at", + "league_id", + "league_name", + "league_country", + "league_season", + "team_name", + "form", + "played_home", + "wins_home", + "draws_home", + "loses_home", + "played_away", + "wins_away", + "draws_away", + "loses_away", + "played_total", + "wins_total", + "draws_total", + "loses_total", + "goals_for_home", + "goals_for_avg_home", + "goals_for_away", + "goals_for_avg_away", + "goals_for_total", + "goals_for_avg_total", + "goals_against_home", + "goals_against_avg_home", + "goals_against_away", + "goals_against_avg_away", + "goals_against_total", + "goals_against_avg_total", + "clean_sheet_home", + "failed_to_score_home", + "clean_sheet_away", + "failed_to_score_away", + "clean_sheet_total", + "failed_to_score_total", + "penalty_scored", + "penalty_missed", + "penalty_total", + "streak_wins", + "streak_draws", + "streak_loses", + "biggest_wins_home", + "biggest_wins_away", + "biggest_loses_home", + "biggest_loses_away", + "biggest_goals_for_home", + "biggest_goals_for_away", + "biggest_goals_against_home", + "biggest_goals_against_away", ] if not api_response_data or not isinstance(api_response_data, dict): - self.logger.warning(f"_parse_team_stat_response: Invalid api_response_data for team {target_team_id}, fixture_id context {fixture_id}") + self.logger.warning( + f"_parse_team_stat_response: Invalid api_response_data for team {target_team_id}, fixture_id context {fixture_id}" + ) return None # Ensure target_team_id is an int try: - target_team_id_int = int(target_team_id[0] if isinstance(target_team_id, tuple) else target_team_id) + target_team_id_int = int( + target_team_id[0] if isinstance(target_team_id, tuple) else target_team_id + ) except (ValueError, TypeError): - self.logger.error(f"_parse_team_stat_response: Invalid target_team_id type. Got {target_team_id}") + self.logger.error( + f"_parse_team_stat_response: Invalid target_team_id type. Got {target_team_id}" + ) return None # Extract main sections from API response @@ -256,16 +321,18 @@ def _parse_team_stat_response(self, api_response_data: dict, fixture_id: int, ta clean_sheet_api = api_response_data.get("clean_sheet", {}) failed_to_score_api = api_response_data.get("failed_to_score", {}) - if not team_api.get("id") == target_team_id_int: - self.logger.warning(f"API response team ID {team_api.get('id')} does not match target_team_id {target_team_id_int}") + if team_api.get("id") != target_team_id_int: + self.logger.warning( + f"API response team ID {team_api.get('id')} does not match target_team_id {target_team_id_int}" + ) parsed_data = { - "fixture_id": fixture_id, # Contextual: links this season's stats to a fixture for fetching trigger - "team_id": team_api.get("id", target_team_id_int), # Prefer API's team_id if available + "fixture_id": fixture_id, # Contextual: links this season's stats to a fixture for fetching trigger + "team_id": team_api.get("id", target_team_id_int), # Prefer API's team_id if available "league_id": league_api.get("id"), "league_name": league_api.get("name"), "league_country": league_api.get("country"), - "league_season": league_api.get("season"), # This is the season of the stats + "league_season": league_api.get("season"), # This is the season of the stats "team_name": team_api.get("name"), "form": api_response_data.get("form"), "played_home": fixtures_api.get("played", {}).get("home"), @@ -281,17 +348,29 @@ def _parse_team_stat_response(self, api_response_data: dict, fixture_id: int, ta "loses_away": fixtures_api.get("loses", {}).get("away"), "loses_total": fixtures_api.get("loses", {}).get("total"), "goals_for_home": goals_api.get("for", {}).get("total", {}).get("home"), - "goals_for_avg_home": self._parse_api_numeric(goals_api.get("for", {}).get("average", {}).get("home")), + "goals_for_avg_home": self._parse_api_numeric( + goals_api.get("for", {}).get("average", {}).get("home") + ), "goals_for_away": goals_api.get("for", {}).get("total", {}).get("away"), - "goals_for_avg_away": self._parse_api_numeric(goals_api.get("for", {}).get("average", {}).get("away")), + "goals_for_avg_away": self._parse_api_numeric( + goals_api.get("for", {}).get("average", {}).get("away") + ), "goals_for_total": goals_api.get("for", {}).get("total", {}).get("total"), - "goals_for_avg_total": self._parse_api_numeric(goals_api.get("for", {}).get("average", {}).get("total")), + "goals_for_avg_total": self._parse_api_numeric( + goals_api.get("for", {}).get("average", {}).get("total") + ), "goals_against_home": goals_api.get("against", {}).get("total", {}).get("home"), - "goals_against_avg_home": self._parse_api_numeric(goals_api.get("against", {}).get("average", {}).get("home")), + "goals_against_avg_home": self._parse_api_numeric( + goals_api.get("against", {}).get("average", {}).get("home") + ), "goals_against_away": goals_api.get("against", {}).get("total", {}).get("away"), - "goals_against_avg_away": self._parse_api_numeric(goals_api.get("against", {}).get("average", {}).get("away")), + "goals_against_avg_away": self._parse_api_numeric( + goals_api.get("against", {}).get("average", {}).get("away") + ), "goals_against_total": goals_api.get("against", {}).get("total", {}).get("total"), - "goals_against_avg_total": self._parse_api_numeric(goals_api.get("against", {}).get("average", {}).get("total")), + "goals_against_avg_total": self._parse_api_numeric( + goals_api.get("against", {}).get("average", {}).get("total") + ), "clean_sheet_home": clean_sheet_api.get("home"), "clean_sheet_away": clean_sheet_api.get("away"), "clean_sheet_total": clean_sheet_api.get("total"), @@ -310,16 +389,28 @@ def _parse_team_stat_response(self, api_response_data: dict, fixture_id: int, ta "biggest_loses_away": biggest_api.get("loses", {}).get("away"), "biggest_goals_for_home": biggest_api.get("goals", {}).get("for", {}).get("home"), "biggest_goals_for_away": biggest_api.get("goals", {}).get("for", {}).get("away"), - "biggest_goals_against_home": biggest_api.get("goals", {}).get("against", {}).get("home"), - "biggest_goals_against_away": biggest_api.get("goals", {}).get("against", {}).get("away"), - "updated_at": datetime.now() + "biggest_goals_against_home": biggest_api.get("goals", {}) + .get("against", {}) + .get("home"), + "biggest_goals_against_away": biggest_api.get("goals", {}) + .get("against", {}) + .get("away"), + "updated_at": datetime.now(), } - + # Filter to only include keys that exist in the known_team_stats_columns list and are not None - filtered_data = {k: v for k, v in parsed_data.items() if k in known_team_stats_columns and v is not None} - - if not filtered_data.get("team_id") or not filtered_data.get("league_id") or not filtered_data.get("league_season"): - self.logger.warning(f"Filtered data for team {target_team_id_int} (fixture context {fixture_id}) is missing essential keys (team_id, league_id, league_season) after filtering.") + filtered_data = { + k: v for k, v in parsed_data.items() if k in known_team_stats_columns and v is not None + } + + if ( + not filtered_data.get("team_id") + or not filtered_data.get("league_id") + or not filtered_data.get("league_season") + ): + self.logger.warning( + f"Filtered data for team {target_team_id_int} (fixture context {fixture_id}) is missing essential keys (team_id, league_id, league_season) after filtering." + ) return None return filtered_data @@ -335,7 +426,7 @@ def get_team_stats_for_fixtures(self): left join api_football.team_stats ts on f.fixture_id = ts.fixture_id WHERE date <= now() and ts.fixture_id is null; """) - + fixtures_to_query_teams_for = [] with engine.connect() as conn: result = conn.execute(query) @@ -345,12 +436,14 @@ def get_team_stats_for_fixtures(self): self.logger.info("No fixtures found to trigger team statistics processing.") return - self.logger.info(f"Found {len(fixtures_to_query_teams_for)} fixtures to check for team statistics updates.") + self.logger.info( + f"Found {len(fixtures_to_query_teams_for)} fixtures to check for team statistics updates." + ) api_call_count_total = 0 api_calls_in_current_batch = 0 - rate_limit_threshold_per_batch = 250 - pause_duration = 60 + rate_limit_threshold_per_batch = 250 + pause_duration = 60 batch_start_time = time.time() for i, fixture_row in enumerate(fixtures_to_query_teams_for): @@ -362,10 +455,14 @@ def get_team_stats_for_fixtures(self): date = fixture_row.date.strftime("%Y-%m-%d") if not all([fixture_id, league_id, home_team_id, away_team_id, season]): - self.logger.warning(f"Skipping fixture {fixture_id} due to missing key DB information for team stat processing.") + self.logger.warning( + f"Skipping fixture {fixture_id} due to missing key DB information for team stat processing." + ) continue - - self.logger.info(f"Processing fixture context {fixture_id} ({i+1}/{len(fixtures_to_query_teams_for)}) for league {league_id}, season {season}") + + self.logger.info( + f"Processing fixture context {fixture_id} ({i + 1}/{len(fixtures_to_query_teams_for)}) for league {league_id}, season {season}" + ) teams_to_process_for_stats = [home_team_id, away_team_id] stats_for_db_batch = [] @@ -374,20 +471,26 @@ def get_team_stats_for_fixtures(self): with engine.connect() as conn: check_stmt = team_stats_table.select().where( team_stats_table.c.team_id == team_id_to_fetch, - team_stats_table.c.fixture_id == fixture_id + team_stats_table.c.fixture_id == fixture_id, ) existing_stat_for_season = conn.execute(check_stmt).first() if existing_stat_for_season: - self.logger.info(f"Season stats for team {team_id_to_fetch}, league {league_id}, season {season} already exist in DB. Skipping API call.") + self.logger.info( + f"Season stats for team {team_id_to_fetch}, league {league_id}, season {season} already exist in DB. Skipping API call." + ) continue except SQLAlchemyError as e_check: - self.logger.error(f"DB error checking existing season stats for team {team_id_to_fetch}, league {league_id}, season {season}: {e_check}") + self.logger.error( + f"DB error checking existing season stats for team {team_id_to_fetch}, league {league_id}, season {season}: {e_check}" + ) if api_calls_in_current_batch >= rate_limit_threshold_per_batch: elapsed_time_in_batch = time.time() - batch_start_time if elapsed_time_in_batch < pause_duration: sleep_time = pause_duration - elapsed_time_in_batch - self.logger.info(f"Rate limit threshold hit. Sleeping for {sleep_time:.1f} seconds.") + self.logger.info( + f"Rate limit threshold hit. Sleeping for {sleep_time:.1f} seconds." + ) time.sleep(sleep_time) api_calls_in_current_batch = 0 batch_start_time = time.time() @@ -397,58 +500,80 @@ def get_team_stats_for_fixtures(self): "league": league_id, "team": team_id_to_fetch, "season": season, - "date": date - } - - self.logger.info(f"Calling API for team {team_id_to_fetch}, league {league_id}, season {season} (context fixture {fixture_id})") + "date": date, + } + + self.logger.info( + f"Calling API for team {team_id_to_fetch}, league {league_id}, season {season} (context fixture {fixture_id})" + ) response_json = self._get_request(endpoint, params) api_calls_in_current_batch += 1 api_call_count_total += 1 if response_json and response_json.get("response"): parsed_stats = self._parse_team_stat_response( - response_json["response"], + response_json["response"], fixture_id, - team_id_to_fetch, - # league_id, + team_id_to_fetch, + # league_id, # season ) if parsed_stats: - self.logger.info(f"Parsed season stats for team {team_id_to_fetch}, league {league_id}") - if not parsed_stats.get("team_id") or parsed_stats.get("team_id") <= 0: - self.logger.warning(f"Skipping stats record - missing or invalid team_id for fixture {fixture_id}") + self.logger.info( + f"Parsed season stats for team {team_id_to_fetch}, league {league_id}" + ) + team_id_value = parsed_stats.get("team_id") + if not team_id_value or (isinstance(team_id_value, (int, float)) and team_id_value <= 0): + self.logger.warning( + f"Skipping stats record - missing or invalid team_id for fixture {fixture_id}" + ) continue stats_for_db_batch.append(parsed_stats) else: - self.logger.warning(f"Failed to parse API season stats for team {team_id_to_fetch}, L:{league_id}, S:{season}. Response: {response_json.get('response')}") + self.logger.warning( + f"Failed to parse API season stats for team {team_id_to_fetch}, L:{league_id}, S:{season}. Response: {response_json.get('response')}" + ) else: - self.logger.error(f"API request failed for team {team_id_to_fetch}, L:{league_id}, S:{season}. Params: {params}. Response: {response_json}") - + self.logger.error( + f"API request failed for team {team_id_to_fetch}, L:{league_id}, S:{season}. Params: {params}. Response: {response_json}" + ) + if stats_for_db_batch: try: with engine.begin() as conn: for stat_data in stats_for_db_batch: - conflict_elements = ['team_id', 'fixture_id'] - + conflict_elements = ["team_id", "fixture_id"] + values_to_insert = stat_data.copy() - values_to_insert['fixture_id'] = fixture_id + values_to_insert["fixture_id"] = fixture_id stmt = pg_insert(team_stats_table).values(**values_to_insert) update_cols = { - c.name: stmt.excluded[c.name] for c in team_stats_table.c + c.name: stmt.excluded[c.name] + for c in team_stats_table.c if c.name not in conflict_elements } - stmt = stmt.on_conflict_do_update(index_elements=conflict_elements, set_=update_cols) + stmt = stmt.on_conflict_do_update( + index_elements=conflict_elements, set_=update_cols + ) conn.execute(stmt) - self.logger.info(f"Successfully upserted {len(stats_for_db_batch)} team season stats records (context fixture {fixture_id}).") + self.logger.info( + f"Successfully upserted {len(stats_for_db_batch)} team season stats records (context fixture {fixture_id})." + ) except SQLAlchemyError as e_upsert: - self.logger.error(f"DB error upserting batch season stats (context fixture {fixture_id}): {e_upsert}") - - if (i + 1) % 10 == 0 : - self.logger.info(f"--- Progress: Checked {i+1}/{len(fixtures_to_query_teams_for)} fixture contexts. Total API calls this run: {api_call_count_total} ---") - time.sleep(0.2) # Shorter delay after each fixture context check + self.logger.error( + f"DB error upserting batch season stats (context fixture {fixture_id}): {e_upsert}" + ) + + if (i + 1) % 10 == 0: + self.logger.info( + f"--- Progress: Checked {i + 1}/{len(fixtures_to_query_teams_for)} fixture contexts. Total API calls this run: {api_call_count_total} ---" + ) + time.sleep(0.2) # Shorter delay after each fixture context check - self.logger.info(f"Finished get_team_stats_for_fixtures (season stats). Total API calls: {api_call_count_total}") + self.logger.info( + f"Finished get_team_stats_for_fixtures (season stats). Total API calls: {api_call_count_total}" + ) except Exception as e: self.logger.error(f"General error in get_team_stats_for_fixtures: {e}") @@ -527,75 +652,94 @@ def get_statistics(self, fixture_id: int) -> dict: # Map statistics to fixture table columns def stat_map(prefix, stats): mapping = { - 'shots_on_goal': f'{prefix}_shots_on_goal', - 'shots_off_goal': f'{prefix}_shots_off_goal', - 'total_shots': f'{prefix}_total_shots', - 'blocked_shots': f'{prefix}_blocked_shots', - 'shots_insidebox': f'{prefix}_shots_insidebox', - 'shots_outsidebox': f'{prefix}_shots_outsidebox', - 'fouls': f'{prefix}_fouls', - 'corner_kicks': f'{prefix}_corner_kicks', - 'offsides': f'{prefix}_offsides', - 'ball_possession': f'{prefix}_ball_possession', - 'yellow_cards': f'{prefix}_yellow_cards', - 'red_cards': f'{prefix}_red_cards', - 'goalkeeper_saves': f'{prefix}_goalkeeper_saves', - 'total_passes': f'{prefix}_total_passes', - 'passes_accurate': f'{prefix}_passes_accurate', - 'passes_%': f'{prefix}_passes_percent', - 'expected_goals': f'{prefix}_expected_goals' + "shots_on_goal": f"{prefix}_shots_on_goal", + "shots_off_goal": f"{prefix}_shots_off_goal", + "total_shots": f"{prefix}_total_shots", + "blocked_shots": f"{prefix}_blocked_shots", + "shots_insidebox": f"{prefix}_shots_insidebox", + "shots_outsidebox": f"{prefix}_shots_outsidebox", + "fouls": f"{prefix}_fouls", + "corner_kicks": f"{prefix}_corner_kicks", + "offsides": f"{prefix}_offsides", + "ball_possession": f"{prefix}_ball_possession", + "yellow_cards": f"{prefix}_yellow_cards", + "red_cards": f"{prefix}_red_cards", + "goalkeeper_saves": f"{prefix}_goalkeeper_saves", + "total_passes": f"{prefix}_total_passes", + "passes_accurate": f"{prefix}_passes_accurate", + "passes_%": f"{prefix}_passes_percent", + "expected_goals": f"{prefix}_expected_goals", } result = {} for k, v in mapping.items(): val = stats.get(k) # Convert ball_possession from '55%' to float if needed - if k == 'ball_possession' and isinstance(val, str) and val.endswith('%'): + if k == "ball_possession" and isinstance(val, str) and val.endswith("%"): try: - val = float(val.replace('%', '')) - except Exception: + val = float(val.replace("%", "")) + except (ValueError, AttributeError): val = None # Convert passes_percent from '55%' to float if needed - if k == 'passes_percent' or k == 'passes_%' and isinstance(val, str) and val.endswith('%'): + elif ( + k == "passes_percent" + or k == "passes_%" + and isinstance(val, str) + and val.endswith("%") + ): try: - val = float(val.replace('%', '')) - except Exception: + val = float(val.replace("%", "")) + except (ValueError, AttributeError): val = None result[v] = val return result - home_stats = stat_map('home', statistics['home']['stats']) - away_stats = stat_map('away', statistics['away']['stats']) + home_stats = stat_map("home", statistics["home"]["stats"]) + away_stats = stat_map("away", statistics["away"]["stats"]) # Prepare update dict update_dict = {**home_stats, **away_stats} # Update the fixture row in PostgreSQL - stmt = update(fixtures_table).where(fixtures_table.c.fixture_id == fixture_id).values(**update_dict) + stmt = ( + update(fixtures_table) + .where(fixtures_table.c.fixture_id == fixture_id) + .values(**update_dict) + ) try: with engine.begin() as conn: conn.execute(stmt) self.logger.info(f"Statistics for fixture {fixture_id} updated in PostgreSQL.") except Exception as e: - self.logger.error(f"Error updating statistics for fixture {fixture_id} in PostgreSQL: {e}") + self.logger.error( + f"Error updating statistics for fixture {fixture_id} in PostgreSQL: {e}" + ) return statistics else: self.logger.warning(f"No statistics found for fixture {fixture_id}") try: # Get fixture date from PostgreSQL - query = text("SELECT date FROM api_football.fixtures WHERE fixture_id = :fixture_id") + query = text( + "SELECT date FROM api_football.fixtures WHERE fixture_id = :fixture_id" + ) with engine.connect() as conn: result = conn.execute(query, {"fixture_id": fixture_id}).first() if result: fixture_date = datetime.strptime(str(result[0]), "%Y-%m-%d %H:%M:%S") if fixture_date < datetime.now() - timedelta(days=7): # Delete old fixture from PostgreSQL - delete_query = text("DELETE FROM api_football.fixtures WHERE fixture_id = :fixture_id") + delete_query = text( + "DELETE FROM api_football.fixtures WHERE fixture_id = :fixture_id" + ) with engine.begin() as conn: result = conn.execute(delete_query, {"fixture_id": fixture_id}) if result.rowcount == 1: - self.logger.info(f"Fixture {fixture_id} dropped from PostgreSQL due to date constraint.") + self.logger.info( + f"Fixture {fixture_id} dropped from PostgreSQL due to date constraint." + ) else: - self.logger.warning(f"Fixture {fixture_id} was NOT deleted (rowcount={result.rowcount}).") + self.logger.warning( + f"Fixture {fixture_id} was NOT deleted (rowcount={result.rowcount})." + ) return {} except Exception as e: self.logger.error(f"Error checking date for fixture {fixture_id}: {e}") @@ -634,11 +778,8 @@ def get_fixture_ids_without_statistics(self) -> list[int]: fixture_ids = [] with engine.connect() as conn: - result = conn.execute( - query, - {"today": today, "league_ids": target_league_ids} - ) - + result = conn.execute(query, {"today": today, "league_ids": target_league_ids}) + for row in result: fixture_ids.append(row.fixture_id) league_id = row.league_id @@ -659,154 +800,162 @@ def get_fixture_ids_without_statistics(self) -> list[int]: return [] def get_prediction_for_fixture(self, fixture_id: int) -> bool: - """ - Fetches prediction data for a specific fixture_id from the API, - transforms it, and upserts it into the api_football.predictions table. - Args: - fixture_id (int): The ID of the fixture to get predictions for. - Returns: - bool: True if the operation was successful, False otherwise. - """ - api_url = f"{self.base_url}/predictions?fixture={fixture_id}" - try: - response = requests.get(api_url, headers=self.headers) - response.raise_for_status() # Raises an HTTPError for bad responses (4XX or 5XX) - - api_data = response.json() - - if not api_data.get("response"): - message = f"No prediction data found in API response for fixture ID: {fixture_id}" - self.logger.warning(message) - print(message) - return False - - # The main object containing all prediction related data for the fixture - prediction_api_obj = api_data["response"][0] - - # Extract parts of the API response - predictions_part = prediction_api_obj.get("predictions", {}) - teams_api_part = prediction_api_obj.get("teams", {}) - comparison_part = prediction_api_obj.get("comparison", {}) - h2h_list_api = prediction_api_obj.get("h2h", []) - - fixture_home_team_id = teams_api_part.get("home", {}).get("id") - fixture_away_team_id = teams_api_part.get("away", {}).get("id") - - # Helper to safely get and convert percentage strings (e.g., "50%" or "50.5") to float - def parse_percent(percent_val): - if percent_val is None: - return None - if isinstance(percent_val, (int, float)): - return float(percent_val) - try: - s_val = str(percent_val).replace('%', '') - return float(s_val) - except ValueError: - return None - - # Prepare data for database, matching the provided schema - data_to_upsert = { - "fixture_id": fixture_id, - - # From 'predictions' part - "winner_id": predictions_part.get("winner", {}).get("id"), - "winner_name": predictions_part.get("winner", {}).get("name"), - "win_or_draw": predictions_part.get("win_or_draw"), - "under_over": predictions_part.get("under_over"), - "goals_home": predictions_part.get("goals", {}).get("home"), - "goals_away": predictions_part.get("goals", {}).get("away"), - "advice": predictions_part.get("advice"), - "home_win_percent": parse_percent(predictions_part.get("percent", {}).get("home")), - "draw_percent": parse_percent(predictions_part.get("percent", {}).get("draw")), - "away_win_percent": parse_percent(predictions_part.get("percent", {}).get("away")), - - # From 'comparison' part - parse percentages to remove % signs - "comparison_home_form": parse_percent(comparison_part.get("form", {}).get("home")), - "comparison_away_form": parse_percent(comparison_part.get("form", {}).get("away")), - "comparison_home_att": parse_percent(comparison_part.get("att", {}).get("home")), - "comparison_away_att": parse_percent(comparison_part.get("att", {}).get("away")), - "comparison_home_def": parse_percent(comparison_part.get("def", {}).get("home")), - "comparison_away_def": parse_percent(comparison_part.get("def", {}).get("away")), - "comparison_home_poisson_distribution": parse_percent(comparison_part.get("poisson_distribution", {}).get("home")), - "comparison_away_poisson_distribution": parse_percent(comparison_part.get("poisson_distribution", {}).get("away")), - "comparison_home_h2h": parse_percent(comparison_part.get("h2h", {}).get("home")), - "comparison_away_h2h": parse_percent(comparison_part.get("h2h", {}).get("away")), - "comparison_home_goals": parse_percent(comparison_part.get("goals", {}).get("home")), - "comparison_away_goals": parse_percent(comparison_part.get("goals", {}).get("away")), - "comparison_home_total": parse_percent(comparison_part.get("total", {}).get("home")), - "comparison_away_total": parse_percent(comparison_part.get("total", {}).get("away")), - - "updated_at": datetime.now(), - } + """ + Fetches prediction data for a specific fixture_id from the API, + transforms it, and upserts it into the api_football.predictions table. + Args: + fixture_id (int): The ID of the fixture to get predictions for. + Returns: + bool: True if the operation was successful, False otherwise. + """ + api_url = f"{self.base_url}/predictions?fixture={fixture_id}" + try: + response = requests.get(api_url, headers=self.headers) + response.raise_for_status() # Raises an HTTPError for bad responses (4XX or 5XX) - # Calculate H2H stats - h2h_home_wins_calc, h2h_away_wins_calc, h2h_draws_calc = 0, 0, 0 - if fixture_home_team_id is not None and fixture_away_team_id is not None and h2h_list_api: - for match in h2h_list_api: - match_teams = match.get("teams", {}) - match_goals = match.get("goals", {}) - - h2h_match_home_id = match_teams.get("home", {}).get("id") - - h2h_gh = match_goals.get("home") - h2h_ga = match_goals.get("away") - - if isinstance(h2h_gh, (int, float)) and isinstance(h2h_ga, (int, float)): - if h2h_gh == h2h_ga: - h2h_draws_calc += 1 - elif h2h_match_home_id == fixture_home_team_id: - if h2h_gh > h2h_ga: - h2h_home_wins_calc +=1 - else: - h2h_away_wins_calc +=1 - elif h2h_match_home_id == fixture_away_team_id: - if h2h_ga > h2h_gh: - h2h_home_wins_calc += 1 - else: - h2h_away_wins_calc += 1 - - data_to_upsert["h2h_home_wins"] = h2h_home_wins_calc - data_to_upsert["h2h_away_wins"] = h2h_away_wins_calc - data_to_upsert["h2h_draws"] = h2h_draws_calc - data_to_upsert["h2h_total_matches"] = len(h2h_list_api) if h2h_list_api else 0 - - # Upsert to PostgreSQL - stmt = pg_insert(predictions_table).values(**data_to_upsert) - update_dict = {c: stmt.excluded[c] for c in data_to_upsert.keys() if c != 'fixture_id'} - stmt = stmt.on_conflict_do_update(index_elements=['fixture_id'], set_=update_dict) - try: - with engine.begin() as conn: - conn.execute(stmt) - message = f"Successfully upserted prediction for fixture ID: {fixture_id}" - self.logger.info(message) - print(message) - return True - except SQLAlchemyError as e: - message = f"Error upserting prediction for fixture ID {fixture_id}: {e}" - self.logger.error(message) - print(message) - return False - - except requests.exceptions.RequestException as e: - message = f"API request failed for fixture ID {fixture_id}: {e}" - self.logger.error(message) + api_data = response.json() + + if not api_data.get("response"): + message = f"No prediction data found in API response for fixture ID: {fixture_id}" + self.logger.warning(message) print(message) return False - except (Exception, psycopg2.Error) as e: - message = f"Error processing or upserting prediction for fixture ID {fixture_id}: {e}" + + # The main object containing all prediction related data for the fixture + prediction_api_obj = api_data["response"][0] + + # Extract parts of the API response + predictions_part = prediction_api_obj.get("predictions", {}) + teams_api_part = prediction_api_obj.get("teams", {}) + comparison_part = prediction_api_obj.get("comparison", {}) + h2h_list_api = prediction_api_obj.get("h2h", []) + + fixture_home_team_id = teams_api_part.get("home", {}).get("id") + fixture_away_team_id = teams_api_part.get("away", {}).get("id") + + # Helper to safely get and convert percentage strings (e.g., "50%" or "50.5") to float + def parse_percent(percent_val): + if percent_val is None: + return None + if isinstance(percent_val, (int, float)): + return float(percent_val) + try: + s_val = str(percent_val).replace("%", "") + return float(s_val) + except ValueError: + return None + + # Prepare data for database, matching the provided schema + data_to_upsert = { + "fixture_id": fixture_id, + # From 'predictions' part + "winner_id": predictions_part.get("winner", {}).get("id"), + "winner_name": predictions_part.get("winner", {}).get("name"), + "win_or_draw": predictions_part.get("win_or_draw"), + "under_over": predictions_part.get("under_over"), + "goals_home": predictions_part.get("goals", {}).get("home"), + "goals_away": predictions_part.get("goals", {}).get("away"), + "advice": predictions_part.get("advice"), + "home_win_percent": parse_percent(predictions_part.get("percent", {}).get("home")), + "draw_percent": parse_percent(predictions_part.get("percent", {}).get("draw")), + "away_win_percent": parse_percent(predictions_part.get("percent", {}).get("away")), + # From 'comparison' part - parse percentages to remove % signs + "comparison_home_form": parse_percent(comparison_part.get("form", {}).get("home")), + "comparison_away_form": parse_percent(comparison_part.get("form", {}).get("away")), + "comparison_home_att": parse_percent(comparison_part.get("att", {}).get("home")), + "comparison_away_att": parse_percent(comparison_part.get("att", {}).get("away")), + "comparison_home_def": parse_percent(comparison_part.get("def", {}).get("home")), + "comparison_away_def": parse_percent(comparison_part.get("def", {}).get("away")), + "comparison_home_poisson_distribution": parse_percent( + comparison_part.get("poisson_distribution", {}).get("home") + ), + "comparison_away_poisson_distribution": parse_percent( + comparison_part.get("poisson_distribution", {}).get("away") + ), + "comparison_home_h2h": parse_percent(comparison_part.get("h2h", {}).get("home")), + "comparison_away_h2h": parse_percent(comparison_part.get("h2h", {}).get("away")), + "comparison_home_goals": parse_percent( + comparison_part.get("goals", {}).get("home") + ), + "comparison_away_goals": parse_percent( + comparison_part.get("goals", {}).get("away") + ), + "comparison_home_total": parse_percent( + comparison_part.get("total", {}).get("home") + ), + "comparison_away_total": parse_percent( + comparison_part.get("total", {}).get("away") + ), + "updated_at": datetime.now(), + } + + # Calculate H2H stats + h2h_home_wins_calc, h2h_away_wins_calc, h2h_draws_calc = 0, 0, 0 + if ( + fixture_home_team_id is not None + and fixture_away_team_id is not None + and h2h_list_api + ): + for match in h2h_list_api: + match_teams = match.get("teams", {}) + match_goals = match.get("goals", {}) + + h2h_match_home_id = match_teams.get("home", {}).get("id") + + h2h_gh = match_goals.get("home") + h2h_ga = match_goals.get("away") + + if isinstance(h2h_gh, (int, float)) and isinstance(h2h_ga, (int, float)): + if h2h_gh == h2h_ga: + h2h_draws_calc += 1 + elif h2h_match_home_id == fixture_home_team_id: + if h2h_gh > h2h_ga: + h2h_home_wins_calc += 1 + else: + h2h_away_wins_calc += 1 + elif h2h_match_home_id == fixture_away_team_id: + if h2h_ga > h2h_gh: + h2h_home_wins_calc += 1 + else: + h2h_away_wins_calc += 1 + + data_to_upsert["h2h_home_wins"] = h2h_home_wins_calc + data_to_upsert["h2h_away_wins"] = h2h_away_wins_calc + data_to_upsert["h2h_draws"] = h2h_draws_calc + data_to_upsert["h2h_total_matches"] = len(h2h_list_api) if h2h_list_api else 0 + + # Upsert to PostgreSQL + stmt = pg_insert(predictions_table).values(**data_to_upsert) + update_dict = {c: stmt.excluded[c] for c in data_to_upsert.keys() if c != "fixture_id"} + stmt = stmt.on_conflict_do_update(index_elements=["fixture_id"], set_=update_dict) + try: + with engine.begin() as conn: + conn.execute(stmt) + message = f"Successfully upserted prediction for fixture ID: {fixture_id}" + self.logger.info(message) + print(message) + return True + except SQLAlchemyError as e: + message = f"Error upserting prediction for fixture ID {fixture_id}: {e}" self.logger.error(message) print(message) - if self.conn and not self.conn.closed: - try: - self.conn.rollback() - except psycopg2.Error as rb_e: - print(f"Rollback failed: {rb_e}") return False + except requests.exceptions.RequestException as e: + message = f"API request failed for fixture ID {fixture_id}: {e}" + self.logger.error(message) + print(message) + return False + except (Exception, psycopg2.Error) as e: + message = f"Error processing or upserting prediction for fixture ID {fixture_id}: {e}" + self.logger.error(message) + print(message) + return False + def get_fixture_ids_without_predictions(self) -> list[int]: """ Retrieves fixture IDs from PostgreSQL where predictions do not exist. - + Returns: List[int]: List of fixture IDs without predictions. """ @@ -876,7 +1025,7 @@ def update_venues(self) -> None: if team_data["results"] > 0: team = team_data["response"][0] venue = team["venue"] - + # Upsert to venues table upsert_query = text(""" INSERT INTO api_football.venues ( @@ -920,12 +1069,12 @@ def update_venues(self) -> None: "venue_capacity": venue["capacity"], "venue_surface": venue["surface"], "venue_image": venue["image"], - "updated_at": datetime.now() + "updated_at": datetime.now(), } with engine.begin() as conn: conn.execute(upsert_query, params) - + self.logger.info(f"Updated venue data for team ID: {team_id}") else: self.logger.error( @@ -942,24 +1091,30 @@ def delete_old_unscored_fixtures(self) -> None: try: # Calculate cutoff date (7 days ago) cutoff_date = (datetime.now() - timedelta(days=7)).strftime("%Y-%m-%d") - + # First count the number of fixtures that match criteria with engine.connect() as conn: - result = conn.execute(text(""" + result = conn.execute( + text(""" SELECT COUNT(*) FROM api_football.fixtures WHERE date < :cutoff_date AND home_goals IS NULL - """), {"cutoff_date": cutoff_date}) + """), + {"cutoff_date": cutoff_date}, + ) count = result.scalar() - + self.logger.info(f"Found {count} old unscored fixtures to delete") - + # Then delete them with engine.begin() as conn: - result = conn.execute(text(""" + result = conn.execute( + text(""" DELETE FROM api_football.fixtures WHERE date < :cutoff_date AND home_goals IS NULL - """), {"cutoff_date": cutoff_date}) + """), + {"cutoff_date": cutoff_date}, + ) deleted_count = result.rowcount self.logger.info(f"Deleted {deleted_count} old unscored fixtures") @@ -968,10 +1123,12 @@ def delete_old_unscored_fixtures(self) -> None: except Exception as e: self.logger.error(f"Error deleting old unscored fixtures: {e}") - def _map_api_event_to_row(self, event_data: dict, fixture_id_context: int) -> dict: + def _map_api_event_to_row(self, event_data: dict, fixture_id_context: int) -> Optional[dict]: """Maps a single event object from the API response to a dictionary for the fixture_events table.""" if not event_data or not isinstance(event_data, dict): - self.logger.warning(f"_map_api_event_to_row: Invalid event_data for fixture_id {fixture_id_context}") + self.logger.warning( + f"_map_api_event_to_row: Invalid event_data for fixture_id {fixture_id_context}" + ) return None # Helper to safely access nested dictionary keys @@ -985,24 +1142,27 @@ def safe_get(d, *keys, default=None): # Get the raw event detail raw_event_detail = safe_get(event_data, "detail") event_type = safe_get(event_data, "type") - + # Handle missing event_detail with appropriate defaults based on event type if raw_event_detail is None and event_type: # Map event types to default details when API doesn't provide them default_details = { - "Var": "VAR Check", - "var": "VAR Check", # Handle case variations - "VAR": "VAR Check", + "Var": VAR_CHECK_DETAIL, + "var": VAR_CHECK_DETAIL, # Handle case variations + "VAR": VAR_CHECK_DETAIL, "Goal": "Goal", "Card": "Card", "subst": "Substitution", - "Substitution": "Substitution" + "Substitution": "Substitution", } - - event_detail = default_details.get(event_type, f"{event_type} Event") - + + event_type_str = str(event_type) if event_type is not None else "" + event_detail = default_details.get(event_type_str, f"{event_type_str} Event") + # Log when we apply a default value for tracking - self.logger.info(f"Applied default event_detail '{event_detail}' for event_type '{event_type}' in fixture {fixture_id_context}") + self.logger.info( + f"Applied default event_detail '{event_detail}' for event_type '{event_type}' in fixture {fixture_id_context}" + ) else: event_detail = raw_event_detail @@ -1019,18 +1179,22 @@ def safe_get(d, *keys, default=None): "event_type": event_type, "event_detail": event_detail, "event_comments": safe_get(event_data, "comments"), - "updated_at": datetime.now() + "updated_at": datetime.now(), } - + if not mapped_event["team_id"] or not mapped_event["event_type"]: - self.logger.warning(f"Essential event data missing for fixture {fixture_id_context}: {event_data}") + self.logger.warning( + f"Essential event data missing for fixture {fixture_id_context}: {event_data}" + ) return None - + # Final validation that event_detail is not None (database constraint) if mapped_event["event_detail"] is None: - self.logger.warning(f"Could not determine event_detail for event_type '{event_type}' in fixture {fixture_id_context}. Original data: {event_data}") + self.logger.warning( + f"Could not determine event_detail for event_type '{event_type}' in fixture {fixture_id_context}. Original data: {event_data}" + ) return None - + return mapped_event def get_and_upsert_fixture_events(self, fixture_id: int) -> bool: @@ -1044,13 +1208,15 @@ def get_and_upsert_fixture_events(self, fixture_id: int) -> bool: """ endpoint = "fixtures/events" params = {"fixture": fixture_id} - + self.logger.info(f"Fetching events for fixture_id: {fixture_id}") api_response = self._get_request(endpoint, params) if not api_response or "response" not in api_response or not api_response["response"]: - self.logger.warning(f"No event data found in API response for fixture ID: {fixture_id}. API Response: {api_response}") - return True + self.logger.warning( + f"No event data found in API response for fixture ID: {fixture_id}. API Response: {api_response}" + ) + return True events_from_api = api_response["response"] events_to_insert = [] @@ -1059,27 +1225,37 @@ def get_and_upsert_fixture_events(self, fixture_id: int) -> bool: mapped_row = self._map_api_event_to_row(event_data, fixture_id) if mapped_row: events_to_insert.append(mapped_row) - + if not events_to_insert: - self.logger.info(f"No valid events parsed to insert for fixture_id: {fixture_id} after mapping.") - return True # No valid events to insert, consider it done. + self.logger.info( + f"No valid events parsed to insert for fixture_id: {fixture_id} after mapping." + ) + return True # No valid events to insert, consider it done. try: with engine.begin() as conn: # Delete existing events for this fixture_id to prevent duplicates/stale data - delete_stmt = delete(fixture_events_table).where(fixture_events_table.c.fixture_id == fixture_id) + delete_stmt = delete(fixture_events_table).where( + fixture_events_table.c.fixture_id == fixture_id + ) delete_result = conn.execute(delete_stmt) - self.logger.info(f"Deleted {delete_result.rowcount} existing event(s) for fixture_id: {fixture_id}") + self.logger.info( + f"Deleted {delete_result.rowcount} existing event(s) for fixture_id: {fixture_id}" + ) # Bulk insert the new events conn.execute(fixture_events_table.insert(), events_to_insert) - self.logger.info(f"Successfully inserted {len(events_to_insert)} events for fixture_id: {fixture_id}") + self.logger.info( + f"Successfully inserted {len(events_to_insert)} events for fixture_id: {fixture_id}" + ) return True except SQLAlchemyError as e: self.logger.error(f"Database error processing events for fixture_id {fixture_id}: {e}") return False except Exception as e: - self.logger.error(f"Unexpected error processing events for fixture_id {fixture_id}: {e}") + self.logger.error( + f"Unexpected error processing events for fixture_id {fixture_id}: {e}" + ) return False def get_events_for_missing_fixtures(self): @@ -1087,8 +1263,10 @@ def get_events_for_missing_fixtures(self): Identifies finished fixtures without events and fetches/stores their event data. Includes API rate limiting. """ - self.logger.info("Starting to get fixture events for finished fixtures missing event data...") - + self.logger.info( + "Starting to get fixture events for finished fixtures missing event data..." + ) + fixtures_to_query = [] try: query = text(""" @@ -1099,7 +1277,7 @@ def get_events_for_missing_fixtures(self): AND f.date <= NOW() AND e.fixture_id IS NULL; """) - + with engine.connect() as conn: result = conn.execute(query) fixtures_to_query = [row.fixture_id for row in result] @@ -1108,7 +1286,9 @@ def get_events_for_missing_fixtures(self): self.logger.info("No finished fixtures found missing event data.") return - self.logger.info(f"Found {len(fixtures_to_query)} finished fixtures missing event data.") + self.logger.info( + f"Found {len(fixtures_to_query)} finished fixtures missing event data." + ) except SQLAlchemyError as e: self.logger.error(f"DB error querying for fixtures missing events: {e}") @@ -1122,35 +1302,44 @@ def get_events_for_missing_fixtures(self): rate_limit_threshold_per_batch = 250 # As used in get_team_stats_for_fixtures pause_duration = 60 # Seconds batch_start_time = time.time() - + processed_count = 0 for fixture_id in fixtures_to_query: if api_calls_in_current_batch >= rate_limit_threshold_per_batch: elapsed_time_in_batch = time.time() - batch_start_time if elapsed_time_in_batch < pause_duration: sleep_time = pause_duration - elapsed_time_in_batch - self.logger.info(f"Event fetching rate limit: Sleeping for {sleep_time:.1f} seconds.") + self.logger.info( + f"Event fetching rate limit: Sleeping for {sleep_time:.1f} seconds." + ) time.sleep(sleep_time) api_calls_in_current_batch = 0 batch_start_time = time.time() success = self.get_and_upsert_fixture_events(fixture_id) - api_calls_in_current_batch += 1 - api_call_count_total +=1 - + api_calls_in_current_batch += 1 + api_call_count_total += 1 + if success: processed_count += 1 else: - self.logger.warning(f"Failed to get/upsert events for fixture_id: {fixture_id}. Will not retry in this run.") + self.logger.warning( + f"Failed to get/upsert events for fixture_id: {fixture_id}. Will not retry in this run." + ) if (api_call_count_total % 10 == 0) or (api_call_count_total == len(fixtures_to_query)): - self.logger.info(f"--- Event Fetching Progress: Attempted {api_call_count_total}/{len(fixtures_to_query)} fixtures. " - f"Successfully processed: {processed_count}. ---") - - time.sleep(0.1) # Small delay between individual fixture event calls + self.logger.info( + f"--- Event Fetching Progress: Attempted {api_call_count_total}/{len(fixtures_to_query)} fixtures. " + f"Successfully processed: {processed_count}. ---" + ) + + time.sleep(0.1) # Small delay between individual fixture event calls + + self.logger.info( + f"Finished getting fixture events. Total API calls: {api_call_count_total}. " + f"Successfully processed fixtures with events: {processed_count}." + ) - self.logger.info(f"Finished getting fixture events. Total API calls: {api_call_count_total}. " - f"Successfully processed fixtures with events: {processed_count}.") def main(): api_key = os.getenv("API_FOOTBALL_API_KEY") @@ -1166,5 +1355,7 @@ def main(): api_football.get_team_stats_for_fixtures() api_football.get_events_for_missing_fixtures() api_football.update_venues() + + if __name__ == "__main__": main() diff --git a/data/Create_data/utils/feature_selection.py b/data/Create_data/utils/feature_selection.py index b039710..97e7ada 100644 --- a/data/Create_data/utils/feature_selection.py +++ b/data/Create_data/utils/feature_selection.py @@ -33,7 +33,7 @@ sys.path.append(str(project_root)) except Exception as e: print(f"Error setting project root path: {e}") - sys.path.append(os.getcwd().parent) + sys.path.append(str(Path(os.getcwd()).parent)) # Import ExperimentLogger and evaluation set creation from utils.create_evaluation_set import ( @@ -54,7 +54,7 @@ def __init__( target_features: tuple[int, int] = (50, 80), random_state: int = 42, experiment_name: str = "feature_selection_optimization", - logger: ExperimentLogger = None, + logger: Optional[ExperimentLogger] = None, ): """Initialize feature selector. @@ -110,70 +110,95 @@ def optimize_composite_weights( """ self.logger.info("Starting composite weight optimization") - # Default weight grid if none provided + weight_grid = self._get_default_weight_grid(weight_grid) + weight_combinations = self._generate_weight_combinations(weight_grid) + best_weights, best_score = self._evaluate_weight_combinations(X, y, model, weight_combinations) + + return self._finalize_optimization_results(best_weights, best_score) + + def _get_default_weight_grid(self, weight_grid: Optional[dict[str, list[float]]]) -> dict[str, list[float]]: + """Get default weight grid if none provided.""" if weight_grid is None: - weight_grid = { + return { "gain": [0.4, 0.5, 0.6], "weight": [0.2, 0.3, 0.4], "cover": [0.1, 0.2, 0.3], } + return weight_grid - best_score = -np.inf - best_weights = None - - # Generate all weight combinations - weight_combinations = [] + def _generate_weight_combinations(self, weight_grid: dict[str, list[float]]) -> list[tuple[float, float, float]]: + """Generate all valid weight combinations that sum to 1.""" + combinations = [] for gain in weight_grid["gain"]: for weight in weight_grid["weight"]: for cover in weight_grid["cover"]: if abs(gain + weight + cover - 1.0) < 1e-10: # Sum to 1 - weight_combinations.append((gain, weight, cover)) + combinations.append((gain, weight, cover)) + return combinations + + def _evaluate_weight_combinations( + self, + X: pd.DataFrame, + y: pd.Series, + model: xgb.XGBClassifier, + weight_combinations: list[tuple[float, float, float]] + ) -> tuple[Optional[dict[str, float]], float]: + """Evaluate all weight combinations and return the best.""" + best_score = -np.inf + best_weights = None - # Evaluate each combination for gain, weight, cover in weight_combinations: try: - # Calculate composite scores - importance_scores = self._calculate_composite_scores( - X, y, model, {"gain": gain, "weight": weight, "cover": cover} - ) + score = self._evaluate_single_weight_combination(X, y, model, gain, weight, cover) + if score > best_score: + best_score = score + best_weights = {"gain": gain, "weight": weight, "cover": cover} + except Exception as e: + self.logger.error(f"Error evaluating weights {(gain, weight, cover)}: {str(e)}") + continue - # Convert scores to DataFrame for proper indexing - scores_df = pd.DataFrame.from_dict( - importance_scores, orient="index", columns=["score"] - ) + return best_weights, best_score - # Select top features based on scores - top_features = self._select_top_features( - scores_df["score"].to_dict(), # Convert back to dict with proper indexing - X, - min_features=self.target_features[0], - ) + def _evaluate_single_weight_combination( + self, + X: pd.DataFrame, + y: pd.Series, + model: xgb.XGBClassifier, + gain: float, + weight_val: float, + cover: float + ) -> float: + """Evaluate a single weight combination.""" + weights = {"gain": gain, "weight": weight_val, "cover": cover} - # Evaluate feature set - score = self._evaluate_feature_set(X[top_features], y, model) + # Calculate composite scores + importance_scores = self._calculate_composite_scores(pd.DataFrame(X), y, model, weights) # type: ignore[arg-type] - # Log to MLflow - mlflow.log_metrics({"cv_score": score, "n_features": len(top_features)}) + # Select top features + top_features = self._select_top_features( + importance_scores, X, min_features=self.target_features[0] + ) - if score > best_score: - best_score = score - best_weights = {"gain": gain, "weight": weight, "cover": cover} + # Evaluate feature set + score = self._evaluate_feature_set(pd.DataFrame(X[top_features]), y, model) # type: ignore - # Log best weights - mlflow.log_metrics( - { - "best_gain_weight": gain, - "best_weight_weight": weight, - "best_cover_weight": cover, - "best_cv_score": score, - } - ) + # Log to MLflow + mlflow.log_metrics({"cv_score": score, "n_features": len(top_features)}) - except Exception as e: - self.logger.error(f"Error evaluating weights {(gain, weight, cover)}: {str(e)}") - continue + # Log best weights + mlflow.log_metrics({ + "best_gain_weight": gain, + "best_weight_weight": weight_val, + "best_cover_weight": cover, + "best_cv_score": score, + }) + + return score - # Handle case where no valid weights are found + def _finalize_optimization_results( + self, best_weights: Optional[dict[str, float]], best_score: float + ) -> dict[str, float]: + """Finalize optimization results and handle edge cases.""" if best_weights is None: self.logger.warning("No valid weights found, using defaults") best_weights = {"gain": 0.5, "weight": 0.3, "cover": 0.2} @@ -198,6 +223,9 @@ def _calculate_composite_scores( Returns: Dictionary of composite scores for each feature """ + # Ensure X is a DataFrame + if not isinstance(X, pd.DataFrame): + X = pd.DataFrame(X) model.fit(X, y) # Get importance scores for each metric @@ -212,7 +240,8 @@ def _calculate_composite_scores( normalized_metrics = {} for metric, scores in importance_metrics.items(): # Convert to DataFrame for normalization - score_df = pd.DataFrame.from_dict(scores, orient="index", columns=[metric]) + score_df = pd.DataFrame.from_dict(scores, orient="index") + score_df.columns = [metric] normalized_metrics[metric] = pd.DataFrame( scaler.fit_transform(score_df), index=score_df.index ) @@ -265,6 +294,9 @@ def _evaluate_feature_set( Returns: Mean cross-validation score """ + # Ensure X is a DataFrame + if not isinstance(X, pd.DataFrame): + X = pd.DataFrame(X) try: # Create a fresh model instance for cross-validation cv_model = xgb.XGBClassifier( @@ -293,7 +325,7 @@ def analyze_correlations( List of correlated feature groups """ # Calculate correlation matrix - correlation_matrix = X[selected_features].corr() + correlation_matrix = X[selected_features].corr() # type: ignore # Find highly correlated features correlated_groups = [] @@ -338,23 +370,24 @@ def perform_stability_selection( """ self.logger.info(f"Starting stability selection with {self.n_bootstrap} iterations") - feature_counts = {feature: 0 for feature in X.columns} - feature_scores = {feature: [] for feature in X.columns} + feature_counts = dict.fromkeys(X.columns, 0) + feature_scores = dict.fromkeys(X.columns, []) for i in range(self.n_bootstrap): try: # Create bootstrap sample - indices = np.random.choice(len(X), size=len(X), replace=True) - X_boot = X.iloc[indices] + rng = np.random.default_rng(42) + indices = rng.choice(len(X), size=len(X), replace=True) + x_boot = X.iloc[indices] y_boot = y.iloc[indices] # Calculate feature importance for this bootstrap # Note: eval_metric removed from fit() calls in _calculate_composite_scores - importance_scores = self._calculate_composite_scores(X_boot, y_boot, model, weights) + importance_scores = self._calculate_composite_scores(x_boot, y_boot, model, weights) # Select top features selected = self._select_top_features( - importance_scores, X_boot, min_features=self.target_features[0] + importance_scores, x_boot, min_features=self.target_features[0] ) # Update counts and scores @@ -388,7 +421,7 @@ def perform_stability_selection( mlflow.log_metrics( { "n_stable_features": len(stable_features), - "mean_stability_score": np.mean(list(stability_scores.values())), + "mean_stability_score": float(np.mean(list(stability_scores.values()))), } ) @@ -430,7 +463,7 @@ def perform_iterative_elimination( while len(remaining_features) > min_features: try: # Evaluate current feature set - score = self._evaluate_feature_set(X[remaining_features], y, model, cv=cv) + score = self._evaluate_feature_set(X[remaining_features], y, model, cv=cv) # type: ignore[arg-type] scores_history.append((len(remaining_features), score)) # Update best score and features @@ -440,7 +473,7 @@ def perform_iterative_elimination( # Calculate feature importance importance_scores = self._calculate_composite_scores( - X[remaining_features], + X[remaining_features], # type: ignore[arg-type] y, model, weights={"gain": 0.5, "weight": 0.3, "cover": 0.2}, @@ -505,7 +538,7 @@ def select_features(self, X: pd.DataFrame, y: pd.Series, model: xgb.XGBClassifie ] # 4. Analyze correlations among stable features - correlation_groups = self.analyze_correlations(X[stable_features], stable_features) + correlation_groups = self.analyze_correlations(pd.DataFrame(X[stable_features]), stable_features) # 5. Remove redundant features from each correlation group unique_features = [] @@ -521,7 +554,7 @@ def select_features(self, X: pd.DataFrame, y: pd.Series, model: xgb.XGBClassifie unique_features.extend(uncorrelated) # 6. Perform iterative elimination on remaining features - final_features = self.perform_iterative_elimination(X[unique_features], y, model) + final_features = self.perform_iterative_elimination(pd.DataFrame(X[unique_features]), y, model) # Store selected features self.selected_features = final_features @@ -540,7 +573,7 @@ def __init__( self, min_recall: float = 0.20, target_precision: float = 0.50, - logger: ExperimentLogger = None, + logger: Optional[ExperimentLogger] = None, handle_imbalance: bool = True, calibrate_probas: bool = True, ): @@ -585,7 +618,8 @@ def _handle_class_imbalance( self.logger.info("Applying SMOTE for class imbalance") smote = SMOTE(random_state=42) - X_resampled, y_resampled = smote.fit_resample(X, y) + result = smote.fit_resample(X, y) + x_resampled, y_resampled = result[0], result[1] self.logger.info( f"Original class distribution: {pd.Series(y).value_counts(normalize=True)}" @@ -594,7 +628,7 @@ def _handle_class_imbalance( f"Resampled class distribution: {pd.Series(y_resampled).value_counts(normalize=True)}" ) - return pd.DataFrame(X_resampled, columns=X.columns), pd.Series(y_resampled) + return pd.DataFrame(x_resampled, columns=X.columns), pd.Series(y_resampled) except Exception as e: self.logger.error(f"Error in class imbalance handling: {str(e)}") @@ -626,35 +660,29 @@ def analyze_feature_importance( self, model: xgb.XGBClassifier, feature_names: list[str], - X_val: pd.DataFrame, + x_val: pd.DataFrame, y_val: pd.Series, ) -> pd.DataFrame: """Analyze feature importance with enhanced precision focus.""" try: self.logger.info("Handling class imbalance") - X_balanced, y_balanced = self._handle_class_imbalance(X_val, y_val) + x_balanced, y_balanced = self._handle_class_imbalance(x_val, y_val) self.logger.info("Calibrating model if enabled") - calibrated_model = self._calibrate_model(model, X_balanced, y_balanced) + calibrated_model = self._calibrate_model(model, x_balanced, y_balanced) self.logger.info("Getting base importance scores") importance_base = model.feature_importances_ self.logger.info("Calculating precision impact scores with balanced data") precision_impact = self._calculate_precision_impact( - calibrated_model, X_balanced, y_balanced, feature_names + calibrated_model, x_balanced, y_balanced, feature_names ) - # self.logger.info("Calculating interaction importance") - # interaction_importance = self._calculate_interaction_importance( - # calibrated_model, X_balanced, y_balanced, feature_names - # ) - self.logger.info("Combining scores with updated weights") combined_scores = ( 0.5 * precision_impact # Increased weight for precision impact + 0.3 * importance_base # Base importance - # 0.2 * interaction_importance # New interaction component ) self.logger.info("Creating and sorting importance DataFrame") @@ -663,7 +691,6 @@ def analyze_feature_importance( "feature": feature_names, "importance": importance_base, "precision_impact": precision_impact, - # 'interaction_importance': interaction_importance, "combined_score": combined_scores, } ).sort_values("combined_score", ascending=False) @@ -685,13 +712,13 @@ def _calculate_interaction_importance( for feature2 in feature_names[i + 1 :]: # Create interaction feature interaction_name = f"{feature1}_{feature2}" - X_interaction = X.copy() - X_interaction[interaction_name] = X[feature1] * X[feature2] + x_interaction = X.copy() + x_interaction[interaction_name] = X[feature1] * X[feature2] self.created_interactions.add(interaction_name) # Get predictions with interaction base_score = model.score(X, y) - interaction_score = model.score(X_interaction, y) + interaction_score = model.score(x_interaction, y) # Add interaction impact to both features impact = interaction_score - base_score @@ -731,14 +758,14 @@ def _calculate_precision_impact( for i, feature in enumerate(feature_names): try: # Create copy of X without the current feature - X_reduced = X.drop(columns=[feature]) + x_reduced = X.drop(columns=[feature]) # Retrain model on reduced feature set reduced_model = xgb.XGBClassifier(tree_method="hist", device="cpu", random_state=42) - reduced_model.fit(X_reduced, y) + reduced_model.fit(x_reduced, y) # Calculate precision with reduced feature set - reduced_precision = precision_score(y, reduced_model.predict(X_reduced)) + reduced_precision = precision_score(y, reduced_model.predict(x_reduced)) # Calculate precision impact precision_impact[i] = baseline_precision - reduced_precision @@ -756,19 +783,19 @@ def _calculate_precision_impact( return precision_impact def select_features( - self, importance_df: pd.DataFrame, X_val: pd.DataFrame, correlation_threshold: float = 0.85 + self, importance_df: pd.DataFrame, x_val: pd.DataFrame, correlation_threshold: float = 0.85 ) -> list[str]: """Select optimal feature set with enhanced precision focus.""" try: # Először ellenőrizzük az adatok minőségét - self._validate_data_quality(X_val) + self._validate_data_quality(x_val) # Candidate features validálása candidate_features = importance_df.sort_values("combined_score", ascending=False)[ "feature" ].tolist() - valid_features = self._get_valid_features(X_val, candidate_features) + valid_features = self._get_valid_features(x_val, candidate_features) if not valid_features: raise ValueError("No valid features found after validation") @@ -788,7 +815,7 @@ def select_features( continue # Biztonságos korrelációszámítás - correlations = self._calculate_safe_correlations(X_val, selected, feature) + correlations = self._calculate_safe_correlations(x_val, selected, feature) # Feature csoportosítás self._group_feature( @@ -810,21 +837,29 @@ def select_features( self.logger.error(f"Error in feature selection: {str(e)}") raise + def _get_valid_features(self, X: pd.DataFrame, candidate_features: list[str]) -> list[str]: + """Get valid features that exist in the DataFrame and have valid data.""" + valid_features = [] + for feature in candidate_features: + if feature in X.columns and X[feature].notna().any(): # type: ignore + valid_features.append(feature) + return valid_features + def _validate_data_quality(self, X: pd.DataFrame) -> None: """Validate data quality and handle problematic values.""" # Ellenőrizzük a NaN értékeket nan_cols = X.isna().sum() - if nan_cols.any(): + if (nan_cols > 0).any(): self.logger.warning(f"Columns with NaN values: {nan_cols[nan_cols > 0]}") # Ellenőrizzük a konstans oszlopokat constant_cols = X.std() == 0 - if constant_cols.any(): + if constant_cols.any(): # type: ignore self.logger.warning(f"Constant columns detected: {X.columns[constant_cols].tolist()}") # Ellenőrizzük a végtelen értékeket inf_cols = np.isinf(X).sum() - if inf_cols.any(): + if (inf_cols > 0).any(): self.logger.warning(f"Columns with infinite values: {inf_cols[inf_cols > 0]}") def _calculate_safe_correlations( @@ -833,17 +868,17 @@ def _calculate_safe_correlations( """Calculate correlations safely handling numerical issues.""" try: # Kezeljük a NaN és végtelen értékeket - X_clean = X.copy() - X_clean = X_clean.replace([np.inf, -np.inf], np.nan) + x_clean = X.copy() + x_clean = x_clean.replace([np.inf, -np.inf], np.nan) # Töltsük ki a hiányzó értékeket az oszlop mediánjával for col in [feature] + selected: - if X_clean[col].isna().any(): - median_val = X_clean[col].median() - X_clean[col].fillna(median_val, inplace=True) + if x_clean[col].isna().any(): # type: ignore + median_val = x_clean[col].median() + x_clean[col].fillna(median_val, inplace=True) # Számítsuk ki a korrelációkat - correlations = abs(X_clean[selected].corrwith(X_clean[feature])) + correlations = abs(x_clean[selected].corrwith(x_clean[feature])) # Ellenőrizzük az eredményeket if correlations.isna().any(): @@ -896,7 +931,7 @@ def _select_best_features( return final_features -def run_feature_selection(experiment_name: str = "feature_selection_optimization") -> list[str]: +def run_feature_selection(experiment_name: str = "feature_selection_optimization") -> list[str]: # type: ignore """Run the complete feature selection process with precision focus. Args: @@ -910,11 +945,19 @@ def run_feature_selection(experiment_name: str = "feature_selection_optimization try: # Load data logger.info("Loading and preparing data") - X_train, y_train, X_test, y_test = import_feature_select_draws_api() - X_val, y_val = create_evaluation_sets_draws_api(use_selected_columns=False) - X_train, X_val, X_test = align_columns(logger, X_train, X_val, X_test) + x_train_raw, y_train_raw, x_test_raw, _ = import_feature_select_draws_api() + x_val_raw, y_val_raw = create_evaluation_sets_draws_api(use_selected_columns=False) + + # Ensure correct types + X_train = pd.DataFrame(x_train_raw) + y_train = pd.Series(y_train_raw) + X_test = pd.DataFrame(x_test_raw) + x_val = pd.DataFrame(x_val_raw) + y_val = pd.Series(y_val_raw) + + X_train, x_val, X_test = align_columns(logger, X_train, x_val, X_test) logger.info( - f"Loaded data shapes - Train: {X_train.shape}, Val: {X_val.shape}, Test: {X_test.shape}" + f"Loaded data shapes - Train: {X_train.shape}, Val: {x_val.shape}, Test: {X_test.shape}" ) # Initialize both feature selectors @@ -947,20 +990,20 @@ def run_feature_selection(experiment_name: str = "feature_selection_optimization model.fit( X_train[standard_features], y_train, - eval_set=[(X_val[standard_features], y_val)], + eval_set=[(x_val[standard_features], y_val)], verbose=False, ) # Now analyze precision impact logger.info("Analyzing precision impact of features") importance_df = precision_selector.analyze_feature_importance( - model, standard_features, X_val[standard_features], y_val + model, standard_features, pd.DataFrame(x_val[standard_features]), y_val ) # Select final feature set logger.info("Selecting final feature set with precision focus") final_features = precision_selector.select_features( - importance_df, X_val, correlation_threshold=0.85 + importance_df, x_val, correlation_threshold=0.85 ) # Log results @@ -992,11 +1035,11 @@ def run_feature_selection(experiment_name: str = "feature_selection_optimization def evaluate_feature_set(features): model = xgb.XGBClassifier(tree_method="hist", device="cpu", random_state=42) model.fit( - X_train[features], y_train, eval_set=[(X_val[features], y_val)], verbose=False + X_train[features], y_train, eval_set=[(x_val[features], y_val)], verbose=False ) # Get predictions - val_probs = model.predict_proba(X_val[features])[:, 1] + val_probs = model.predict_proba(x_val[features])[:, 1] val_preds = (val_probs >= 0.5).astype(int) return { @@ -1010,10 +1053,10 @@ def evaluate_feature_set(features): mlflow.log_metrics( { - "standard_precision": standard_metrics["precision"], - "standard_recall": standard_metrics["recall"], - "precision_focused_precision": precision_metrics["precision"], - "precision_focused_recall": precision_metrics["recall"], + "standard_precision": float(standard_metrics["precision"]), + "standard_recall": float(standard_metrics["recall"]), + "precision_focused_precision": float(precision_metrics["precision"]), + "precision_focused_recall": float(precision_metrics["recall"]), } ) @@ -1037,7 +1080,7 @@ def evaluate_feature_set(features): def align_columns( logger: ExperimentLogger, train_df: pd.DataFrame, test_df: pd.DataFrame, eval_df: pd.DataFrame -) -> pd.DataFrame: +) -> tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]: """ Aligns the columns of the evaluation set with the training set by dropping columns that are missing in either DataFrame. @@ -1058,13 +1101,13 @@ def align_columns( # Drop columns that are missing in either DataFrame # Preserve the original column order from the training set - ordered_columns = [col for col in common_columns] + ordered_columns = list(common_columns) logger.info(f"Aligned columns: {ordered_columns}") - train_df = train_df[ordered_columns] - test_df = test_df[ordered_columns] - eval_df = eval_df[ordered_columns] + train_aligned = pd.DataFrame(train_df[ordered_columns]) + test_aligned = pd.DataFrame(test_df[ordered_columns]) + eval_aligned = pd.DataFrame(eval_df[ordered_columns]) - return train_df, test_df, eval_df + return train_aligned, test_aligned, eval_aligned def verify_interactions(train_df, test_df, eval_df, feature_names): diff --git a/data/new_api_training_final.parquet b/data/new_api_training_final.parquet index 4e80487..d8cfa9d 100644 Binary files a/data/new_api_training_final.parquet and b/data/new_api_training_final.parquet differ diff --git a/data/new_api_training_final.xlsx b/data/new_api_training_final.xlsx index 98e3135..9f90822 100644 Binary files a/data/new_api_training_final.xlsx and b/data/new_api_training_final.xlsx differ diff --git a/src/models/StackedEnsemble/base/kernel_based/svm_model.py b/src/models/StackedEnsemble/base/kernel_based/svm_model.py index dd21977..dfae915 100644 --- a/src/models/StackedEnsemble/base/kernel_based/svm_model.py +++ b/src/models/StackedEnsemble/base/kernel_based/svm_model.py @@ -73,7 +73,7 @@ "probability": True, # MUST be True for predict_proba "class_weight": "balanced", # Good for imbalanced classes "random_state": SEED, - 'kernel': 'rbf', # Can be fixed here or tuned + "kernel": "rbf", # Can be fixed here or tuned "verbose": False, # Set to True for more SVC logs } @@ -101,8 +101,13 @@ def load_hyperparameter_space_svm(): }, "cache_size": {"type": "int", "low": 3000, "high": 10000, "step": 50}, # 'kernel': {'type': 'categorical', 'choices': ['rbf', 'poly', 'sigmoid']}, - 'degree': {'type': 'int', 'low': 3, 'high': 6}, # Only if kernel='poly' - 'coef0': {'type': 'float', 'low': 0.4, 'high': 1.0, 'log': True}, # Only if kernel='poly' or 'sigmoid' + "degree": {"type": "int", "low": 3, "high": 6}, # Only if kernel='poly' + "coef0": { + "type": "float", + "low": 0.4, + "high": 1.0, + "log": True, + }, # Only if kernel='poly' or 'sigmoid' "tol": {"type": "float", "low": 1e-5, "high": 1e-3, "log": True}, } return hyperparameter_space @@ -147,7 +152,9 @@ def train_model_svm(X_train_scaled, y_train, X_eval_scaled, y_eval, model_params # --- Optuna Objective --- -def objective(trial, X_train, y_train, X_test, y_test, X_eval, y_eval, hyperparameter_space, best_score): +def objective( + trial, X_train, y_train, X_test, y_test, X_eval, y_eval, hyperparameter_space, best_score +): """ Objective function for Optuna hyperparameter optimization. Uses SCALED data. @@ -169,18 +176,14 @@ def objective(trial, X_train, y_train, X_test, y_test, X_eval, y_eval, hyperpara param_name, param_config["low"], param_config["high"] ) elif param_config["type"] == "categorical": - params[param_name] = trial.suggest_categorical( - param_name, param_config["choices"] - ) + params[param_name] = trial.suggest_categorical(param_name, param_config["choices"]) # --- Preprocessing --- (Scale the data) # X_train_scaled, X_test_scaled, X_eval_scaled, scaler = preprocess_data( # X_train, X_test, X_eval # ) # Train model and get metrics # Pass only the suggested params, train_model_svm combines with base_params - model, metrics = train_model_svm( - X_train_scaled, y_train, X_eval_scaled, y_eval, params - ) + model, metrics = train_model_svm(X_train_scaled, y_train, X_eval_scaled, y_eval, params) recall = metrics.get("recall", 0.0) precision = metrics.get("precision", 0.0) @@ -284,22 +287,22 @@ def callback(study, trial): sampler=sampler, ) logger.info( - f"Starting Optuna batch {batch+1}/{num_batches} (Sampler: {type(sampler).__name__})" + f"Starting Optuna batch {batch + 1}/{num_batches} (Sampler: {type(sampler).__name__})" ) study.optimize( - objective_func, - n_trials=batch_size, - show_progress_bar=True, + objective_func, + n_trials=batch_size, + show_progress_bar=True, callbacks=[callback], - n_jobs=8 # Use all available CPU cores for parallel trials + n_jobs=8, # Use all available CPU cores for parallel trials ) # Update overall best score from the study instance after batch if study.best_trial and study.best_value > best_score: best_score = study.best_value best_params_from_hpo = study.best_params - logger.info(f"Batch {batch+1} completed. New overall best score: {best_score:.4f}") + logger.info(f"Batch {batch + 1} completed. New overall best score: {best_score:.4f}") logger.info("Optuna optimization finished for SVM.") if best_params_from_hpo: @@ -328,16 +331,14 @@ def log_top_trials_svm(trials_list, title="Top SVM Trials"): score_str = f"{score:.4f}" if score is not None else "N/A" prec_str = f"{precision:.4f}" if precision is not None else "N/A" rec_str = f"{recall:.4f}" if recall is not None else "N/A" - logger.info(f"| {i+1} | {number} | {score_str} | {prec_str} | {rec_str} | {params} |") + logger.info(f"| {i + 1} | {number} | {score_str} | {prec_str} | {rec_str} | {params} |") # --- MLflow Logging --- def log_to_mlflow_svm(model, metrics, params, fitted_scaler, input_example): """Logs parameters, metrics, scaler, and model to MLflow.""" logger.info("Logging results to MLflow for SVM...") - with mlflow.start_run( - run_name=f"svm_fixed_params_{datetime.now().strftime('%Y%m%d_%H%M')}" - ): + with mlflow.start_run(run_name=f"svm_fixed_params_{datetime.now().strftime('%Y%m%d_%H%M')}"): run_id = mlflow.active_run().info.run_id # Log combined final parameters used for the model @@ -355,7 +356,7 @@ def log_to_mlflow_svm(model, metrics, params, fitted_scaler, input_example): pickle.dump(fitted_scaler, f) mlflow.log_artifact(scaler_path) logger.info(f"Scaler artifact logged as {scaler_path}") - os.remove(scaler_path) # Clean up local scaler file + os.remove(scaler_path) # Clean up local scaler file except Exception as e: logger.error(f"Failed to save or log scaler artifact: {e}") @@ -366,7 +367,7 @@ def log_to_mlflow_svm(model, metrics, params, fitted_scaler, input_example): try: # Ensure dtypes are float for numeric cols num_cols = input_example.select_dtypes(include=np.number).columns - input_example[num_cols] = input_example[num_cols].astype('float64') + input_example[num_cols] = input_example[num_cols].astype("float64") logger.info("Created input_example from DataFrame for signature.") model_prediction = model.predict(input_example) @@ -375,7 +376,9 @@ def log_to_mlflow_svm(model, metrics, params, fitted_scaler, input_example): except Exception as sig_err: logger.error(f"Error inferring model signature: {sig_err}") - logger.error(f"Input example details:\n{input_example.head()}\n{input_example.dtypes}") + logger.error( + f"Input example details:\n{input_example.head()}\n{input_example.dtypes}" + ) # Define registered model name model_name_suffix = datetime.now().strftime("%Y%m%d_%H%M") @@ -393,6 +396,7 @@ def log_to_mlflow_svm(model, metrics, params, fitted_scaler, input_example): logger.info(f"Scikit-learn SVM model logged successfully as {registered_model_name}.") mlflow.end_run() + # --- Main Hypertuning Orchestration --- def hypertune_svm(X_train, y_train, X_test, y_test, X_eval, y_eval, experiment_name: str): """ @@ -422,6 +426,7 @@ def hypertune_svm(X_train, y_train, X_test, y_test, X_eval, y_eval, experiment_n logger.error(f"Error in hypertune_svm: {str(e)}") return None, None + # --- (Optional) Precision Target Training --- def train_with_precision_target_svm( X_train, y_train, X_test, y_test, X_eval, y_eval, experiment_name: str @@ -441,9 +446,9 @@ def train_with_precision_target_svm( # Define fixed parameters (Update these based on prior tuning or best guess) fixed_params = { - 'C': 10.0, - 'gamma': 0.01, - 'tol': 0.001, + "C": 10.0, + "gamma": 0.01, + "tol": 0.001, # Add other relevant fixed params like kernel if not 'rbf' } model_params = base_params.copy() @@ -454,9 +459,13 @@ def train_with_precision_target_svm( logger.info("Combining scaled training and test sets...") if isinstance(X_train_scaled, pd.DataFrame) and isinstance(X_test_scaled, pd.DataFrame): X_train_combined_scaled = pd.concat([X_train_scaled, X_test_scaled], ignore_index=True) - else: # Assume numpy arrays + else: # Assume numpy arrays X_train_combined_scaled = np.vstack((X_train_scaled, X_test_scaled)) - y_train_combined = pd.concat([y_train, y_test], ignore_index=True) if isinstance(y_train, pd.Series) else np.concatenate((y_train, y_test)) + y_train_combined = ( + pd.concat([y_train, y_test], ignore_index=True) + if isinstance(y_train, pd.Series) + else np.concatenate((y_train, y_test)) + ) logger.info(f"Combined scaled training set shape: {X_train_combined_scaled.shape}") # --- Create and Train Model --- @@ -485,9 +494,7 @@ def train_with_precision_target_svm( # --- Log to MLflow --- input_example_data = X_eval_scaled[:5] - log_to_mlflow_svm( - model, metrics, fixed_params, scaler, input_example_data - ) + log_to_mlflow_svm(model, metrics, fixed_params, scaler, input_example_data) return model, metrics @@ -517,26 +524,26 @@ def train_with_precision_target_svm( features = import_selected_features_ensemble(model_type="all") if not features: logger.error("Failed to load any features. Exiting.") - sys.exit(1) # Or handle differently + sys.exit(1) # Or handle differently X_train = prepare_data(X_train, features) X_test = prepare_data(X_test, features) X_eval = prepare_data(X_eval, features) try: - with open('src/models/scalers/scaler_svm.pkl', 'rb') as f: + with open("src/models/scalers/scaler_svm.pkl", "rb") as f: scaler = pickle.load(f) except Exception as e: logger.error(f"Error loading SVM scaler: {str(e)}") scaler = StandardScaler() logger.info("Created new SVM scaler") scaler.fit(X_train) - with open('src/models/scalers/scaler_svm.pkl', 'wb') as f: + with open("src/models/scalers/scaler_svm.pkl", "wb") as f: pickle.dump(scaler, f) - + X_train_scaled = scaler.transform(X_train) X_test_scaled = scaler.transform(X_test) X_eval_scaled = scaler.transform(X_eval) - + # Log data shapes logger.info(f"Training data shape after selection: {X_train.shape}") logger.info(f"Testing data shape after selection: {X_test.shape}") @@ -547,7 +554,7 @@ def train_with_precision_target_svm( # --- Choose Mode: hypertune or fixed params --- # mode = "hypertune" - mode = "hypertune" # Or "hypertune" + mode = "hypertune" # Or "hypertune" best_model_params = None final_metrics = None @@ -565,7 +572,7 @@ def train_with_precision_target_svm( X_train, y_train, X_test, y_test, X_eval, y_eval, experiment_name ) if final_model: - best_model_params = final_model.get_params() # Get params from trained model + best_model_params = final_model.get_params() # Get params from trained model else: logger.error(f"Invalid mode selected: {mode}") @@ -581,4 +588,4 @@ def train_with_precision_target_svm( logger.error(f"Error in main execution: {str(e)}") finally: gc.collect() # Force garbage collection - logger.info("--- SVM Main execution finished. ---") \ No newline at end of file + logger.info("--- SVM Main execution finished. ---") diff --git a/src/models/StackedEnsemble/base/kernel_based/svm_model_25.py b/src/models/StackedEnsemble/base/kernel_based/svm_model_25.py index 8fc03fa..3ccacfa 100644 --- a/src/models/StackedEnsemble/base/kernel_based/svm_model_25.py +++ b/src/models/StackedEnsemble/base/kernel_based/svm_model_25.py @@ -13,7 +13,6 @@ import os import pickle # For saving the scaler import random -import sys import time from datetime import datetime @@ -72,7 +71,7 @@ base_params = { "probability": True, # MUST be True for predict_proba "random_state": SEED, - 'kernel': 'rbf', # Using RBF kernel + "kernel": "rbf", # Using RBF kernel "verbose": False, # Set to True for more SVC logs } @@ -83,14 +82,14 @@ def load_hyperparameter_space_svm(): Defines the hyperparameter search space for SVC tuning using Optuna. Optimized based on top-performing trial analysis - narrowed ranges to focus on regions that produced the best precision scores (0.32-0.34). - + Key optimizations: - gamma: Narrowed from [0.001, 1.0] to [0.001, 0.03] (top trials: 0.001-0.024) - C: Slightly narrowed from [0.001, 1.0] to [0.005, 0.95] (top trials: 0.01-0.89) - class_weight_positive: Narrowed from [1.8, 4.0] to [2.0, 3.5] (top trials: 2.1-3.2) - degree: Start from 2 instead of 1 (no top trials had degree=1) - coef0: Slightly narrowed to focus on middle range [0.1, 0.9] - + Returns: dict: Optimized hyperparameter space configuration. """ @@ -102,30 +101,40 @@ def load_hyperparameter_space_svm(): "log": True, }, "gamma": { # RBF kernel parameter - SIGNIFICANTLY NARROWED - "type": "float", - "low": 0.001, # Keep lower bound (min top trial: 0.00102) - "high": 0.03, # MAJOR reduction from 1.0 (max top trial: 0.0239) - "log": True + "type": "float", + "low": 0.001, # Keep lower bound (min top trial: 0.00102) + "high": 0.03, # MAJOR reduction from 1.0 (max top trial: 0.0239) + "log": True, }, "class_weight_positive": { # Narrowed to focus on effective range "type": "float", - "low": 2.0, # Narrowed from 1.8 (min top trial: 2.136) - "high": 3.5, # Narrowed from 4.0 (max top trial: 3.221) - "step": 0.05 + "low": 2.0, # Narrowed from 1.8 (min top trial: 2.136) + "high": 3.5, # Narrowed from 4.0 (max top trial: 3.221) + "step": 0.05, }, - "cache_size": {"type": "int", "low": 4000, "high": 14000, "step": 100}, # Keep current - good coverage - "tol": {"type": "float", "low": 1e-7, "high": 1e-5, "log": True}, # Keep current - good coverage + "cache_size": { + "type": "int", + "low": 4000, + "high": 14000, + "step": 100, + }, # Keep current - good coverage + "tol": { + "type": "float", + "low": 1e-7, + "high": 1e-5, + "log": True, + }, # Keep current - good coverage "coef0": { # Slightly narrowed to focus on more effective middle range - "type": "float", - "low": 0.1, # Narrowed from 0.0 (min top trial: 0.130) - "high": 0.9, # Narrowed from 1.0 (max top trial: 0.873) - "step": 0.05 + "type": "float", + "low": 0.1, # Narrowed from 0.0 (min top trial: 0.130) + "high": 0.9, # Narrowed from 1.0 (max top trial: 0.873) + "step": 0.05, }, "degree": { # Start from 2 since no top trials used degree=1 - "type": "int", - "low": 2, # Changed from 1 (min top trial: 2) - "high": 10, # Keep current (max top trial: 10) - "step": 1 + "type": "int", + "low": 2, # Changed from 1 (min top trial: 2) + "high": 10, # Keep current (max top trial: 10) + "step": 1, }, } return hyperparameter_space @@ -147,15 +156,17 @@ def train_model_svm(X_train_scaled, y_train, X_eval_scaled, y_eval, model_params try: # Combine base and suggested params full_params = base_params.copy() - + # Special handling for class_weight from Optuna - if 'class_weight_positive' in model_params: - positive_weight = model_params.pop('class_weight_positive') # Remove from model_params before update - full_params['class_weight'] = {0: 1.0, 1: positive_weight} - elif 'class_weight' not in full_params: # Fallback if not tuned and not in base_params - full_params['class_weight'] = 'balanced' - - full_params.update(model_params) # Add other tuned params like C, tol, cache_size + if "class_weight_positive" in model_params: + positive_weight = model_params.pop( + "class_weight_positive" + ) # Remove from model_params before update + full_params["class_weight"] = {0: 1.0, 1: positive_weight} + elif "class_weight" not in full_params: # Fallback if not tuned and not in base_params + full_params["class_weight"] = "balanced" + + full_params.update(model_params) # Add other tuned params like C, tol, cache_size logger.info(f"Training SVC with parameters: {full_params}") model = SVC(**full_params) @@ -178,7 +189,9 @@ def train_model_svm(X_train_scaled, y_train, X_eval_scaled, y_eval, model_params # --- Optuna Objective --- -def objective(trial, X_train, y_train, X_test, y_test, X_eval, y_eval, hyperparameter_space, best_score): +def objective( + trial, X_train, y_train, X_test, y_test, X_eval, y_eval, hyperparameter_space, best_score +): """ Objective function for Optuna hyperparameter optimization. Uses SCALED data. @@ -200,14 +213,10 @@ def objective(trial, X_train, y_train, X_test, y_test, X_eval, y_eval, hyperpara param_name, param_config["low"], param_config["high"] ) elif param_config["type"] == "categorical": - params[param_name] = trial.suggest_categorical( - param_name, param_config["choices"] - ) + params[param_name] = trial.suggest_categorical(param_name, param_config["choices"]) # --- Preprocessing --- (Scale the data) # Train model and get metrics - model, metrics = train_model_svm( - X_train_scaled, y_train, X_eval_scaled, y_eval, params - ) + model, metrics = train_model_svm(X_train_scaled, y_train, X_eval_scaled, y_eval, params) recall = metrics.get("recall", 0.0) precision = metrics.get("precision", 0.0) @@ -326,22 +335,22 @@ def callback(study, trial): sampler=sampler, ) logger.info( - f"Starting Optuna batch {batch+1}/{num_batches} (Sampler: {type(sampler).__name__})" + f"Starting Optuna batch {batch + 1}/{num_batches} (Sampler: {type(sampler).__name__})" ) study.optimize( - objective_func, - n_trials=batch_size, - show_progress_bar=True, + objective_func, + n_trials=batch_size, + show_progress_bar=True, callbacks=[callback], - n_jobs=12 # Reduced from 8 - with narrowed hyperparameter space, fewer parallel jobs can be more efficient + n_jobs=12, # Reduced from 8 - with narrowed hyperparameter space, fewer parallel jobs can be more efficient ) # Update overall best score from the study instance after batch if study.best_trial and study.best_value > best_score: best_score = study.best_value best_params_from_hpo = study.best_params - logger.info(f"Batch {batch+1} completed. New overall best score: {best_score:.4f}") + logger.info(f"Batch {batch + 1} completed. New overall best score: {best_score:.4f}") logger.info("Optuna optimization finished for SVM.") if best_params_from_hpo: @@ -370,16 +379,14 @@ def log_top_trials_svm(trials_list, title="Top SVM Trials"): score_str = f"{score:.4f}" if score is not None else "N/A" prec_str = f"{precision:.4f}" if precision is not None else "N/A" rec_str = f"{recall:.4f}" if recall is not None else "N/A" - logger.info(f"| {i+1} | {number} | {score_str} | {prec_str} | {rec_str} | {params} |") + logger.info(f"| {i + 1} | {number} | {score_str} | {prec_str} | {rec_str} | {params} |") # --- MLflow Logging --- def log_to_mlflow_svm(model, metrics, params, fitted_scaler, input_example): """Logs parameters, metrics, scaler, and model to MLflow.""" logger.info("Logging results to MLflow for SVM...") - with mlflow.start_run( - run_name=f"svm_fixed_params_{datetime.now().strftime('%Y%m%d_%H%M')}" - ): + with mlflow.start_run(run_name=f"svm_fixed_params_{datetime.now().strftime('%Y%m%d_%H%M')}"): run_id = mlflow.active_run().info.run_id # Log combined final parameters used for the model @@ -397,7 +404,7 @@ def log_to_mlflow_svm(model, metrics, params, fitted_scaler, input_example): pickle.dump(fitted_scaler, f) mlflow.log_artifact(scaler_path) logger.info(f"Scaler artifact logged as {scaler_path}") - os.remove(scaler_path) # Clean up local scaler file + os.remove(scaler_path) # Clean up local scaler file except Exception as e: logger.error(f"Failed to save or log scaler artifact: {e}") @@ -408,7 +415,7 @@ def log_to_mlflow_svm(model, metrics, params, fitted_scaler, input_example): try: # Ensure dtypes are float for numeric cols num_cols = input_example.select_dtypes(include=np.number).columns - input_example[num_cols] = input_example[num_cols].astype('float64') + input_example[num_cols] = input_example[num_cols].astype("float64") logger.info("Created input_example from DataFrame for signature.") model_prediction = model.predict(input_example) @@ -417,7 +424,9 @@ def log_to_mlflow_svm(model, metrics, params, fitted_scaler, input_example): except Exception as sig_err: logger.error(f"Error inferring model signature: {sig_err}") - logger.error(f"Input example details:\n{input_example.head()}\n{input_example.dtypes}") + logger.error( + f"Input example details:\n{input_example.head()}\n{input_example.dtypes}" + ) # Define registered model name model_name_suffix = datetime.now().strftime("%Y%m%d_%H%M") @@ -435,6 +444,7 @@ def log_to_mlflow_svm(model, metrics, params, fitted_scaler, input_example): logger.info(f"Scikit-learn SVM model logged successfully as {registered_model_name}.") mlflow.end_run() + # --- Main Hypertuning Orchestration --- def hypertune_svm(X_train, y_train, X_test, y_test, X_eval, y_eval, experiment_name: str): """ @@ -464,6 +474,7 @@ def hypertune_svm(X_train, y_train, X_test, y_test, X_eval, y_eval, experiment_n logger.error(f"Error in hypertune_svm: {str(e)}") return None, None + # --- (Optional) Precision Target Training --- def train_with_precision_target_svm( X_train, y_train, X_test, y_test, X_eval, y_eval, experiment_name: str @@ -483,17 +494,17 @@ def train_with_precision_target_svm( # Define fixed parameters (Update these based on prior tuning or best guess) fixed_params = { - 'C': 0.6175310980516687, - 'cache_size': 5466, - 'class_weight': 'balanced', - 'coef0': 0.4568065814626598, - 'degree': 6, - 'gamma': 0.00020587086243364596, - 'kernel': 'rbf', - 'probability': True, - 'random_state': 19, - 'tol': 9.218236481865033e-05, - 'verbose': False + "C": 0.6175310980516687, + "cache_size": 5466, + "class_weight": "balanced", + "coef0": 0.4568065814626598, + "degree": 6, + "gamma": 0.00020587086243364596, + "kernel": "rbf", + "probability": True, + "random_state": 19, + "tol": 9.218236481865033e-05, + "verbose": False, } model_params = base_params.copy() model_params.update(fixed_params) @@ -503,9 +514,13 @@ def train_with_precision_target_svm( logger.info("Combining scaled training and test sets...") if isinstance(X_train_scaled, pd.DataFrame) and isinstance(X_test_scaled, pd.DataFrame): X_train_combined_scaled = pd.concat([X_train_scaled, X_test_scaled], ignore_index=True) - else: # Assume numpy arrays + else: # Assume numpy arrays X_train_combined_scaled = np.vstack((X_train_scaled, X_test_scaled)) - y_train_combined = pd.concat([y_train, y_test], ignore_index=True) if isinstance(y_train, pd.Series) else np.concatenate((y_train, y_test)) + y_train_combined = ( + pd.concat([y_train, y_test], ignore_index=True) + if isinstance(y_train, pd.Series) + else np.concatenate((y_train, y_test)) + ) logger.info(f"Combined scaled training set shape: {X_train_combined_scaled.shape}") # --- Create and Train Model --- @@ -520,11 +535,19 @@ def train_with_precision_target_svm( } ) - model, metrics = train_model_svm(X_train_combined_scaled, y_train_combined, X_eval_scaled, y_eval, model_params) + model, metrics = train_model_svm( + X_train_combined_scaled, y_train_combined, X_eval_scaled, y_eval, model_params + ) best_threshold = metrics["threshold"] logger.info(f"Final evaluation metrics (SVM Fixed Params): {metrics}") compute_permutation_importance( - model, X_eval, X_eval_scaled, y_eval, threshold=best_threshold, n_repeats=3, number_of_features=100 + model, + X_eval, + X_eval_scaled, + y_eval, + threshold=best_threshold, + n_repeats=3, + number_of_features=100, ) return model, metrics @@ -534,9 +557,10 @@ def train_with_precision_target_svm( mlflow.end_run("FAILED") return None, None + def compute_permutation_importance( model, - X_val: pd.DataFrame, + X_val: pd.DataFrame, X_val_scaled: np.ndarray, y_val: np.ndarray, threshold: float = 0.3, @@ -557,35 +581,37 @@ def compute_permutation_importance( DataFrame with columns: ['feature', 'importance'] (mean drop in metric), sorted descending. """ feature_names = X_val.columns.tolist() - y_val_np = y_val.values if hasattr(y_val, 'values') else y_val - + y_val_np = y_val.values if hasattr(y_val, "values") else y_val + # Compute baseline metric probs = model.predict_proba(X_val_scaled)[:, 1] preds = (probs >= threshold).astype(int) - + # Calculate baseline precision baseline = np.sum((y_val_np == 1) & (preds == 1)) / (np.sum(preds == 1)) logger.info(f"Baseline metric: {baseline:.4f}") - + importances = [] for idx, feat in enumerate(feature_names): drops = [] for i in range(n_repeats): - logger.info(f"Shuffling feature: {feat} - Repeat: {i+1}") + logger.info(f"Shuffling feature: {feat} - Repeat: {i + 1}") X_shuffled = X_val_scaled.copy() X_shuffled[:, idx] = np.random.permutation(X_shuffled[:, idx]) probs_shuffled = model.predict_proba(X_shuffled)[:, 1] preds_shuffled = (probs_shuffled >= threshold).astype(int) - + # Calculate precision directly instead of using metric parameter - precision = np.sum((y_val_np == 1) & (preds_shuffled == 1)) / (np.sum(preds_shuffled == 1)) + precision = np.sum((y_val_np == 1) & (preds_shuffled == 1)) / ( + np.sum(preds_shuffled == 1) + ) drop = baseline - precision drops.append(drop) - + mean_drop = np.mean(drops) importances.append((feat, mean_drop)) logger.debug(f"Feature: {feat}, Mean drop: {mean_drop:.4f}") - + # Sort by importance descending importances.sort(key=lambda x: x[1], reverse=True) df_importance = pd.DataFrame(importances, columns=["feature", "importance"]) @@ -593,6 +619,7 @@ def compute_permutation_importance( logger.info(df_importance.head(number_of_features).to_string(index=False)) return df_importance + # --- Main Execution Block --- if __name__ == "__main__": try: @@ -605,12 +632,11 @@ def compute_permutation_importance( features = import_selected_features_ensemble_new(model_type="svm") logger.info(f"Using 'svm' specific feature set with {len(features)} features.") - X_train = prepare_data(X_train, features) X_test = prepare_data(X_test, features) X_eval = prepare_data(X_eval, features) try: - with open('src/models/scalers/scaler_svm.pkl', 'rb') as f: + with open("src/models/scalers/scaler_svm.pkl", "rb") as f: scaler = pickle.load(f) X_train_scaled = scaler.transform(X_train) X_test_scaled = scaler.transform(X_test) @@ -620,14 +646,12 @@ def compute_permutation_importance( scaler = StandardScaler() logger.info("Created new SVM scaler") scaler.fit(X_train) - with open('src/models/scalers/scaler_svm.pkl', 'wb') as f: + with open("src/models/scalers/scaler_svm.pkl", "wb") as f: pickle.dump(scaler, f) X_train_scaled = scaler.transform(X_train) X_test_scaled = scaler.transform(X_test) X_eval_scaled = scaler.transform(X_eval) - - - + # Log data shapes logger.info(f"Training data shape after selection: {X_train.shape}") logger.info(f"Testing data shape after selection: {X_test.shape}") @@ -637,7 +661,7 @@ def compute_permutation_importance( ) # --- Choose Mode: hypertune or fixed params --- - mode = "hypertune" + mode = "hypertune" best_model_params = None final_metrics = None @@ -655,7 +679,7 @@ def compute_permutation_importance( X_train, y_train, X_test, y_test, X_eval, y_eval, experiment_name ) if final_model: - best_model_params = final_model.get_params() # Get params from trained model + best_model_params = final_model.get_params() # Get params from trained model else: logger.error(f"Invalid mode selected: {mode}") @@ -663,4 +687,4 @@ def compute_permutation_importance( logger.error(f"Error in main execution: {str(e)}") finally: gc.collect() # Force garbage collection - logger.info("--- SVM Main execution finished. ---") \ No newline at end of file + logger.info("--- SVM Main execution finished. ---") diff --git a/src/models/StackedEnsemble/base/neural/mlp_model.py b/src/models/StackedEnsemble/base/neural/mlp_model.py index 5255d9c..9b78b44 100644 --- a/src/models/StackedEnsemble/base/neural/mlp_model.py +++ b/src/models/StackedEnsemble/base/neural/mlp_model.py @@ -18,11 +18,12 @@ os.environ["TF_INTRA_OP_PARALLELISM_THREADS"] = NUM_THREADS os.environ["TF_INTER_OP_PARALLELISM_THREADS"] = NUM_THREADS os.environ["CUDA_VISIBLE_DEVICES"] = "-1" # Force CPU usage -os.environ["TF_CPP_MIN_LOG_LEVEL"] = "2" # Reduce TensorFlow logging verbosity +os.environ["TF_CPP_MIN_LOG_LEVEL"] = "2" # Reduce TensorFlow logging verbosity # Optional: Set process priority to high (Windows only) try: import psutil + p = psutil.Process(os.getpid()) p.nice(psutil.HIGH_PRIORITY_CLASS) except Exception: @@ -57,37 +58,38 @@ random.seed(random_seed) np.random.seed(random_seed) tf.random.set_seed(random_seed) -os.environ['PYTHONHASHSEED'] = str(random_seed) +os.environ["PYTHONHASHSEED"] = str(random_seed) # Configure Git executable path if available git_executable = os.environ.get("GIT_PYTHON_GIT_EXECUTABLE") if git_executable and os.path.exists(git_executable): import git + git.refresh(git_executable) mlflow_tracking = setup_mlflow_tracking(experiment_name) # Global settings -min_recall = 0.30 # Minimum acceptable recall -n_trials = 10000 # Fewer trials for MLP due to longer training times +min_recall = 0.30 # Minimum acceptable recall +n_trials = 10000 # Fewer trials for MLP due to longer training times pip_requirements = [ f"tensorflow=={tf.__version__}", - "scikit-learn", - f"mlflow=={mlflow.__version__}" + "scikit-learn", + f"mlflow=={mlflow.__version__}", ] scaler = None # Global scaler object # Define base configurations -base_params = { - 'verbose': 0, - 'metrics': ['accuracy', 'AUC'] -} +base_params = {"verbose": 0, "metrics": ["accuracy", "AUC"]} + + # Define the Wrapper Class class KerasMLPWrapper(BaseEstimator, ClassifierMixin): """ A wrapper for a fitted Keras Sequential model to provide a scikit-learn compatible predict_proba method. """ + def __init__(self, model): # Check if the model is a fitted Keras model # Keras models might not have model.built immediately after loading, check for weights @@ -131,89 +133,40 @@ def predict_proba(self, X): # Add necessary methods for sklearn compatibility if needed further def get_params(self, deep=True): - return {'model': self.model} + return {"model": self.model} def set_params(self, **params): - if 'model' in params: - self.model = params['model'] + if "model" in params: + self.model = params["model"] return self + def load_hyperparameter_space(): """ Define hyperparameter space for MLP tuning. - + Returns: dict: Hyperparameter space configuration. """ hyperparameter_space = { - 'learning_rate': { - 'type': 'float', - 'low': 1e-5, - 'high': 5e-2, - 'log': True - }, - 'hidden_layers': { - 'type': 'int', - 'low': 1, - 'high': 6 - }, - 'neurons_per_layer': { - 'type': 'int', - 'low': 32, - 'high': 1024, - 'step': 16 - }, - 'dropout_rate': { - 'type': 'float', - 'low': 0.001, - 'high': 0.7, - 'step': 0.001 - }, - 'activation': { - 'type': 'categorical', - 'choices': ['relu', 'elu', 'tanh'] - }, - 'l1_regularization': { - 'type': 'float', - 'low': 0.00001, - 'high': 1e-2, - 'log': True - }, - 'l2_regularization': { - 'type': 'float', - 'low': 0.00001, - 'high': 1e-2, - 'log': True - }, - 'batch_size': { - 'type': 'int', - 'low': 64, - 'high': 2048, - 'step': 16 - }, - 'epochs': { - 'type': 'int', - 'low': 50, - 'high': 250, - 'step': 5 - }, - 'patience': { - 'type': 'int', - 'low': 5, - 'high': 50 - }, - 'class_weight_multiplier': { - 'type': 'float', - 'low': 0.5, - 'high': 3.0, - 'step': 0.05 - } + "learning_rate": {"type": "float", "low": 1e-5, "high": 5e-2, "log": True}, + "hidden_layers": {"type": "int", "low": 1, "high": 6}, + "neurons_per_layer": {"type": "int", "low": 32, "high": 1024, "step": 16}, + "dropout_rate": {"type": "float", "low": 0.001, "high": 0.7, "step": 0.001}, + "activation": {"type": "categorical", "choices": ["relu", "elu", "tanh"]}, + "l1_regularization": {"type": "float", "low": 0.00001, "high": 1e-2, "log": True}, + "l2_regularization": {"type": "float", "low": 0.00001, "high": 1e-2, "log": True}, + "batch_size": {"type": "int", "low": 64, "high": 2048, "step": 16}, + "epochs": {"type": "int", "low": 50, "high": 250, "step": 5}, + "patience": {"type": "int", "low": 5, "high": 50}, + "class_weight_multiplier": {"type": "float", "low": 0.5, "high": 3.0, "step": 0.05}, } return hyperparameter_space + def preprocess_data(X_train, X_test, X_eval=None): try: - with open('src/models/scalers/scaler_mlp.pkl', 'rb') as f: + with open("src/models/scalers/scaler_mlp.pkl", "rb") as f: scaler = pickle.load(f) logger.info("Loaded existing MLP scaler") X_train_scaled = scaler.transform(X_train) @@ -226,94 +179,99 @@ def preprocess_data(X_train, X_test, X_eval=None): X_train_scaled = scaler.fit_transform(X_train) X_test_scaled = scaler.transform(X_test) X_eval_scaled = scaler.transform(X_eval) - with open('src/models/scalers/scaler_mlp.pkl', 'wb') as f: + with open("src/models/scalers/scaler_mlp.pkl", "wb") as f: pickle.dump(scaler, f) - + return X_train_scaled, X_test_scaled, X_eval_scaled, scaler + def create_model(model_params): """ Create and compile a Keras MLP model based on provided hyperparameters. - + Args: model_params (dict): Hyperparameters for model configuration. - + Returns: keras.Model: Compiled MLP model. """ try: - input_dim = model_params.pop('input_dim') - hidden_layers = model_params.pop('hidden_layers', 2) - neurons_per_layer = model_params.pop('neurons_per_layer', 128) - dropout_rate = model_params.pop('dropout_rate', 0.2) - activation = model_params.pop('activation', 'relu') - l1_reg = model_params.pop('l1_regularization', 0.0) - l2_reg = model_params.pop('l2_regularization', 0.0) - learning_rate = model_params.pop('learning_rate', 0.001) - + input_dim = model_params.pop("input_dim") + hidden_layers = model_params.pop("hidden_layers", 2) + neurons_per_layer = model_params.pop("neurons_per_layer", 128) + dropout_rate = model_params.pop("dropout_rate", 0.2) + activation = model_params.pop("activation", "relu") + l1_reg = model_params.pop("l1_regularization", 0.0) + l2_reg = model_params.pop("l2_regularization", 0.0) + learning_rate = model_params.pop("learning_rate", 0.001) + model = keras.Sequential() model.add(layers.InputLayer(shape=(input_dim,))) - + # Add hidden layers for _ in range(hidden_layers): - model.add(layers.Dense( - neurons_per_layer, - activation=activation, - kernel_regularizer=regularizers.l1_l2(l1=l1_reg, l2=l2_reg) - )) + model.add( + layers.Dense( + neurons_per_layer, + activation=activation, + kernel_regularizer=regularizers.l1_l2(l1=l1_reg, l2=l2_reg), + ) + ) model.add(layers.BatchNormalization()) model.add(layers.Dropout(dropout_rate)) - + # Output layer for binary classification - model.add(layers.Dense(1, activation='sigmoid')) + model.add(layers.Dense(1, activation="sigmoid")) optimizer = keras.optimizers.Adam(learning_rate=learning_rate) model.compile( optimizer=optimizer, - loss='binary_crossentropy', - metrics=['accuracy', keras.metrics.AUC(name='auc')], - jit_compile=True # Enable XLA compilation for faster CPU execution + loss="binary_crossentropy", + metrics=["accuracy", keras.metrics.AUC(name="auc")], + jit_compile=True, # Enable XLA compilation for faster CPU execution ) return model except Exception as e: logger.error(f"Error creating MLP model: {str(e)}") raise + def train_model(X_train, y_train, X_test, y_test, X_eval, y_eval, model_params): """ Train the MLP model, wrap it, optimize threshold, return raw Keras model and metrics. """ try: # Set the input dimension based on training data - model_params['input_dim'] = X_train.shape[1] + model_params["input_dim"] = X_train.shape[1] keras_model = create_model(model_params.copy()) # Compute class weights neg_count = np.sum(y_train == 0) pos_count = np.sum(y_train == 1) class_weight = {0: 1.0, 1: (neg_count / pos_count) if pos_count > 0 else 1.0} - class_weight[1] *= model_params.get('class_weight_multiplier', 1.0) + class_weight[1] *= model_params.get("class_weight_multiplier", 1.0) # Early stopping callback early_stop = callbacks.EarlyStopping( - monitor='val_auc', # Monitor validation AUC - mode='max', - patience=model_params.get('patience', 20), + monitor="val_auc", # Monitor validation AUC + mode="max", + patience=model_params.get("patience", 20), restore_best_weights=True, - verbose=0 + verbose=0, ) logger.info("Starting Keras model fitting...") # For CPU training, moderate batch sizes work better - batch_size = model_params.get('batch_size', 32) - + batch_size = model_params.get("batch_size", 32) + keras_model.fit( - X_train, y_train, + X_train, + y_train, validation_data=(X_test, y_test), - epochs=model_params.get('epochs', 100), + epochs=model_params.get("epochs", 100), batch_size=batch_size, class_weight=class_weight, callbacks=[early_stop], - verbose=0 + verbose=0, ) logger.info("Keras model fitting finished.") @@ -322,7 +280,9 @@ def train_model(X_train, y_train, X_test, y_test, X_eval, y_eval, model_params): wrapped_model = KerasMLPWrapper(keras_model) # Optimize threshold using the wrapped model - best_threshold, threshold_metrics = optimize_threshold(wrapped_model, X_eval, y_eval, min_recall) + best_threshold, threshold_metrics = optimize_threshold( + wrapped_model, X_eval, y_eval, min_recall + ) # Return the original Keras model and the combined metrics return keras_model, threshold_metrics @@ -330,7 +290,10 @@ def train_model(X_train, y_train, X_test, y_test, X_eval, y_eval, model_params): logger.error(f"Error training MLP model: {str(e)}") raise -def optimize_hyperparameters(X_train, y_train, X_test, y_test, X_eval, y_eval, hyperparameter_space): + +def optimize_hyperparameters( + X_train, y_train, X_test, y_test, X_eval, y_eval, hyperparameter_space +): """ Run hyperparameter optimization using Optuna. """ @@ -346,26 +309,33 @@ def objective(trial): try: params = {} for param_name, param_config in hyperparameter_space.items(): - if param_config['type'] == 'float': + if param_config["type"] == "float": params[param_name] = trial.suggest_float( - param_name, param_config['low'], param_config['high'], - log=param_config.get('log', False), step=param_config.get('step') + param_name, + param_config["low"], + param_config["high"], + log=param_config.get("log", False), + step=param_config.get("step"), ) - elif param_config['type'] == 'int': + elif param_config["type"] == "int": params[param_name] = trial.suggest_int( - param_name, param_config['low'], param_config['high'] + param_name, param_config["low"], param_config["high"] ) - elif param_config['type'] == 'categorical': + elif param_config["type"] == "categorical": params[param_name] = trial.suggest_categorical( - param_name, param_config['choices'] + param_name, param_config["choices"] ) # Train model - train_model now returns raw Keras model and metrics dict - model, metrics = train_model(X_train, y_train, X_test, y_test, X_eval, y_eval, params.copy()) + model, metrics = train_model( + X_train, y_train, X_test, y_test, X_eval, y_eval, params.copy() + ) - precision = metrics.get('precision', 0.0) - recall = metrics.get('recall', 0.0) - score = precision if recall >= min_recall else 0.0 # Optimize for precision - logger.info(f" Trial {trial.number}: Score={score:.4f}, Precision={precision:.4f}, Recall={recall:.4f}, AUC={metrics.get('auc', 0.0):.4f}") + precision = metrics.get("precision", 0.0) + recall = metrics.get("recall", 0.0) + score = precision if recall >= min_recall else 0.0 # Optimize for precision + logger.info( + f" Trial {trial.number}: Score={score:.4f}, Precision={precision:.4f}, Recall={recall:.4f}, AUC={metrics.get('auc', 0.0):.4f}" + ) for metric_name, metric_value in metrics.items(): # Ensure serializable for Optuna @@ -375,7 +345,7 @@ def objective(trial): trial.set_user_attr(metric_name, metric_value.item()) else: trial.set_user_attr(metric_name, str(metric_value)) - + if score > 0.35 and score > best_score: logger.info(f"Trial {trial.number} completed with score {score:.4f}") log_to_mlflow(model, metrics, params, experiment_name, scaler) @@ -426,7 +396,7 @@ def callback(study, trial): sampler = optuna.samplers.RandomSampler(seed=random_seed) study = optuna.create_study( study_name=study_name, - direction='maximize', + direction="maximize", storage=storage_url, load_if_exists=True, sampler=sampler, @@ -443,58 +413,81 @@ def callback(study, trial): best_params.update(base_params) return best_params + def hypertune_mlp(experiment_name): """ Run hyperparameter tuning and final training for the MLP model with MLflow tracking. """ try: - X_train_scaled, X_test_scaled, X_eval_scaled, scaler = preprocess_data(X_train, X_test, X_eval) + X_train_scaled, X_test_scaled, X_eval_scaled, scaler = preprocess_data( + X_train, X_test, X_eval + ) hyperparameter_space = load_hyperparameter_space() - best_params = optimize_hyperparameters(X_train_scaled, y_train, X_test_scaled, y_test, X_eval_scaled, y_eval, hyperparameter_space) + best_params = optimize_hyperparameters( + X_train_scaled, + y_train, + X_test_scaled, + y_test, + X_eval_scaled, + y_eval, + hyperparameter_space, + ) logger.info("Training final MLP model with best hyperparameters") - model, metrics = train_model(X_train_scaled, y_train, X_test_scaled, y_test, X_eval_scaled, y_eval, best_params.copy()) + model, metrics = train_model( + X_train_scaled, + y_train, + X_test_scaled, + y_test, + X_eval_scaled, + y_eval, + best_params.copy(), + ) log_to_mlflow(model, metrics, best_params, experiment_name, scaler) return best_params, metrics except Exception as e: logger.error(f"Error during hypertuning: {str(e)}") return None, None + def log_to_mlflow(model, metrics, params, experiment_name, scaler): """ Log the final MLP model, its metrics, and parameters to MLflow. - + Returns: str: Run ID. """ global X_eval try: mlflow.set_experiment(experiment_name) - with mlflow.start_run(run_name=f"mlp_final_{datetime.now().strftime('%Y%m%d_%H%M')}") as run: + with mlflow.start_run( + run_name=f"mlp_final_{datetime.now().strftime('%Y%m%d_%H%M')}" + ) as run: for param_name, param_value in params.items(): mlflow.log_param(param_name, param_value) for metric_name, metric_value in metrics.items(): mlflow.log_metric(metric_name, metric_value) - + # Wrap the fitted Keras model wrapped_model = KerasMLPWrapper(model) - scaler_path = 'src/models/scalers/scaler_mlp.pkl' + scaler_path = "src/models/scalers/scaler_mlp.pkl" mlflow.log_artifact(scaler_path, artifact_path="scaler") # Create input example input_example = X_eval.iloc[:5].copy() # Identify and convert integer columns to float64 to prevent schema enforcement errors - if hasattr(input_example, 'dtypes'): + if hasattr(input_example, "dtypes"): for col in input_example.columns: - if input_example[col].dtype.kind == 'i': - logger.info(f"Converting integer column '{col}' to float64 to handle potential missing values") - input_example[col] = input_example[col].astype('float64') - + if input_example[col].dtype.kind == "i": + logger.info( + f"Converting integer column '{col}' to float64 to handle potential missing values" + ) + input_example[col] = input_example[col].astype("float64") + signature = None if input_example is not None: try: # Use wrapped model for predict_proba signature signature = mlflow.models.infer_signature( - input_example, - wrapped_model.predict_proba(input_example) + input_example, wrapped_model.predict_proba(input_example) ) except Exception as sig_err: logger.error(f"Failed to infer signature: {sig_err}") @@ -505,7 +498,7 @@ def log_to_mlflow(model, metrics, params, experiment_name, scaler): artifact_path="model", pip_requirements=pip_requirements, registered_model_name=f"mlp_{datetime.now().strftime('%Y%m%d_%H%M')}", - signature=signature + signature=signature, ) run_id = run.info.run_id logger.info(f"MLflow run ID: {run_id}") @@ -515,6 +508,7 @@ def log_to_mlflow(model, metrics, params, experiment_name, scaler): logger.error(f"Error logging to MLflow: {str(e)}") return None + def train_with_precision_target(X_train, y_train, X_test, y_test, X_eval, y_eval): """ Train MLP model with focus on precision target. @@ -532,23 +526,29 @@ def train_with_precision_target(X_train, y_train, X_test, y_test, X_eval, y_eval logger.info("Training model with precision target") params = base_params.copy() # Inherits base MLP parameters # Specific parameters for this training run with advanced scheduling - params.update({ - "learning_rate": 2.291034155900042e-05, - "hidden_layers": 1, - "neurons_per_layer": 778, - "dropout_rate": 0.69, - "activation": "elu", - "l1_regularization": 1.2245823592413548e-06, - "l2_regularization": 7.52365321620833e-05, - "batch_size": 3621, - "epochs": 112, - "patience": 30, - "class_weight_multiplier": 2.42, - }) - X_train_scaled, X_test_scaled, X_eval_scaled, scaler = preprocess_data(X_train, X_test, X_eval) + params.update( + { + "learning_rate": 2.291034155900042e-05, + "hidden_layers": 1, + "neurons_per_layer": 778, + "dropout_rate": 0.69, + "activation": "elu", + "l1_regularization": 1.2245823592413548e-06, + "l2_regularization": 7.52365321620833e-05, + "batch_size": 3621, + "epochs": 112, + "patience": 30, + "class_weight_multiplier": 2.42, + } + ) + X_train_scaled, X_test_scaled, X_eval_scaled, scaler = preprocess_data( + X_train, X_test, X_eval + ) # Train final model with best parameters logger.info("Training final model with best parameters") - model, metrics = train_model(X_train_scaled, y_train, X_test_scaled, y_test, X_eval_scaled, y_eval, params) + model, metrics = train_model( + X_train_scaled, y_train, X_test_scaled, y_test, X_eval_scaled, y_eval, params + ) # Log to MLflow # log_to_mlflow(model, metrics, params, experiment_name, scaler) top_features = compute_permutation_importance(model, X_eval, y_eval) @@ -558,6 +558,7 @@ def train_with_precision_target(X_train, y_train, X_test, y_test, X_eval, y_eval logger.error(f"Error during MLflow artifact logging: {str(e)}") return mlflow.active_run().info.run_id if mlflow.active_run() else None + def select_top_features_mlp(model: keras.Sequential, X: pd.DataFrame, n_features=60): """ Selects the top N features based on MLP feature importances. @@ -571,16 +572,17 @@ def select_top_features_mlp(model: keras.Sequential, X: pd.DataFrame, n_features A list of the names of the top N features. """ # get weight matrix of first Dense layer: shape (n_inputs, n_neurons) - W = model.layers[0].get_weights()[0] + W = model.layers[0].get_weights()[0] # sum abs(weights) across neurons → one score per input feature scores = np.abs(W).sum(axis=1) - df = pd.DataFrame({'Feature': X.columns, 'Importance': scores}) - df = df.sort_values('Importance', ascending=False) - return df['Feature'].head(n_features).tolist() + df = pd.DataFrame({"Feature": X.columns, "Importance": scores}) + df = df.sort_values("Importance", ascending=False) + return df["Feature"].head(n_features).tolist() + def compute_permutation_importance( model, - X_val: pd.DataFrame, + X_val: pd.DataFrame, y_val: np.ndarray, threshold: float = 0.3, n_repeats: int = 10, @@ -606,60 +608,60 @@ def compute_permutation_importance( else: # If X_val is already numpy array, we need feature names from somewhere else raise ValueError("X_val must be a pandas DataFrame to extract feature names") - - y_val_np = y_val.values if hasattr(y_val, 'values') else y_val + + y_val_np = y_val.values if hasattr(y_val, "values") else y_val wrapped_model = KerasMLPWrapper(model) # Convert to numpy and ensure correct shape if y_val_np.ndim == 2 and y_val_np.shape[1] == 1: y_val_np = y_val_np.ravel() - + # Compute baseline metric probs = wrapped_model.predict_proba(X_val)[:, 1] preds = (probs >= threshold).astype(int) - + # Calculate baseline precision baseline = np.sum((y_val_np == 1) & (preds == 1)) / (np.sum(preds == 1)) logger.info(f"Baseline precision: {baseline:.4f}") - + importances = [] - for feat_idx, feat in enumerate(feature_names): + for _feat_idx, feat in enumerate(feature_names): drops = [] for i in range(n_repeats): - logger.info(f"Shuffling feature: {feat} - Repeat: {i+1}") + logger.info(f"Shuffling feature: {feat} - Repeat: {i + 1}") X_shuffled = X_val_scaled.copy() # Use column name since X_val_scaled is a DataFrame X_shuffled[feat] = np.random.permutation(X_shuffled[feat]) - + # Get predictions with shuffled feature probs_shuffled = wrapped_model.predict_proba(X_shuffled)[:, 1] preds_shuffled = (probs_shuffled >= threshold).astype(int) - + # Calculate precision with shuffled feature - handle division by zero true_positives = np.sum((y_val_np == 1) & (preds_shuffled == 1)) predicted_positives = np.sum(preds_shuffled == 1) - + if predicted_positives > 0: precision = true_positives / predicted_positives else: precision = 0.0 # No positive predictions means precision is 0 - + drop = baseline - precision drops.append(drop) - + mean_drop = np.mean(drops) importances.append((feat, mean_drop)) logger.debug(f"Feature: {feat}, Mean importance drop: {mean_drop:.4f}") - + # Sort by importance descending importances.sort(key=lambda x: x[1], reverse=True) df_importance = pd.DataFrame(importances, columns=["feature", "importance"]) - + # Log top features logger.info("Top features by permutation importance:") logger.info(df_importance.head(number_of_features).to_string(index=False)) - + return df_importance - + except Exception as e: logger.error(f"Error computing permutation importance: {str(e)}") raise @@ -693,5 +695,6 @@ def main(): except Exception as e: logger.error(f"Error in main execution: {str(e)}") + if __name__ == "__main__": - main() \ No newline at end of file + main() diff --git a/src/models/StackedEnsemble/base/neural/mlp_model_25.py b/src/models/StackedEnsemble/base/neural/mlp_model_25.py index 8d498d5..47f071f 100644 --- a/src/models/StackedEnsemble/base/neural/mlp_model_25.py +++ b/src/models/StackedEnsemble/base/neural/mlp_model_25.py @@ -18,12 +18,13 @@ os.environ["TF_INTRA_OP_PARALLELISM_THREADS"] = NUM_THREADS os.environ["TF_INTER_OP_PARALLELISM_THREADS"] = NUM_THREADS os.environ["CUDA_VISIBLE_DEVICES"] = "-1" # Force CPU usage -os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3" # Change from "2" to "3" +os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3" # Change from "2" to "3" os.environ["XLA_FLAGS"] = "--xla_hlo_profile=false" # Disable XLA logging # Optional: Set process priority to high (Windows only) try: import psutil + p = psutil.Process(os.getpid()) p.nice(psutil.HIGH_PRIORITY_CLASS) except Exception: @@ -56,37 +57,38 @@ random.seed(random_seed) np.random.seed(random_seed) tf.random.set_seed(random_seed) -os.environ['PYTHONHASHSEED'] = str(random_seed) +os.environ["PYTHONHASHSEED"] = str(random_seed) # Configure Git executable path if available git_executable = os.environ.get("GIT_PYTHON_GIT_EXECUTABLE") if git_executable and os.path.exists(git_executable): import git + git.refresh(git_executable) mlflow_tracking = setup_mlflow_tracking(experiment_name) # Global settings -min_recall = 0.25 # Minimum acceptable recall -n_trials = 10000 # Fewer trials for MLP due to longer training times +min_recall = 0.25 # Minimum acceptable recall +n_trials = 10000 # Fewer trials for MLP due to longer training times pip_requirements = [ f"tensorflow=={tf.__version__}", - "scikit-learn", - f"mlflow=={mlflow.__version__}" + "scikit-learn", + f"mlflow=={mlflow.__version__}", ] scaler = None # Global scaler object # Define base configurations -base_params = { - 'verbose': 0, - 'metrics': ['accuracy', 'AUC'] -} +base_params = {"verbose": 0, "metrics": ["accuracy", "AUC"]} + + # Define the Wrapper Class class KerasMLPWrapper(BaseEstimator, ClassifierMixin): """ A wrapper for a fitted Keras Sequential model to provide a scikit-learn compatible predict_proba method. """ + def __init__(self, model): # Check if the model is a fitted Keras model # Keras models might not have model.built immediately after loading, check for weights @@ -130,89 +132,40 @@ def predict_proba(self, X): # Add necessary methods for sklearn compatibility if needed further def get_params(self, deep=True): - return {'model': self.model} + return {"model": self.model} def set_params(self, **params): - if 'model' in params: - self.model = params['model'] + if "model" in params: + self.model = params["model"] return self + def load_hyperparameter_space(): """ Define hyperparameter space for MLP tuning. - + Returns: dict: Hyperparameter space configuration. """ hyperparameter_space = { - 'learning_rate': { - 'type': 'float', - 'low': 1e-5, - 'high': 5e-2, - 'log': True - }, - 'hidden_layers': { - 'type': 'int', - 'low': 1, - 'high': 6 - }, - 'neurons_per_layer': { - 'type': 'int', - 'low': 32, - 'high': 1024, - 'step': 16 - }, - 'dropout_rate': { - 'type': 'float', - 'low': 0.59, - 'high': 0.8, - 'step': 0.001 - }, - 'activation': { - 'type': 'categorical', - 'choices': ['elu', 'tanh'] - }, - 'l1_regularization': { - 'type': 'float', - 'low': 1e-6, - 'high': 5e-4, - 'log': True - }, - 'l2_regularization': { - 'type': 'float', - 'low': 1e-6, - 'high': 5e-3, - 'log': True - }, - 'batch_size': { - 'type': 'int', - 'low': 512, - 'high': 4096, - 'step': 32 - }, - 'epochs': { - 'type': 'int', - 'low': 50, - 'high': 250, - 'step': 5 - }, - 'patience': { - 'type': 'int', - 'low': 5, - 'high': 50 - }, - 'class_weight_multiplier': { - 'type': 'float', - 'low': 1.0, - 'high': 2.5, - 'step': 0.01 - } + "learning_rate": {"type": "float", "low": 1e-5, "high": 5e-2, "log": True}, + "hidden_layers": {"type": "int", "low": 1, "high": 6}, + "neurons_per_layer": {"type": "int", "low": 32, "high": 1024, "step": 16}, + "dropout_rate": {"type": "float", "low": 0.59, "high": 0.8, "step": 0.001}, + "activation": {"type": "categorical", "choices": ["elu", "tanh"]}, + "l1_regularization": {"type": "float", "low": 1e-6, "high": 5e-4, "log": True}, + "l2_regularization": {"type": "float", "low": 1e-6, "high": 5e-3, "log": True}, + "batch_size": {"type": "int", "low": 512, "high": 4096, "step": 32}, + "epochs": {"type": "int", "low": 50, "high": 250, "step": 5}, + "patience": {"type": "int", "low": 5, "high": 50}, + "class_weight_multiplier": {"type": "float", "low": 1.0, "high": 2.5, "step": 0.01}, } return hyperparameter_space + def preprocess_data(X_train, X_test, X_eval=None): try: - with open('src/models/scalers/scaler_mlp.pkl', 'rb') as f: + with open("src/models/scalers/scaler_mlp.pkl", "rb") as f: scaler = pickle.load(f) logger.info("Loaded existing MLP scaler") X_train_scaled = scaler.transform(X_train) @@ -225,94 +178,99 @@ def preprocess_data(X_train, X_test, X_eval=None): X_train_scaled = scaler.fit_transform(X_train) X_test_scaled = scaler.transform(X_test) X_eval_scaled = scaler.transform(X_eval) - with open('src/models/scalers/scaler_mlp.pkl', 'wb') as f: + with open("src/models/scalers/scaler_mlp.pkl", "wb") as f: pickle.dump(scaler, f) - + return X_train_scaled, X_test_scaled, X_eval_scaled, scaler + def create_model(model_params): """ Create and compile a Keras MLP model based on provided hyperparameters. - + Args: model_params (dict): Hyperparameters for model configuration. - + Returns: keras.Model: Compiled MLP model. """ try: - input_dim = model_params.pop('input_dim') - hidden_layers = model_params.pop('hidden_layers', 2) - neurons_per_layer = model_params.pop('neurons_per_layer', 128) - dropout_rate = model_params.pop('dropout_rate', 0.2) - activation = model_params.pop('activation', 'relu') - l1_reg = model_params.pop('l1_regularization', 0.0) - l2_reg = model_params.pop('l2_regularization', 0.0) - learning_rate = model_params.pop('learning_rate', 0.001) - + input_dim = model_params.pop("input_dim") + hidden_layers = model_params.pop("hidden_layers", 2) + neurons_per_layer = model_params.pop("neurons_per_layer", 128) + dropout_rate = model_params.pop("dropout_rate", 0.2) + activation = model_params.pop("activation", "relu") + l1_reg = model_params.pop("l1_regularization", 0.0) + l2_reg = model_params.pop("l2_regularization", 0.0) + learning_rate = model_params.pop("learning_rate", 0.001) + model = keras.Sequential() model.add(layers.InputLayer(shape=(input_dim,))) - + # Add hidden layers for _ in range(hidden_layers): - model.add(layers.Dense( - neurons_per_layer, - activation=activation, - kernel_regularizer=regularizers.l1_l2(l1=l1_reg, l2=l2_reg) - )) + model.add( + layers.Dense( + neurons_per_layer, + activation=activation, + kernel_regularizer=regularizers.l1_l2(l1=l1_reg, l2=l2_reg), + ) + ) model.add(layers.BatchNormalization()) model.add(layers.Dropout(dropout_rate)) - + # Output layer for binary classification - model.add(layers.Dense(1, activation='sigmoid')) + model.add(layers.Dense(1, activation="sigmoid")) optimizer = keras.optimizers.Adam(learning_rate=learning_rate) model.compile( optimizer=optimizer, - loss='binary_crossentropy', - metrics=['accuracy', keras.metrics.AUC(name='auc')], - jit_compile=True # Enable XLA compilation for faster CPU execution + loss="binary_crossentropy", + metrics=["accuracy", keras.metrics.AUC(name="auc")], + jit_compile=True, # Enable XLA compilation for faster CPU execution ) return model except Exception as e: logger.error(f"Error creating MLP model: {str(e)}") raise + def train_model(X_train, y_train, X_test, y_test, X_eval, y_eval, model_params): """ Train the MLP model, wrap it, optimize threshold, return raw Keras model and metrics. """ try: # Set the input dimension based on training data - model_params['input_dim'] = X_train.shape[1] + model_params["input_dim"] = X_train.shape[1] keras_model = create_model(model_params.copy()) # Compute class weights neg_count = np.sum(y_train == 0) pos_count = np.sum(y_train == 1) class_weight = {0: 1.0, 1: (neg_count / pos_count) if pos_count > 0 else 1.0} - class_weight[1] *= model_params.get('class_weight_multiplier', 1.0) + class_weight[1] *= model_params.get("class_weight_multiplier", 1.0) # Early stopping callback early_stop = callbacks.EarlyStopping( - monitor='val_auc', # Monitor validation AUC - mode='max', - patience=model_params.get('patience', 20), + monitor="val_auc", # Monitor validation AUC + mode="max", + patience=model_params.get("patience", 20), restore_best_weights=True, - verbose=0 + verbose=0, ) logger.info("Starting Keras model fitting...") # For CPU training, moderate batch sizes work better - batch_size = model_params.get('batch_size', 32) - + batch_size = model_params.get("batch_size", 32) + keras_model.fit( - X_train, y_train, + X_train, + y_train, validation_data=(X_test, y_test), - epochs=model_params.get('epochs', 100), + epochs=model_params.get("epochs", 100), batch_size=batch_size, class_weight=class_weight, callbacks=[early_stop], - verbose=0 + verbose=0, ) logger.info("Keras model fitting finished.") @@ -321,7 +279,9 @@ def train_model(X_train, y_train, X_test, y_test, X_eval, y_eval, model_params): wrapped_model = KerasMLPWrapper(keras_model) # Optimize threshold using the wrapped model - best_threshold, threshold_metrics = optimize_threshold(wrapped_model, X_eval, y_eval, min_recall) + best_threshold, threshold_metrics = optimize_threshold( + wrapped_model, X_eval, y_eval, min_recall + ) # Return the original Keras model and the combined metrics return keras_model, threshold_metrics, wrapped_model @@ -329,7 +289,10 @@ def train_model(X_train, y_train, X_test, y_test, X_eval, y_eval, model_params): logger.error(f"Error training MLP model: {str(e)}") raise -def optimize_hyperparameters(X_train, y_train, X_test, y_test, X_eval, y_eval, hyperparameter_space): + +def optimize_hyperparameters( + X_train, y_train, X_test, y_test, X_eval, y_eval, hyperparameter_space +): """ Run hyperparameter optimization using Optuna. """ @@ -345,26 +308,33 @@ def objective(trial): try: params = {} for param_name, param_config in hyperparameter_space.items(): - if param_config['type'] == 'float': + if param_config["type"] == "float": params[param_name] = trial.suggest_float( - param_name, param_config['low'], param_config['high'], - log=param_config.get('log', False), step=param_config.get('step') + param_name, + param_config["low"], + param_config["high"], + log=param_config.get("log", False), + step=param_config.get("step"), ) - elif param_config['type'] == 'int': + elif param_config["type"] == "int": params[param_name] = trial.suggest_int( - param_name, param_config['low'], param_config['high'] + param_name, param_config["low"], param_config["high"] ) - elif param_config['type'] == 'categorical': + elif param_config["type"] == "categorical": params[param_name] = trial.suggest_categorical( - param_name, param_config['choices'] + param_name, param_config["choices"] ) # Train model - train_model now returns raw Keras model and metrics dict - model, metrics, wrapped_model = train_model(X_train, y_train, X_test, y_test, X_eval, y_eval, params.copy()) + model, metrics, wrapped_model = train_model( + X_train, y_train, X_test, y_test, X_eval, y_eval, params.copy() + ) - precision = metrics.get('precision', 0.0) - recall = metrics.get('recall', 0.0) - score = precision if recall >= min_recall else 0.0 # Optimize for precision - logger.info(f" Trial {trial.number}: Score={score:.4f}, Precision={precision:.4f}, Recall={recall:.4f}, AUC={metrics.get('auc', 0.0):.4f}") + precision = metrics.get("precision", 0.0) + recall = metrics.get("recall", 0.0) + score = precision if recall >= min_recall else 0.0 # Optimize for precision + logger.info( + f" Trial {trial.number}: Score={score:.4f}, Precision={precision:.4f}, Recall={recall:.4f}, AUC={metrics.get('auc', 0.0):.4f}" + ) for metric_name, metric_value in metrics.items(): # Ensure serializable for Optuna @@ -374,7 +344,7 @@ def objective(trial): trial.set_user_attr(metric_name, metric_value.item()) else: trial.set_user_attr(metric_name, str(metric_value)) - + if score > 0.30 and score > best_score: logger.info(f"Trial {trial.number} completed with score {score:.4f}") log_to_mlflow(model, metrics, params, experiment_name, scaler, X_eval) @@ -425,7 +395,7 @@ def callback(study, trial): sampler = optuna.samplers.RandomSampler(seed=random_seed) study = optuna.create_study( study_name=study_name, - direction='maximize', + direction="maximize", storage=storage_url, load_if_exists=True, sampler=sampler, @@ -434,15 +404,17 @@ def callback(study, trial): if batch > 0: # Skip feature reduction for the first batch features_to_remove = min(1, X_train.shape[1] - 20) if features_to_remove > 0: - logger.info(f"Batch {batch + 1}: Removing {features_to_remove} features from position 0") + logger.info( + f"Batch {batch + 1}: Removing {features_to_remove} features from position 0" + ) logger.info(f"Features before removal: {X_train.shape[1]}") - + # Remove features from numpy arrays by slicing (remove first features_to_remove columns) X_train = X_train[:, features_to_remove:] X_test = X_test[:, features_to_remove:] if X_eval is not None: X_eval = X_eval[:, features_to_remove:] - + logger.info(f"Features after removal: {X_train.shape[1]}") try: study.optimize(objective, n_trials=batch_size, callbacks=[callback], n_jobs=3) @@ -455,57 +427,80 @@ def callback(study, trial): best_params.update(base_params) return best_params + def hypertune_mlp(experiment_name): """ Run hyperparameter tuning and final training for the MLP model with MLflow tracking. """ try: - X_train_scaled, X_test_scaled, X_eval_scaled, scaler = preprocess_data(X_train, X_test, X_eval) + X_train_scaled, X_test_scaled, X_eval_scaled, scaler = preprocess_data( + X_train, X_test, X_eval + ) hyperparameter_space = load_hyperparameter_space() - best_params = optimize_hyperparameters(X_train_scaled, y_train, X_test_scaled, y_test, X_eval_scaled, y_eval, hyperparameter_space) + best_params = optimize_hyperparameters( + X_train_scaled, + y_train, + X_test_scaled, + y_test, + X_eval_scaled, + y_eval, + hyperparameter_space, + ) logger.info("Training final MLP model with best hyperparameters") - model, metrics = train_model(X_train_scaled, y_train, X_test_scaled, y_test, X_eval_scaled, y_eval, best_params.copy()) - log_to_mlflow(model, metrics, best_params, experiment_name, scaler) + model, metrics = train_model( + X_train_scaled, + y_train, + X_test_scaled, + y_test, + X_eval_scaled, + y_eval, + best_params.copy(), + ) + log_to_mlflow(model, metrics, best_params, experiment_name, scaler, X_eval) return best_params, metrics except Exception as e: logger.error(f"Error during hypertuning: {str(e)}") return None, None + def log_to_mlflow(model, metrics, params, experiment_name, scaler, X_eval): """ Log the final MLP model, its metrics, and parameters to MLflow. - + Returns: str: Run ID. """ try: mlflow.set_experiment(experiment_name) - with mlflow.start_run(run_name=f"mlp_final_{datetime.now().strftime('%Y%m%d_%H%M')}") as run: + with mlflow.start_run( + run_name=f"mlp_final_{datetime.now().strftime('%Y%m%d_%H%M')}" + ) as run: for param_name, param_value in params.items(): mlflow.log_param(param_name, param_value) for metric_name, metric_value in metrics.items(): mlflow.log_metric(metric_name, metric_value) - + # Wrap the fitted Keras model wrapped_model = KerasMLPWrapper(model) - scaler_path = 'src/models/scalers/scaler_mlp.pkl' + scaler_path = "src/models/scalers/scaler_mlp.pkl" mlflow.log_artifact(scaler_path, artifact_path="scaler") # Create input example input_example = X_eval.iloc[:5].copy() # Identify and convert integer columns to float64 to prevent schema enforcement errors - if hasattr(input_example, 'dtypes'): + if hasattr(input_example, "dtypes"): for col in input_example.columns: - if input_example[col].dtype.kind == 'i': - logger.info(f"Converting integer column '{col}' to float64 to handle potential missing values") - input_example[col] = input_example[col].astype('float64') - + if input_example[col].dtype.kind == "i": + logger.info( + f"Converting integer column '{col}' to float64 to handle potential missing values" + ) + input_example[col] = input_example[col].astype("float64") + signature = None if input_example is not None: try: # Use wrapped model for predict_proba signature signature = mlflow.models.infer_signature( - input_example, - wrapped_model.predict_proba(input_example) + input_example, wrapped_model.predict_proba(input_example) ) except Exception as sig_err: logger.error(f"Failed to infer signature: {sig_err}") @@ -516,7 +511,7 @@ def log_to_mlflow(model, metrics, params, experiment_name, scaler, X_eval): artifact_path="model", pip_requirements=pip_requirements, registered_model_name=f"mlp_{datetime.now().strftime('%Y%m%d_%H%M')}", - signature=signature + signature=signature, ) run_id = run.info.run_id logger.info(f"MLflow run ID: {run_id}") @@ -526,6 +521,7 @@ def log_to_mlflow(model, metrics, params, experiment_name, scaler, X_eval): logger.error(f"Error logging to MLflow: {str(e)}") return None + def train_with_precision_target(X_train, y_train, X_test, y_test, X_eval, y_eval): """ Train MLP model with focus on precision target. @@ -543,35 +539,44 @@ def train_with_precision_target(X_train, y_train, X_test, y_test, X_eval, y_eval logger.info("Training model with precision target") params = base_params.copy() # Inherits base MLP parameters # Specific parameters for this training run with advanced scheduling - params.update({ - "learning_rate": 2.291034155900042e-05, - "hidden_layers": 1, - "neurons_per_layer": 778, - "dropout_rate": 0.69, - "activation": "elu", - "l1_regularization": 1.2245823592413548e-06, - "l2_regularization": 7.52365321620833e-05, - "batch_size": 3621, - "epochs": 112, - "patience": 30, - "class_weight_multiplier": 2.42, - }) - X_train_scaled, X_test_scaled, X_eval_scaled, scaler = preprocess_data(X_train, X_test, X_eval) + params.update( + { + "learning_rate": 2.291034155900042e-05, + "hidden_layers": 1, + "neurons_per_layer": 778, + "dropout_rate": 0.69, + "activation": "elu", + "l1_regularization": 1.2245823592413548e-06, + "l2_regularization": 7.52365321620833e-05, + "batch_size": 3621, + "epochs": 112, + "patience": 30, + "class_weight_multiplier": 2.42, + } + ) + X_train_scaled, X_test_scaled, X_eval_scaled, scaler = preprocess_data( + X_train, X_test, X_eval + ) # Train final model with best parameters logger.info("Training final model with best parameters") - model, metrics, wrapped_model = train_model(X_train_scaled, y_train, X_test_scaled, y_test, X_eval_scaled, y_eval, params) + model, metrics, wrapped_model = train_model( + X_train_scaled, y_train, X_test_scaled, y_test, X_eval_scaled, y_eval, params + ) # Log to MLflow # log_to_mlflow(model, metrics, params, experiment_name, scaler) - top_features = compute_permutation_importance(wrapped_model, X_eval, X_eval_scaled, y_eval, metrics["threshold"]) + top_features = compute_permutation_importance( + wrapped_model, X_eval, X_eval_scaled, y_eval, metrics["threshold"] + ) logger.info(f"Top features: {top_features}") return model, metrics, params except Exception as e: logger.error(f"Error during MLflow artifact logging: {str(e)}") return mlflow.active_run().info.run_id if mlflow.active_run() else None + def compute_permutation_importance( model, - X_val: pd.DataFrame, + X_val: pd.DataFrame, X_val_scaled: np.ndarray, y_val: np.ndarray, threshold: float = 0.3, @@ -592,35 +597,37 @@ def compute_permutation_importance( DataFrame with columns: ['feature', 'importance'] (mean drop in metric), sorted descending. """ feature_names = X_val.columns.tolist() - y_val_np = y_val.values if hasattr(y_val, 'values') else y_val - + y_val_np = y_val.values if hasattr(y_val, "values") else y_val + # Compute baseline metric probs = model.predict_proba(X_val_scaled)[:, 1] preds = (probs >= threshold).astype(int) - + # Calculate baseline precision baseline = np.sum((y_val_np == 1) & (preds == 1)) / (np.sum(preds == 1)) logger.info(f"Baseline metric: {baseline:.4f}") - + importances = [] for idx, feat in enumerate(feature_names): drops = [] for i in range(n_repeats): - logger.info(f"Shuffling feature: {feat} ({idx}) - Repeat: {i+1}") + logger.info(f"Shuffling feature: {feat} ({idx}) - Repeat: {i + 1}") X_shuffled = X_val_scaled.copy() X_shuffled[:, idx] = np.random.permutation(X_shuffled[:, idx]) probs_shuffled = model.predict_proba(X_shuffled)[:, 1] preds_shuffled = (probs_shuffled >= threshold).astype(int) - + # Calculate precision directly instead of using metric parameter - precision = np.sum((y_val_np == 1) & (preds_shuffled == 1)) / (np.sum(preds_shuffled == 1)) + precision = np.sum((y_val_np == 1) & (preds_shuffled == 1)) / ( + np.sum(preds_shuffled == 1) + ) drop = baseline - precision drops.append(drop) - + mean_drop = np.mean(drops) importances.append((feat, mean_drop)) logger.debug(f"Feature: {feat}, Mean drop: {mean_drop:.4f}") - + # Sort by importance descending importances.sort(key=lambda x: x[1], reverse=True) df_importance = pd.DataFrame(importances, columns=["feature", "importance"]) @@ -628,13 +635,16 @@ def compute_permutation_importance( logger.info(df_importance.head(number_of_features).to_string(index=False)) return df_importance -def hypertune_with_feature_importance(X_train, y_train, X_test, y_test, X_eval, y_eval, n_trials=50): + +def hypertune_with_feature_importance( + X_train, y_train, X_test, y_test, X_eval, y_eval, n_trials=50 +): """ Perform hyperparameter optimization with Optuna while tracking feature importances. After optimization, compute SHAP feature importances for the best model. Args: X_train (pd.DataFrame): Training features - y_train (pd.Series): Training labels + y_train (pd.Series): Training labels X_test (pd.DataFrame): Test features y_test (pd.Series): Test labels n_trials (int): Number of optimization trials @@ -647,9 +657,10 @@ def hypertune_with_feature_importance(X_train, y_train, X_test, y_test, X_eval, hyperparameter_space = load_hyperparameter_space() X_train_scaled, X_test_scaled, X_eval_scaled, scaler = preprocess_data(X_train, X_test, X_eval) best_model = None - best_score = -float('inf') + best_score = -float("inf") best_metrics = None best_wrapped_model = None + def objective(trial): nonlocal best_model, best_score, best_metrics, best_wrapped_model params = base_params.copy() @@ -684,11 +695,13 @@ def objective(trial): param_name, param_config["low"], param_config["high"] ) # Train model - model, metrics, wrapped_model = train_model(X_train_scaled, y_train, X_test_scaled, y_test, X_eval_scaled, y_eval, params) + model, metrics, wrapped_model = train_model( + X_train_scaled, y_train, X_test_scaled, y_test, X_eval_scaled, y_eval, params + ) # Track best model - precision = metrics.get('precision', 0.0) - recall = metrics.get('recall', 0.0) - threshold = metrics.get('threshold', 0.5) + precision = metrics.get("precision", 0.0) + recall = metrics.get("recall", 0.0) + threshold = metrics.get("threshold", 0.5) score = precision if recall >= min_recall else 0.0 if score > best_score: best_score = score @@ -698,7 +711,7 @@ def objective(trial): # Compute permutation importance for this trial importances = [] feature_names = X_eval.columns.tolist() - y_val_np = y_eval.values if hasattr(y_eval, 'values') else y_eval + y_val_np = y_eval.values if hasattr(y_eval, "values") else y_eval probs = wrapped_model.predict_proba(X_eval_scaled)[:, 1] preds = (probs >= threshold).astype(int) baseline = np.sum((y_val_np == 1) & (preds == 1)) / (np.sum(preds == 1)) @@ -707,161 +720,175 @@ def objective(trial): X_shuffled[:, idx] = np.random.permutation(X_shuffled[:, idx]) probs_shuffled = wrapped_model.predict_proba(X_shuffled)[:, 1] preds_shuffled = (probs_shuffled >= threshold).astype(int) - precision = np.sum((y_val_np == 1) & (preds_shuffled == 1)) / (np.sum(preds_shuffled == 1)) + precision = np.sum((y_val_np == 1) & (preds_shuffled == 1)) / ( + np.sum(preds_shuffled == 1) + ) drop = baseline - precision importances.append(drop) feature_importances_trials.append(importances) return score + # Create and run study - study = optuna.create_study(direction='maximize') + study = optuna.create_study(direction="maximize") study.optimize(objective, n_trials=n_trials) - + # Aggregate importances importances_array = np.array(feature_importances_trials) # shape: (n_trials, n_features) mean_importances = np.mean(importances_array, axis=0) - importance_df = pd.DataFrame({ - 'feature': X_eval.columns, - 'mean_importance': mean_importances - }).sort_values('mean_importance', ascending=False) - logger.info('Top features by average permutation importance across trials:') + importance_df = pd.DataFrame( + {"feature": X_eval.columns, "mean_importance": mean_importances} + ).sort_values("mean_importance", ascending=False) + logger.info("Top features by average permutation importance across trials:") for _idx, row in importance_df.head(100).iterrows(): - logger.info(f' {row.feature}: {row.mean_importance:.6f}') + logger.info(f" {row.feature}: {row.mean_importance:.6f}") - return study.best_params, importance_df + def mlp_staged_selection(X, y, X_eval, y_eval, target_features=100): """Multi-stage MLP feature selection with different objectives""" - + logger.info(f"Starting MLP staged selection with {X.shape[1]} initial features") - + # Stage 1: Quick filter with simple MLP logger.info("Stage 1: Quick filter with simple MLP") X_scaled, X_eval_scaled, _X_eval_scaled, scaler_stage1 = preprocess_data(X, X_eval, X_eval) - + # Create simple MLP for quick filtering - mlp_fast = keras.Sequential([ - layers.InputLayer(shape=(X.shape[1],)), - layers.Dense(64, activation='relu'), - layers.Dropout(0.3), - layers.Dense(32, activation='relu'), - layers.Dropout(0.3), - layers.Dense(1, activation='sigmoid') - ]) - + mlp_fast = keras.Sequential( + [ + layers.InputLayer(shape=(X.shape[1],)), + layers.Dense(64, activation="relu"), + layers.Dropout(0.3), + layers.Dense(32, activation="relu"), + layers.Dropout(0.3), + layers.Dense(1, activation="sigmoid"), + ] + ) + mlp_fast.compile( optimizer=keras.optimizers.Adam(learning_rate=0.01), - loss='binary_crossentropy', - metrics=['accuracy', 'precision', 'recall'] + loss="binary_crossentropy", + metrics=["accuracy", "precision", "recall"], ) - + # Train quick model mlp_fast.fit( - X_scaled, y, + X_scaled, + y, validation_data=(X_eval_scaled, y_eval), epochs=50, batch_size=512, verbose=0, - callbacks=[keras.callbacks.EarlyStopping(patience=10, restore_best_weights=True)] + callbacks=[keras.callbacks.EarlyStopping(patience=10, restore_best_weights=True)], ) - + # Calculate permutation importance for stage 1 baseline_score = mlp_fast.evaluate(X_eval_scaled, y_eval, verbose=0)[1] # accuracy stage1_importances = [] - + for i in range(X.shape[1]): X_eval_permuted = X_eval_scaled.copy() X_eval_permuted[:, i] = np.random.permutation(X_eval_permuted[:, i]) permuted_score = mlp_fast.evaluate(X_eval_permuted, y_eval, verbose=0)[1] importance = baseline_score - permuted_score stage1_importances.append(importance) - + stage1_importances = np.array(stage1_importances) stage1_features = X.columns[np.argsort(stage1_importances)[-200:]].tolist() - + logger.info(f"Stage 1: Selected {len(stage1_features)} features") - + # Stage 2: Refined selection with cross-validation logger.info("Stage 2: Refined selection with cross-validation") X_stage1 = X[stage1_features] X_eval_stage1 = X_eval[stage1_features] - + # Cross-validation feature importance cv_scores = [] cv_importances = [] - + skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42) for fold, (train_idx, val_idx) in enumerate(skf.split(X_stage1, y)): logger.info(f"Processing fold {fold + 1}/5") - + X_train_cv, X_val_cv = X_stage1.iloc[train_idx], X_stage1.iloc[val_idx] y_train_cv, y_val_cv = y.iloc[train_idx], y.iloc[val_idx] - + # Scale data for this fold scaler_cv = RobustScaler() X_train_cv_scaled = scaler_cv.fit_transform(X_train_cv) X_val_cv_scaled = scaler_cv.transform(X_val_cv) X_eval_stage1_scaled = scaler_cv.transform(X_eval_stage1) - + # Create refined MLP - mlp_refined = keras.Sequential([ - layers.InputLayer(shape=(X_stage1.shape[1],)), - layers.Dense(128, activation='elu', kernel_regularizer=regularizers.l1_l2(l1=1e-5, l2=1e-4)), - layers.Dropout(0.4), - layers.Dense(64, activation='elu', kernel_regularizer=regularizers.l1_l2(l1=1e-5, l2=1e-4)), - layers.Dropout(0.4), - layers.Dense(32, activation='elu', kernel_regularizer=regularizers.l1_l2(l1=1e-5, l2=1e-4)), - layers.Dropout(0.3), - layers.Dense(1, activation='sigmoid') - ]) - + mlp_refined = keras.Sequential( + [ + layers.InputLayer(shape=(X_stage1.shape[1],)), + layers.Dense( + 128, activation="elu", kernel_regularizer=regularizers.l1_l2(l1=1e-5, l2=1e-4) + ), + layers.Dropout(0.4), + layers.Dense( + 64, activation="elu", kernel_regularizer=regularizers.l1_l2(l1=1e-5, l2=1e-4) + ), + layers.Dropout(0.4), + layers.Dense( + 32, activation="elu", kernel_regularizer=regularizers.l1_l2(l1=1e-5, l2=1e-4) + ), + layers.Dropout(0.3), + layers.Dense(1, activation="sigmoid"), + ] + ) + mlp_refined.compile( optimizer=keras.optimizers.Adam(learning_rate=0.001), - loss='binary_crossentropy', - metrics=['accuracy', 'precision', 'recall'] + loss="binary_crossentropy", + metrics=["accuracy", "precision", "recall"], ) - + # Train refined model mlp_refined.fit( - X_train_cv_scaled, y_train_cv, + X_train_cv_scaled, + y_train_cv, validation_data=(X_val_cv_scaled, y_val_cv), epochs=100, batch_size=256, verbose=0, - callbacks=[keras.callbacks.EarlyStopping(patience=15, restore_best_weights=True)] + callbacks=[keras.callbacks.EarlyStopping(patience=15, restore_best_weights=True)], ) - + # Calculate permutation importance for this fold baseline_score_cv = mlp_refined.evaluate(X_eval_stage1_scaled, y_eval, verbose=0)[1] fold_importances = [] - + for i in range(X_stage1.shape[1]): X_eval_permuted = X_eval_stage1_scaled.copy() X_eval_permuted[:, i] = np.random.permutation(X_eval_permuted[:, i]) permuted_score = mlp_refined.evaluate(X_eval_permuted, y_eval, verbose=0)[1] importance = baseline_score_cv - permuted_score fold_importances.append(importance) - + cv_importances.append(fold_importances) cv_scores.append(baseline_score_cv) - + # Clear memory del mlp_refined keras.backend.clear_session() - + # Average importance across folds avg_importance = np.mean(cv_importances, axis=0) stage2_features = [stage1_features[i] for i in np.argsort(avg_importance)[-target_features:]] - + # Log average importances for the selected features selected_indices = np.argsort(avg_importance)[-target_features:] selected_importances = avg_importance[selected_indices] feature_importance_pairs = list(zip(stage2_features, selected_importances)) - + logger.info(f"Stage 2: Selected {len(stage2_features)} features: {stage2_features}") logger.info(f"Feature-importance pairs: {feature_importance_pairs}") logger.info(f"CV Score: {np.mean(cv_scores):.4f} ± {np.std(cv_scores):.4f}") - + return stage2_features, avg_importance @@ -896,7 +923,9 @@ def main(): y_combined = pd.concat([y_train, y_test]) X_eval_combined = pd.concat([X_eval, X_test]) y_eval_combined = pd.concat([y_eval, y_test]) - mlp_staged_selection(X_combined, y_combined, X_eval_combined, y_eval_combined, target_features=80) + mlp_staged_selection( + X_combined, y_combined, X_eval_combined, y_eval_combined, target_features=80 + ) # # Optional seed-based fine-tuning for improved precision # train_with_precision_target(X_train, y_train, X_test, y_test, X_eval, y_eval) @@ -904,5 +933,6 @@ def main(): except Exception as e: logger.error(f"Error in main execution: {str(e)}") + if __name__ == "__main__": - main() \ No newline at end of file + main() diff --git a/src/models/StackedEnsemble/base/neural/pytorch_hypertuner.py b/src/models/StackedEnsemble/base/neural/pytorch_hypertuner.py index 58802b0..f5b44d5 100644 --- a/src/models/StackedEnsemble/base/neural/pytorch_hypertuner.py +++ b/src/models/StackedEnsemble/base/neural/pytorch_hypertuner.py @@ -69,7 +69,7 @@ torch.backends.cuda.matmul.allow_tf32 = True torch.backends.cudnn.allow_tf32 = True # Check if PyTorch version supports torch.compile - if hasattr(torch, 'compile'): + if hasattr(torch, "compile"): logger.info("torch.compile is available - will use it for performance optimization") USE_TORCH_COMPILE = True else: @@ -78,6 +78,7 @@ else: USE_TORCH_COMPILE = False + # Define a placeholder if the actual model file doesn't exist yet class PytorchModel(nn.Module): def __init__(self, input_dim, **kwargs): @@ -91,16 +92,16 @@ def __init__(self, input_dim, **kwargs): act_fn = nn.LeakyReLU() elif activation_name == "SiLU": act_fn = nn.SiLU() - else: # Default to ReLU + else: # Default to ReLU act_fn = nn.ReLU() layers = [nn.Linear(input_dim, hidden_units), act_fn, nn.Dropout(dropout_rate)] for _ in range(num_layers - 1): layers.extend([nn.Linear(hidden_units, hidden_units), act_fn, nn.Dropout(dropout_rate)]) - layers.append(nn.Linear(hidden_units, 1)) # Output layer for binary classification + layers.append(nn.Linear(hidden_units, 1)) # Output layer for binary classification self.network = nn.Sequential(*layers) # Add attributes to store scaler and device for predict_proba - self.scaler_ = None + self.scaler_ = None self.device_ = None def forward(self, x): @@ -113,19 +114,21 @@ def predict_proba(self, X, batch_size=1024): Returns probabilities for both classes (0 and 1) in shape (N, 2). """ if self.scaler_ is None or self.device_ is None: - raise ValueError("Scaler and Device must be set on the model before calling predict_proba.") - - self.network.eval() # Set model to evaluation mode + raise ValueError( + "Scaler and Device must be set on the model before calling predict_proba." + ) + + self.network.eval() # Set model to evaluation mode all_probs_class1 = [] - + # Ensure X is DataFrame or Array that scaler expects X_scaled = self.scaler_.transform(X) - + X_tensor = torch.tensor(X_scaled, dtype=torch.float32) dataset = TensorDataset(X_tensor) dataloader = TorchDataLoader( - dataset, - batch_size=32, + dataset, + batch_size=32, num_workers=1, # pin_memory=True, # Enables faster CPU to GPU transfers # persistent_workers=True # Keeps workers alive between epochs @@ -137,15 +140,16 @@ def predict_proba(self, X, batch_size=1024): outputs = self.network(batch_X) probs = torch.sigmoid(outputs).cpu().numpy() all_probs_class1.append(probs) - + # Concatenate probabilities for class 1 - probs_class1 = np.concatenate(all_probs_class1) # Shape (N, 1) + probs_class1 = np.concatenate(all_probs_class1) # Shape (N, 1) # Calculate probabilities for class 0 - probs_class0 = 1.0 - probs_class1 # Shape (N, 1) - + probs_class0 = 1.0 - probs_class1 # Shape (N, 1) + # Stack them horizontally to get shape (N, 2) return np.hstack((probs_class0, probs_class1)) + # Global settings MIN_RECALL = 0.30 # Minimum acceptable recall N_TRIALS = 10000 # Number of hyperparameter optimization trials (adjust as needed) @@ -164,9 +168,8 @@ def predict_proba(self, X, batch_size=1024): ] logger.info(f"Defined pip requirements: {pip_requirements}") -base_params = { - "random_state": SEED -} +base_params = {"random_state": SEED} + # --- Phase 2: Hyperparameter Space and Model Handling --- def load_hyperparameter_space(): @@ -183,30 +186,24 @@ def load_hyperparameter_space(): "high": 1e-2, "log": True, }, - "optimizer": { - "type": "categorical", - "choices": ["Adam", "AdamW", "SGD"] - }, + "optimizer": {"type": "categorical", "choices": ["Adam", "AdamW", "SGD"]}, "weight_decay": { "type": "float", "low": 1e-7, "high": 1e-3, "log": True, }, - "batch_size": { - "type": "categorical", - "choices": [64, 128, 256, 512] - }, + "batch_size": {"type": "categorical", "choices": [64, 128, 256, 512]}, "num_epochs": { "type": "int", - "low": 50, # Min epochs - "high": 300, # Max epochs (can be higher, depends on early stopping) + "low": 50, # Min epochs + "high": 300, # Max epochs (can be higher, depends on early stopping) "step": 10, }, "num_layers": { - "type": "int", + "type": "int", "low": 1, - "high": 6 # Example range for custom NN + "high": 6, # Example range for custom NN }, "hidden_units": { "type": "int", @@ -214,10 +211,7 @@ def load_hyperparameter_space(): "high": 256, "step": 32, }, - "activation_fn": { - "type": "categorical", - "choices": ["ReLU", "LeakyReLU", "SiLU"] - }, + "activation_fn": {"type": "categorical", "choices": ["ReLU", "LeakyReLU", "SiLU"]}, "dropout_rate": { "type": "float", "low": 0.1, @@ -253,19 +247,19 @@ def create_pytorch_model(model_params, input_dim, device, scaler): architecture_params = { "num_layers": model_params.get("num_layers"), "hidden_units": model_params.get("hidden_units"), - "activation_fn": model_params.get("activation_fn"), - "dropout_rate": model_params.get("dropout_rate") + "activation_fn": model_params.get("activation_fn"), + "dropout_rate": model_params.get("dropout_rate"), } # Remove None values if PytorchModel handles defaults architecture_params = {k: v for k, v in architecture_params.items() if v is not None} - + model = PytorchModel(input_dim=input_dim, **architecture_params) model.to(device) - + # Attach scaler and device to the model instance model.scaler_ = scaler model.device_ = device - + # logger.info(f"Created PyTorch model with params: {architecture_params}") logger.info(f"Model placed on device: {device}, scaler attached.") return model @@ -282,8 +276,8 @@ def train_pytorch_model( y_val, model_params, device, - scaler, # Pass the fitted scaler - trial=None, # Optional: Pass the Optuna trial for pruning + scaler, # Pass the fitted scaler + trial=None, # Optional: Pass the Optuna trial for pruning ): """ Trains the PyTorch model, handles validation, early stopping, and threshold tuning. @@ -311,28 +305,34 @@ def train_pytorch_model( X_val_scaled = scaler.transform(X_val) # No need to scale X_eval here, will be handled by pytorch_predict_proba X_train_tensor = torch.tensor(X_combined_scaled, dtype=torch.float32) - y_train_tensor = torch.tensor(y_combined.values, dtype=torch.float32).unsqueeze(1) # Ensure (N, 1) + y_train_tensor = torch.tensor(y_combined.values, dtype=torch.float32).unsqueeze( + 1 + ) # Ensure (N, 1) X_val_tensor = torch.tensor(X_val_scaled, dtype=torch.float32) - y_val_tensor = torch.tensor(y_val.values, dtype=torch.float32).unsqueeze(1) # Ensure (N, 1) - + y_val_tensor = torch.tensor(y_val.values, dtype=torch.float32).unsqueeze(1) # Ensure (N, 1) + train_dataset = TensorDataset(X_train_tensor, y_train_tensor) val_dataset = TensorDataset(X_val_tensor, y_val_tensor) - + train_loader = TorchDataLoader(train_dataset, batch_size=batch_size, shuffle=True) val_loader = TorchDataLoader(val_dataset, batch_size=batch_size, shuffle=False) - logger.info(f"Data loaders created. Train batches: {len(train_loader)}, Val batches: {len(val_loader)}") + logger.info( + f"Data loaders created. Train batches: {len(train_loader)}, Val batches: {len(val_loader)}" + ) # --- Initialization --- - criterion = nn.BCEWithLogitsLoss() # Numerically stable + criterion = nn.BCEWithLogitsLoss() # Numerically stable optimizer_name = model_params.get("optimizer", "AdamW") lr = model_params["learning_rate"] weight_decay = model_params.get("weight_decay", 0) - + if optimizer_name == "Adam": optimizer = optim.Adam(model.parameters(), lr=lr, weight_decay=weight_decay) elif optimizer_name == "SGD": - optimizer = optim.SGD(model.parameters(), lr=lr, weight_decay=weight_decay, momentum=0.9) # Add momentum for SGD - else: # Default to AdamW + optimizer = optim.SGD( + model.parameters(), lr=lr, weight_decay=weight_decay, momentum=0.9 + ) # Add momentum for SGD + else: # Default to AdamW optimizer = optim.AdamW(model.parameters(), lr=lr, weight_decay=weight_decay) logger.info(f"Using optimizer: {optimizer_name}") @@ -342,7 +342,7 @@ def train_pytorch_model( patience_counter = 0 best_model_state_dict = None epochs = model_params["num_epochs"] - + # --- Training Loop --- for epoch in range(epochs): # Training Phase @@ -350,7 +350,7 @@ def train_pytorch_model( epoch_train_loss = 0.0 for batch_X, batch_y in train_loader: batch_X, batch_y = batch_X.to(device), batch_y.to(device) - + optimizer.zero_grad() outputs = model(batch_X) loss = criterion(outputs, batch_y) @@ -369,59 +369,69 @@ def train_pytorch_model( loss = criterion(outputs, batch_y) epoch_val_loss += loss.item() avg_val_loss = epoch_val_loss / len(val_loader) - - logger.debug(f"Epoch {epoch+1}/{epochs} - Train Loss: {avg_train_loss:.4f}, Val Loss: {avg_val_loss:.4f}") + + logger.debug( + f"Epoch {epoch + 1}/{epochs} - Train Loss: {avg_train_loss:.4f}, Val Loss: {avg_val_loss:.4f}" + ) # Early Stopping if avg_val_loss < best_val_loss: best_val_loss = avg_val_loss patience_counter = 0 - best_model_state_dict = model.state_dict() # Save best model state - logger.debug(f"Epoch {epoch+1}: New best validation loss: {best_val_loss:.4f}") + best_model_state_dict = model.state_dict() # Save best model state + logger.debug(f"Epoch {epoch + 1}: New best validation loss: {best_val_loss:.4f}") else: patience_counter += 1 - logger.debug(f"Epoch {epoch+1}: No improvement in validation loss. Patience: {patience_counter}/{patience}") + logger.debug( + f"Epoch {epoch + 1}: No improvement in validation loss. Patience: {patience_counter}/{patience}" + ) if patience_counter >= patience: - logger.info(f"Early stopping triggered at epoch {epoch+1}.") + logger.info(f"Early stopping triggered at epoch {epoch + 1}.") break - + # --- Post-Training --- if best_model_state_dict: model.load_state_dict(best_model_state_dict) - logger.info(f"Loaded best model state from epoch with validation loss: {best_val_loss:.4f}") + logger.info( + f"Loaded best model state from epoch with validation loss: {best_val_loss:.4f}" + ) else: - logger.warning("No best model state found (early stopping might not have been triggered or training was too short).") + logger.warning( + "No best model state found (early stopping might not have been triggered or training was too short)." + ) # Threshold Optimization using validation data (X_val, y_val) logger.info("Optimizing threshold on validation data...") - y_val_np = y_val.values # optimize_threshold expects numpy array - + y_val_np = y_val.values # optimize_threshold expects numpy array + # Call optimize_threshold with the model, X, and y # Assumes optimize_threshold will internally call model.predict_proba(X_val) try: - best_threshold, metrics = optimize_threshold( - model, # Pass the model object - X_val, # Pass validation features - y_val_np, # Pass validation true labels + best_threshold, metrics = optimize_threshold( + model, # Pass the model object + X_val, # Pass validation features + y_val_np, # Pass validation true labels min_recall=MIN_RECALL, # Removed pre-calculated probs and other args ) except TypeError as te: # This error might still occur if optimize_threshold itself has issues logger.error(f"TypeError calling optimize_threshold: {te}") - logger.error("Ensure optimize_threshold in hypertuner_utils.py expects (model, X, y, min_recall).") - return model, {} # Return empty metrics on failure - + logger.error( + "Ensure optimize_threshold in hypertuner_utils.py expects (model, X, y, min_recall)." + ) + return model, {} # Return empty metrics on failure + logger.info(f"Threshold optimization complete. Best Threshold: {best_threshold:.4f}") logger.info(f"Metrics at threshold: {metrics}") return model, metrics except optuna.TrialPruned: - raise # Re-raise prune exception for Optuna to handle + raise # Re-raise prune exception for Optuna to handle except Exception as e: logger.error(f"Error during PyTorch model training: {str(e)}") # Return None or raise to indicate failure - return None, {} # Return empty metrics on failure + return None, {} # Return empty metrics on failure # --- Phase 4: Optuna Integration --- @@ -448,17 +458,19 @@ def objective( for param_name, config in hyperparameter_space.items(): if config["type"] == "float": params[param_name] = trial.suggest_float( - param_name, config["low"], config["high"], - log=config.get("log", False), step=config.get("step") + param_name, + config["low"], + config["high"], + log=config.get("log", False), + step=config.get("step"), ) elif config["type"] == "int": params[param_name] = trial.suggest_int( - param_name, config["low"], config["high"], - step=config.get("step", 1) + param_name, config["low"], config["high"], step=config.get("step", 1) ) elif config["type"] == "categorical": params[param_name] = trial.suggest_categorical(param_name, config["choices"]) - + logger.info(f"Trial {trial.number}: Suggested params: {params}") # Create model @@ -468,19 +480,22 @@ def objective( # train_pytorch_model handles potential TrialPruned exception model, metrics = train_pytorch_model( model, - X_train, y_train, - X_test, y_test, - X_val, y_val, - params, - device, - scaler, - trial=trial # Pass trial for pruning + X_train, + y_train, + X_test, + y_test, + X_val, + y_val, + params, + device, + scaler, + trial=trial, # Pass trial for pruning ) - + # Check if training failed if not metrics: logger.warning(f"Trial {trial.number}: Training failed or returned no metrics.") - return 0.0 # Return low score for failed trials + return 0.0 # Return low score for failed trials # Calculate score (e.g., precision constrained by recall) recall = metrics.get("recall", 0.0) @@ -490,10 +505,12 @@ def objective( # Log metrics as user attributes for the trial for metric_name, metric_value in metrics.items(): trial.set_user_attr(metric_name, metric_value) - trial.set_user_attr("score", score) # Log the final score too + trial.set_user_attr("score", score) # Log the final score too + + logger.info( + f"Trial {trial.number}: Score: {score:.4f} (Precision: {precision:.4f}, Recall: {recall:.4f})" + ) - logger.info(f"Trial {trial.number}: Score: {score:.4f} (Precision: {precision:.4f}, Recall: {recall:.4f})") - if score > 0.35: log_to_mlflow_pytorch( model, @@ -509,21 +526,24 @@ def objective( except optuna.TrialPruned as e: logger.info(f"Trial {trial.number} pruned during training.") - raise e # Re-raise for Optuna + raise e # Re-raise for Optuna except Exception as e: logger.error(f"Trial {trial.number} failed unexpectedly: {str(e)}") return 0.0 # Return low score for other failed trials def optimize_hyperparameters( - X_train, y_train, - X_test, y_test, - X_val, y_val, + X_train, + y_train, + X_test, + y_test, + X_val, + y_val, hyperparameter_space, input_dim, device, scaler, - n_trials=N_TRIALS # Use global constant + n_trials=N_TRIALS, # Use global constant ): """ Orchestrates the hyperparameter optimization process using Optuna. @@ -540,29 +560,32 @@ def optimize_hyperparameters( dict: The best hyperparameters found by Optuna. """ logger.info(f"Starting hyperparameter optimization with {n_trials} trials...") - + # Define the objective function with fixed arguments using lambda objective_func = lambda trial: objective( - trial, - X_train, y_train, - X_test, y_test, - X_val, y_val, - hyperparameter_space, - input_dim, - device, - scaler + trial, + X_train, + y_train, + X_test, + y_test, + X_val, + y_val, + hyperparameter_space, + input_dim, + device, + scaler, ) # Set up Optuna study # Consider adding persistent storage like the XGBoost version if needed storage_url = "sqlite:///optuna_pytorch.db" - sampler = optuna.samplers.RandomSampler(seed=SEED) # Example sampler + sampler = optuna.samplers.RandomSampler(seed=SEED) # Example sampler study = optuna.create_study( direction="maximize", sampler=sampler, study_name=experiment_name, storage=storage_url, - load_if_exists=True + load_if_exists=True, ) # Add a callback to log progress (optional, can be simpler than XGBoost version) @@ -579,27 +602,33 @@ def log_callback(study, trial): # Keep track of top N trials top_trials_overall.append((trial.number, current_score, trial.params)) top_trials_overall.sort(key=lambda x: x[1], reverse=True) - top_trials_overall[:] = top_trials_overall[:10] # Keep top 10 + top_trials_overall[:] = top_trials_overall[:10] # Keep top 10 logger.info(f"Best Score: {best_score_overall}") # Log top trials periodically (e.g., every 10 trials) if trial.number % 10 == 0: logger.info(f"--- Top Trials (after Trial {trial.number}) ---") header = "| Rank | Trial # | Score | Params |" - sep = "|------|---------|--------|--------|" + sep = "|------|---------|--------|--------|" logger.info(header) logger.info(sep) for i, (t_num, t_score, t_params) in enumerate(top_trials_overall): - params_str = json.dumps(t_params, sort_keys=True, default=lambda x: f"{x:.4g}" if isinstance(x, float) else x) # Compact params - logger.info(f"| {i+1:<4} | {t_num:<7} | {t_score:.4f} | {params_str} |") + params_str = json.dumps( + t_params, + sort_keys=True, + default=lambda x: f"{x:.4g}" if isinstance(x, float) else x, + ) # Compact params + logger.info(f"| {i + 1:<4} | {t_num:<7} | {t_score:.4f} | {params_str} |") logger.info(f"Current Best Score Overall: {best_score_overall:.4f}") # Run the optimization try: - study.optimize(objective_func, n_trials=n_trials, callbacks=[log_callback], show_progress_bar=True) + study.optimize( + objective_func, n_trials=n_trials, callbacks=[log_callback], show_progress_bar=True + ) except KeyboardInterrupt: logger.warning("Optimization stopped manually via KeyboardInterrupt.") - - # --- Post-Optimization --- + + # --- Post-Optimization --- logger.info("Hyperparameter optimization finished.") if not study.trials: logger.warning("No trials completed successfully.") @@ -613,9 +642,9 @@ def log_to_mlflow_pytorch( model, metrics, params, - experiment_name, - X_eval, # For input example - scaler, # Log the scaler as well + experiment_name, + X_eval, # For input example + scaler, # Log the scaler as well pip_requirements, run_name_prefix="pytorch_final", ): @@ -635,17 +664,17 @@ def log_to_mlflow_pytorch( """ try: mlflow.set_experiment(experiment_name) - + with mlflow.start_run( run_name=f"{run_name_prefix}_{datetime.now().strftime('%Y%m%d_%H%M%S')}" ) as run: run_id = run.info.run_id logger.info(f"Starting MLflow run: {run_id}") - + # Log parameters (consider filtering if needed) mlflow.log_params(params) logger.info(f"Logged parameters: {params}") - + # Log metrics mlflow.log_metrics(metrics) logger.info(f"Logged metrics: {metrics}") @@ -656,28 +685,28 @@ def log_to_mlflow_pytorch( logger.info("Logged scaler artifact.") # Log the PyTorch model - # --- Signature Inference --- - input_example_df = X_eval.iloc[:5].copy() # Get DataFrame slice first + # --- Signature Inference --- + input_example_df = X_eval.iloc[:5].copy() # Get DataFrame slice first # Scale this specific example for prediction input_example_np_scaled = scaler.transform(input_example_df).astype(np.float32) input_example_tensor = torch.tensor(input_example_np_scaled, dtype=torch.float32) - + # Infer signature (optional but recommended) try: # Ensure model is on CPU for signature inference - original_device = next(model.parameters()).device # Store original device - model.to('cpu') + original_device = next(model.parameters()).device # Store original device + model.to("cpu") # Get predictions for the example with torch.no_grad(): # Use the tensor created from the scaled DataFrame example - predictions = model(input_example_tensor.cpu()) + predictions = model(input_example_tensor.cpu()) # Move model back to original device - model.to(original_device) + model.to(original_device) # Infer signature using the DataFrame input and numpy output signature = mlflow.models.infer_signature( - input_example_df, # Pass DataFrame with names - predictions.cpu().numpy() + input_example_df, # Pass DataFrame with names + predictions.cpu().numpy(), # Removed input_names argument ) logger.info("Successfully inferred model signature.") @@ -687,7 +716,7 @@ def log_to_mlflow_pytorch( # --- Model Logging --- # Still use numpy for input_example in log_model if required by mlflow - input_example_for_log = input_example_np_scaled + input_example_for_log = input_example_np_scaled # Log model using mlflow.pytorch with signature only model_info = mlflow.pytorch.log_model( @@ -695,13 +724,15 @@ def log_to_mlflow_pytorch( artifact_path="model", signature=signature, pip_requirements=pip_requirements, - input_example=input_example_for_log, # Use numpy array here for input_example param - registered_model_name=f"pytorch_{datetime.now().strftime('%Y%m%d_%H%M')}" # Optional registration + input_example=input_example_for_log, # Use numpy array here for input_example param + registered_model_name=f"pytorch_{datetime.now().strftime('%Y%m%d_%H%M')}", # Optional registration ) # Validate serving input try: - serving_input = mlflow.models.convert_input_example_to_serving_input(input_example_df) + serving_input = mlflow.models.convert_input_example_to_serving_input( + input_example_df + ) mlflow.models.validate_serving_input(model_info.model_uri, serving_input) logger.info("Successfully validated model serving input using DataFrame example.") except Exception as val_e: @@ -717,13 +748,16 @@ def log_to_mlflow_pytorch( def hypertune_pytorch( - X_train, y_train, - X_test, y_test, - X_val, y_val, - experiment_name: str, + X_train, + y_train, + X_test, + y_test, + X_val, + y_val, + experiment_name: str, input_dim: int, device: torch.device, - scaler: StandardScaler + scaler: StandardScaler, ): """ Main orchestration function for hyperparameter tuning and final model training. @@ -745,13 +779,16 @@ def hypertune_pytorch( # Run hyperparameter optimization logger.info("Initiating Optuna hyperparameter search...") best_hpo_params = optimize_hyperparameters( - X_train, y_train, - X_test, y_test, - X_val, y_val, + X_train, + y_train, + X_test, + y_test, + X_val, + y_val, hyperparameter_space, input_dim, device, - scaler + scaler, ) if not best_hpo_params: @@ -760,35 +797,40 @@ def hypertune_pytorch( return None, None logger.info(f"Best HPO parameters identified: {best_hpo_params}") - mlflow.log_params({f"best_{k}": v for k,v in best_hpo_params.items()}) # Log best HPO params to HPO run + mlflow.log_params( + {f"best_{k}": v for k, v in best_hpo_params.items()} + ) # Log best HPO params to HPO run # --- Train Final Model with Best Parameters --- logger.info("Training final model using best hyperparameters...") final_model = create_pytorch_model(best_hpo_params, input_dim, device, scaler) - + # Train the final model (without passing Optuna trial) final_model, final_metrics = train_pytorch_model( final_model, - X_train, y_train, - X_test, y_test, - X_val, y_val, + X_train, + y_train, + X_test, + y_test, + X_val, + y_val, best_hpo_params, device, scaler, - trial=None # Not an Optuna trial run + trial=None, # Not an Optuna trial run ) - + # --- Log Final Model Separately --- logger.info("Logging the final trained model and artifacts to MLflow...") log_to_mlflow_pytorch( model=final_model, metrics=final_metrics, params=best_hpo_params, - experiment_name=experiment_name, + experiment_name=experiment_name, X_eval=X_val, scaler=scaler, pip_requirements=pip_requirements, - run_name_prefix="pytorch_best_model" + run_name_prefix="pytorch_best_model", ) return best_hpo_params, final_metrics @@ -803,10 +845,13 @@ def hypertune_pytorch( def train_with_precision_target_pytorch( - X_train, y_train, - X_test, y_test, - X_val, y_val, - experiment_name: str, + X_train, + y_train, + X_test, + y_test, + X_val, + y_val, + experiment_name: str, input_dim: int, device: torch.device, scaler: StandardScaler, @@ -831,7 +876,7 @@ def train_with_precision_target_pytorch( logger.warning( "Training model with hardcoded parameters - Update these values with actual best params." ) - + # --- Define hardcoded best parameters here --- fixed_params = { "learning_rate": 0.00012540204133324443, @@ -843,7 +888,7 @@ def train_with_precision_target_pytorch( "hidden_units": 224, "activation_fn": "ReLU", "dropout_rate": 0.6, - "early_stopping_patience": 40 + "early_stopping_patience": 40, } logger.info(f"Using hardcoded parameters: {fixed_params}") @@ -853,13 +898,16 @@ def train_with_precision_target_pytorch( # Train the model model, metrics = train_pytorch_model( model, - X_train, y_train, - X_test, y_test, - X_val, y_val, - fixed_params, # Use the hardcoded dict + X_train, + y_train, + X_test, + y_test, + X_val, + y_val, + fixed_params, # Use the hardcoded dict device, scaler, - trial=None # Not an Optuna trial + trial=None, # Not an Optuna trial ) if not metrics: @@ -870,14 +918,14 @@ def train_with_precision_target_pytorch( log_to_mlflow_pytorch( model=model, metrics=metrics, - params=fixed_params, # Log the hardcoded params used + params=fixed_params, # Log the hardcoded params used experiment_name=experiment_name, X_eval=X_val, scaler=scaler, pip_requirements=pip_requirements, - run_name_prefix="pytorch_fixed_params" + run_name_prefix="pytorch_fixed_params", ) - + logger.info("Training with fixed parameters completed successfully.") return model, metrics @@ -893,23 +941,23 @@ def main(): """ try: logger.info("Starting PyTorch Model HPO script...") - + # Load data using the shared DataLoader - dataloader = DataLoader() + dataloader = DataLoader() X_train, y_train, X_test, y_test, X_val, y_val = dataloader.load_data() - + if X_train is None: logger.error("Data loading failed. Exiting.") return # Select features (using a placeholder type for PyTorch) - features = import_selected_features_ensemble(model_type="xgb") + features = import_selected_features_ensemble(model_type="xgb") if not features: logger.warning("No features selected for pytorch_model. Using all columns.") features = X_train.columns.tolist() # Ensure target column is not in features if it exists initially - if 'target' in features: - features.remove('target') + if "target" in features: + features.remove("target") X_train = prepare_data(X_train, features) X_test = prepare_data(X_test, features) @@ -918,34 +966,41 @@ def main(): logger.info(f"Selected {input_dim} features.") # Log data shapes and target means - logger.info(f"Train shape: {X_train.shape}, Val shape: {X_val.shape}, Test shape: {X_test.shape}") - logger.info(f"Target mean - Train: {y_train.mean():.3f}, Val: {y_val.mean():.3f}, Test: {y_test.mean():.3f}") + logger.info( + f"Train shape: {X_train.shape}, Val shape: {X_val.shape}, Test shape: {X_test.shape}" + ) + logger.info( + f"Target mean - Train: {y_train.mean():.3f}, Val: {y_val.mean():.3f}, Test: {y_test.mean():.3f}" + ) # Fit the scaler ONLY on training data scaler_path = "src/models/scalers/scaler_pytorch.pkl" if os.path.exists(scaler_path): logger.info(f"Loading existing scaler from {scaler_path}") - with open(scaler_path, 'rb') as f: + with open(scaler_path, "rb") as f: scaler = pickle.load(f) else: logger.info("Creating new RobustScaler") scaler = RobustScaler() scaler.fit(X_train) logger.info("RobustScaler fitted on training data.") - with open(scaler_path, 'wb') as f: + with open(scaler_path, "wb") as f: pickle.dump(scaler, f) best_params = None final_metrics = None # Run Hyperparameter Optimization and Final Model Training best_params, final_metrics = hypertune_pytorch( - X_train, y_train, - X_test, y_test, - X_val, y_val, - experiment_name, + X_train, + y_train, + X_test, + y_test, + X_val, + y_val, + experiment_name, input_dim, device, - scaler + scaler, ) if best_params and final_metrics: @@ -956,18 +1011,26 @@ def main(): logger.error("HPO process failed or did not complete successfully.") logger.info("Running training with fixed best parameters...") train_with_precision_target_pytorch( - X_train, y_train, X_test, y_test, X_val, y_val, - experiment_name, input_dim, device, scaler + X_train, + y_train, + X_test, + y_test, + X_val, + y_val, + experiment_name, + input_dim, + device, + scaler, ) except Exception as e: logger.error(f"Error in main execution: {str(e)}") finally: - gc.collect() # Force garbage collection + gc.collect() # Force garbage collection logger.info("--- PyTorch HPO script finished. ---") if __name__ == "__main__": main() -# --- End of File --- \ No newline at end of file +# --- End of File --- diff --git a/src/models/StackedEnsemble/base/neural/pytorch_hypertuner_20.py b/src/models/StackedEnsemble/base/neural/pytorch_hypertuner_20.py index 12e64e0..f06f698 100644 --- a/src/models/StackedEnsemble/base/neural/pytorch_hypertuner_20.py +++ b/src/models/StackedEnsemble/base/neural/pytorch_hypertuner_20.py @@ -25,7 +25,7 @@ # Add sklearn imports for staged feature selection from sklearn.model_selection import StratifiedKFold -from sklearn.preprocessing import RobustScaler, StandardScaler +from sklearn.preprocessing import RobustScaler from torch.utils.data import DataLoader as TorchDataLoader from torch.utils.data import TensorDataset @@ -60,19 +60,20 @@ torch.manual_seed(SEED) if torch.cuda.is_available(): torch.cuda.manual_seed_all(SEED) - torch.backends.cudnn.benchmark = True + torch.backends.cudnn.benchmark = True torch.backends.cudnn.deterministic = False # Better performance, less deterministic # Enable TF32 for better performance on Ampere GPUs (RTX 30xx and newer) torch.backends.cuda.matmul.allow_tf32 = True torch.backends.cudnn.allow_tf32 = True # Check if PyTorch version supports torch.compile - if hasattr(torch, 'compile'): + if hasattr(torch, "compile"): USE_TORCH_COMPILE = True else: USE_TORCH_COMPILE = False else: USE_TORCH_COMPILE = False + # Define a placeholder if the actual model file doesn't exist yet class PytorchModel(nn.Module): def __init__(self, input_dim, **kwargs): @@ -86,16 +87,16 @@ def __init__(self, input_dim, **kwargs): act_fn = nn.LeakyReLU() elif activation_name == "SiLU": act_fn = nn.SiLU() - else: # Default to ReLU + else: # Default to ReLU act_fn = nn.ReLU() layers = [nn.Linear(input_dim, hidden_units), act_fn, nn.Dropout(dropout_rate)] for _ in range(num_layers - 1): layers.extend([nn.Linear(hidden_units, hidden_units), act_fn, nn.Dropout(dropout_rate)]) - layers.append(nn.Linear(hidden_units, 1)) # Output layer for binary classification + layers.append(nn.Linear(hidden_units, 1)) # Output layer for binary classification self.network = nn.Sequential(*layers) # Add attributes to store scaler and device for predict_proba - self.scaler_ = None + self.scaler_ = None self.device_ = None def forward(self, x): @@ -108,64 +109,74 @@ def predict_proba(self, X, batch_size=1024): Returns probabilities for both classes (0 and 1) in shape (N, 2). """ import time + start_time = time.time() - + if self.scaler_ is None or self.device_ is None: - raise ValueError("Scaler and Device must be set on the model before calling predict_proba.") - + raise ValueError( + "Scaler and Device must be set on the model before calling predict_proba." + ) + self.network.eval() - + # Timing: Data scaling scale_start = time.time() - X_scaled = self.scaler_.transform(X) + x_scaled = self.scaler_.transform(X) scale_time = time.time() - scale_start - + # Timing: Tensor conversion tensor_start = time.time() - X_tensor = torch.tensor(X_scaled, dtype=torch.float32, device=self.device_) + x_tensor = torch.tensor(x_scaled, dtype=torch.float32, device=self.device_) tensor_time = time.time() - tensor_start - + # Timing: Model inference inference_start = time.time() - n_samples = X_tensor.shape[0] + n_samples = x_tensor.shape[0] all_probs = [] - + with torch.no_grad(): # Process in proper batches for i in range(0, n_samples, batch_size): batch_end = min(i + batch_size, n_samples) - batch_X = X_tensor[i:batch_end] - + batch_x = x_tensor[i:batch_end] + # Forward pass - outputs = self.network(batch_X) + outputs = self.network(batch_x) # Apply sigmoid and keep on GPU until all batches processed probs = torch.sigmoid(outputs) all_probs.append(probs) - + # Concatenate all results on GPU, then move to CPU once all_probs_tensor = torch.cat(all_probs, dim=0) probs_class1 = all_probs_tensor.cpu().numpy() # Shape (N, 1) inference_time = time.time() - inference_start - + # Timing: Result formatting format_start = time.time() probs_class0 = 1.0 - probs_class1 # Shape (N, 1) result = np.hstack((probs_class0, probs_class1)) format_time = time.time() - format_start - + total_time = time.time() - start_time - + # Performance logging (only log if slow) if total_time > 1.0: # Log if prediction takes more than 1 second print(f"PERFORMANCE: predict_proba took {total_time:.3f}s for {n_samples} samples") - print(f" - Scaling: {scale_time:.3f}s ({scale_time/total_time*100:.1f}%)") - print(f" - Tensor conversion: {tensor_time:.3f}s ({tensor_time/total_time*100:.1f}%)") - print(f" - Inference: {inference_time:.3f}s ({inference_time/total_time*100:.1f}%)") - print(f" - Formatting: {format_time:.3f}s ({format_time/total_time*100:.1f}%)") - print(f" - Effective batch size: {batch_size}, Batches: {(n_samples + batch_size - 1) // batch_size}") - + print(f" - Scaling: {scale_time:.3f}s ({scale_time / total_time * 100:.1f}%)") + print( + f" - Tensor conversion: {tensor_time:.3f}s ({tensor_time / total_time * 100:.1f}%)" + ) + print( + f" - Inference: {inference_time:.3f}s ({inference_time / total_time * 100:.1f}%)" + ) + print(f" - Formatting: {format_time:.3f}s ({format_time / total_time * 100:.1f}%)") + print( + f" - Effective batch size: {batch_size}, Batches: {(n_samples + batch_size - 1) // batch_size}" + ) + return result + # Global settings MIN_RECALL = 0.20 # Minimum acceptable recall N_TRIALS = 10000 # Number of hyperparameter optimization trials (adjust as needed) @@ -182,12 +193,14 @@ def predict_proba(self, X, batch_size=1024): f"numpy=={np.__version__}", ] -base_params = { - "random_state": SEED -} +base_params = {"random_state": SEED} + +# Constants for repeated strings +SCALER_PATH_PYTORCH = "src/models/scalers/scaler_pytorch.pkl" global logger, experiment_name + # --- Phase 2: Hyperparameter Space and Model Handling --- def load_hyperparameter_space(): """ @@ -203,30 +216,24 @@ def load_hyperparameter_space(): "high": 1e-2, "log": True, }, - "optimizer": { - "type": "categorical", - "choices": ["Adam", "AdamW", "SGD"] - }, + "optimizer": {"type": "categorical", "choices": ["Adam", "AdamW", "SGD"]}, "weight_decay": { "type": "float", "low": 1e-7, "high": 1e-3, "log": True, }, - "batch_size": { - "type": "categorical", - "choices": [64, 128, 256, 512, 1024, 2048] - }, + "batch_size": {"type": "categorical", "choices": [64, 128, 256, 512, 1024, 2048]}, "num_epochs": { "type": "int", - "low": 50, # Min epochs - "high": 300, # Max epochs (can be higher, depends on early stopping) + "low": 50, # Min epochs + "high": 300, # Max epochs (can be higher, depends on early stopping) "step": 10, }, "num_layers": { - "type": "int", + "type": "int", "low": 1, - "high": 6 # Example range for custom NN + "high": 6, # Example range for custom NN }, "hidden_units": { "type": "int", @@ -234,10 +241,7 @@ def load_hyperparameter_space(): "high": 256, "step": 32, }, - "activation_fn": { - "type": "categorical", - "choices": ["ReLU", "LeakyReLU", "SiLU"] - }, + "activation_fn": {"type": "categorical", "choices": ["ReLU", "LeakyReLU", "SiLU"]}, "dropout_rate": { "type": "float", "low": 0.1, @@ -254,24 +258,26 @@ def load_hyperparameter_space(): logger.info("Hyperparameter space loaded.") return hyperparameter_space -def preprocess_data(X_train, X_test, X_eval=None): + +def preprocess_data(x_train, x_test, x_eval=None): try: - with open('src/models/scalers/scaler_pytorch.pkl', 'rb') as f: + with open(SCALER_PATH_PYTORCH, "rb") as f: scaler = pickle.load(f) logger.info("Loaded existing PyTorch scaler") - X_train_scaled = scaler.transform(X_train) - X_test_scaled = scaler.transform(X_test) - X_eval_scaled = scaler.transform(X_eval) + x_train_scaled = scaler.transform(x_train) + x_test_scaled = scaler.transform(x_test) + x_eval_scaled = scaler.transform(x_eval) except Exception as e: logger.error(f"Error loading PyTorch scaler: {str(e)}") scaler = RobustScaler() logger.info("Created new PyTorch scaler") - X_train_scaled = scaler.fit_transform(X_train) - X_test_scaled = scaler.transform(X_test) - X_eval_scaled = scaler.transform(X_eval) - with open('src/models/scalers/scaler_pytorch.pkl', 'wb') as f: + x_train_scaled = scaler.fit_transform(x_train) + x_test_scaled = scaler.transform(x_test) + x_eval_scaled = scaler.transform(x_eval) + with open(SCALER_PATH_PYTORCH, "wb") as f: pickle.dump(scaler, f) - return X_train_scaled, X_test_scaled, X_eval_scaled, scaler + return x_train_scaled, x_test_scaled, x_eval_scaled, scaler + def create_pytorch_model(model_params, input_dim, device, scaler): """ @@ -281,7 +287,7 @@ def create_pytorch_model(model_params, input_dim, device, scaler): model_params (dict): Hyperparameters suggested by Optuna. input_dim (int): Number of input features. device (torch.device): The device (CPU or CUDA) to run the model on. - scaler (StandardScaler): The fitted scaler instance. + scaler (RobustScaler): The fitted scaler instance. Returns: PytorchModel: Configured and device-placed PyTorch model instance. """ @@ -291,48 +297,49 @@ def create_pytorch_model(model_params, input_dim, device, scaler): architecture_params = { "num_layers": model_params.get("num_layers"), "hidden_units": model_params.get("hidden_units"), - "activation_fn": model_params.get("activation_fn"), - "dropout_rate": model_params.get("dropout_rate") + "activation_fn": model_params.get("activation_fn"), + "dropout_rate": model_params.get("dropout_rate"), } # Remove None values if PytorchModel handles defaults architecture_params = {k: v for k, v in architecture_params.items() if v is not None} - + model = PytorchModel(input_dim=input_dim, **architecture_params) model.to(device) - + # Attach scaler and device to the model instance model.scaler_ = scaler model.device_ = device - + # logger.info(f"Created PyTorch model with params: {architecture_params}") logger.info(f"Model placed on device: {device}, scaler attached.") return model # --- Phase 3: Core Training Logic --- +# Cognitive complexity is high due to comprehensive ML training logic (data prep, training loop, validation, early stopping) def train_pytorch_model( model, - X_train, + x_train, y_train, - X_test, + x_test, y_test, - X_val, + x_val, y_val, model_params, device, - scaler, # Pass the fitted scaler - trial=None, # Optional: Pass the Optuna trial for pruning + scaler, # Pass the fitted scaler + trial=None, # Optional: Pass the Optuna trial for pruning ): """ Trains the PyTorch model, handles validation, early stopping, and threshold tuning. Args: model (torch.nn.Module): The PyTorch model instance. - X_train, y_train: Training data and labels (numpy/pandas). - X_val, y_val: Validation data and labels (numpy/pandas). - X_eval, y_eval: Evaluation data for final threshold tuning (numpy/pandas). + x_train, y_train: Training data and labels (numpy/pandas). + x_val, y_val: Validation data and labels (numpy/pandas). + x_eval, y_eval: Evaluation data for final threshold tuning (numpy/pandas). model_params (dict): Hyperparameters including batch_size, epochs, optimizer, etc. device (torch.device): CPU or CUDA device. - scaler (StandardScaler): The fitted scaler instance. + scaler (RobustScaler): The fitted scaler instance. trial (optuna.Trial, optional): Optuna trial for pruning. Defaults to None. Returns: tuple: (trained_model, metrics_dict) @@ -342,35 +349,47 @@ def train_pytorch_model( try: # --- Data Preparation --- batch_size = model_params["batch_size"] - X_combined = pd.concat([X_train, X_test], axis=0) + x_combined = pd.concat([x_train, x_test], axis=0) y_combined = pd.concat([y_train, y_test], axis=0) - X_combined_scaled = scaler.transform(X_combined) - X_val_scaled = scaler.transform(X_val) + x_combined_scaled = scaler.transform(x_combined) + x_val_scaled = scaler.transform(x_val) # No need to scale X_eval here, will be handled by pytorch_predict_proba - X_train_tensor = torch.tensor(X_combined_scaled, dtype=torch.float32) - y_train_tensor = torch.tensor(y_combined.values, dtype=torch.float32).unsqueeze(1) # Ensure (N, 1) - X_val_tensor = torch.tensor(X_val_scaled, dtype=torch.float32) - y_val_tensor = torch.tensor(y_val.values, dtype=torch.float32).unsqueeze(1) # Ensure (N, 1) - - train_dataset = TensorDataset(X_train_tensor, y_train_tensor) - val_dataset = TensorDataset(X_val_tensor, y_val_tensor) - - train_loader = TorchDataLoader(train_dataset, batch_size=batch_size, shuffle=True) - val_loader = TorchDataLoader(val_dataset, batch_size=batch_size, shuffle=False) - logger.info(f"Data loaders created. Train batches: {len(train_loader)}, Val batches: {len(val_loader)}") + x_train_tensor = torch.tensor(x_combined_scaled, dtype=torch.float32) + # Fix ndarray attribute access - safely get values + y_combined_values = ( + y_combined.values if hasattr(y_combined, "values") else np.asarray(y_combined) + ) + y_train_tensor = torch.tensor(y_combined_values, dtype=torch.float32).unsqueeze( + 1 + ) # Ensure (N, 1) + x_val_tensor = torch.tensor(x_val_scaled, dtype=torch.float32) + # Fix ndarray attribute access - safely get values + y_val_values = y_val.values if hasattr(y_val, "values") else np.asarray(y_val) + y_val_tensor = torch.tensor(y_val_values, dtype=torch.float32).unsqueeze(1) # Ensure (N, 1) + + train_dataset = TensorDataset(x_train_tensor, y_train_tensor) + val_dataset = TensorDataset(x_val_tensor, y_val_tensor) + + train_loader = TorchDataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=0) + val_loader = TorchDataLoader(val_dataset, batch_size=batch_size, shuffle=False, num_workers=0) + logger.info( + f"Data loaders created. Train batches: {len(train_loader)}, Val batches: {len(val_loader)}" + ) # --- Initialization --- - criterion = nn.BCEWithLogitsLoss() # Numerically stable + criterion = nn.BCEWithLogitsLoss() # Numerically stable optimizer_name = model_params.get("optimizer", "AdamW") lr = model_params["learning_rate"] weight_decay = model_params.get("weight_decay", 0) - + if optimizer_name == "Adam": optimizer = optim.Adam(model.parameters(), lr=lr, weight_decay=weight_decay) elif optimizer_name == "SGD": - optimizer = optim.SGD(model.parameters(), lr=lr, weight_decay=weight_decay, momentum=0.9) # Add momentum for SGD - else: # Default to AdamW + optimizer = optim.SGD( + model.parameters(), lr=lr, weight_decay=weight_decay, momentum=0.9 + ) # Add momentum for SGD + else: # Default to AdamW optimizer = optim.AdamW(model.parameters(), lr=lr, weight_decay=weight_decay) logger.info(f"Using optimizer: {optimizer_name}") @@ -380,17 +399,17 @@ def train_pytorch_model( patience_counter = 0 best_model_state_dict = None epochs = model_params["num_epochs"] - + # --- Training Loop --- for epoch in range(epochs): # Training Phase model.train() epoch_train_loss = 0.0 - for batch_X, batch_y in train_loader: - batch_X, batch_y = batch_X.to(device), batch_y.to(device) - + for batch_x, batch_y in train_loader: + batch_x, batch_y = batch_x.to(device), batch_y.to(device) + optimizer.zero_grad() - outputs = model(batch_X) + outputs = model(batch_x) loss = criterion(outputs, batch_y) loss.backward() optimizer.step() @@ -401,76 +420,90 @@ def train_pytorch_model( model.eval() epoch_val_loss = 0.0 with torch.no_grad(): - for batch_X, batch_y in val_loader: - batch_X, batch_y = batch_X.to(device), batch_y.to(device) - outputs = model(batch_X) + for batch_x, batch_y in val_loader: + batch_x, batch_y = batch_x.to(device), batch_y.to(device) + outputs = model(batch_x) loss = criterion(outputs, batch_y) epoch_val_loss += loss.item() avg_val_loss = epoch_val_loss / len(val_loader) - - logger.debug(f"Epoch {epoch+1}/{epochs} - Train Loss: {avg_train_loss:.4f}, Val Loss: {avg_val_loss:.4f}") + + logger.debug( + f"Epoch {epoch + 1}/{epochs} - Train Loss: {avg_train_loss:.4f}, Val Loss: {avg_val_loss:.4f}" + ) # Early Stopping if avg_val_loss < best_val_loss: best_val_loss = avg_val_loss patience_counter = 0 - best_model_state_dict = model.state_dict() # Save best model state - logger.debug(f"Epoch {epoch+1}: New best validation loss: {best_val_loss:.4f}") + best_model_state_dict = model.state_dict() # Save best model state + logger.debug(f"Epoch {epoch + 1}: New best validation loss: {best_val_loss:.4f}") else: patience_counter += 1 - logger.debug(f"Epoch {epoch+1}: No improvement in validation loss. Patience: {patience_counter}/{patience}") + logger.debug( + f"Epoch {epoch + 1}: No improvement in validation loss. Patience: {patience_counter}/{patience}" + ) if patience_counter >= patience: - logger.info(f"Early stopping triggered at epoch {epoch+1}.") + logger.info(f"Early stopping triggered at epoch {epoch + 1}.") break - + # --- Post-Training --- if best_model_state_dict: model.load_state_dict(best_model_state_dict) - logger.info(f"Loaded best model state from epoch with validation loss: {best_val_loss:.4f}") + logger.info( + f"Loaded best model state from epoch with validation loss: {best_val_loss:.4f}" + ) else: - logger.warning("No best model state found (early stopping might not have been triggered or training was too short).") + logger.warning( + "No best model state found (early stopping might not have been triggered or training was too short)." + ) - # Threshold Optimization using validation data (X_val, y_val) + # Threshold Optimization using validation data (x_val, y_val) logger.info("Optimizing threshold on validation data...") - y_val_np = y_val.values # optimize_threshold expects numpy array - + # Fix ndarray attribute access - safely get values + y_val_np = ( + y_val.values if hasattr(y_val, "values") else np.asarray(y_val) + ) # optimize_threshold expects numpy array + # Call optimize_threshold with the model, X, and y - # Assumes optimize_threshold will internally call model.predict_proba(X_val) + # Assumes optimize_threshold will internally call model.predict_proba(x_val) try: - best_threshold, metrics = optimize_threshold( - model, # Pass the model object - X_val, # Pass validation features - y_val_np, # Pass validation true labels + best_threshold, metrics = optimize_threshold( + model, # Pass the model object + x_val, # Pass validation features + y_val_np, # Pass validation true labels min_recall=MIN_RECALL, # Removed pre-calculated probs and other args ) except TypeError as te: # This error might still occur if optimize_threshold itself has issues logger.error(f"TypeError calling optimize_threshold: {te}") - logger.error("Ensure optimize_threshold in hypertuner_utils.py expects (model, X, y, min_recall).") - return model, {} # Return empty metrics on failure - + logger.error( + "Ensure optimize_threshold in hypertuner_utils.py expects (model, X, y, min_recall)." + ) + return model, {} # Return empty metrics on failure + logger.info(f"Threshold optimization complete. Best Threshold: {best_threshold:.4f}") logger.info(f"Metrics at threshold: {metrics}") return model, metrics except optuna.TrialPruned: - raise # Re-raise prune exception for Optuna to handle + raise # Re-raise prune exception for Optuna to handle except Exception as e: logger.error(f"Error during PyTorch model training: {str(e)}") # Return None or raise to indicate failure - return None, {} # Return empty metrics on failure + return None, {} # Return empty metrics on failure # --- Phase 4: Optuna Integration --- +# Cognitive complexity is high due to Optuna hyperparameter optimization logic # Define the objective function with fixed arguments using lambda def objective( trial, - X_train, + x_train, y_train, - X_test, + x_test, y_test, - X_val, + x_val, y_val, hyperparameter_space, input_dim, @@ -484,24 +517,26 @@ def objective( for param_name, config in hyperparameter_space.items(): if config["type"] == "float": params[param_name] = trial.suggest_float( - param_name, config["low"], config["high"], - log=config.get("log", False), step=config.get("step") + param_name, + config["low"], + config["high"], + log=config.get("log", False), + step=config.get("step"), ) elif config["type"] == "int": params[param_name] = trial.suggest_int( - param_name, config["low"], config["high"], - step=config.get("step", 1) + param_name, config["low"], config["high"], step=config.get("step", 1) ) elif config["type"] == "categorical": choices = config["choices"] if not isinstance(choices, (list, tuple)): logger.error(f"Invalid choices for {param_name}: {choices}") if param_name == "batch_size": - choices = [128] # Default batch size + choices = [128] # Default batch size else: - choices = ["Adam"] # Default optimizer + choices = ["Adam"] # Default optimizer params[param_name] = trial.suggest_categorical(param_name, choices) - + logger.info(f"Trial {trial.number}: Suggested params: {params}") # Create model @@ -511,19 +546,22 @@ def objective( # train_pytorch_model handles potential TrialPruned exception model, metrics = train_pytorch_model( model, - X_train, y_train, - X_test, y_test, - X_val, y_val, - params, - device, - scaler, - trial=trial # Pass trial for pruning + x_train, + y_train, + x_test, + y_test, + x_val, + y_val, + params, + device, + scaler, + trial=trial, # Pass trial for pruning ) - + # Check if training failed if not metrics: logger.warning(f"Trial {trial.number}: Training failed or returned no metrics.") - return 0.0 # Return low score for failed trials + return 0.0 # Return low score for failed trials # Calculate score (e.g., precision constrained by recall) recall = metrics.get("recall", 0.0) @@ -533,17 +571,19 @@ def objective( # Log metrics as user attributes for the trial for metric_name, metric_value in metrics.items(): trial.set_user_attr(metric_name, metric_value) - trial.set_user_attr("score", score) # Log the final score too + trial.set_user_attr("score", score) # Log the final score too + + logger.info( + f"Trial {trial.number}: Score: {score:.4f} (Precision: {precision:.4f}, Recall: {recall:.4f})" + ) - logger.info(f"Trial {trial.number}: Score: {score:.4f} (Precision: {precision:.4f}, Recall: {recall:.4f})") - if score > 0.31 and score > best_score_overall: log_to_mlflow_pytorch( model, metrics, params, experiment_name, - X_val, + x_val, scaler, pip_requirements, run_name_prefix=f"pytorch_trial_{trial.number}", @@ -551,32 +591,35 @@ def objective( return score except optuna.TrialPruned as e: logger.info(f"Trial {trial.number} pruned during training.") - raise e # Re-raise for Optuna + raise e # Re-raise for Optuna except Exception as e: logger.error(f"Trial {trial.number} failed unexpectedly: {str(e)}") return 0.0 # Return low score for other failed trials def optimize_hyperparameters( - X_train, y_train, - X_test, y_test, - X_val, y_val, + x_train, + y_train, + x_test, + y_test, + x_val, + y_val, hyperparameter_space, input_dim, device, scaler, - n_trials=N_TRIALS # Use global constant + n_trials=N_TRIALS, # Use global constant ): """ Orchestrates the hyperparameter optimization process using Optuna. Args: - X_train, y_train: Training data. - X_test, y_test: Test data. - X_val, y_val: Validation data. + x_train, y_train: Training data. + x_test, y_test: Test data. + x_val, y_val: Validation data. hyperparameter_space (dict): The search space definition. input_dim (int): Number of input features. device (torch.device): Device for training. - scaler (StandardScaler): Fitted scaler. + scaler (RobustScaler): Fitted scaler. n_trials (int): Number of Optuna trials to run. Returns: dict: The best hyperparameters found by Optuna. @@ -590,27 +633,30 @@ def optimize_hyperparameters( def objective_func(trial): nonlocal best_score_overall return objective( - trial, - X_train, y_train, - X_test, y_test, - X_val, y_val, - hyperparameter_space, - input_dim, - device, + trial, + x_train, + y_train, + x_test, + y_test, + x_val, + y_val, + hyperparameter_space, + input_dim, + device, scaler, - best_score_overall + best_score_overall, ) # Set up Optuna study # Consider adding persistent storage like the XGBoost version if needed storage_url = "sqlite:///optuna_pytorch.db" - sampler = optuna.samplers.RandomSampler(seed=SEED) # Example sampler + sampler = optuna.samplers.RandomSampler(seed=SEED) # Example sampler study = optuna.create_study( direction="maximize", sampler=sampler, study_name=experiment_name, storage=storage_url, - load_if_exists=True + load_if_exists=True, ) def log_callback(study, trial): @@ -623,18 +669,22 @@ def log_callback(study, trial): # Keep track of top N trials top_trials_overall.append((trial.number, current_score, trial.params)) top_trials_overall.sort(key=lambda x: x[1], reverse=True) - top_trials_overall[:] = top_trials_overall[:10] # Keep top 10 + top_trials_overall[:] = top_trials_overall[:10] # Keep top 10 logger.info(f"Best Score: {best_score_overall}") # Log top trials periodically (e.g., every 10 trials) if trial.number % 10 == 0: logger.info(f"--- Top Trials (after Trial {trial.number}) ---") header = "| Rank | Trial # | Score | Params |" - sep = "|------|---------|--------|--------|" + sep = "|------|---------|--------|--------|" logger.info(header) logger.info(sep) for i, (t_num, t_score, t_params) in enumerate(top_trials_overall): - params_str = json.dumps(t_params, sort_keys=True, default=lambda x: f"{x:.4g}" if isinstance(x, float) else x) # Compact params - logger.info(f"| {i+1:<4} | {t_num:<7} | {t_score:.4f} | {params_str} |") + params_str = json.dumps( + t_params, + sort_keys=True, + default=lambda x: f"{x:.4g}" if isinstance(x, float) else x, + ) # Compact params + logger.info(f"| {i + 1:<4} | {t_num:<7} | {t_score:.4f} | {params_str} |") logger.info(f"Current Best Score Overall: {best_score_overall:.4f}") # Run the optimization @@ -642,8 +692,8 @@ def log_callback(study, trial): study.optimize(objective_func, n_trials=n_trials, callbacks=[log_callback], n_jobs=6) except KeyboardInterrupt: logger.warning("Optimization stopped manually via KeyboardInterrupt.") - - # --- Post-Optimization --- + + # --- Post-Optimization --- logger.info("Hyperparameter optimization finished.") if not study.trials: logger.warning("No trials completed successfully.") @@ -657,9 +707,9 @@ def log_to_mlflow_pytorch( model, metrics, params, - experiment_name, - X_eval, # For input example - scaler, # Log the scaler as well + experiment_name, + x_eval, # For input example + scaler, # Log the scaler as well pip_requirements, run_name_prefix="pytorch_final", ): @@ -670,8 +720,8 @@ def log_to_mlflow_pytorch( metrics (dict): Model evaluation metrics (precision, recall, threshold, etc.). params (dict): Hyperparameters used for the final model. experiment_name (str): Target MLflow experiment. - X_eval (pd.DataFrame or np.ndarray): Evaluation data to create input example. - scaler (StandardScaler): The fitted scaler instance to log. + x_eval (pd.DataFrame or np.ndarray): Evaluation data to create input example. + scaler (RobustScaler): The fitted scaler instance to log. pip_requirements (list): List of pip requirements strings. run_name_prefix (str): Prefix for the MLflow run name. Returns: @@ -679,49 +729,51 @@ def log_to_mlflow_pytorch( """ try: mlflow.set_experiment(experiment_name) - + with mlflow.start_run( run_name=f"{run_name_prefix}_{datetime.now().strftime('%Y%m%d_%H%M%S')}" ) as run: run_id = run.info.run_id logger.info(f"Starting MLflow run: {run_id}") - + # Log parameters (consider filtering if needed) mlflow.log_params(params) logger.info(f"Logged parameters: {params}") - + # Log metrics mlflow.log_metrics(metrics) logger.info(f"Logged metrics: {metrics}") # Log the scaler - scaler_path = "src/models/scalers/scaler_pytorch.pkl" + scaler_path = SCALER_PATH_PYTORCH mlflow.log_artifact(scaler_path, artifact_path="scaler") logger.info("Logged scaler artifact.") # Log the PyTorch model - # --- Signature Inference --- - input_example_df = X_eval.iloc[:5].copy() # Get DataFrame slice first + # --- Signature Inference --- + input_example_df = x_eval.iloc[:5].copy() # Get DataFrame slice first # Scale this specific example for prediction input_example_np_scaled = scaler.transform(input_example_df).astype(np.float32) input_example_tensor = torch.tensor(input_example_np_scaled, dtype=torch.float32) - + # Infer signature (optional but recommended) try: # Ensure model is on CPU for signature inference - original_device = next(model.parameters()).device # Store original device - model.to('cpu') + original_device = next(model.parameters()).device # Store original device + model.to("cpu") # Get predictions for the example with torch.no_grad(): # Use the tensor created from the scaled DataFrame example - predictions = model(input_example_tensor.cpu()) + predictions = model(input_example_tensor.cpu()) # Move model back to original device - model.to(original_device) + model.to(original_device) # Infer signature using the DataFrame input and numpy output - signature = mlflow.models.infer_signature( - input_example_df, # Pass DataFrame with names - predictions.cpu().numpy() + from mlflow.models.signature import infer_signature + + signature = infer_signature( + input_example_df, # Pass DataFrame with names + predictions.cpu().numpy(), # Removed input_names argument ) logger.info("Successfully inferred model signature.") @@ -731,23 +783,26 @@ def log_to_mlflow_pytorch( # --- Model Logging --- # Still use numpy for input_example in log_model if required by mlflow - input_example_for_log = input_example_np_scaled + input_example_for_log = input_example_np_scaled # Log model using mlflow.pytorch with signature only - model_info = mlflow.pytorch.log_model( + mlflow.pytorch.log_model( pytorch_model=model, artifact_path="model", signature=signature, pip_requirements=pip_requirements, - input_example=input_example_for_log, # Use numpy array here for input_example param - registered_model_name=f"pytorch_{datetime.now().strftime('%Y%m%d_%H%M')}" # Optional registration + input_example=input_example_for_log, # Use numpy array here for input_example param + registered_model_name=f"pytorch_{datetime.now().strftime('%Y%m%d_%H%M')}", # Optional registration ) - # Validate serving input + # Validate serving input (simplified for MLflow 2.x compatibility) try: - serving_input = mlflow.models.convert_input_example_to_serving_input(input_example_df) - mlflow.models.validate_serving_input(model_info.model_uri, serving_input) - logger.info("Successfully validated model serving input using DataFrame example.") + # Basic validation - just check that the model can make predictions + test_input = input_example_for_log[:1] # Take first sample + model(torch.tensor(test_input, dtype=torch.float32).to(model.device)) + logger.info( + "Successfully validated model serving input with basic prediction test." + ) except Exception as val_e: logger.warning(f"Failed to validate serving input: {val_e}") @@ -761,24 +816,27 @@ def log_to_mlflow_pytorch( def hypertune_pytorch( - X_train, y_train, - X_test, y_test, - X_val, y_val, - experiment_name: str, + x_train, + y_train, + x_test, + y_test, + x_val, + y_val, + experiment_name: str, input_dim: int, device: torch.device, - scaler: StandardScaler + scaler: RobustScaler, ): """ Main orchestration function for hyperparameter tuning and final model training. Args: - X_train, y_train: Training data. - X_val, y_val: Validation data. - X_eval, y_eval: Evaluation data. + x_train, y_train: Training data. + x_val, y_val: Validation data. + x_eval, y_eval: Evaluation data. experiment_name (str): MLflow experiment name. input_dim (int): Number of input features. device (torch.device): Device for training. - scaler (StandardScaler): Fitted scaler. + scaler (RobustScaler): Fitted scaler. Returns: tuple: (best_params, final_metrics) or (None, None) on failure. """ @@ -789,13 +847,16 @@ def hypertune_pytorch( # Run hyperparameter optimization logger.info("Initiating Optuna hyperparameter search...") best_hpo_params = optimize_hyperparameters( - X_train, y_train, - X_test, y_test, - X_val, y_val, + x_train, + y_train, + x_test, + y_test, + x_val, + y_val, hyperparameter_space, input_dim, device, - scaler + scaler, ) if not best_hpo_params: @@ -804,35 +865,40 @@ def hypertune_pytorch( return None, None logger.info(f"Best HPO parameters identified: {best_hpo_params}") - mlflow.log_params({f"best_{k}": v for k,v in best_hpo_params.items()}) # Log best HPO params to HPO run + mlflow.log_params( + {f"best_{k}": v for k, v in best_hpo_params.items()} + ) # Log best HPO params to HPO run # --- Train Final Model with Best Parameters --- logger.info("Training final model using best hyperparameters...") final_model = create_pytorch_model(best_hpo_params, input_dim, device, scaler) - + # Train the final model (without passing Optuna trial) final_model, final_metrics = train_pytorch_model( final_model, - X_train, y_train, - X_test, y_test, - X_val, y_val, + x_train, + y_train, + x_test, + y_test, + x_val, + y_val, best_hpo_params, device, scaler, - trial=None # Not an Optuna trial run + trial=None, # Not an Optuna trial run ) - + # --- Log Final Model Separately --- logger.info("Logging the final trained model and artifacts to MLflow...") log_to_mlflow_pytorch( model=final_model, metrics=final_metrics, params=best_hpo_params, - experiment_name=experiment_name, - X_eval=X_val, + experiment_name=experiment_name, + x_eval=x_val, scaler=scaler, pip_requirements=pip_requirements, - run_name_prefix=experiment_name + "_model" + run_name_prefix=experiment_name + "_model", ) return best_hpo_params, final_metrics @@ -847,25 +913,28 @@ def hypertune_pytorch( def train_with_precision_target_pytorch( - X_train, y_train, - X_test, y_test, - X_val, y_val, - experiment_name: str, + x_train, + y_train, + x_test, + y_test, + x_val, + y_val, + experiment_name: str, input_dim: int, device: torch.device, - scaler: StandardScaler, + scaler: RobustScaler, ): """ Trains a single model using fixed, hardcoded hyperparameters. Mirrors the structure of the XGBoost version but uses PyTorch logic. Args: - X_train, y_train: Training data. - X_val, y_val: Validation data. - X_eval, y_eval: Evaluation data. + x_train, y_train: Training data. + x_val, y_val: Validation data. + x_eval, y_eval: Evaluation data. experiment_name (str): MLflow experiment name. input_dim (int): Number of input features. device (torch.device): Device for training. - scaler (StandardScaler): Fitted scaler. + scaler (RobustScaler): Fitted scaler. Returns: tuple: (trained_model, metrics) or (None, None) on failure. """ @@ -873,7 +942,7 @@ def train_with_precision_target_pytorch( # --- Define hardcoded best parameters here --- fixed_params = { "learning_rate": 0.00020272553087508855, - "optimizer": "Adam", + "optimizer": "Adam", "weight_decay": 0.0005416238713396992, "batch_size": 512, "num_epochs": 300, @@ -881,7 +950,7 @@ def train_with_precision_target_pytorch( "hidden_units": 96, "activation_fn": "LeakyReLU", "dropout_rate": 0.56, - "early_stopping_patience": 20 + "early_stopping_patience": 20, } logger.info(f"Using hardcoded parameters: {fixed_params}") @@ -891,13 +960,16 @@ def train_with_precision_target_pytorch( # Train the model model, metrics = train_pytorch_model( model, - X_train, y_train, - X_test, y_test, - X_val, y_val, - fixed_params, # Use the hardcoded dict + x_train, + y_train, + x_test, + y_test, + x_val, + y_val, + fixed_params, # Use the hardcoded dict device, scaler, - trial=None # Not an Optuna trial + trial=None, # Not an Optuna trial ) if not metrics: @@ -910,17 +982,17 @@ def train_with_precision_target_pytorch( # metrics=metrics, # params=fixed_params, # Log the hardcoded params used # experiment_name=experiment_name, - # X_eval=X_val, + # x_eval=x_val, # scaler=scaler, # pip_requirements=pip_requirements, # run_name_prefix="pytorch_fixed_params" # ) compute_permutation_importance( - model=model, - X_val=X_val, - y_val=y_val, - metric=metrics["precision"], - threshold=metrics["threshold"] + model, + x_val, + y_val, + metrics["precision"], + metrics["threshold"], ) logger.info("Training with fixed parameters completed successfully.") return model, metrics @@ -929,9 +1001,10 @@ def train_with_precision_target_pytorch( logger.error(f"Error in fixed parameter training: {str(e)}") return None, None + def compute_permutation_importance( model, - X_val: pd.DataFrame, + x_val: pd.DataFrame, y_val: np.ndarray, metric, threshold: float, @@ -942,7 +1015,7 @@ def compute_permutation_importance( Compute permutation feature importance for a given metric and threshold. Args: model: Trained model with predict_proba(X) method. - X_val: Validation features (DataFrame). + x_val: Validation features (DataFrame). y_val: Validation labels (array-like). metric: Metric function (e.g., sklearn.metrics.precision_score). threshold: Threshold for positive class prediction. @@ -952,10 +1025,16 @@ def compute_permutation_importance( DataFrame with columns: ['feature', 'importance'] (mean drop in metric), sorted descending. """ try: - feature_names = X_val.columns.tolist() - y_val_np = y_val.values if hasattr(y_val, 'values') else y_val + feature_names = x_val.columns.tolist() + # Convert y_val to numpy array safely + if hasattr(y_val, "values"): + y_val_np = np.asarray(y_val.values) # type: ignore + elif isinstance(y_val, np.ndarray): + y_val_np = y_val + else: + y_val_np = np.array(y_val) # Compute baseline metric - probs = model.predict_proba(X_val)[:, 1] + probs = model.predict_proba(x_val)[:, 1] preds = (probs >= threshold).astype(int) # Fix: metric is being passed as a float value instead of a function # We'll calculate precision directly since that's what was passed in @@ -966,13 +1045,23 @@ def compute_permutation_importance( drops = [] for i in range(n_repeats): feat_idx = feature_names.index(feat) + 1 - logger.info(f"Shuffling feature: {feat} ({feat_idx}) - Repeat: {i+1}") - X_shuffled = X_val.copy() - X_shuffled[feat] = np.random.permutation(X_shuffled[feat].values) - probs_shuffled = model.predict_proba(X_shuffled)[:, 1] + logger.info(f"Shuffling feature: {feat} ({feat_idx}) - Repeat: {i + 1}") + x_shuffled = x_val.copy() + # Safely get values from the column + feat_values = ( + x_shuffled[feat].values + if hasattr(x_shuffled[feat], "values") + else x_shuffled[feat] + ) + # Use numpy.random.Generator for better performance + rng = np.random.default_rng(random_state) + x_shuffled[feat] = rng.permutation(np.asarray(feat_values)) + probs_shuffled = model.predict_proba(x_shuffled)[:, 1] preds_shuffled = (probs_shuffled >= threshold).astype(int) # Calculate precision directly instead of using metric parameter - precision = np.sum((y_val_np == 1) & (preds_shuffled == 1)) / (np.sum(preds_shuffled == 1)) + precision = np.sum((y_val_np == 1) & (preds_shuffled == 1)) / ( + np.sum(preds_shuffled == 1) + ) drop = baseline - precision drops.append(drop) mean_drop = np.mean(drops) @@ -980,44 +1069,34 @@ def compute_permutation_importance( logger.debug(f"Feature: {feat}, Mean drop: {mean_drop:.4f}") # Sort by importance descending importances.sort(key=lambda x: x[1], reverse=True) - df_importance = pd.DataFrame(importances, columns=["feature", "importance"]) + columns_list = ["feature", "importance"] + df_importance = pd.DataFrame(data=importances, columns=columns_list) # type: ignore logger.info("Top features by permutation importance:") logger.info(df_importance.head(100).to_string(index=False)) return df_importance except Exception as e: logger.error(f"Error in compute_permutation_importance: {str(e)}") - return None + # Return empty DataFrame with correct columns instead of None + empty_columns = ["feature", "importance"] + return pd.DataFrame(data=[], columns=empty_columns) # type: ignore + -def pytorch_staged_selection(X, y, X_eval, y_eval, target_features=80, device=None, scaler=None): +def _perform_stage1_selection(x, y, x_eval, y_eval, device, scaler): """ - Multi-stage PyTorch neural network feature selection with different objectives. - Args: - X (pd.DataFrame): Training features - y (pd.Series): Training labels - X_eval (pd.DataFrame): Evaluation features - y_eval (pd.Series): Evaluation labels - target_features (int): Number of final features to select - device (torch.device): Device for training (defaults to global device) - scaler (sklearn scaler): Fitted scaler (will create new if None) - + Perform Stage 1 feature selection with simple architecture. + Returns: - tuple: (selected_features_list, final_importance_scores) + tuple: (stage1_features, stage1_scaler) or (None, None) if failed """ - if device is None: - device = torch.device("cuda" if torch.cuda.is_available() else "cpu") - - logger.info(f"Starting PyTorch staged selection with {X.shape[1]} initial features") - - # Stage 1: Quick filter with simple architecture and high learning rate logger.info("Stage 1: Quick filter with simple architecture") - + # Prepare scaler for Stage 1 if scaler is None: stage1_scaler = RobustScaler() - stage1_scaler.fit(X) # Just fit the scaler, don't transform + stage1_scaler.fit(x) else: stage1_scaler = scaler - + # Stage 1 model: Simple and fast stage1_params = { "num_layers": 2, @@ -1026,202 +1105,331 @@ def pytorch_staged_selection(X, y, X_eval, y_eval, target_features=80, device=No "dropout_rate": 0.3, "learning_rate": 0.01, "batch_size": 512, - "num_epochs": 100, + "num_epochs": 100, "early_stopping_patience": 15, "optimizer": "Adam", - "weight_decay": 1e-4 + "weight_decay": 1e-4, } - - input_dim = X.shape[1] + + input_dim = x.shape[1] stage1_model = create_pytorch_model(stage1_params, input_dim, device, stage1_scaler) - + # Train Stage 1 model stage1_model, stage1_metrics = train_pytorch_model( - stage1_model, X, y, X, y, X_eval, y_eval, # Use same data for train/test in stage 1 - stage1_params, device, stage1_scaler, trial=None + stage1_model, x, y, x, y, x_eval, y_eval, stage1_params, device, stage1_scaler, trial=None ) - + if not stage1_metrics: logger.error("Stage 1 training failed") - return X.columns.tolist()[:target_features], np.ones(target_features) - + return None, None + # Compute permutation importance for Stage 1 logger.info("Computing Stage 1 permutation importance...") stage1_importance = compute_permutation_importance( model=stage1_model, - X_val=X_eval, - y_val=y_eval.values if hasattr(y_eval, 'values') else y_eval, + x_val=x_eval, + # Fix ndarray attribute access - safely get values + y_val=y_eval.values if hasattr(y_eval, "values") else np.asarray(y_eval), metric=stage1_metrics["precision"], threshold=stage1_metrics["threshold"], - n_repeats=5 # Fewer repeats for speed in stage 1 + n_repeats=5, ) logger.info("Stage 1 importance: permutation complete") - + # Select top 200 features from Stage 1 - stage1_features = stage1_importance.head(200)['feature'].tolist() + stage1_features = stage1_importance.head(200)["feature"].tolist() logger.info(f"Stage 1: Selected {len(stage1_features)} features") - - # Stage 2: Refined selection with cross-validation + + return stage1_features, stage1_scaler + + +def _perform_stage2_selection(x, y, x_eval, y_eval, stage1_features, device, target_features=80): + """ + Perform Stage 2 feature selection with cross-validation. + + Returns: + tuple: (stage2_features, final_importance_scores, cv_precision_score) or fallback values + """ logger.info("Stage 2: Refined selection with cross-validation") - X_stage1 = X[stage1_features] - X_eval_stage1 = X_eval[stage1_features] - + + x_stage1 = x[stage1_features] + x_eval_stage1 = x_eval[stage1_features] + # Stage 2 model: More complex and thorough stage2_params = { "num_layers": 3, "hidden_units": 128, "activation_fn": "LeakyReLU", "dropout_rate": 0.4, - "learning_rate": 0.001, # Lower learning rate for refined training + "learning_rate": 0.001, "batch_size": 256, "num_epochs": 200, "early_stopping_patience": 25, "optimizer": "AdamW", - "weight_decay": 1e-3 + "weight_decay": 1e-3, } - + # Cross-validation feature importance cv_importances = [] cv_scores = [] - + skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=SEED) - - for fold, (train_idx, val_idx) in enumerate(skf.split(X_stage1, y)): + + for fold, (train_idx, val_idx) in enumerate(skf.split(x_stage1, y)): logger.info(f"Processing fold {fold + 1}/5") - - X_train_cv = X_stage1.iloc[train_idx] - X_val_cv = X_stage1.iloc[val_idx] + + x_train_cv = x_stage1.iloc[train_idx] + x_val_cv = x_stage1.iloc[val_idx] y_train_cv = y.iloc[train_idx] y_val_cv = y.iloc[val_idx] - - # Create fold-specific scaler - fold_scaler = RobustScaler() - fold_scaler.fit(X_train_cv) # Just fit the scaler, don't transform - - # Create and train model for this fold - input_dim_stage2 = len(stage1_features) - fold_model = create_pytorch_model(stage2_params, input_dim_stage2, device, fold_scaler) - - fold_model, fold_metrics = train_pytorch_model( - fold_model, X_train_cv, y_train_cv, X_val_cv, y_val_cv, - X_eval_stage1, y_eval, stage2_params, device, fold_scaler, trial=None + + fold_result = _process_cv_fold( + x_train_cv, + y_train_cv, + x_val_cv, + y_val_cv, + x_eval_stage1, + y_eval, + stage2_params, + device, + stage1_features, + fold, ) - - if fold_metrics: - # Compute permutation importance for this fold - fold_importance = compute_permutation_importance( - model=fold_model, - X_val=X_eval_stage1, - y_val=y_eval.values if hasattr(y_eval, 'values') else y_eval, - metric=fold_metrics["precision"], - threshold=fold_metrics["threshold"], - n_repeats=10 - ) - - # Store importance scores in the same order as stage1_features - importance_dict = dict(zip(fold_importance['feature'], fold_importance['importance'])) - fold_importance_scores = [importance_dict.get(feat, 0.0) for feat in stage1_features] - cv_importances.append(fold_importance_scores) - - # Calculate validation score - y_eval_pred_proba = fold_model.predict_proba(X_eval_stage1)[:, 1] - y_eval_pred = (y_eval_pred_proba >= fold_metrics["threshold"]).astype(int) - y_eval_np = y_eval.values if hasattr(y_eval, 'values') else y_eval - - # Calculate precision for this fold - if np.sum(y_eval_pred) > 0: - fold_precision = precision_score(y_eval_np, y_eval_pred) - cv_scores.append(fold_precision) - else: - cv_scores.append(0.0) - else: - logger.warning(f"Fold {fold + 1} training failed, using zero importance") - cv_importances.append([0.0] * len(stage1_features)) - cv_scores.append(0.0) - - # Clean up GPU memory - del fold_model - torch.cuda.empty_cache() if torch.cuda.is_available() else None - + cv_importances.append(fold_result[0]) + cv_scores.append(fold_result[1]) + # Average importance across folds if cv_importances: - avg_importance = np.mean(cv_importances, axis=0) - - # Select top features based on average importance - feature_importance_pairs = list(zip(stage1_features, avg_importance)) - feature_importance_pairs.sort(key=lambda x: x[1], reverse=True) - - stage2_features = [feat for feat, _ in feature_importance_pairs[:target_features]] - final_importance_scores = [imp for _, imp in feature_importance_pairs[:target_features]] - CV_Precision_Score = np.mean(cv_scores) - logger.info(f"Stage 2: Selected {len(stage2_features)} features") - - return stage2_features, final_importance_scores, CV_Precision_Score + return _aggregate_cv_results(cv_importances, cv_scores, stage1_features, target_features) else: logger.error("No valid cross-validation results obtained") - return stage1_features[:target_features], np.ones(target_features) + return _get_fallback_selection(stage1_features, target_features) + + +def _process_cv_fold( + x_train_cv, + y_train_cv, + x_val_cv, + y_val_cv, + x_eval_stage1, + y_eval, + stage2_params, + device, + stage1_features, + fold, +): + """ + Process a single cross-validation fold. + + Returns: + tuple: (fold_importance_scores, fold_precision) + """ + # Create fold-specific scaler + fold_scaler = RobustScaler() + fold_scaler.fit(x_train_cv) + + # Create and train model for this fold + input_dim_stage2 = len(stage1_features) + fold_model = create_pytorch_model(stage2_params, input_dim_stage2, device, fold_scaler) + + fold_model, fold_metrics = train_pytorch_model( + fold_model, + x_train_cv, + y_train_cv, + x_val_cv, + y_val_cv, + x_eval_stage1, + y_eval, + stage2_params, + device, + fold_scaler, + trial=None, + ) + + if fold_metrics: + fold_importance_scores, fold_precision = _compute_fold_importance( + fold_model, x_eval_stage1, y_eval, fold_metrics, stage1_features + ) + else: + logger.warning(f"Fold {fold + 1} training failed, using zero importance") + fold_importance_scores = [0.0] * len(stage1_features) + fold_precision = 0.0 + + # Clean up GPU memory + del fold_model + torch.cuda.empty_cache() if torch.cuda.is_available() else None + + return fold_importance_scores, fold_precision + + +def _compute_fold_importance(fold_model, x_eval_stage1, y_eval, fold_metrics, stage1_features): + """ + Compute permutation importance for a fold. + + Returns: + tuple: (fold_importance_scores, fold_precision) + """ + # Compute permutation importance for this fold + fold_importance = compute_permutation_importance( + model=fold_model, + x_val=x_eval_stage1, + # Fix ndarray attribute access - safely get values + y_val=y_eval.values if hasattr(y_eval, "values") else np.asarray(y_eval), + metric=fold_metrics["precision"], + threshold=fold_metrics["threshold"], + n_repeats=10, + ) + + # Store importance scores in the same order as stage1_features + importance_dict = dict(zip(fold_importance["feature"], fold_importance["importance"])) + fold_importance_scores = [importance_dict.get(feat, 0.0) for feat in stage1_features] + + # Calculate validation score + y_eval_pred_proba = fold_model.predict_proba(x_eval_stage1)[:, 1] + y_eval_pred = (y_eval_pred_proba >= fold_metrics["threshold"]).astype(int) + # Fix ndarray attribute access - safely get values + y_eval_np = y_eval.values if hasattr(y_eval, "values") else np.asarray(y_eval) + + # Calculate precision for this fold + if np.sum(y_eval_pred) > 0: + fold_precision = precision_score(y_eval_np, y_eval_pred) + else: + fold_precision = 0.0 + + return fold_importance_scores, fold_precision + + +def _aggregate_cv_results(cv_importances, cv_scores, stage1_features, target_features): + """ + Aggregate cross-validation results and select final features. + + Returns: + tuple: (stage2_features, final_importance_scores, cv_precision_score) + """ + avg_importance = np.mean(cv_importances, axis=0) + + # Select top features based on average importance + feature_importance_pairs = list(zip(stage1_features, avg_importance)) + feature_importance_pairs.sort(key=lambda x: x[1], reverse=True) + + stage2_features = [feat for feat, _ in feature_importance_pairs[:target_features]] + final_importance_scores = [imp for _, imp in feature_importance_pairs[:target_features]] + cv_precision_score = np.mean(cv_scores) + logger.info(f"Stage 2: Selected {len(stage2_features)} features") + + return stage2_features, final_importance_scores, cv_precision_score + + +def _get_fallback_selection(stage1_features, target_features): + """ + Get fallback feature selection when cross-validation fails. + + Returns: + tuple: (fallback_features, fallback_scores, cv_precision_score) + """ + return stage1_features[:target_features], np.ones(target_features), 0.0 + + +def pytorch_staged_selection(x, y, x_eval, y_eval, target_features=80, device=None, scaler=None): + """ + Multi-stage PyTorch neural network feature selection with different objectives. + Args: + x (pd.DataFrame): Training features + y (pd.Series): Training labels + x_eval (pd.DataFrame): Evaluation features + y_eval (pd.Series): Evaluation labels + target_features (int): Number of final features to select + device (torch.device): Device for training (defaults to global device) + scaler (sklearn scaler): Fitted scaler (will create new if None) + + Returns: + tuple: (selected_features_list, final_importance_scores) + """ + if device is None: + device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + + logger.info(f"Starting PyTorch staged selection with {x.shape[1]} initial features") + + # Stage 1: Quick filter with simple architecture and high learning rate + stage1_features, _ = _perform_stage1_selection(x, y, x_eval, y_eval, device, scaler) + + if stage1_features is None: + return x.columns.tolist()[:target_features], np.ones(target_features), 0.0 + + # Stage 2: Refined selection with cross-validation + stage2_features, final_importance_scores, cv_precision_score = _perform_stage2_selection( + x, y, x_eval, y_eval, stage1_features, device, target_features + ) + return stage2_features, final_importance_scores, cv_precision_score + def run_staged_feature_selection_workflow( - X_train, y_train, X_test, y_test, X_val, y_val, - experiment_name: str, target_features: int = 80 + x_train, y_train, x_test, y_test, x_val, y_val, experiment_name: str, target_features: int = 80 ): """ Run the complete staged feature selection workflow and save results. - + Args: - X_train, y_train: Training data - X_test, y_test: Test data - X_val, y_val: Validation data + x_train, y_train: Training data + x_test, y_test: Test data + x_val, y_val: Validation data experiment_name (str): Experiment name for file naming target_features (int): Number of features to select - + Returns: tuple: (selected_features, importance_scores, selection_metrics) """ logger.info(f"Starting staged feature selection workflow for {target_features} features") - + # Combine train and test data for feature selection - X_combined = pd.concat([X_train, X_test], axis=0) + x_combined = pd.concat([x_train, x_test], axis=0) y_combined = pd.concat([y_train, y_test], axis=0) - + # Run staged feature selection - selected_features, importance_scores, CV_Precision_Score = pytorch_staged_selection( - X=X_combined, - y=y_combined, - X_eval=X_val, + selected_features, importance_scores, cv_precision_score = pytorch_staged_selection( + x=x_combined, + y=y_combined, + x_eval=x_val, y_eval=y_val, target_features=target_features, device=device, - scaler=None # Let the function create its own scaler + scaler=None, # Let the function create its own scaler ) logger.info(f"Selected features: {selected_features}") - logger.info(f"CV Precision Score: {CV_Precision_Score}") + logger.info(f"CV Precision Score: {cv_precision_score}") # Save as JSON for programmatic use feature_dict = { "selected_features": selected_features, "importance_scores": importance_scores, "metadata": { "target_features": target_features, - "original_feature_count": X_combined.shape[1], + "original_feature_count": x_combined.shape[1], "selection_timestamp": datetime.now().isoformat(), "selection_method": "pytorch_staged_selection", - "feature_reduction_ratio": len(selected_features) / X_combined.shape[1], + "feature_reduction_ratio": len(selected_features) / x_combined.shape[1], "mean_importance_score": float(np.mean(importance_scores)), "std_importance_score": float(np.std(importance_scores)), - "CV_Precision_Score": CV_Precision_Score - } + "cv_precision_score": cv_precision_score, + }, } - + feature_json_path = f"selected_features_pytorch_{target_features}_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json" - with open(feature_json_path, 'w') as f: - json.dump(feature_dict, f, indent=2, default=lambda x: float(x) if isinstance(x, np.floating) else x) - + with open(feature_json_path, "w") as f: + json.dump( + feature_dict, + f, + indent=2, + default=lambda x: float(x) if isinstance(x, np.floating) else x, + ) + logger.info("Feature selection results saved to:") logger.info(f" - JSON file: {feature_json_path}") - logger.info(f"Selected {len(selected_features)} features out of {X_combined.shape[1]} original features") - logger.info(f"Feature reduction ratio: {len(selected_features) / X_combined.shape[1]:.3f}") - - return selected_features, importance_scores, CV_Precision_Score + logger.info( + f"Selected {len(selected_features)} features out of {x_combined.shape[1]} original features" + ) + logger.info(f"Feature reduction ratio: {len(selected_features) / x_combined.shape[1]:.3f}") + + return selected_features, importance_scores, cv_precision_score def main(): @@ -1235,76 +1443,102 @@ def main(): # Setup MLflow tracking setup_mlflow_tracking(experiment_name) - + try: logger.info("Starting PyTorch Model HPO script...") - + # Load data using the shared DataLoader - dataloader = DataLoader() - X_train, y_train, X_test, y_test, X_val, y_val = dataloader.load_data() - - if X_train is None: + dataloader = DataLoader() + x_train, y_train, x_test, y_test, x_val, y_val = dataloader.load_data() + + if x_train is None: logger.error("Data loading failed. Exiting.") return # Select features using the existing method model_type = "pytorch" - features = import_selected_features_ensemble_new(model_type=model_type) + features = import_selected_features_ensemble_new(model_type=model_type) if not features: logger.warning("No features selected for pytorch_model. Using all columns.") - features = X_train.columns.tolist() + features = x_train.columns.tolist() # Ensure target column is not in features if it exists initially - if 'target' in features: - features.remove('target') - - X_train = prepare_data(X_train, features) - X_test = prepare_data(X_test, features) - X_val = prepare_data(X_val, features) + if "target" in features: + features.remove("target") + # Ensure features is always a list of strings + if not isinstance(features, list): + features = list(features) if hasattr(features, "__iter__") else [str(features)] + + x_train = prepare_data(x_train, features) + x_test = prepare_data(x_test, features) + x_val = prepare_data(x_val, features) input_dim = len(features) logger.info(f"Final feature set: {input_dim} features.") - X_train_scaled, X_test_scaled, X_val_scaled, scaler = preprocess_data(X_train, X_test, X_val) + _, _, _, scaler = preprocess_data( + x_train, x_test, x_val + ) # Configuration: Set to True to run staged feature selection RUN_TYPE = "tuning" # Change this to enable/disable staged feature selection - + if RUN_TYPE == "staged_selection": TARGET_FEATURES = 100 # Number of features to select logger.info("=== RUNNING STAGED FEATURE SELECTION ===") - + # Run staged feature selection workflow - selected_features, importance_scores, CV_Precision_Score = run_staged_feature_selection_workflow( - X_train, y_train, X_test, y_test, X_val, y_val, - experiment_name, target_features=TARGET_FEATURES + selected_features, _, cv_precision_score = ( + run_staged_feature_selection_workflow( + x_train, + y_train, + x_test, + y_test, + x_val, + y_val, + experiment_name, + target_features=TARGET_FEATURES, + ) ) - + # Use selected features for the rest of the pipeline features = selected_features - logger.info(f"Using {len(features)} features from staged selection, features: {features}") - logger.info(f"CV Precision Score: {CV_Precision_Score:.4f}") + logger.info( + f"Using {len(features)} features from staged selection, features: {features}" + ) + logger.info(f"CV Precision Score: {cv_precision_score:.4f}") elif RUN_TYPE == "tuning": # Run Hyperparameter Optimization and Final Model Training - best_params, final_metrics = hypertune_pytorch( - X_train, y_train, - X_test, y_test, - X_val, y_val, - experiment_name, + _, _ = hypertune_pytorch( + x_train, + y_train, + x_test, + y_test, + x_val, + y_val, + experiment_name, input_dim, device, - scaler + scaler, ) elif RUN_TYPE == "training": train_with_precision_target_pytorch( - X_train, y_train, X_test, y_test, X_val, y_val, - experiment_name, input_dim, device, scaler - ) - + x_train, + y_train, + x_test, + y_test, + x_val, + y_val, + experiment_name, + input_dim, + device, + scaler, + ) + except Exception as e: logger.error(f"Error in main execution: {str(e)}") finally: - gc.collect() # Force garbage collection + gc.collect() # Force garbage collection logger.info("--- PyTorch HPO script finished. ---") if __name__ == "__main__": main() -# --- End of File --- \ No newline at end of file +# --- End of File --- diff --git a/src/models/StackedEnsemble/base/neural/specialized_fnn.py b/src/models/StackedEnsemble/base/neural/specialized_fnn.py index 7fbc4ba..650fc75 100644 --- a/src/models/StackedEnsemble/base/neural/specialized_fnn.py +++ b/src/models/StackedEnsemble/base/neural/specialized_fnn.py @@ -15,22 +15,22 @@ import numpy as np import torch import torch.nn as nn -import torch.nn.functional as F class FeatureInteractionLayer(nn.Module): """ Learns pairwise interactions between features. - + This is particularly useful for soccer data where the interaction between features (e.g., home attack vs away defense) is important. """ + def __init__(self, input_dim, interaction_factors=8): super().__init__() self.interaction_factors = interaction_factors # Create factorized interaction parameters (for efficiency) self.factors = nn.Parameter(torch.randn(input_dim, interaction_factors)) - + def forward(self, x): # Project features to a latent interaction space latent_factors = torch.matmul(x, self.factors) # (batch_size, factors) @@ -43,9 +43,10 @@ def forward(self, x): class ResidualBlock(nn.Module): """ Residual block with batch normalization and dropout. - + Helps with gradient flow in deeper networks and improves training stability. """ + def __init__(self, dim, dropout_rate=0.3): super().__init__() self.block = nn.Sequential( @@ -54,9 +55,9 @@ def __init__(self, dim, dropout_rate=0.3): nn.ReLU(), nn.Dropout(dropout_rate), nn.Linear(dim, dim), - nn.BatchNorm1d(dim) + nn.BatchNorm1d(dim), ) - + def forward(self, x): return x + self.block(x) # Skip connection @@ -64,13 +65,14 @@ def forward(self, x): class TemperatureScaling(nn.Module): """ Learns to calibrate model probabilities through temperature scaling. - + This improves probability estimates, which is crucial for betting applications. """ + def __init__(self, init_temp=1.0): super().__init__() self.temperature = nn.Parameter(torch.ones(1) * init_temp) - + def forward(self, logits): return logits / self.temperature @@ -78,36 +80,37 @@ def forward(self, logits): class SoccerFeatureProcessor(nn.Module): """ Processes soccer-specific features with domain knowledge. - + Groups input features into: - Home team features - Away team features - Match context features """ + def __init__(self, input_dim, team_feature_pct=0.4): super().__init__() # Determine feature splits (approximate based on percentage) self.home_features = int(input_dim * team_feature_pct) self.away_features = int(input_dim * team_feature_pct) self.match_features = input_dim - self.home_features - self.away_features - + # Normalize each feature group separately self.home_norm = nn.BatchNorm1d(self.home_features) self.away_norm = nn.BatchNorm1d(self.away_features) self.match_norm = nn.BatchNorm1d(self.match_features) if self.match_features > 0 else None - + def forward(self, x): # Split features - home_x = x[:, :self.home_features] - away_x = x[:, self.home_features:self.home_features+self.away_features] - + home_x = x[:, : self.home_features] + away_x = x[:, self.home_features : self.home_features + self.away_features] + # Apply normalization home_x = self.home_norm(home_x) away_x = self.away_norm(away_x) - + # Process match features if they exist if self.match_features > 0: - match_x = x[:, self.home_features+self.away_features:] + match_x = x[:, self.home_features + self.away_features :] match_x = self.match_norm(match_x) # Concatenate all features return torch.cat([home_x, away_x, match_x], dim=1) @@ -119,16 +122,17 @@ def forward(self, x): class SpecializedFNN(nn.Module): """ Specialized Feed-Forward Neural Network optimized for soccer prediction. - + Incorporates multiple enhancements: - Soccer-specific feature processing - Batch normalization for training stability - Feature interaction modeling - Residual connections - Calibrated probability outputs - + Compatible with the existing PyTorch hypertuner workflow. """ + def __init__(self, input_dim, **kwargs): super().__init__() # Extract parameters with sensible defaults @@ -138,7 +142,7 @@ def __init__(self, input_dim, **kwargs): team_feature_pct = kwargs.get("team_feature_pct", 0.4) # 40% home, 40% away, 20% match use_residual = kwargs.get("use_residual", True if num_layers > 1 else False) use_interactions = kwargs.get("use_interactions", True) - + # Map activation function names to PyTorch classes activation_name = kwargs.get("activation_fn", "ReLU") if activation_name == "LeakyReLU": @@ -147,10 +151,10 @@ def __init__(self, input_dim, **kwargs): self.act_fn = nn.SiLU() else: # Default to ReLU self.act_fn = nn.ReLU() - + # Feature processor component self.feature_processor = SoccerFeatureProcessor(input_dim, team_feature_pct) - + # Feature interaction component (optional) self.use_interactions = use_interactions if use_interactions: @@ -159,16 +163,16 @@ def __init__(self, input_dim, **kwargs): main_input_dim = input_dim + 1 # +1 for interaction output else: main_input_dim = input_dim - + # Main network layers layers = [] - + # Input layer layers.append(nn.Linear(main_input_dim, hidden_units)) layers.append(nn.BatchNorm1d(hidden_units)) layers.append(self.act_fn) layers.append(nn.Dropout(dropout_rate)) - + # Hidden layers with optional residual connections for _ in range(num_layers - 1): if use_residual: @@ -178,24 +182,24 @@ def __init__(self, input_dim, **kwargs): layers.append(nn.BatchNorm1d(hidden_units)) layers.append(self.act_fn) layers.append(nn.Dropout(dropout_rate)) - + # Output layer (logits) layers.append(nn.Linear(hidden_units, 1)) - + # Combine all layers self.main_network = nn.Sequential(*layers) - + # Temperature scaling for probability calibration self.temperature_scaling = TemperatureScaling() - + # Add attributes to store scaler and device for predict_proba (required for hypertuner compatibility) - self.scaler_ = None + self.scaler_ = None self.device_ = None def forward(self, x): # Process features with domain knowledge processed_x = self.feature_processor(x) - + # Add interaction features if enabled if self.use_interactions: interactions = self.interaction_layer(x) @@ -203,47 +207,49 @@ def forward(self, x): logits = self.main_network(x_with_interactions) else: logits = self.main_network(processed_x) - + # Apply temperature scaling to logits calibrated_logits = self.temperature_scaling(logits) - + return calibrated_logits - + def predict_proba(self, X, batch_size=128): """ Predict probabilities, mimicking scikit-learn interface. Requires scaler_ and device_ attributes to be set. Returns probabilities for both classes (0 and 1) in shape (N, 2). - + This method maintains compatibility with the existing hypertuner workflow. """ if self.scaler_ is None or self.device_ is None: - raise ValueError("Scaler and Device must be set on the model before calling predict_proba.") - + raise ValueError( + "Scaler and Device must be set on the model before calling predict_proba." + ) + # Set model to evaluation mode self.eval() all_probs_class1 = [] - + # Ensure X is DataFrame or Array that scaler expects X_scaled = self.scaler_.transform(X) - + # Convert to PyTorch tensors X_tensor = torch.tensor(X_scaled, dtype=torch.float32) - + # Process in batches to avoid memory issues with torch.no_grad(): for i in range(0, len(X_tensor), batch_size): - batch_X = X_tensor[i:i+batch_size].to(self.device_) + batch_X = X_tensor[i : i + batch_size].to(self.device_) outputs = self(batch_X) probs = torch.sigmoid(outputs).cpu().numpy() all_probs_class1.append(probs) - + # Concatenate probabilities for class 1 probs_class1 = np.concatenate(all_probs_class1).flatten() - + # Calculate probabilities for class 0 probs_class0 = 1.0 - probs_class1 - + # Stack horizontally to get shape (N, 2) return np.column_stack((probs_class0, probs_class1)) @@ -252,15 +258,15 @@ def predict_proba(self, X, batch_size=128): def create_specialized_fnn(model_params, input_dim, device, scaler): """ Create and configure the specialized FNN model instance. - + This function matches the interface expected by the hypertuner workflow. - + Args: model_params (dict): Hyperparameters suggested by Optuna. input_dim (int): Number of input features. device (torch.device): The device (CPU or CUDA) to run the model on. scaler (StandardScaler): The fitted scaler instance. - + Returns: SpecializedFNN: Configured and device-placed FNN model instance. """ @@ -268,24 +274,24 @@ def create_specialized_fnn(model_params, input_dim, device, scaler): architecture_params = { "num_layers": model_params.get("num_layers"), "hidden_units": model_params.get("hidden_units"), - "activation_fn": model_params.get("activation_fn"), + "activation_fn": model_params.get("activation_fn"), "dropout_rate": model_params.get("dropout_rate"), "team_feature_pct": model_params.get("team_feature_pct", 0.4), "use_residual": model_params.get("use_residual", True), - "use_interactions": model_params.get("use_interactions", True) + "use_interactions": model_params.get("use_interactions", True), } - + # Remove None values architecture_params = {k: v for k, v in architecture_params.items() if v is not None} - + # Create model instance model = SpecializedFNN(input_dim=input_dim, **architecture_params) model.to(device) - + # Attach scaler and device to the model instance (for predict_proba) model.scaler_ = scaler model.device_ = device - + return model @@ -293,33 +299,22 @@ def create_specialized_fnn(model_params, input_dim, device, scaler): def add_specialized_hyperparameters(hyperparameter_space): """ Add specialized hyperparameters for the FNN model. - + Call this function to extend the existing hyperparameter space before passing it to Optuna for optimization. - + Args: hyperparameter_space (dict): Existing hyperparameter space. - + Returns: dict: Extended hyperparameter space with FNN-specific parameters. """ specialized_params = { - "team_feature_pct": { - "type": "float", - "low": 0.3, - "high": 0.5, - "step": 0.05 - }, - "use_residual": { - "type": "categorical", - "choices": [True, False] - }, - "use_interactions": { - "type": "categorical", - "choices": [True, False] - } + "team_feature_pct": {"type": "float", "low": 0.3, "high": 0.5, "step": 0.05}, + "use_residual": {"type": "categorical", "choices": [True, False]}, + "use_interactions": {"type": "categorical", "choices": [True, False]}, } - + # Add specialized parameters to the existing space hyperparameter_space.update(specialized_params) - return hyperparameter_space \ No newline at end of file + return hyperparameter_space diff --git a/src/models/StackedEnsemble/base/neural/specialized_fnn_integration.py b/src/models/StackedEnsemble/base/neural/specialized_fnn_integration.py index aff68d2..eea075d 100644 --- a/src/models/StackedEnsemble/base/neural/specialized_fnn_integration.py +++ b/src/models/StackedEnsemble/base/neural/specialized_fnn_integration.py @@ -1,22 +1,21 @@ """ Integration Script for Specialized FNN with PyTorch Hypertuner -This script demonstrates how to use the Specialized FNN model with the +This script demonstrates how to use the Specialized FNN model with the existing PyTorch hypertuner workflow for soccer prediction. Usage: python specialized_fnn_integration.py """ + # These are the key imports we'll override from src.models.StackedEnsemble.base.neural.pytorch_hypertuner import ( - create_pytorch_model, load_hyperparameter_space, ) from src.models.StackedEnsemble.base.neural.pytorch_hypertuner import main as original_main # Import the specialized FNN components from src.models.StackedEnsemble.base.neural.specialized_fnn import ( - SpecializedFNN, add_specialized_hyperparameters, create_specialized_fnn, ) @@ -25,6 +24,7 @@ # Setup logger logger = ExperimentLogger("specialized_fnn_experiment") + def patched_create_pytorch_model(*args, **kwargs): """ Replace the standard model creation with our specialized FNN. @@ -32,52 +32,57 @@ def patched_create_pytorch_model(*args, **kwargs): logger.info("Creating specialized FNN model instead of default PyTorch model") return create_specialized_fnn(*args, **kwargs) + def patched_load_hyperparameter_space(): """ Load the standard hyperparameter space and extend it with FNN-specific parameters. """ # Get the original hyperparameter space original_space = load_hyperparameter_space() - + # Add our specialized FNN hyperparameters enhanced_space = add_specialized_hyperparameters(original_space) - + logger.info("Enhanced hyperparameter space with specialized FNN parameters") return enhanced_space + def main(): """ Main execution function that uses the specialized FNN with the existing workflow. """ logger.info("Starting specialized FNN experiment") - + # Temporarily patch functions to use our specialized model import src.models.StackedEnsemble.base.neural.pytorch_hypertuner as hypertuner - + # Save original functions and variables original_create_model = hypertuner.create_pytorch_model original_load_space = hypertuner.load_hyperparameter_space original_experiment_name = hypertuner.experiment_name - + try: # Patch with our specialized versions hypertuner.create_pytorch_model = patched_create_pytorch_model hypertuner.load_hyperparameter_space = patched_load_hyperparameter_space - + # Set custom experiment name # hypertuner.experiment_name = "specialized_fnn_experiment" - + # Now run the original main function which will use our specialized components - logger.info(f"Running hypertuner with specialized FNN model using experiment name: {hypertuner.experiment_name}") + logger.info( + f"Running hypertuner with specialized FNN model using experiment name: {hypertuner.experiment_name}" + ) original_main() - + finally: # Restore original functions and variables hypertuner.create_pytorch_model = original_create_model hypertuner.load_hyperparameter_space = original_load_space hypertuner.experiment_name = original_experiment_name - + logger.info("Specialized FNN experiment completed") + if __name__ == "__main__": - main() \ No newline at end of file + main() diff --git a/src/models/StackedEnsemble/base/neural/specialized_fnn_integration_20.py b/src/models/StackedEnsemble/base/neural/specialized_fnn_integration_20.py index 45cd90b..93f7d34 100644 --- a/src/models/StackedEnsemble/base/neural/specialized_fnn_integration_20.py +++ b/src/models/StackedEnsemble/base/neural/specialized_fnn_integration_20.py @@ -1,22 +1,21 @@ """ Integration Script for Specialized FNN with PyTorch Hypertuner -This script demonstrates how to use the Specialized FNN model with the +This script demonstrates how to use the Specialized FNN model with the existing PyTorch hypertuner workflow for soccer prediction. Usage: python specialized_fnn_integration.py """ + # These are the key imports we'll override from src.models.StackedEnsemble.base.neural.pytorch_hypertuner_20 import ( - create_pytorch_model, load_hyperparameter_space, ) from src.models.StackedEnsemble.base.neural.pytorch_hypertuner_20 import main as original_main # Import the specialized FNN components from src.models.StackedEnsemble.base.neural.specialized_fnn import ( - SpecializedFNN, add_specialized_hyperparameters, create_specialized_fnn, ) @@ -25,6 +24,7 @@ # Setup logger logger = ExperimentLogger("specialized_fnn_experiment_20") + def patched_create_pytorch_model(*args, **kwargs): """ Replace the standard model creation with our specialized FNN. @@ -32,26 +32,28 @@ def patched_create_pytorch_model(*args, **kwargs): logger.info("Creating specialized FNN model instead of default PyTorch model") return create_specialized_fnn(*args, **kwargs) + def patched_load_hyperparameter_space(): """ Load the standard hyperparameter space and extend it with FNN-specific parameters. """ # Get the original hyperparameter space original_space = load_hyperparameter_space() - + # Add our specialized FNN hyperparameters enhanced_space = add_specialized_hyperparameters(original_space) - + logger.info("Enhanced hyperparameter space with specialized FNN parameters") return enhanced_space + def main(): """ Main execution function that uses the specialized FNN with the existing workflow. """ # Temporarily patch functions to use our specialized model import src.models.StackedEnsemble.base.neural.pytorch_hypertuner as hypertuner - + global logger, experiment_name experiment_name = "specialized_fnn_experiment_20" logger.info("Starting specialized FNN experiment") @@ -59,26 +61,29 @@ def main(): original_create_model = hypertuner.create_pytorch_model original_load_space = hypertuner.load_hyperparameter_space original_experiment_name = hypertuner.experiment_name - + try: # Patch with our specialized versions hypertuner.create_pytorch_model = patched_create_pytorch_model hypertuner.load_hyperparameter_space = patched_load_hyperparameter_space - + # Set custom experiment name hypertuner.experiment_name = experiment_name - + # Now run the original main function which will use our specialized components - logger.info(f"Running hypertuner with specialized FNN model using experiment name: {hypertuner.experiment_name}") + logger.info( + f"Running hypertuner with specialized FNN model using experiment name: {hypertuner.experiment_name}" + ) original_main() - + finally: # Restore original functions and variables hypertuner.create_pytorch_model = original_create_model hypertuner.load_hyperparameter_space = original_load_space hypertuner.experiment_name = original_experiment_name - + logger.info("Specialized FNN experiment completed") + if __name__ == "__main__": - main() \ No newline at end of file + main() diff --git a/src/models/StackedEnsemble/base/neural/tabnet_model.py b/src/models/StackedEnsemble/base/neural/tabnet_model.py index bcbb46f..59fb98f 100644 --- a/src/models/StackedEnsemble/base/neural/tabnet_model.py +++ b/src/models/StackedEnsemble/base/neural/tabnet_model.py @@ -53,7 +53,7 @@ base_params = { "optimizer_fn": optim.Adam, # Use Adam as default optimizer # "mask_type": "sparsemax", - "eval_metric": ["auc", "logloss"], # Default metrics, custom one passed in + "eval_metric": ["auc", "logloss"], # Default metrics, custom one passed in "fit_weights": 1, "verbose": 0, "seed": 19, @@ -79,14 +79,14 @@ # Verify CUDA availability if torch.cuda.is_available(): torch.cuda.manual_seed_all(SEED) - torch.backends.cudnn.benchmark = True - torch.backends.cudnn.deterministic = False + torch.backends.cudnn.benchmark = True + torch.backends.cudnn.deterministic = False torch.backends.cuda.matmul.allow_tf32 = True torch.backends.cudnn.allow_tf32 = True gpu_name = torch.cuda.get_device_name(0) - gpu_memory = torch.cuda.get_device_properties(0).total_memory / (1024**3) + gpu_memory = torch.cuda.get_device_properties(0).total_memory / (1024**3) # Check if PyTorch version supports torch.compile - if hasattr(torch, 'compile'): + if hasattr(torch, "compile"): logger.info("torch.compile is available - will use it for performance optimization") USE_TORCH_COMPILE = True else: @@ -95,12 +95,15 @@ logger.info(f"CUDA is available! Found {torch.cuda.device_count()} GPU(s).") logger.info(f"Using GPU: {gpu_name} with {gpu_memory:.2f} GB memory") logger.info(f"CUDA Version: {torch.version.cuda}") - logger.info(f"PyTorch CUDA capabilities: TF32={torch.backends.cuda.matmul.allow_tf32}, cuDNN benchmark={torch.backends.cudnn.benchmark}") + logger.info( + f"PyTorch CUDA capabilities: TF32={torch.backends.cuda.matmul.allow_tf32}, cuDNN benchmark={torch.backends.cudnn.benchmark}" + ) else: logger.warning("CUDA is NOT available. TabNet will run on CPU.") USE_TORCH_COMPILE = False base_params["device_name"] = "cpu" + def load_hyperparameter_space(): """ Define hyperparameter space for TabNet tuning. @@ -135,14 +138,17 @@ def load_hyperparameter_space(): "scheduler_min_lr": {"type": "float", "low": 1e-6, "high": 1e-4, "log": True}, "scheduler_pct_start": {"type": "float", "low": 0.1, "high": 0.5, "step": 0.05}, "scheduler_div_factor": {"type": "float", "low": 10.0, "high": 40.0, "step": 0.5}, - "scheduler_final_div_factor": {"type": "float", "low": 1000.0, "high": 10000.0, "step": 100.0}, - "mask_type": { - "type": "categorical", - "choices": ["sparsemax", "entmax"] + "scheduler_final_div_factor": { + "type": "float", + "low": 1000.0, + "high": 10000.0, + "step": 100.0, }, + "mask_type": {"type": "categorical", "choices": ["sparsemax", "entmax"]}, } return hyperparameter_space + # Create a custom metric that heavily weights precision class PrecisionFocusedMetric(Metric): def __init__(self, beta=0.5): @@ -161,25 +167,27 @@ def __call__(self, y_true, y_score): # Convert back to 1D: take the argmax along the class axis (axis=1) y_true_flat = np.argmax(y_true, axis=1) elif y_true_type == "binary": - y_true_flat = y_true.astype(int) # Ensure integer type + y_true_flat = y_true.astype(int) # Ensure integer type else: # Handle unexpected types or raise an error - logger.warning(f"Unexpected y_true type '{y_true_type}' in PrecisionFocusedMetric. Attempting to flatten.") + logger.warning( + f"Unexpected y_true type '{y_true_type}' in PrecisionFocusedMetric. Attempting to flatten." + ) try: - y_true_flat = y_true.astype(int).ravel() # General attempt to flatten + y_true_flat = y_true.astype(int).ravel() # General attempt to flatten except Exception as e: logger.error(f"Could not convert y_true to 1D array: {e}") - return 0.0 # Return 0 score if conversion fails + return 0.0 # Return 0 score if conversion fails # Ensure y_score handling is robust # Check if y_score has 2 columns (expected for binary probabilities) if y_score.ndim == 2 and y_score.shape[1] == 2: - pred = (y_score[:, 1] > 0.5).astype(int) # Use probability of positive class - elif y_score.ndim == 1: # If y_score is already 1D predictions/scores - pred = (y_score > 0.5).astype(int) # Threshold directly + pred = (y_score[:, 1] > 0.5).astype(int) # Use probability of positive class + elif y_score.ndim == 1: # If y_score is already 1D predictions/scores + pred = (y_score > 0.5).astype(int) # Threshold directly else: logger.error(f"Unexpected y_score shape {y_score.shape} in PrecisionFocusedMetric.") - return 0.0 # Return 0 score if y_score format is wrong + return 0.0 # Return 0 score if y_score format is wrong # Calculate precision and recall safely try: @@ -189,9 +197,13 @@ def __call__(self, y_true, y_score): recall = recall_score(y_true_flat, pred, zero_division=0) except ValueError as e: logger.error(f"Error calculating scores in PrecisionFocusedMetric: {e}") - logger.error(f"y_true_flat sample: {y_true_flat[:5]}, shape: {y_true_flat.shape}, type: {type_of_target(y_true_flat)}") - logger.error(f"pred sample: {pred[:5]}, shape: {pred.shape}, type: {type_of_target(pred)}") - return 0.0 # Return 0 score if scikit-learn metric fails + logger.error( + f"y_true_flat sample: {y_true_flat[:5]}, shape: {y_true_flat.shape}, type: {type_of_target(y_true_flat)}" + ) + logger.error( + f"pred sample: {pred[:5]}, shape: {pred.shape}, type: {type_of_target(pred)}" + ) + return 0.0 # Return 0 score if scikit-learn metric fails # If recall below threshold, return 0 if recall < min_recall: @@ -203,6 +215,7 @@ def __call__(self, y_true, y_score): ) return f_beta + class TabNetSklearnWrapper(BaseEstimator): """ A scikit-learn compatible wrapper for TabNet that makes it compatible with MLflow's sklearn flavor. @@ -241,6 +254,7 @@ def predict_proba(self, X): data = X return self.model.predict_proba(data) + class TabNetWrapper(mlflow.pyfunc.PythonModel): def __init__(self, model): self.model = model @@ -265,6 +279,7 @@ def predict_proba(self, model_input): data = model_input return self.model.predict_proba(data) + def create_model(model_params): """ Create and configure TabNet model instance based on provided parameters. @@ -275,14 +290,36 @@ def create_model(model_params): params = base_params.copy() # Define valid constructor args and config keys valid_constructor_args = { - "n_d", "n_a", "n_steps", "gamma", "lambda_sparse", "optimizer_fn", - "optimizer_params", "scheduler_fn", "scheduler_params", "mask_type", - "n_independent", "n_shared", "epsilon", "momentum", "device_name", - "seed", "verbose", "cat_idxs", "cat_dims", "cat_emb_dim" + "n_d", + "n_a", + "n_steps", + "gamma", + "lambda_sparse", + "optimizer_fn", + "optimizer_params", + "scheduler_fn", + "scheduler_params", + "mask_type", + "n_independent", + "n_shared", + "epsilon", + "momentum", + "device_name", + "seed", + "verbose", + "cat_idxs", + "cat_dims", + "cat_emb_dim", + } + config_keys = { + "learning_rate", + "weight_decay", + "scheduler_type", + "scheduler_min_lr", + "scheduler_patience", + "scheduler_factor", + "scheduler_div_factor", } - config_keys = {"learning_rate", "weight_decay", "scheduler_type", - "scheduler_min_lr", "scheduler_patience", - "scheduler_factor", "scheduler_div_factor"} # Extract constructor and config params from input model_params constructor_params = {k: v for k, v in model_params.items() if k in valid_constructor_args} @@ -290,21 +327,23 @@ def create_model(model_params): # Always use Adam optimizer for GPU optimization params["optimizer_fn"] = optim.Adam - + # Update base params with constructor params params.update(constructor_params) # Configure optimizer with GPU-optimized settings lr = config_params.get("learning_rate", 0.01) weight_decay = config_params.get("weight_decay", 1e-5) - if "optimizer_params" not in params: + if "optimizer_params" not in params: params["optimizer_params"] = {} params["optimizer_params"]["lr"] = lr params["optimizer_params"]["weight_decay"] = weight_decay # Add optimizer settings that can improve GPU performance - params["optimizer_params"]["eps"] = config_params.get("eps", 1e-7) # Improves numerical stability + params["optimizer_params"]["eps"] = config_params.get( + "eps", 1e-7 + ) # Improves numerical stability params["optimizer_params"]["amsgrad"] = True # Can improve convergence on GPU - + # Configure scheduler scheduler_type = config_params.get("scheduler_type", "none") scheduler_params_config = {} @@ -332,26 +371,31 @@ def create_model(model_params): params.pop("scheduler_params", None) # Ensure only valid args are passed to constructor - final_params = {k:v for k,v in params.items() if k in valid_constructor_args or k in ["optimizer_params", "scheduler_fn", "scheduler_params"]} + final_params = { + k: v + for k, v in params.items() + if k in valid_constructor_args + or k in ["optimizer_params", "scheduler_fn", "scheduler_params"] + } if "verbose" in final_params: final_params["verbose"] = int(final_params["verbose"]) # Instantiate model (without loss_fn argument) model = TabNetClassifier(**final_params) - + # Apply torch.compile if available and using GPU (for PyTorch 2.0+) if USE_TORCH_COMPILE and torch.cuda.is_available(): try: # We need to compile specific parts of the model # TabNetClassifier is complex, so we compile only the network component - if hasattr(model, 'network') and hasattr(torch, 'compile'): + if hasattr(model, "network") and hasattr(torch, "compile"): logger.info("Applying torch.compile to TabNet network for GPU acceleration") # Apply compilation with 'reduce-overhead' mode which is good for GPU performance model.network = torch.compile(model.network, mode="reduce-overhead") logger.info("Successfully applied torch.compile to TabNet network") except Exception as e: logger.warning(f"Could not apply torch.compile: {str(e)}") - + return model except Exception as e: logger.error(f"Error creating TabNet model: {str(e)}") @@ -359,6 +403,7 @@ def create_model(model_params): logger.error(traceback.format_exc()) raise + def train_model(X_train, y_train, X_test, y_test, X_eval, y_eval, model_params): """ Train a TabNet model with early stopping. @@ -374,16 +419,21 @@ def train_model(X_train, y_train, X_test, y_test, X_eval, y_eval, model_params): max_epochs = int(model_params.get("max_epochs", 50)) patience_to_use = int(model_params.get("patience", 10)) virtual_batch_size_to_use = int(model_params.get("virtual_batch_size", 128)) - fit_weights_value = int(model_params.get("fit_weights", 1)) # Default to 1 (unbalanced) if not found + fit_weights_value = int( + model_params.get("fit_weights", 1) + ) # Default to 1 (unbalanced) if not found # Update OneCycleLR scheduler params if needed if model_params.get("scheduler_type") == "onecycle": # Combine X_train and X_test for step calculation as they are used together in fit - total_samples = (len(X_train) if hasattr(X_train, "__len__") else X_train.shape[0]) + \ - (len(X_test) if hasattr(X_test, "__len__") else X_test.shape[0]) - steps_per_epoch = total_samples // batch_size_to_use + (1 if total_samples % batch_size_to_use != 0 else 0) + total_samples = (len(X_train) if hasattr(X_train, "__len__") else X_train.shape[0]) + ( + len(X_test) if hasattr(X_test, "__len__") else X_test.shape[0] + ) + steps_per_epoch = total_samples // batch_size_to_use + ( + 1 if total_samples % batch_size_to_use != 0 else 0 + ) total_steps = steps_per_epoch * max_epochs - if hasattr(model, 'scheduler_params') and isinstance(model.scheduler_params, dict): + if hasattr(model, "scheduler_params") and isinstance(model.scheduler_params, dict): lr = model_params.get("learning_rate", 0.01) model.scheduler_params["max_lr"] = lr model.scheduler_params["total_steps"] = total_steps @@ -425,18 +475,16 @@ def train_model(X_train, y_train, X_test, y_test, X_eval, y_eval, model_params): "patience": patience_to_use, "batch_size": batch_size_to_use, "virtual_batch_size": virtual_batch_size_to_use, - "weights": fit_weights_value, # Pass the sampled weight value here + "weights": fit_weights_value, # Pass the sampled weight value here "drop_last": False, } # Fit the model - logger.info(f"Starting model.fit with epochs={max_epochs}, patience={patience_to_use}, batch_size={batch_size_to_use}, weights={fit_weights_value}") - model.fit( - X_combined, - y_combined, - **fit_params + logger.info( + f"Starting model.fit with epochs={max_epochs}, patience={patience_to_use}, batch_size={batch_size_to_use}, weights={fit_weights_value}" ) - + model.fit(X_combined, y_combined, **fit_params) + # Log peak GPU memory usage during training if torch.cuda.is_available(): peak_mem = torch.cuda.max_memory_allocated() / (1024**2) @@ -458,6 +506,7 @@ def train_model(X_train, y_train, X_test, y_test, X_eval, y_eval, model_params): logger.error(f"Parameters during failed training: {model_params}") raise + def optimize_hyperparameters( X_train, y_train, X_test, y_test, X_eval, y_eval, hyperparameter_space ): @@ -477,7 +526,9 @@ def objective(trial): try: # --- Sample other parameters --- # Sample scheduler type needed for conditional params - scheduler_type = trial.suggest_categorical("scheduler_type", hyperparameter_space["scheduler_type"]["choices"]) + scheduler_type = trial.suggest_categorical( + "scheduler_type", hyperparameter_space["scheduler_type"]["choices"] + ) current_params["scheduler_type"] = scheduler_type # Iterate over the rest of the hyperparameter space @@ -487,40 +538,77 @@ def objective(trial): continue # Conditional suggestion for scheduler params is_relevant_scheduler_param = False - if scheduler_type == "plateau" and param_name in ["scheduler_patience", "scheduler_factor", "scheduler_min_lr"]: + if scheduler_type == "plateau" and param_name in [ + "scheduler_patience", + "scheduler_factor", + "scheduler_min_lr", + ]: is_relevant_scheduler_param = True - elif scheduler_type == "cosine" and param_name in ["scheduler_t_max", "scheduler_min_lr"]: + elif scheduler_type == "cosine" and param_name in [ + "scheduler_t_max", + "scheduler_min_lr", + ]: is_relevant_scheduler_param = True elif scheduler_type == "onecycle" and param_name == "scheduler_div_factor": is_relevant_scheduler_param = True - elif param_name not in ["scheduler_patience", "scheduler_factor", "scheduler_min_lr", "scheduler_t_max", "scheduler_div_factor"]: + elif param_name not in [ + "scheduler_patience", + "scheduler_factor", + "scheduler_min_lr", + "scheduler_t_max", + "scheduler_div_factor", + ]: # Not a scheduler-specific param, suggest normally is_relevant_scheduler_param = True if is_relevant_scheduler_param: # Suggest parameter if param_config["type"] == "float": if "step" in param_config: - current_params[param_name] = trial.suggest_float(param_name, param_config["low"], param_config["high"], step=param_config["step"], log=param_config.get("log", False)) + current_params[param_name] = trial.suggest_float( + param_name, + param_config["low"], + param_config["high"], + step=param_config["step"], + log=param_config.get("log", False), + ) else: - current_params[param_name] = trial.suggest_float(param_name, param_config["low"], param_config["high"], log=param_config.get("log", False)) + current_params[param_name] = trial.suggest_float( + param_name, + param_config["low"], + param_config["high"], + log=param_config.get("log", False), + ) elif param_config["type"] == "int": if "step" in param_config: - current_params[param_name] = trial.suggest_int(param_name, param_config["low"], param_config["high"], step=param_config["step"]) + current_params[param_name] = trial.suggest_int( + param_name, + param_config["low"], + param_config["high"], + step=param_config["step"], + ) else: - current_params[param_name] = trial.suggest_int(param_name, param_config["low"], param_config["high"]) + current_params[param_name] = trial.suggest_int( + param_name, param_config["low"], param_config["high"] + ) elif param_config["type"] == "categorical": # Only suggest if not scheduler_type (already handled) if param_name != "scheduler_type": choices = param_config.get("choices", []) if isinstance(choices, list) and choices: - current_params[param_name] = trial.suggest_categorical(param_name, choices) + current_params[param_name] = trial.suggest_categorical( + param_name, choices + ) else: - logger.warning(f"Skipping categorical param '{param_name}' due to invalid/empty choices.") + logger.warning( + f"Skipping categorical param '{param_name}' due to invalid/empty choices." + ) # Train model and get metrics - model, metrics = train_model(X_train, y_train, X_test, y_test, X_eval, y_eval, current_params) + model, metrics = train_model( + X_train, y_train, X_test, y_test, X_eval, y_eval, current_params + ) # Store model reference for callback but don't try to serialize it - setattr(trial, 'model', model) # noqa: B010 + setattr(trial, "model", model) # noqa: B010 # Log metrics to trial attributes for metric_name, metric_value in metrics.items(): trial.set_user_attr(metric_name, metric_value) @@ -533,14 +621,16 @@ def objective(trial): serializable_params[k] = v else: serializable_params[k] = str(v) - trial.set_user_attr('params', serializable_params) + trial.set_user_attr("params", serializable_params) # Scoring logic recall = metrics.get("recall", 0.0) precision = metrics.get("precision", 0.0) score = precision if recall >= min_recall else 0.0 # Log trial results - logger.info(f" Trial {trial.number}: Score={score:.4f}, Precision={precision:.4f}, Recall={recall:.4f}") + logger.info( + f" Trial {trial.number}: Score={score:.4f}, Precision={precision:.4f}, Recall={recall:.4f}" + ) for metric_name, metric_value in metrics.items(): # Serialize for Optuna if isinstance(metric_value, (int, float, str, bool)) or metric_value is None: @@ -557,18 +647,20 @@ def objective(trial): if score > best_score: best_score = score best_params = current_params.copy() - logger.info(f" >>> New best score in this run: {best_score:.4f} (Trial {trial.number})") + logger.info( + f" >>> New best score in this run: {best_score:.4f} (Trial {trial.number})" + ) return score except optuna.TrialPruned: logger.info(f"Trial {trial.number} pruned.") - raise # Re-raise to signal Optuna + raise # Re-raise to signal Optuna except Exception as e: logger.error(f"Trial {trial.number} failed.") logger.error(f"Failed trial parameters: {current_params}") logger.error(f"Error: {str(e)}") logger.error(traceback.format_exc()) - return 0.0 # Return low score for failed trials + return 0.0 # Return low score for failed trials def callback(study, trial): nonlocal best_score, best_params, top_trials @@ -621,22 +713,26 @@ def callback(study, trial): ) for batch in range(num_batches): if batch > 0: - features_to_remove = min(1, X_train.shape[1] - 10) # Ensure we don't go below 10 features + features_to_remove = min( + 1, X_train.shape[1] - 10 + ) # Ensure we don't go below 10 features if features_to_remove > 0: # Always remove the first x features features_to_drop = X_train.columns[:features_to_remove].tolist() - logger.info(f"Batch {batch + 1}: Removing {features_to_remove} features: {features_to_drop}") + logger.info( + f"Batch {batch + 1}: Removing {features_to_remove} features: {features_to_drop}" + ) logger.info(f"Features before removal: {X_train.shape[1]}") - + # Remove features from all datasets X_train = X_train.drop(columns=features_to_drop) X_test = X_test.drop(columns=features_to_drop) if X_eval is not None: X_eval = X_eval.drop(columns=features_to_drop) - + logger.info(f"Features after removal: {X_train.shape[1]}") try: - study.optimize(objective, n_trials=batch_size, callbacks=[lambda study, trial: callback(study, trial, experiment_name, X_eval)], n_jobs=3) + study.optimize(objective, n_trials=batch_size, callbacks=[callback], n_jobs=3) except KeyboardInterrupt: logger.warning("Optimization interrupted by user.") break @@ -644,6 +740,7 @@ def callback(study, trial): logger.info(f"Best parameters selected: {best_params}") return best_params + def hypertune_tabnet(experiment_name: str, X_train, y_train, X_test, y_test, X_eval, y_eval): """ Main hypertuning function for TabNet with MLflow tracking. @@ -655,31 +752,37 @@ def hypertune_tabnet(experiment_name: str, X_train, y_train, X_test, y_test, X_e # === Run Optimization === best_params_found = optimize_hyperparameters( - X_train, y_train, X_test, y_test, X_eval, y_eval, # Pass actual data - hyperparameter_space=hyperparameter_space + X_train, + y_train, + X_test, + y_test, + X_eval, + y_eval, # Pass actual data + hyperparameter_space=hyperparameter_space, ) if not best_params_found: logger.error("Hyperparameter optimization failed to find best parameters.") return None, None - logger.info(f"Hyperparameter optimization completed. Best parameters found: {best_params_found}") + logger.info( + f"Hyperparameter optimization completed. Best parameters found: {best_params_found}" + ) # === Train Final Model with Best Params === logger.info("Training final TabNet model with best parameters found...") # Pass data correctly final_model, final_metrics = train_model( - X_train, y_train, - X_test, y_test, - X_eval, y_eval, - best_params_found + X_train, y_train, X_test, y_test, X_eval, y_eval, best_params_found ) logger.info("Final model trained successfully.") logger.info(f"Final Metrics: {final_metrics}") - global X_eval_orig_df # Need original DataFrame for signature - log_run_id = log_to_mlflow(final_model, final_metrics, best_params_found, experiment_name, X_eval_orig_df) + global X_eval_orig_df # Need original DataFrame for signature + log_run_id = log_to_mlflow( + final_model, final_metrics, best_params_found, experiment_name, X_eval_orig_df + ) logger.info(f"Final model and metrics logged to MLflow run_id: {log_run_id}") # Return the best parameters and the metrics from the model trained with those params @@ -690,12 +793,15 @@ def hypertune_tabnet(experiment_name: str, X_train, y_train, X_test, y_test, X_e logger.error(traceback.format_exc()) return None, None + def log_to_mlflow(model, metrics, params, experiment_name, X_eval_df_for_sig): """Logs model, metrics, params to MLflow.""" try: # Set up MLflow tracking mlflow.set_experiment(experiment_name) - with mlflow.start_run(run_name=f"tabnet_final_train_{datetime.now().strftime('%Y%m%d_%H%M')}", nested=True) as run: + with mlflow.start_run( + run_name=f"tabnet_final_train_{datetime.now().strftime('%Y%m%d_%H%M')}", nested=True + ) as run: mlflow.log_params(params) mlflow.set_tags({"final_model_training": True}) mlflow.log_metrics(metrics) @@ -709,7 +815,7 @@ def log_to_mlflow(model, metrics, params, experiment_name, X_eval_df_for_sig): input_example = X_eval_df_for_sig.iloc[:5].copy() # Ensure dtypes are float for numeric cols num_cols = input_example.select_dtypes(include=np.number).columns - input_example[num_cols] = input_example[num_cols].astype('float64') + input_example[num_cols] = input_example[num_cols].astype("float64") logger.info("Created input_example from DataFrame for signature.") # Wrap model for prediction @@ -717,33 +823,43 @@ def log_to_mlflow(model, metrics, params, experiment_name, X_eval_df_for_sig): try: logger.info("Inferring model signature...") # Ensure model is fitted - if not hasattr(sklearn_wrapper.model, 'network'): - raise ValueError("Model inside wrapper doesn't seem fitted (no network attribute).") + if not hasattr(sklearn_wrapper.model, "network"): + raise ValueError( + "Model inside wrapper doesn't seem fitted (no network attribute)." + ) prediction_output = sklearn_wrapper.predict_proba(input_example) signature = mlflow.models.infer_signature(input_example, prediction_output) logger.info("Signature inferred successfully.") except Exception as sig_err: - logger.error(f"Failed to infer signature: {sig_err}. Logging model without signature.") + logger.error( + f"Failed to infer signature: {sig_err}. Logging model without signature." + ) logger.error(traceback.format_exc()) signature = None else: - logger.warning(f"Cannot create input example for MLflow signature from X_eval of type {type(X_eval_df_for_sig)}.)") + logger.warning( + f"Cannot create input example for MLflow signature from X_eval of type {type(X_eval_df_for_sig)}.)" + ) # --- Log Model --- # Re-wrap model just before logging to be safe sklearn_wrapper_for_log = TabNetSklearnWrapper(model=model) model_reg_name = f"tabnet_final_{datetime.now().strftime('%Y%m%d_%H%M')}" try: - logger.info(f"Logging model with mlflow.sklearn.log_model (signature={'present' if signature else 'absent'})...") + logger.info( + f"Logging model with mlflow.sklearn.log_model (signature={'present' if signature else 'absent'})..." + ) model_info = mlflow.sklearn.log_model( sk_model=sklearn_wrapper_for_log, artifact_path="model_sklearn", signature=signature, registered_model_name=model_reg_name, - input_example=input_example if signature else None + input_example=input_example if signature else None, ) run_id = run.info.run_id - logger.info(f"Final model logged to MLflow (sklearn flavor): {model_info.model_uri}") + logger.info( + f"Final model logged to MLflow (sklearn flavor): {model_info.model_uri}" + ) logger.info(f"Registered as: {model_reg_name}") logger.info(f"Run ID: {run_id}") mlflow.end_run() @@ -752,13 +868,16 @@ def log_to_mlflow(model, metrics, params, experiment_name, X_eval_df_for_sig): except Exception as log_model_err: logger.error(f"mlflow.sklearn.log_model failed: {log_model_err}") logger.error(traceback.format_exc()) - return active_run_id # Return run_id even if model logging had issues + return active_run_id # Return run_id even if model logging had issues except Exception as e: logger.error(f"Error in log_to_mlflow: {str(e)}") logger.error(traceback.format_exc()) return None -def select_top_features_tabnet(model: TabNetClassifier, X_features: pd.DataFrame, n_features: int = 60) -> list[str]: + +def select_top_features_tabnet( + model: TabNetClassifier, X_features: pd.DataFrame, n_features: int = 60 +) -> list[str]: """ Selects the top N features based on TabNet feature importances. @@ -770,8 +889,10 @@ def select_top_features_tabnet(model: TabNetClassifier, X_features: pd.DataFrame Returns: A list of the names of the top N features. """ - if not hasattr(model, 'feature_importances_'): - raise ValueError("The provided model has not been trained yet or does not support feature importances.") + if not hasattr(model, "feature_importances_"): + raise ValueError( + "The provided model has not been trained yet or does not support feature importances." + ) importances = model.feature_importances_ feature_names = X_features.columns @@ -779,15 +900,16 @@ def select_top_features_tabnet(model: TabNetClassifier, X_features: pd.DataFrame if len(importances) != len(feature_names): raise ValueError("Mismatch between the number of feature importances and feature names.") - feature_importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': importances}) - feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False) + feature_importance_df = pd.DataFrame({"Feature": feature_names, "Importance": importances}) + feature_importance_df = feature_importance_df.sort_values(by="Importance", ascending=False) - top_features = feature_importance_df['Feature'].head(n_features).tolist() + top_features = feature_importance_df["Feature"].head(n_features).tolist() logger.info(f"Selected top {n_features} features based on TabNet importance.") - logger.info(f"Top features: {top_features}") # Log the selected features for visibility + logger.info(f"Top features: {top_features}") # Log the selected features for visibility return top_features + def train_with_precision_target(X_train, y_train, X_test, y_test, X_eval, y_eval): """ Train TabNet model with focus on precision target. @@ -810,7 +932,7 @@ def train_with_precision_target(X_train, y_train, X_test, y_test, X_eval, y_eval "batch_size": 128, "device_name": "cuda", "eps": 9.848795981241588e-06, - "eval_metric": ['auc', 'logloss'], + "eval_metric": ["auc", "logloss"], "fit_weights": 1, "gamma": 1.8, "lambda_sparse": 1.483050364931367e-06, @@ -840,16 +962,17 @@ def train_with_precision_target(X_train, y_train, X_test, y_test, X_eval, y_eval # Log to MLflow # log_to_mlflow(model, metrics, params, experiment_name, X_eval) # Select top features - top_features = compute_permutation_importance(model, X_eval, y_eval, metrics['threshold']) + top_features = compute_permutation_importance(model, X_eval, y_eval, metrics["threshold"]) return model, metrics, top_features except Exception as e: logger.error(f"Error during MLflow artifact logging: {str(e)}") logger.error(traceback.format_exc()) return mlflow.active_run().info.run_id if mlflow.active_run() else None + def compute_permutation_importance( model, - X_val: pd.DataFrame, + X_val: pd.DataFrame, y_val: np.ndarray, threshold: float = 0.3, n_repeats: int = 10, @@ -871,82 +994,90 @@ def compute_permutation_importance( feature_names = X_val.columns.tolist() if isinstance(X_val, pd.DataFrame): X_val = X_val.values - y_val_np = y_val.values if hasattr(y_val, 'values') else y_val - + y_val_np = y_val.values if hasattr(y_val, "values") else y_val + # Convert to numpy and ensure correct shape if y_val_np.ndim == 2 and y_val_np.shape[1] == 1: y_val_np = y_val_np.ravel() - + # Compute baseline metric probs = model.predict_proba(X_val)[:, 1] preds = (probs >= threshold).astype(int) - + # Calculate baseline precision baseline = np.sum((y_val_np == 1) & (preds == 1)) / (np.sum(preds == 1)) logger.info(f"Baseline precision: {baseline:.4f}") - + importances = [] for feat_idx, feat in enumerate(feature_names): drops = [] for i in range(n_repeats): - logger.info(f"Shuffling feature: {feat} ({feat_idx}) - Repeat: {i+1}") + logger.info(f"Shuffling feature: {feat} ({feat_idx}) - Repeat: {i + 1}") X_shuffled = X_val.copy() # Use column index since X_val is numpy array X_shuffled[:, feat_idx] = np.random.permutation(X_val[:, feat_idx]) - + # Get predictions with shuffled feature probs_shuffled = model.predict_proba(X_shuffled)[:, 1] preds_shuffled = (probs_shuffled >= threshold).astype(int) - + # Calculate precision with shuffled feature - precision = np.sum((y_val_np == 1) & (preds_shuffled == 1)) / (np.sum(preds_shuffled == 1) + 1e-7) + precision = np.sum((y_val_np == 1) & (preds_shuffled == 1)) / ( + np.sum(preds_shuffled == 1) + 1e-7 + ) drop = baseline - precision drops.append(drop) - + mean_drop = np.mean(drops) importances.append((feat, mean_drop)) logger.debug(f"Feature: {feat}, Mean importance drop: {mean_drop:.4f}") - + # Sort by importance descending importances.sort(key=lambda x: x[1], reverse=True) df_importance = pd.DataFrame(importances, columns=["feature", "importance"]) - + # Log top features logger.info("Top features by permutation importance:") logger.info(df_importance.head(number_of_features).to_string(index=False)) - + return df_importance - + except Exception as e: logger.error(f"Error computing permutation importance: {str(e)}") logger.error(traceback.format_exc()) raise -def hypertune_with_feature_importance(X_train, y_train, X_test, y_test, X_eval, y_eval, n_trials=50): + +def hypertune_with_feature_importance( + X_train, y_train, X_test, y_test, X_eval, y_eval, n_trials=50 +): """ Perform hyperparameter optimization with Optuna while tracking feature importances. - + Args: X_train (pd.DataFrame): Training features - y_train (pd.Series): Training labels + y_train (pd.Series): Training labels X_test (pd.DataFrame): Test features y_test (pd.Series): Test labels n_trials (int): Number of optimization trials - + Returns: tuple: (best_params, feature_importance_df) """ logger.info(f"Starting hyperparameter optimization with {n_trials} trials") - + # Store feature importances across trials feature_importances = [] - + hyperparameter_space = load_hyperparameter_space() + def objective(trial): current_params = {} current_params.update(base_params) # Sample scheduler type needed for conditional params - scheduler_type = trial.suggest_categorical("scheduler_type", hyperparameter_space["scheduler_type"]["choices"]) + scheduler_type = trial.suggest_categorical( + "scheduler_type", hyperparameter_space["scheduler_type"]["choices"] + ) current_params["scheduler_type"] = scheduler_type # Iterate over the rest of the hyperparameter space @@ -956,13 +1087,26 @@ def objective(trial): continue # Conditional suggestion for scheduler params is_relevant_scheduler_param = False - if scheduler_type == "plateau" and param_name in ["scheduler_patience", "scheduler_factor", "scheduler_min_lr"]: + if scheduler_type == "plateau" and param_name in [ + "scheduler_patience", + "scheduler_factor", + "scheduler_min_lr", + ]: is_relevant_scheduler_param = True - elif scheduler_type == "cosine" and param_name in ["scheduler_t_max", "scheduler_min_lr"]: + elif scheduler_type == "cosine" and param_name in [ + "scheduler_t_max", + "scheduler_min_lr", + ]: is_relevant_scheduler_param = True elif scheduler_type == "onecycle" and param_name == "scheduler_div_factor": is_relevant_scheduler_param = True - elif param_name not in ["scheduler_patience", "scheduler_factor", "scheduler_min_lr", "scheduler_t_max", "scheduler_div_factor"]: + elif param_name not in [ + "scheduler_patience", + "scheduler_factor", + "scheduler_min_lr", + "scheduler_t_max", + "scheduler_div_factor", + ]: # Not a scheduler-specific param, suggest normally is_relevant_scheduler_param = True @@ -970,65 +1114,89 @@ def objective(trial): # Suggest parameter if param_config["type"] == "float": if "step" in param_config: - current_params[param_name] = trial.suggest_float(param_name, param_config["low"], param_config["high"], step=param_config["step"], log=param_config.get("log", False)) + current_params[param_name] = trial.suggest_float( + param_name, + param_config["low"], + param_config["high"], + step=param_config["step"], + log=param_config.get("log", False), + ) else: - current_params[param_name] = trial.suggest_float(param_name, param_config["low"], param_config["high"], log=param_config.get("log", False)) + current_params[param_name] = trial.suggest_float( + param_name, + param_config["low"], + param_config["high"], + log=param_config.get("log", False), + ) elif param_config["type"] == "int": if "step" in param_config: - current_params[param_name] = trial.suggest_int(param_name, param_config["low"], param_config["high"], step=param_config["step"]) + current_params[param_name] = trial.suggest_int( + param_name, + param_config["low"], + param_config["high"], + step=param_config["step"], + ) else: - current_params[param_name] = trial.suggest_int(param_name, param_config["low"], param_config["high"]) + current_params[param_name] = trial.suggest_int( + param_name, param_config["low"], param_config["high"] + ) elif param_config["type"] == "categorical": # Only suggest if not scheduler_type (already handled) if param_name != "scheduler_type": choices = param_config.get("choices", []) if isinstance(choices, list) and choices: - current_params[param_name] = trial.suggest_categorical(param_name, choices) + current_params[param_name] = trial.suggest_categorical( + param_name, choices + ) else: - logger.warning(f"Skipping categorical param '{param_name}' due to invalid/empty choices.") + logger.warning( + f"Skipping categorical param '{param_name}' due to invalid/empty choices." + ) # Train model and get metrics - model, metrics = train_model(X_train, y_train, X_test, y_test, X_eval, y_eval, current_params) - + model, metrics = train_model( + X_train, y_train, X_test, y_test, X_eval, y_eval, current_params + ) + # Store feature importances for this trial importance_dict = dict(zip(X_train.columns, model.feature_importances_)) feature_importances.append(importance_dict) - - return metrics['precision'] - + + return metrics["precision"] + # Create and run study - study = optuna.create_study(direction='maximize') + study = optuna.create_study(direction="maximize") study.optimize(objective, n_trials=n_trials, n_jobs=4) - + # Calculate average feature importance across all trials avg_importances = {} for feature in X_train.columns: importance_values = [trial_imp[feature] for trial_imp in feature_importances] avg_importances[feature] = np.mean(importance_values) - + # Create DataFrame and sort by importance - importance_df = pd.DataFrame({ - 'feature': list(avg_importances.keys()), - 'importance': list(avg_importances.values()) - }) - importance_df = importance_df.sort_values('importance', ascending=False) - + importance_df = pd.DataFrame( + {"feature": list(avg_importances.keys()), "importance": list(avg_importances.values())} + ) + importance_df = importance_df.sort_values("importance", ascending=False) + # Get top 100 features top_100_features = importance_df.head(100) - + logger.info("Top 100 features by average importance across trials:") for idx, row in top_100_features.iterrows(): logger.info(f"{row['feature']}: {row['importance']:.4f} id: {idx}") - + return study.best_params, importance_df + def tabnet_feature_selection_pipeline(X, y, X_eval, y_eval, target_range=(50, 70)): """Complete feature selection pipeline optimized for TabNet""" logger.info(f"Starting TabNet feature selection pipeline with {X.shape[1]} initial features") - + # Stage 1: Quick filter methods (260 -> ~100) logger.info("Stage 1: Applying mutual information and F-test filters") - + # Mutual information for non-linear relationships mi_scores = mutual_info_classif(X, y, random_state=42) mi_top = np.argsort(mi_scores)[-100:] @@ -1045,29 +1213,33 @@ def tabnet_feature_selection_pipeline(X, y, X_eval, y_eval, target_range=(50, 70 # Stage 2: TabNet-based importance (100 -> ~80) logger.info("Stage 2: Using TabNet for feature importance ranking") - + tabnet_selector = TabNetClassifier( - n_d=64, n_a=64, n_steps=5, + n_d=64, + n_a=64, + n_steps=5, lambda_sparse=1e-3, optimizer_params=dict(lr=2e-2), verbose=0, - device_name=device + device_name=device, ) X_train, X_val = X_filtered, X_eval_filtered y_train, y_val = y, y_eval # Convert to numpy arrays for TabNet - X_train_np = X_train.values if hasattr(X_train, 'values') else X_train - X_val_np = X_val.values if hasattr(X_val, 'values') else X_val - y_train_np = y_train.values.ravel() if hasattr(y_train, 'values') else np.array(y_train).ravel() - y_val_np = y_val.values.ravel() if hasattr(y_val, 'values') else np.array(y_val).ravel() + X_train_np = X_train.values if hasattr(X_train, "values") else X_train + X_val_np = X_val.values if hasattr(X_val, "values") else X_val + y_train_np = y_train.values.ravel() if hasattr(y_train, "values") else np.array(y_train).ravel() + y_val_np = y_val.values.ravel() if hasattr(y_val, "values") else np.array(y_val).ravel() tabnet_selector.fit( - X_train_np, y_train_np, + X_train_np, + y_train_np, eval_set=[(X_val_np, y_val_np)], - eval_metric=['auc'], - max_epochs=100, patience=15, + eval_metric=["auc"], + max_epochs=100, + patience=15, ) # Get importance and select top features @@ -1079,7 +1251,7 @@ def tabnet_feature_selection_pipeline(X, y, X_eval, y_eval, target_range=(50, 70 # Stage 3: Fine-tuned sequential selection (80 -> 50-70) logger.info("Stage 3: Sequential feature selection for optimal subset") - + X_stage2 = X[stage2_features] final_features, scores = tabnet_sequential_selection( X_stage2, y, target_features=target_range[1] @@ -1101,35 +1273,37 @@ def tabnet_feature_selection_pipeline(X, y, X_eval, y_eval, target_range=(50, 70 return final_selected, scores[:optimal_count] + def tabnet_sequential_selection(X, y, target_features=70): """Sequential forward selection using TabNet for feature evaluation""" - - + logger.info(f"Starting sequential selection to find top {target_features} features") - + selected_features = [] remaining_features = list(X.columns) scores = [] - + for i in range(min(target_features, len(remaining_features))): best_score = -1 best_feature = None - - logger.info(f"Sequential selection iteration {i+1}/{target_features}") - + + logger.info(f"Sequential selection iteration {i + 1}/{target_features}") + for feature in remaining_features: current_features = selected_features + [feature] X_subset = X[current_features] - + # Quick TabNet evaluation tabnet_eval = TabNetClassifier( - n_d=32, n_a=32, n_steps=3, + n_d=32, + n_a=32, + n_steps=3, lambda_sparse=1e-3, optimizer_params=dict(lr=2e-2), verbose=0, - device_name=device + device_name=device, ) - + try: # Use cross-validation for robust evaluation # fit_params = { @@ -1138,55 +1312,58 @@ def tabnet_sequential_selection(X, y, target_features=70): # "eval_metric": ['auc'] # } cv_scores = cross_val_score( - tabnet_eval, X_subset.values, y, - cv=3, scoring='precision', n_jobs=3 + tabnet_eval, X_subset.values, y, cv=3, scoring="precision", n_jobs=3 ) score = np.mean(cv_scores) - + if score > best_score: best_score = score best_feature = feature - + except Exception as e: logger.warning(f"Error evaluating feature {feature}: {str(e)}") continue - + if best_feature is not None: selected_features.append(best_feature) remaining_features.remove(best_feature) scores.append(best_score) - logger.info(f"Selected feature {i+1}: {best_feature} (score: {best_score:.4f})") + logger.info(f"Selected feature {i + 1}: {best_feature} (score: {best_score:.4f})") else: - logger.warning(f"No valid feature found in iteration {i+1}") + logger.warning(f"No valid feature found in iteration {i + 1}") break - + logger.info(f"Sequential selection completed with {len(selected_features)} features") return selected_features, scores + def tabnet_staged_selection(X, y, X_test, y_test, X_eval, y_eval, target_features=100): """Multi-stage TabNet feature selection with different objectives""" - + logger.info(f"Starting TabNet staged selection with {X.shape[1]} initial features") - + # Stage 1: Quick filter with simplified TabNet logger.info("Stage 1: Quick filter with simplified TabNet") tabnet_fast = TabNetClassifier( - n_d=16, n_a=16, n_steps=2, + n_d=16, + n_a=16, + n_steps=2, lambda_sparse=1e-3, optimizer_params=dict(lr=5e-2), verbose=0, - device_name=device + device_name=device, ) # Fit with early stopping tabnet_fast.fit( - X.values, y, + X.values, + y, eval_set=[(X_test.values, y_test)], max_epochs=50, patience=10, - eval_metric=['auc'] + eval_metric=["auc"], ) - + # Get feature importances stage1_importance = tabnet_fast.feature_importances_ stage1_features = X.columns[np.argsort(stage1_importance)[-200:]].tolist() @@ -1199,11 +1376,13 @@ def tabnet_staged_selection(X, y, X_test, y_test, X_eval, y_eval, target_feature X_eval_stage1 = X_eval[stage1_features] tabnet_refined = TabNetClassifier( - n_d=32, n_a=32, n_steps=3, + n_d=32, + n_a=32, + n_steps=3, lambda_sparse=1e-3, optimizer_params=dict(lr=2e-2), verbose=0, - device_name=device + device_name=device, ) # Cross-validation feature importance @@ -1217,11 +1396,12 @@ def tabnet_staged_selection(X, y, X_test, y_test, X_eval, y_eval, target_feature try: tabnet_refined.fit( - X_train_cv.values, y_train_cv, + X_train_cv.values, + y_train_cv, eval_set=[(X_val_cv.values, y_val_cv)], max_epochs=100, patience=15, - eval_metric=['auc'] + eval_metric=["auc"], ) cv_importances.append(tabnet_refined.feature_importances_) @@ -1229,7 +1409,7 @@ def tabnet_staged_selection(X, y, X_test, y_test, X_eval, y_eval, target_feature y_pred_proba = tabnet_refined.predict_proba(X_eval_stage1.values)[:, 1] val_score = roc_auc_score(y_eval, y_pred_proba) cv_scores.append(val_score) - + except Exception as e: logger.warning(f"Error in CV fold: {str(e)}") continue @@ -1247,37 +1427,41 @@ def tabnet_staged_selection(X, y, X_test, y_test, X_eval, y_eval, target_feature return stage2_features, avg_importance + def improved_tabnet_staged_selection(X, y, X_test, y_test, X_eval, y_eval, target_features=120): """Enhanced TabNet feature selection with optimized hyperparameters""" - + logger.info(f"Starting IMPROVED TabNet staged selection with {X.shape[1]} initial features") - + # IMPROVEMENT 1: Better Stage 1 configuration logger.info("Stage 1: Enhanced quick filter with optimized TabNet") tabnet_fast = TabNetClassifier( - n_d=32, n_a=32, n_steps=4, # Increased capacity + n_d=32, + n_a=32, + n_steps=4, # Increased capacity gamma=1.5, # Feature selection strength lambda_sparse=1e-4, # Reduced sparsity for more features optimizer_params=dict(lr=2e-2, weight_decay=1e-5), scheduler_params=dict(step_size=20, gamma=0.8), # Learning rate scheduling - mask_type='entmax', # Better feature selection + mask_type="entmax", # Better feature selection verbose=0, device_name=device, - seed=42 # Reproducibility + seed=42, # Reproducibility ) # IMPROVEMENT 2: Better training configuration tabnet_fast.fit( - X.values, y, + X.values, + y, eval_set=[(X_test.values, y_test)], max_epochs=100, # Increased epochs - patience=20, # More patience - batch_size=1024, # Larger batch size + patience=20, # More patience + batch_size=1024, # Larger batch size virtual_batch_size=256, eval_metric=["auc", "logloss"], - drop_last=False + drop_last=False, ) - + # Select more features in stage 1 stage1_importance = tabnet_fast.feature_importances_ stage1_features = X.columns[np.argsort(stage1_importance)[-250:]].tolist() # Increased from 200 @@ -1297,35 +1481,38 @@ def improved_tabnet_staged_selection(X, y, X_test, y_test, X_eval, y_eval, targe skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42) for fold, (train_idx, val_idx) in enumerate(skf.split(X_stage1, y)): logger.info(f"Processing fold {fold + 1}/5") - + X_train_cv, X_val_cv = X_stage1.iloc[train_idx], X_stage1.iloc[val_idx] y_train_cv, y_val_cv = y[train_idx], y[val_idx] try: # Reset model for each fold tabnet_fold = TabNetClassifier( - n_d=64, n_a=64, n_steps=5, + n_d=64, + n_a=64, + n_steps=5, gamma=1.3, lambda_sparse=5e-5, optimizer_params=dict(lr=1e-2, weight_decay=1e-5), scheduler_params=dict(step_size=30, gamma=0.9), - mask_type='entmax', + mask_type="entmax", verbose=0, device_name=device, - seed=42 + fold # Different seed per fold + seed=42 + fold, # Different seed per fold ) - + tabnet_fold.fit( - X_train_cv.values, y_train_cv, + X_train_cv.values, + y_train_cv, eval_set=[(X_val_cv.values, y_val_cv)], max_epochs=150, # More epochs for refined training patience=25, batch_size=512, virtual_batch_size=128, - eval_metric=['auc', 'logloss'], - drop_last=False + eval_metric=["auc", "logloss"], + drop_last=False, ) - + cv_importances.append(tabnet_fold.feature_importances_) # Evaluate on validation fold @@ -1333,15 +1520,17 @@ def improved_tabnet_staged_selection(X, y, X_test, y_test, X_eval, y_eval, targe fold_score = roc_auc_score(y_eval, y_pred_proba) cv_scores.append(fold_score) successful_folds += 1 - + logger.info(f"Fold {fold + 1} AUC: {fold_score:.4f}") - + except Exception as e: logger.warning(f"Error in CV fold {fold + 1}: {str(e)}") continue if successful_folds < 3: - logger.error(f"Only {successful_folds} successful CV folds, falling back to stage 1 features") + logger.error( + f"Only {successful_folds} successful CV folds, falling back to stage 1 features" + ) return stage1_features[:target_features], stage1_importance # Average importance across successful folds @@ -1351,13 +1540,14 @@ def improved_tabnet_staged_selection(X, y, X_test, y_test, X_eval, y_eval, targe selected_indices = np.argsort(avg_importance)[-target_features:] selected_importances = avg_importance[selected_indices] feature_importance_pairs = list(zip(stage2_features, selected_importances)) - + logger.info(f"Stage 2: Selected {len(stage2_features)} features: {stage2_features}") logger.info(f"Feature-importance pairs: {feature_importance_pairs}") logger.info(f"CV Score: {CV_score:.4f}") return stage2_features, avg_importance + def main(): """ Main execution function for TabNet hypertuning including loss function. @@ -1365,12 +1555,14 @@ def main(): try: logger.info("Starting TabNet model hypertuning (tuning fit_weights)") setup_mlflow_tracking(experiment_name) - global X_eval_orig_df # Used in log_to_mlflow + global X_eval_orig_df # Used in log_to_mlflow # Load data dataloader = DataLoader() - X_train_orig, y_train_orig, X_test_orig, y_test_orig, X_eval_orig_df, y_eval_orig = dataloader.load_data() - X_eval_orig_df = X_eval_orig_df.copy() # Store original for signature + X_train_orig, y_train_orig, X_test_orig, y_test_orig, X_eval_orig_df, y_eval_orig = ( + dataloader.load_data() + ) + X_eval_orig_df = X_eval_orig_df.copy() # Store original for signature # Select features features = import_selected_features_ensemble_new(model_type="tabnet") @@ -1380,9 +1572,21 @@ def main(): X_eval_df = X_eval_orig_df[features] # Assign labels (ensure 1D numpy) - y_train = y_train_orig.values.ravel() if hasattr(y_train_orig, 'values') else np.array(y_train_orig).ravel() - y_test = y_test_orig.values.ravel() if hasattr(y_test_orig, 'values') else np.array(y_test_orig).ravel() - y_eval = y_eval_orig.values.ravel() if hasattr(y_eval_orig, 'values') else np.array(y_eval_orig).ravel() + y_train = ( + y_train_orig.values.ravel() + if hasattr(y_train_orig, "values") + else np.array(y_train_orig).ravel() + ) + y_test = ( + y_test_orig.values.ravel() + if hasattr(y_test_orig, "values") + else np.array(y_test_orig).ravel() + ) + y_eval = ( + y_eval_orig.values.ravel() + if hasattr(y_eval_orig, "values") + else np.array(y_eval_orig).ravel() + ) # Convert features to float64 X_train = X_train.astype("float64") @@ -1396,15 +1600,13 @@ def main(): # === Run Hypertuning === best_params_final, best_metrics_final = hypertune_tabnet( - experiment_name, - X_train, y_train, - X_test, y_test, - X_eval, y_eval + experiment_name, X_train, y_train, X_test, y_test, X_eval, y_eval ) # === Run Feature Selection === - final_selected, scores = improved_tabnet_staged_selection(X_train, y_train, X_test, y_test, X_eval, y_eval, target_features=100) - + final_selected, scores = improved_tabnet_staged_selection( + X_train, y_train, X_test, y_test, X_eval, y_eval, target_features=100 + ) # train_with_precision_target(X_train, y_train, X_test, y_test, X_eval, y_eval) @@ -1414,4 +1616,3 @@ def main(): if __name__ == "__main__": main() - diff --git a/src/models/StackedEnsemble/base/neural/tabnet_model_outliers.py b/src/models/StackedEnsemble/base/neural/tabnet_model_outliers.py new file mode 100644 index 0000000..8ebbb9e --- /dev/null +++ b/src/models/StackedEnsemble/base/neural/tabnet_model_outliers.py @@ -0,0 +1,1909 @@ +import os +import pickle +import random +import traceback +import warnings +from datetime import datetime + +import mlflow +import mlflow.pyfunc +import numpy as np +import optuna +import pandas as pd +import torch +import torch.optim as optim +from pytorch_tabnet.metrics import Metric +from pytorch_tabnet.tab_model import TabNetClassifier +from sklearn.base import BaseEstimator +from sklearn.feature_selection import f_classif, mutual_info_classif +from sklearn.metrics import precision_score, recall_score, roc_auc_score +from sklearn.model_selection import StratifiedKFold, cross_val_score +from sklearn.preprocessing import MinMaxScaler, RobustScaler, StandardScaler + +# from sklearn.preprocessing import QuantileTransformer +from sklearn.utils.multiclass import type_of_target +from torch.amp import GradScaler +from torch.optim.lr_scheduler import OneCycleLR, ReduceLROnPlateau + +# Logger and shared utilities +from src.utils.logger import ExperimentLogger +from src.utils.outlier_detection import analyze_outlier_impact, remove_outliers_isolation_forest + +experiment_name = "tabnet_soccer_prediction" +logger = ExperimentLogger(experiment_name=experiment_name) + +# Import shared utility functions +from src.models.StackedEnsemble.shared.data_loader_new import DataLoader +from src.models.StackedEnsemble.shared.hypertuner_utils import optimize_threshold +from src.utils.create_evaluation_set import ( + import_selected_features_ensemble_new, + setup_mlflow_tracking, +) + +# Filter specific TabNet weight-related warnings +warnings.filterwarnings( + "ignore", message=".*imbalanced.*|.*weight.*|.*class_weight.*", category=UserWarning +) +warnings.filterwarnings("ignore", message=".*sample_weight.*", category=UserWarning) + +# Global settings +min_recall = 0.30 +# You can adjust n_trials if needed +n_trials = 20000 + +# Scaling configuration +SCALING_METHOD = "standard" # Options: "standard", "robust", "minmax" +SCALER_SAVE_PATH = "src/models/scalers/scaler_tabnet.pkl" + +# Then modify your base_params to include the custom metrics +device = "cuda" if torch.cuda.is_available() else "cpu" +base_params = { + "optimizer_fn": optim.Adam, # Use Adam as default optimizer + # "mask_type": "sparsemax", + "eval_metric": ["auc", "logloss"], # Default metrics, custom one passed in + "fit_weights": 1, + "verbose": 0, + "seed": 19, + "device_name": device, +} + +# Set fixed seed and hash seed for determinism +SEED = 19 +os.environ["PYTHONHASHSEED"] = str(SEED) +random.seed(SEED) +np.random.seed(SEED) + +# Restrict parallel threads across various libraries +os.environ["OMP_NUM_THREADS"] = "4" +os.environ["MKL_NUM_THREADS"] = "4" +os.environ["OPENBLAS_NUM_THREADS"] = "4" +os.environ["NUMEXPR_NUM_THREADS"] = "4" +os.environ["VECLIB_MAXIMUM_THREADS"] = "4" + +# Create gradient scaler for mixed precision training +scaler = GradScaler() if torch.cuda.is_available() else None + +# Verify CUDA availability +if torch.cuda.is_available(): + torch.cuda.manual_seed_all(SEED) + torch.backends.cudnn.benchmark = True + torch.backends.cudnn.deterministic = False + torch.backends.cuda.matmul.allow_tf32 = True + torch.backends.cudnn.allow_tf32 = True + gpu_name = torch.cuda.get_device_name(0) + gpu_memory = torch.cuda.get_device_properties(0).total_memory / (1024**3) + # Check if PyTorch version supports torch.compile + if hasattr(torch, "compile"): + logger.info("torch.compile is available - will use it for performance optimization") + USE_TORCH_COMPILE = True + else: + logger.info("torch.compile not available in this PyTorch version") + USE_TORCH_COMPILE = False + logger.info(f"CUDA is available! Found {torch.cuda.device_count()} GPU(s).") + logger.info(f"Using GPU: {gpu_name} with {gpu_memory:.2f} GB memory") + logger.info(f"CUDA Version: {torch.version.cuda}") + logger.info( + f"PyTorch CUDA capabilities: TF32={torch.backends.cuda.matmul.allow_tf32}, cuDNN benchmark={torch.backends.cudnn.benchmark}" + ) +else: + logger.warning("CUDA is NOT available. TabNet will run on CPU.") + USE_TORCH_COMPILE = False + base_params["device_name"] = "cpu" + + +def get_scaler(scaling_method=SCALING_METHOD): + """ + Get the appropriate scaler based on the scaling method. + + Args: + scaling_method (str): Type of scaler to use ("standard", "robust", "minmax") + + Returns: + sklearn scaler object + """ + if scaling_method == "standard": + return StandardScaler() + elif scaling_method == "robust": + return RobustScaler(quantile_range=(5, 95)) + elif scaling_method == "minmax": + return MinMaxScaler(feature_range=(-1, 1)) + else: + logger.warning(f"Unknown scaling method '{scaling_method}', defaulting to StandardScaler") + return StandardScaler() + + +def load_or_create_scaler(X_train, scaling_method=SCALING_METHOD, force_retrain=False): + """ + Load existing scaler or create and fit a new one. + + Args: + X_train: Training data to fit scaler on (if creating new) + scaling_method (str): Type of scaler to use + force_retrain (bool): Force retraining of scaler even if it exists + + Returns: + tuple: (fitted_scaler, is_new_scaler) + """ + scaler_path = SCALER_SAVE_PATH + + # Create directory if it doesn't exist + os.makedirs(os.path.dirname(scaler_path), exist_ok=True) + + if not force_retrain and os.path.exists(scaler_path): + try: + with open(scaler_path, "rb") as f: + scaler = pickle.load(f) + logger.info(f"Loaded existing TabNet scaler from {scaler_path}") + logger.info(f"Scaler type: {type(scaler).__name__}") + + # Verify scaler compatibility + if hasattr(scaler, "transform"): + return scaler, False + else: + logger.warning("Loaded scaler is invalid, creating new one") + + except Exception as e: + logger.warning(f"Failed to load existing scaler: {str(e)}") + logger.info("Creating new scaler") + + # Create and fit new scaler + scaler = get_scaler(scaling_method) + logger.info(f"Creating new TabNet scaler: {type(scaler).__name__}") + + # Fit scaler on training data + if isinstance(X_train, pd.DataFrame): + scaler.fit(X_train.values) + else: + scaler.fit(X_train) + + # Save scaler + try: + with open(scaler_path, "wb") as f: + pickle.dump(scaler, f) + logger.info(f"Saved new TabNet scaler to {scaler_path}") + except Exception as e: + logger.warning(f"Failed to save scaler: {str(e)}") + + return scaler, True + + +def preprocess_data_with_scaling( + X_train, X_test=None, X_eval=None, scaling_method=SCALING_METHOD, force_retrain=False +): + """ + Preprocess data with scaling for TabNet. + + Args: + X_train: Training features + X_test: Test features (optional) + X_eval: Evaluation features (optional) + scaling_method (str): Type of scaling to apply + force_retrain (bool): Force retraining of scaler + + Returns: + tuple: (X_train_scaled, X_test_scaled, X_eval_scaled, scaler) + """ + logger.info(f"Preprocessing data with {scaling_method} scaling for TabNet") + + # Load or create scaler + scaler, is_new = load_or_create_scaler(X_train, scaling_method, force_retrain) + + # Transform data + if isinstance(X_train, pd.DataFrame): + X_train_scaled = scaler.transform(X_train.values) + X_test_scaled = scaler.transform(X_test.values) if X_test is not None else None + X_eval_scaled = scaler.transform(X_eval.values) if X_eval is not None else None + else: + X_train_scaled = scaler.transform(X_train) + X_test_scaled = scaler.transform(X_test) if X_test is not None else None + X_eval_scaled = scaler.transform(X_eval) if X_eval is not None else None + + # Log scaling statistics + if is_new: + logger.info("=== Scaling Statistics ===") + if hasattr(scaler, "mean_"): + logger.info( + f"Feature means: min={scaler.mean_.min():.4f}, max={scaler.mean_.max():.4f}" + ) + if hasattr(scaler, "scale_"): + logger.info( + f"Feature scales: min={scaler.scale_.min():.4f}, max={scaler.scale_.max():.4f}" + ) + elif hasattr(scaler, "data_range_"): + logger.info( + f"Feature ranges: min={scaler.data_range_.min():.4f}, max={scaler.data_range_.max():.4f}" + ) + + logger.info( + f"Scaled data range - Train: [{X_train_scaled.min():.4f}, {X_train_scaled.max():.4f}]" + ) + logger.info("===========================") + + return X_train_scaled, X_test_scaled, X_eval_scaled, scaler + + +def create_tabnet_sklearn_wrapper_with_scaler(model, scaler): + """ + Create a TabNet sklearn wrapper that includes the scaler for end-to-end preprocessing. + + Args: + model: Trained TabNet model + scaler: Fitted scaler + + Returns: + TabNetSklearnWrapperWithScaler instance + """ + return TabNetSklearnWrapperWithScaler(model=model, scaler=scaler) + + +class TabNetSklearnWrapperWithScaler(BaseEstimator): + """ + A scikit-learn compatible wrapper for TabNet that includes automatic scaling. + This ensures end-to-end preprocessing compatibility with MLflow and ensemble models. + """ + + def __init__(self, model=None, scaler=None, **kwargs): + self.model = model + self.scaler = scaler + self.kwargs = kwargs + + def fit(self, X, y): + """ + Fit method for scikit-learn compatibility. + """ + if self.scaler is None: + self.scaler = get_scaler() + if isinstance(X, pd.DataFrame): + self.scaler.fit(X.values) + else: + self.scaler.fit(X) + + # Scale data + X_scaled = self.scaler.transform(X.values if isinstance(X, pd.DataFrame) else X) + + if self.model is None: + self.model = TabNetClassifier(**self.kwargs) + + self.model.fit(X_scaled, y) + return self + + def predict(self, X): + """ + Predict method for scikit-learn compatibility with automatic scaling. + """ + X_scaled = self.scaler.transform(X.values if isinstance(X, pd.DataFrame) else X) + return self.model.predict(X_scaled) + + def predict_proba(self, X): + """ + Predict probability method for scikit-learn compatibility with automatic scaling. + """ + X_scaled = self.scaler.transform(X.values if isinstance(X, pd.DataFrame) else X) + return self.model.predict_proba(X_scaled) + + +class TabNetSklearnWrapper(BaseEstimator): + """ + A scikit-learn compatible wrapper for TabNet that makes it compatible with MLflow's sklearn flavor. + """ + + def __init__(self, model=None, **kwargs): + self.model = model + self.kwargs = kwargs + + def fit(self, X, y): + """ + Fit method for scikit-learn compatibility. + """ + if self.model is None: + self.model = TabNetClassifier(**self.kwargs) + self.model.fit(X, y) + return self + + def predict(self, X): + """ + Predict method for scikit-learn compatibility. + """ + if hasattr(X, "values"): + data = X.values + else: + data = X + return self.model.predict(data) + + def predict_proba(self, X): + """ + Predict probability method for scikit-learn compatibility. + """ + if hasattr(X, "values"): + data = X.values + else: + data = X + return self.model.predict_proba(data) + + +class TabNetWrapper(mlflow.pyfunc.PythonModel): + def __init__(self, model): + self.model = model + + def predict(self, model_input): + """ + Returns class predictions. + """ + if hasattr(model_input, "values"): + data = model_input.values + else: + data = model_input + return self.model.predict(data) + + def predict_proba(self, model_input): + """ + Returns probability estimates for each class. + """ + if hasattr(model_input, "values"): + data = model_input.values + else: + data = model_input + return self.model.predict_proba(data) + + +def create_model(model_params): + """ + Create and configure TabNet model instance based on provided parameters. + Uses internal TabNet loss. + """ + try: + # Start with base parameters and update with model_params + params = base_params.copy() + # Define valid constructor args and config keys + valid_constructor_args = { + "n_d", + "n_a", + "n_steps", + "gamma", + "lambda_sparse", + "optimizer_fn", + "optimizer_params", + "scheduler_fn", + "scheduler_params", + "mask_type", + "n_independent", + "n_shared", + "epsilon", + "momentum", + "device_name", + "seed", + "verbose", + "cat_idxs", + "cat_dims", + "cat_emb_dim", + } + config_keys = { + "learning_rate", + "weight_decay", + "scheduler_type", + "scheduler_min_lr", + "scheduler_patience", + "scheduler_factor", + "scheduler_div_factor", + } + + # Extract constructor and config params from input model_params + constructor_params = {k: v for k, v in model_params.items() if k in valid_constructor_args} + config_params = {k: v for k, v in model_params.items() if k in config_keys} + + # Always use Adam optimizer for GPU optimization + params["optimizer_fn"] = optim.Adam + + # Update base params with constructor params + params.update(constructor_params) + + # Configure optimizer with GPU-optimized settings + lr = config_params.get("learning_rate", 0.01) + weight_decay = config_params.get("weight_decay", 1e-5) + if "optimizer_params" not in params: + params["optimizer_params"] = {} + params["optimizer_params"]["lr"] = lr + params["optimizer_params"]["weight_decay"] = weight_decay + # Add optimizer settings that can improve GPU performance + params["optimizer_params"]["eps"] = config_params.get( + "eps", 1e-7 + ) # Improves numerical stability + params["optimizer_params"]["amsgrad"] = True # Can improve convergence on GPU + + # Configure scheduler + scheduler_type = config_params.get("scheduler_type", "none") + scheduler_params_config = {} + if scheduler_type == "plateau": + scheduler_fn = ReduceLROnPlateau + scheduler_params_config = { + "patience": config_params.get("scheduler_patience", 5), + "factor": config_params.get("scheduler_factor", 0.1), + "min_lr": config_params.get("scheduler_min_lr", 1e-6), + "mode": "max", + } + params["scheduler_fn"] = scheduler_fn + params["scheduler_params"] = scheduler_params_config + elif scheduler_type == "onecycle": + scheduler_fn = OneCycleLR + scheduler_params_config = { + "div_factor": config_params.get("scheduler_div_factor", 25.0), + "final_div_factor": config_params.get("scheduler_final_div_factor", 10000.0), + "pct_start": config_params.get("scheduler_pct_start", 0.3), + } + params["scheduler_fn"] = scheduler_fn + params["scheduler_params"] = scheduler_params_config + else: + params.pop("scheduler_fn", None) + params.pop("scheduler_params", None) + + # Ensure only valid args are passed to constructor + final_params = { + k: v + for k, v in params.items() + if k in valid_constructor_args + or k in ["optimizer_params", "scheduler_fn", "scheduler_params"] + } + if "verbose" in final_params: + final_params["verbose"] = int(final_params["verbose"]) + + # Instantiate model (without loss_fn argument) + model = TabNetClassifier(**final_params) + + # Apply torch.compile if available and using GPU (for PyTorch 2.0+) + if USE_TORCH_COMPILE and torch.cuda.is_available(): + try: + # We need to compile specific parts of the model + # TabNetClassifier is complex, so we compile only the network component + if hasattr(model, "network") and hasattr(torch, "compile"): + logger.info("Applying torch.compile to TabNet network for GPU acceleration") + # Apply compilation with 'reduce-overhead' mode which is good for GPU performance + model.network = torch.compile(model.network, mode="reduce-overhead") + logger.info("Successfully applied torch.compile to TabNet network") + except Exception as e: + logger.warning(f"Could not apply torch.compile: {str(e)}") + + return model + except Exception as e: + logger.error(f"Error creating TabNet model: {str(e)}") + logger.error(f"Parameters passed to TabNetClassifier attempt: {final_params}") + logger.error(traceback.format_exc()) + raise + + +def train_model( + X_train_scaled, y_train, X_test_scaled, y_test, X_eval_scaled, y_eval, model_params +): + """ + Train a TabNet model with early stopping and automatic scaling. + Uses internal TabNet loss, controls imbalance via fit(weights=...). + Returns the trained model and evaluation metrics after threshold optimization. + """ + try: + # Create the model + model = create_model(model_params) + + # Get fit parameters from model_params + batch_size_to_use = int(model_params.get("batch_size", 1024)) + max_epochs = int(model_params.get("max_epochs", 50)) + patience_to_use = int(model_params.get("patience", 10)) + virtual_batch_size_to_use = int(model_params.get("virtual_batch_size", 128)) + fit_weights_value = int( + model_params.get("fit_weights", 1) + ) # Default to 1 (unbalanced) if not found + + # Update OneCycleLR scheduler params if needed + if model_params.get("scheduler_type") == "onecycle": + # Combine X_train and X_test for step calculation as they are used together in fit + total_samples = ( + len(X_train_scaled) + if hasattr(X_train_scaled, "__len__") + else X_train_scaled.shape[0] + ) + ( + len(X_test_scaled) if hasattr(X_test_scaled, "__len__") else X_test_scaled.shape[0] + ) + steps_per_epoch = total_samples // batch_size_to_use + ( + 1 if total_samples % batch_size_to_use != 0 else 0 + ) + total_steps = steps_per_epoch * max_epochs + if hasattr(model, "scheduler_params") and isinstance(model.scheduler_params, dict): + lr = model_params.get("learning_rate", 0.01) + model.scheduler_params["max_lr"] = lr + model.scheduler_params["total_steps"] = total_steps + logger.info(f"Updated OneCycleLR params: max_lr={lr}, total_steps={total_steps}") + else: + logger.warning("OneCycleLR selected but model.scheduler_params not found/dict.") + + # Convert data to numpy arrays and ensure correct dtypes + if isinstance(X_train_scaled, pd.DataFrame): + X_train_scaled = X_train_scaled.values + if isinstance(X_test_scaled, pd.DataFrame): + X_test_scaled = X_test_scaled.values + if isinstance(X_eval_scaled, pd.DataFrame): + X_eval_scaled = X_eval_scaled.values + + # Ensure float64 for numerical stability + X_train_scaled = X_train_scaled.astype("float64") + X_test_scaled = X_test_scaled.astype("float64") + X_eval_scaled = X_eval_scaled.astype("float64") + + # Handle labels + if isinstance(y_train, (pd.Series, pd.DataFrame)): + y_train = y_train.values + if isinstance(y_test, (pd.Series, pd.DataFrame)): + y_test = y_test.values + if isinstance(y_eval, (pd.Series, pd.DataFrame)): + y_eval = y_eval.values + + # Reshape y + if y_train.ndim == 2 and y_train.shape[1] == 1: + y_train = y_train.ravel() + if y_test.ndim == 2 and y_test.shape[1] == 1: + y_test = y_test.ravel() + if y_eval.ndim == 2 and y_eval.shape[1] == 1: + y_eval = y_eval.ravel() + + # Combine training and testing data (now scaled) + X_combined = np.concatenate([X_train_scaled, X_test_scaled], axis=0) + y_combined = np.concatenate([y_train, y_test], axis=0) + + # Define fit parameters dictionary, now using fit_weights_value (with scaled eval data) + fit_params = { + "eval_set": [(X_eval_scaled, y_eval)], + "eval_metric": [PrecisionFocusedMetric], + "max_epochs": max_epochs, + "patience": patience_to_use, + "batch_size": batch_size_to_use, + "virtual_batch_size": virtual_batch_size_to_use, + "weights": fit_weights_value, # Pass the sampled weight value here + "drop_last": False, + } + + # Fit the model + logger.info( + f"Starting model.fit with epochs={max_epochs}, patience={patience_to_use}, batch_size={batch_size_to_use}, weights={fit_weights_value}" + ) + model.fit(X_combined, y_combined, **fit_params) + + # Log peak GPU memory usage during training + if torch.cuda.is_available(): + peak_mem = torch.cuda.max_memory_allocated() / (1024**2) + logger.info(f"Peak GPU memory during training: {peak_mem:.2f} MB") + + # Optimize threshold using shared utility (with scaled eval data) + best_threshold, metrics = optimize_threshold( + model, X_eval_scaled, y_eval, min_recall=min_recall + ) + + # Add fit_weights used to metrics dict for logging + metrics["fit_weights_used"] = fit_weights_value + # Add GPU memory usage to metrics if available + if torch.cuda.is_available(): + metrics["peak_gpu_memory_mb"] = peak_mem + + # Store the scaler with the model for future use + model._scaler = scaler + + return model, metrics + except Exception as e: + logger.error(f"Error training TabNet model: {str(e)}") + logger.error(traceback.format_exc()) + logger.error(f"Parameters during failed training: {model_params}") + raise + + +def optimize_hyperparameters( + X_train, y_train, X_test, y_test, X_eval, y_eval, hyperparameter_space +): + logger.info("Starting hyperparameter optimization for TabNet (tuning fit_weights)") + if not hyperparameter_space: + hyperparameter_space = load_hyperparameter_space() + + best_score = -float("inf") + best_params = {} + global_top_trials = [] + top_trials = [] + + # Apply scaling to all datasets + logger.info("Applying scaling to TabNet input data") + X_train_scaled, X_test_scaled, X_eval_scaled, scaler = preprocess_data_with_scaling( + X_train, X_test, X_eval, scaling_method=SCALING_METHOD + ) + + def objective(trial): + nonlocal best_score, best_params + current_params = {} + current_params.update(base_params) + try: + # --- Sample other parameters --- + # Sample scheduler type needed for conditional params + scheduler_type = trial.suggest_categorical( + "scheduler_type", hyperparameter_space["scheduler_type"]["choices"] + ) + current_params["scheduler_type"] = scheduler_type + + # Iterate over the rest of the hyperparameter space + for param_name, param_config in hyperparameter_space.items(): + # Skip params already handled or handled conditionally + if param_name in ["fit_weights", "scheduler_type"]: + continue + # Conditional suggestion for scheduler params + is_relevant_scheduler_param = False + if scheduler_type == "plateau" and param_name in [ + "scheduler_patience", + "scheduler_factor", + "scheduler_min_lr", + ]: + is_relevant_scheduler_param = True + elif scheduler_type == "cosine" and param_name in [ + "scheduler_t_max", + "scheduler_min_lr", + ]: + is_relevant_scheduler_param = True + elif scheduler_type == "onecycle" and param_name == "scheduler_div_factor": + is_relevant_scheduler_param = True + elif param_name not in [ + "scheduler_patience", + "scheduler_factor", + "scheduler_min_lr", + "scheduler_t_max", + "scheduler_div_factor", + ]: + # Not a scheduler-specific param, suggest normally + is_relevant_scheduler_param = True + if is_relevant_scheduler_param: + # Suggest parameter + if param_config["type"] == "float": + if "step" in param_config: + current_params[param_name] = trial.suggest_float( + param_name, + param_config["low"], + param_config["high"], + step=param_config["step"], + log=param_config.get("log", False), + ) + else: + current_params[param_name] = trial.suggest_float( + param_name, + param_config["low"], + param_config["high"], + log=param_config.get("log", False), + ) + elif param_config["type"] == "int": + if "step" in param_config: + current_params[param_name] = trial.suggest_int( + param_name, + param_config["low"], + param_config["high"], + step=param_config["step"], + ) + else: + current_params[param_name] = trial.suggest_int( + param_name, param_config["low"], param_config["high"] + ) + elif param_config["type"] == "categorical": + # Only suggest if not scheduler_type (already handled) + if param_name != "scheduler_type": + choices = param_config.get("choices", []) + if isinstance(choices, list) and choices: + current_params[param_name] = trial.suggest_categorical( + param_name, choices + ) + else: + logger.warning( + f"Skipping categorical param '{param_name}' due to invalid/empty choices." + ) + + # Train model and get metrics + model, metrics = train_model( + X_train_scaled, + y_train, + X_test_scaled, + y_test, + X_eval_scaled, + y_eval, + current_params, + ) + # Store model reference for callback but don't try to serialize it + setattr(trial, "model", model) # noqa: B010 + # Log metrics to trial attributes + for metric_name, metric_value in metrics.items(): + trial.set_user_attr(metric_name, metric_value) + # Convert params to JSON serializable format + serializable_params = {} + for k, v in current_params.items(): + if isinstance(v, np.generic): + serializable_params[k] = v.item() + elif isinstance(v, (int, float, str, bool)) or v is None: + serializable_params[k] = v + else: + serializable_params[k] = str(v) + trial.set_user_attr("params", serializable_params) + # Scoring logic + recall = metrics.get("recall", 0.0) + precision = metrics.get("precision", 0.0) + score = precision if recall >= min_recall else 0.0 + + # Log trial results + logger.info( + f" Trial {trial.number}: Score={score:.4f}, Precision={precision:.4f}, Recall={recall:.4f}" + ) + for metric_name, metric_value in metrics.items(): + # Serialize for Optuna + if isinstance(metric_value, (int, float, str, bool)) or metric_value is None: + trial.set_user_attr(metric_name, metric_value) + elif isinstance(metric_value, np.generic): + trial.set_user_attr(metric_name, metric_value.item()) + else: + trial.set_user_attr(metric_name, str(metric_value)) + + if score >= 0.30 and score > best_score: + logger.info(f"Trial {trial.number} completed with score {score:.4f}") + X_eval_orig_df = X_eval.copy() + log_to_mlflow(model, metrics, current_params, experiment_name, X_eval_orig_df) + + # Update best score and params FOR THIS RUN + if score > best_score: + best_score = score + best_params = current_params.copy() + logger.info( + f" >>> New best score in this run: {best_score:.4f} (Trial {trial.number})" + ) + + return score + except optuna.TrialPruned: + logger.info(f"Trial {trial.number} pruned.") + raise # Re-raise to signal Optuna + except Exception as e: + logger.error(f"Trial {trial.number} failed.") + logger.error(f"Failed trial parameters: {current_params}") + logger.error(f"Error: {str(e)}") + logger.error(traceback.format_exc()) + return 0.0 # Return low score for failed trials + + def callback(study, trial, experiment_name, X_eval): + nonlocal best_score, best_params, top_trials + logger.info(f"Current best score in this batch: {best_score:.4f}") + if trial.value > best_score: + best_score = trial.value + best_params = trial.params + logger.info(f"New best score found in trial {trial.number}: {best_score:.4f}") + current_run = (trial.value, trial.params, trial.number) + top_trials.append(current_run) + top_trials.sort(key=lambda x: x[0], reverse=True) + top_trials[:] = top_trials[:10] + if trial.number % 9 == 0: + table_rows = [ + f"| {i + 1} | {rec[2]} | {rec[0]:.4f} | {rec[1]} |" + for i, rec in enumerate(top_trials) + ] + logger.info("Top trials in current batch:") + for row in table_rows: + logger.info(row) + if trial.number % 100 == 0 and global_top_trials: + table_rows = [ + f"| {i + 1} | {rec[2]} | {rec[0]:.4f} | {rec[1]} |" + for i, rec in enumerate(global_top_trials[:10]) + ] + logger.info("Global top trials:") + for row in table_rows: + logger.info(row) + return best_score + + # --- Optuna Study Execution --- + storage_url = "sqlite:///optuna_tabnet.db" + study_name = "tabnet_optimization" + total_trials = n_trials + batch_size = 1000 + num_batches = total_trials // batch_size + if total_trials % batch_size != 0: + num_batches += 1 + + logger.info(f"Starting Optuna study '{study_name}' with {total_trials} trials.") + sampler = optuna.samplers.TPESampler(seed=SEED) + pruner = optuna.pruners.MedianPruner(n_startup_trials=10, n_warmup_steps=5, interval_steps=1) + study = optuna.create_study( + study_name=study_name, + direction="maximize", + storage=storage_url, + load_if_exists=True, + sampler=sampler, + pruner=pruner, + ) + for _ in range(num_batches): + try: + study.optimize( + objective, + n_trials=batch_size, + callbacks=[lambda study, trial: callback(study, trial, experiment_name, X_eval)], + n_jobs=4, + ) + except KeyboardInterrupt: + logger.warning("Optimization interrupted by user.") + break + + logger.info(f"Best parameters selected: {best_params}") + return best_params + + +def hypertune_tabnet(experiment_name: str, X_train, y_train, X_test, y_test, X_eval, y_eval): + """ + Main hypertuning function for TabNet with MLflow tracking. + Returns best_params and metrics from the final trained model. + """ + try: + hyperparameter_space = load_hyperparameter_space() + logger.info("Starting hyperparameter optimization for TabNet (tuning fit_weights)") + + # === Run Optimization === + best_params_found = optimize_hyperparameters( + X_train, + y_train, + X_test, + y_test, + X_eval, + y_eval, # Pass actual data + hyperparameter_space=hyperparameter_space, + ) + + if not best_params_found: + logger.error("Hyperparameter optimization failed to find best parameters.") + return None, None + + logger.info( + f"Hyperparameter optimization completed. Best parameters found: {best_params_found}" + ) + + # === Train Final Model with Best Params === + logger.info("Training final TabNet model with best parameters found...") + # Pass data correctly + final_model, final_metrics = train_model( + X_train, y_train, X_test, y_test, X_eval, y_eval, best_params_found + ) + + logger.info("Final model trained successfully.") + logger.info(f"Final Metrics: {final_metrics}") + + global X_eval_orig_df # Need original DataFrame for signature + log_run_id = log_to_mlflow( + final_model, final_metrics, best_params_found, experiment_name, X_eval_orig_df + ) + logger.info(f"Final model and metrics logged to MLflow run_id: {log_run_id}") + + # Return the best parameters and the metrics from the model trained with those params + return best_params_found, final_metrics + + except Exception as e: + logger.error(f"Error in TabNet hypertuning process: {str(e)}") + logger.error(traceback.format_exc()) + return None, None + + +def log_to_mlflow(model, metrics, params, experiment_name, X_eval_df_for_sig): + """Logs model, metrics, params to MLflow.""" + try: + # Set up MLflow tracking + mlflow.set_experiment(experiment_name) + with mlflow.start_run( + run_name=f"tabnet_final_train_{datetime.now().strftime('%Y%m%d_%H%M')}", nested=True + ) as run: + mlflow.log_params(params) + mlflow.set_tags({"final_model_training": True}) + mlflow.log_metrics(metrics) + active_run_id = mlflow.active_run().info.run_id + logger.info(f"Logging final model artifacts to MLflow run_id: {active_run_id}") + + # --- Signature --- + input_example = None + signature = None + if isinstance(X_eval_df_for_sig, pd.DataFrame): + input_example = X_eval_df_for_sig.iloc[:5].copy() + # Ensure dtypes are float for numeric cols + num_cols = input_example.select_dtypes(include=np.number).columns + input_example[num_cols] = input_example[num_cols].astype("float64") + logger.info("Created input_example from DataFrame for signature.") + + # Wrap model for prediction + sklearn_wrapper = TabNetSklearnWrapper(model=model) + try: + logger.info("Inferring model signature...") + # Ensure model is fitted + if not hasattr(sklearn_wrapper.model, "network"): + raise ValueError( + "Model inside wrapper doesn't seem fitted (no network attribute)." + ) + prediction_output = sklearn_wrapper.predict_proba(input_example) + signature = mlflow.models.infer_signature(input_example, prediction_output) + logger.info("Signature inferred successfully.") + except Exception as sig_err: + logger.error( + f"Failed to infer signature: {sig_err}. Logging model without signature." + ) + logger.error(traceback.format_exc()) + signature = None + else: + logger.warning( + f"Cannot create input example for MLflow signature from X_eval of type {type(X_eval_df_for_sig)}.)" + ) + + # --- Log Model --- + # Create wrapper with scaler for end-to-end preprocessing + scaler = getattr(model, "_scaler", None) + if scaler is not None: + sklearn_wrapper_for_log = TabNetSklearnWrapperWithScaler(model=model, scaler=scaler) + logger.info("Using TabNet wrapper with integrated scaler for MLflow logging") + else: + sklearn_wrapper_for_log = TabNetSklearnWrapper(model=model) + logger.warning("No scaler found, using standard TabNet wrapper") + + model_reg_name = f"tabnet_final_{datetime.now().strftime('%Y%m%d_%H%M')}" + try: + logger.info( + f"Logging model with mlflow.sklearn.log_model (signature={'present' if signature else 'absent'})..." + ) + model_info = mlflow.sklearn.log_model( + sk_model=sklearn_wrapper_for_log, + artifact_path="model_sklearn", + signature=signature, + registered_model_name=model_reg_name, + input_example=input_example if signature else None, + ) + run_id = run.info.run_id + logger.info( + f"Final model logged to MLflow (sklearn flavor): {model_info.model_uri}" + ) + logger.info(f"Registered as: {model_reg_name}") + logger.info(f"Run ID: {run_id}") + mlflow.end_run() + return run_id + + except Exception as log_model_err: + logger.error(f"mlflow.sklearn.log_model failed: {log_model_err}") + logger.error(traceback.format_exc()) + return active_run_id # Return run_id even if model logging had issues + except Exception as e: + logger.error(f"Error in log_to_mlflow: {str(e)}") + logger.error(traceback.format_exc()) + return None + + +def select_top_features_tabnet( + model: TabNetClassifier, X_features: pd.DataFrame, n_features: int = 60 +) -> list[str]: + """ + Selects the top N features based on TabNet feature importances. + + Args: + model: Trained TabNetClassifier model. + X_features: DataFrame containing the features used for training (to get names). + n_features: The number of top features to select. + + Returns: + A list of the names of the top N features. + """ + if not hasattr(model, "feature_importances_"): + raise ValueError( + "The provided model has not been trained yet or does not support feature importances." + ) + + importances = model.feature_importances_ + feature_names = X_features.columns + + if len(importances) != len(feature_names): + raise ValueError("Mismatch between the number of feature importances and feature names.") + + feature_importance_df = pd.DataFrame({"Feature": feature_names, "Importance": importances}) + feature_importance_df = feature_importance_df.sort_values(by="Importance", ascending=False) + + top_features = feature_importance_df["Feature"].head(n_features).tolist() + logger.info(f"Selected top {n_features} features based on TabNet importance.") + logger.info(f"Top features: {top_features}") # Log the selected features for visibility + + return top_features + + +def train_with_precision_target(X_train, y_train, X_test, y_test, X_eval, y_eval): + """ + Train TabNet model with focus on precision target. + Args: + X_train: Training features + y_train: Training labels + X_test: Testing features + y_test: Testing labels + X_eval: Evaluation features + y_eval: Evaluation labels + Returns: + tuple: (best_model, best_metrics) + """ + try: + logger.info("Training model with precision target") + params = base_params.copy() # Inherits 'device_name': 'cuda' + # Specific parameters for this training run with advanced scheduling + params.update( + { + "batch_size": 128, + "device_name": "cuda", + "eps": 9.848795981241588e-06, + "eval_metric": ["auc", "logloss"], + "fit_weights": 1, + "gamma": 1.8, + "lambda_sparse": 1.483050364931367e-06, + "learning_rate": 0.006693216892855157, + "mask_type": "entmax", + "max_epochs": 130, + "momentum": 0.895, + "n_a": 69, + "n_d": 91, + "n_independent": 3, + "n_shared": 5, + "n_steps": 4, + "optimizer_fn": torch.optim.Adam, + "patience": 28, + "scheduler_final_div_factor": 5200.0, + "scheduler_pct_start": 0.30000000000000004, + "scheduler_type": "none", + "seed": 19, + "verbose": 0, + "virtual_batch_size": 896, + "weight_decay": 0.000166528804138707, + } + ) + # Train final model with best parameters + logger.info("Training final model with best parameters") + model, metrics = train_model(X_train, y_train, X_test, y_test, X_eval, y_eval, params) + # Log to MLflow + # log_to_mlflow(model, metrics, params, experiment_name, X_eval) + # Select top features + top_features = compute_permutation_importance(model, X_eval, y_eval, metrics["threshold"]) + return model, metrics, top_features + except Exception as e: + logger.error(f"Error during MLflow artifact logging: {str(e)}") + logger.error(traceback.format_exc()) + return mlflow.active_run().info.run_id if mlflow.active_run() else None + + +def compute_permutation_importance( + model, + X_val: pd.DataFrame, + y_val: np.ndarray, + threshold: float = 0.3, + n_repeats: int = 10, + number_of_features: int = 100, +) -> pd.DataFrame: + """ + Compute permutation feature importance for TabNet model. + Args: + model: Trained TabNet model with predict_proba(X) method + X_val: Validation features (DataFrame) + y_val: Validation labels (array-like) + threshold: Threshold for positive class prediction + n_repeats: Number of shuffles per feature + number_of_features: Number of top features to display in logs + Returns: + DataFrame with columns: ['feature', 'importance'] sorted by importance descending + """ + try: + feature_names = X_val.columns.tolist() + if isinstance(X_val, pd.DataFrame): + X_val = X_val.values + y_val_np = y_val.values if hasattr(y_val, "values") else y_val + + # Convert to numpy and ensure correct shape + if y_val_np.ndim == 2 and y_val_np.shape[1] == 1: + y_val_np = y_val_np.ravel() + + # Compute baseline metric + probs = model.predict_proba(X_val)[:, 1] + preds = (probs >= threshold).astype(int) + + # Calculate baseline precision + baseline = np.sum((y_val_np == 1) & (preds == 1)) / (np.sum(preds == 1)) + logger.info(f"Baseline precision: {baseline:.4f}") + + importances = [] + for feat_idx, feat in enumerate(feature_names): + drops = [] + for i in range(n_repeats): + logger.info(f"Shuffling feature: {feat} ({feat_idx}) - Repeat: {i + 1}") + X_shuffled = X_val.copy() + # Use column index since X_val is numpy array + X_shuffled[:, feat_idx] = np.random.permutation(X_val[:, feat_idx]) + + # Get predictions with shuffled feature + probs_shuffled = model.predict_proba(X_shuffled)[:, 1] + preds_shuffled = (probs_shuffled >= threshold).astype(int) + + # Calculate precision with shuffled feature + precision = np.sum((y_val_np == 1) & (preds_shuffled == 1)) / ( + np.sum(preds_shuffled == 1) + 1e-7 + ) + drop = baseline - precision + drops.append(drop) + + mean_drop = np.mean(drops) + importances.append((feat, mean_drop)) + logger.debug(f"Feature: {feat}, Mean importance drop: {mean_drop:.4f}") + + # Sort by importance descending + importances.sort(key=lambda x: x[1], reverse=True) + df_importance = pd.DataFrame(importances, columns=["feature", "importance"]) + + # Log top features + logger.info("Top features by permutation importance:") + logger.info(df_importance.head(number_of_features).to_string(index=False)) + + return df_importance + + except Exception as e: + logger.error(f"Error computing permutation importance: {str(e)}") + logger.error(traceback.format_exc()) + raise + + +def hypertune_with_feature_importance( + X_train, y_train, X_test, y_test, X_eval, y_eval, n_trials=50 +): + """ + Perform hyperparameter optimization with Optuna while tracking feature importances. + + Args: + X_train (pd.DataFrame): Training features + y_train (pd.Series): Training labels + X_test (pd.DataFrame): Test features + y_test (pd.Series): Test labels + n_trials (int): Number of optimization trials + + Returns: + tuple: (best_params, feature_importance_df) + """ + logger.info(f"Starting hyperparameter optimization with {n_trials} trials") + + # Store feature importances across trials + feature_importances = [] + + hyperparameter_space = load_hyperparameter_space() + + def objective(trial): + current_params = {} + current_params.update(base_params) + # Sample scheduler type needed for conditional params + scheduler_type = trial.suggest_categorical( + "scheduler_type", hyperparameter_space["scheduler_type"]["choices"] + ) + current_params["scheduler_type"] = scheduler_type + + # Iterate over the rest of the hyperparameter space + for param_name, param_config in hyperparameter_space.items(): + # Skip params already handled or handled conditionally + if param_name in ["fit_weights", "scheduler_type"]: + continue + # Conditional suggestion for scheduler params + is_relevant_scheduler_param = False + if scheduler_type == "plateau" and param_name in [ + "scheduler_patience", + "scheduler_factor", + "scheduler_min_lr", + ]: + is_relevant_scheduler_param = True + elif scheduler_type == "cosine" and param_name in [ + "scheduler_t_max", + "scheduler_min_lr", + ]: + is_relevant_scheduler_param = True + elif scheduler_type == "onecycle" and param_name == "scheduler_div_factor": + is_relevant_scheduler_param = True + elif param_name not in [ + "scheduler_patience", + "scheduler_factor", + "scheduler_min_lr", + "scheduler_t_max", + "scheduler_div_factor", + ]: + # Not a scheduler-specific param, suggest normally + is_relevant_scheduler_param = True + + if is_relevant_scheduler_param: + # Suggest parameter + if param_config["type"] == "float": + if "step" in param_config: + current_params[param_name] = trial.suggest_float( + param_name, + param_config["low"], + param_config["high"], + step=param_config["step"], + log=param_config.get("log", False), + ) + else: + current_params[param_name] = trial.suggest_float( + param_name, + param_config["low"], + param_config["high"], + log=param_config.get("log", False), + ) + elif param_config["type"] == "int": + if "step" in param_config: + current_params[param_name] = trial.suggest_int( + param_name, + param_config["low"], + param_config["high"], + step=param_config["step"], + ) + else: + current_params[param_name] = trial.suggest_int( + param_name, param_config["low"], param_config["high"] + ) + elif param_config["type"] == "categorical": + # Only suggest if not scheduler_type (already handled) + if param_name != "scheduler_type": + choices = param_config.get("choices", []) + if isinstance(choices, list) and choices: + current_params[param_name] = trial.suggest_categorical( + param_name, choices + ) + else: + logger.warning( + f"Skipping categorical param '{param_name}' due to invalid/empty choices." + ) + + # Train model and get metrics + model, metrics = train_model( + X_train, y_train, X_test, y_test, X_eval, y_eval, current_params + ) + + # Store feature importances for this trial + importance_dict = dict(zip(X_train.columns, model.feature_importances_)) + feature_importances.append(importance_dict) + + return metrics["precision"] + + # Create and run study + study = optuna.create_study(direction="maximize") + study.optimize(objective, n_trials=n_trials, n_jobs=4) + + # Calculate average feature importance across all trials + avg_importances = {} + for feature in X_train.columns: + importance_values = [trial_imp[feature] for trial_imp in feature_importances] + avg_importances[feature] = np.mean(importance_values) + + # Create DataFrame and sort by importance + importance_df = pd.DataFrame( + {"feature": list(avg_importances.keys()), "importance": list(avg_importances.values())} + ) + importance_df = importance_df.sort_values("importance", ascending=False) + + # Get top 100 features + top_100_features = importance_df.head(100) + + logger.info("Top 100 features by average importance across trials:") + for idx, row in top_100_features.iterrows(): + logger.info(f"{row['feature']}: {row['importance']:.4f} id: {idx}") + + return study.best_params, importance_df + + +def tabnet_feature_selection_pipeline(X, y, X_eval, y_eval, target_range=(50, 70)): + """Complete feature selection pipeline optimized for TabNet""" + logger.info(f"Starting TabNet feature selection pipeline with {X.shape[1]} initial features") + + # Stage 1: Quick filter methods (260 -> ~100) + logger.info("Stage 1: Applying mutual information and F-test filters") + + # Mutual information for non-linear relationships + mi_scores = mutual_info_classif(X, y, random_state=42) + mi_top = np.argsort(mi_scores)[-100:] + + # F-test for linear relationships + f_scores, _ = f_classif(X, y) + f_top = np.argsort(f_scores)[-100:] + + # Union of top features from both methods + initial_features = list(set(X.columns[mi_top]) | set(X.columns[f_top])) + X_filtered = X[initial_features] + X_eval_filtered = X_eval[initial_features] + logger.info(f"Stage 1: Reduced to {len(initial_features)} features") + + # Stage 2: TabNet-based importance (100 -> ~80) + logger.info("Stage 2: Using TabNet for feature importance ranking") + + tabnet_selector = TabNetClassifier( + n_d=64, + n_a=64, + n_steps=5, + lambda_sparse=1e-3, + optimizer_params=dict(lr=2e-2), + verbose=0, + device_name=device, + ) + + X_train, X_val = X_filtered, X_eval_filtered + y_train, y_val = y, y_eval + + # Apply scaling for TabNet feature selection + X_train_scaled, X_val_scaled, _, scaler = preprocess_data_with_scaling( + X_train, X_val, scaling_method=SCALING_METHOD + ) + + # Convert to numpy arrays for TabNet + X_train_np = X_train_scaled + X_val_np = X_val_scaled + y_train_np = y_train.values.ravel() if hasattr(y_train, "values") else np.array(y_train).ravel() + y_val_np = y_val.values.ravel() if hasattr(y_val, "values") else np.array(y_val).ravel() + + tabnet_selector.fit( + X_train_np, + y_train_np, + eval_set=[(X_val_np, y_val_np)], + eval_metric=["auc"], + max_epochs=100, + patience=15, + ) + + # Get importance and select top features + importance = tabnet_selector.feature_importances_ + importance_idx = np.argsort(importance)[-80:] + stage2_features = [initial_features[i] for i in importance_idx] + + logger.info(f"Stage 2: Reduced to {len(stage2_features)} features: {stage2_features}") + + # Stage 3: Fine-tuned sequential selection (80 -> 50-70) + logger.info("Stage 3: Sequential feature selection for optimal subset") + + X_stage2 = X[stage2_features] + final_features, scores = tabnet_sequential_selection( + X_stage2, y, target_features=target_range[1] + ) + + # Select optimal number based on score plateau + score_diffs = np.diff(scores) + plateau_point = np.where(score_diffs < np.percentile(score_diffs, 20))[0] + + if len(plateau_point) > 0 and plateau_point[0] >= target_range[0]: + optimal_count = min(plateau_point[0] + 1, target_range[1]) + else: + optimal_count = target_range[1] + + final_selected = final_features[:optimal_count] + + logger.info(f"Stage 3: Final selection of {len(final_selected)} features") + logger.info("Feature selection pipeline completed successfully") + + return final_selected, scores[:optimal_count] + + +def tabnet_sequential_selection(X, y, target_features=70): + """Sequential forward selection using TabNet for feature evaluation""" + + logger.info(f"Starting sequential selection to find top {target_features} features") + + selected_features = [] + remaining_features = list(X.columns) + scores = [] + + for i in range(min(target_features, len(remaining_features))): + best_score = -1 + best_feature = None + + logger.info(f"Sequential selection iteration {i + 1}/{target_features}") + + for feature in remaining_features: + current_features = selected_features + [feature] + X_subset = X[current_features] + + # Quick TabNet evaluation + tabnet_eval = TabNetClassifier( + n_d=32, + n_a=32, + n_steps=3, + lambda_sparse=1e-3, + optimizer_params=dict(lr=2e-2), + verbose=0, + device_name=device, + ) + + try: + # Use cross-validation for robust evaluation + # fit_params = { + # "max_epochs": 50, + # "patience": 10, + # "eval_metric": ['auc'] + # } + cv_scores = cross_val_score( + tabnet_eval, X_subset.values, y, cv=3, scoring="precision", n_jobs=3 + ) + score = np.mean(cv_scores) + + if score > best_score: + best_score = score + best_feature = feature + + except Exception as e: + logger.warning(f"Error evaluating feature {feature}: {str(e)}") + continue + + if best_feature is not None: + selected_features.append(best_feature) + remaining_features.remove(best_feature) + scores.append(best_score) + logger.info(f"Selected feature {i + 1}: {best_feature} (score: {best_score:.4f})") + else: + logger.warning(f"No valid feature found in iteration {i + 1}") + break + + logger.info(f"Sequential selection completed with {len(selected_features)} features") + return selected_features, scores + + +def tabnet_staged_selection(X, y, X_test, y_test, X_eval, y_eval, target_features=100): + """Multi-stage TabNet feature selection with different objectives""" + + logger.info(f"Starting TabNet staged selection with {X.shape[1]} initial features") + + # Stage 1: Quick filter with simplified TabNet + logger.info("Stage 1: Quick filter with simplified TabNet") + tabnet_fast = TabNetClassifier( + n_d=16, + n_a=16, + n_steps=2, + lambda_sparse=1e-3, + optimizer_params=dict(lr=5e-2), + verbose=0, + device_name=device, + ) + + # Fit with early stopping + tabnet_fast.fit( + X.values, + y, + eval_set=[(X_test.values, y_test)], + max_epochs=50, + patience=10, + eval_metric=["auc"], + ) + + # Get feature importances + stage1_importance = tabnet_fast.feature_importances_ + stage1_features = X.columns[np.argsort(stage1_importance)[-200:]].tolist() + + logger.info(f"Stage 1: Selected {len(stage1_features)} features") + + # Stage 2: Refined selection with cross-validation + logger.info("Stage 2: Refined selection with cross-validation") + X_stage1 = X[stage1_features] + X_eval_stage1 = X_eval[stage1_features] + + tabnet_refined = TabNetClassifier( + n_d=32, + n_a=32, + n_steps=3, + lambda_sparse=1e-3, + optimizer_params=dict(lr=2e-2), + verbose=0, + device_name=device, + ) + + # Cross-validation feature importance + cv_scores = [] + cv_importances = [] + + skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42) + for train_idx, val_idx in skf.split(X_stage1, y): + X_train_cv, X_val_cv = X_stage1.iloc[train_idx], X_stage1.iloc[val_idx] + y_train_cv, y_val_cv = y[train_idx], y[val_idx] + + try: + tabnet_refined.fit( + X_train_cv.values, + y_train_cv, + eval_set=[(X_val_cv.values, y_val_cv)], + max_epochs=100, + patience=15, + eval_metric=["auc"], + ) + cv_importances.append(tabnet_refined.feature_importances_) + + # Evaluate on eval set + y_pred_proba = tabnet_refined.predict_proba(X_eval_stage1.values)[:, 1] + val_score = roc_auc_score(y_eval, y_pred_proba) + cv_scores.append(val_score) + + except Exception as e: + logger.warning(f"Error in CV fold: {str(e)}") + continue + + if not cv_importances: + logger.error("No successful CV folds, falling back to stage 1 features") + return stage1_features[:target_features], stage1_importance + + # Average importance across folds + avg_importance = np.mean(cv_importances, axis=0) + stage2_features = [stage1_features[i] for i in np.argsort(avg_importance)[-target_features:]] + + logger.info(f"Stage 2: Selected {len(stage2_features)} features: {stage2_features}") + logger.info(f"CV Score: {np.mean(cv_scores):.4f} ± {np.std(cv_scores):.4f}") + + return stage2_features, avg_importance + + +def improved_tabnet_staged_selection(X, y, X_test, y_test, X_eval, y_eval, target_features=120): + """Enhanced TabNet feature selection with optimized hyperparameters""" + + logger.info(f"Starting IMPROVED TabNet staged selection with {X.shape[1]} initial features") + + # IMPROVEMENT 1: Better Stage 1 configuration + logger.info("Stage 1: Enhanced quick filter with optimized TabNet") + tabnet_fast = TabNetClassifier( + n_d=32, + n_a=32, + n_steps=4, # Increased capacity + gamma=1.5, # Feature selection strength + lambda_sparse=1e-4, # Reduced sparsity for more features + optimizer_params=dict(lr=2e-2, weight_decay=1e-5), + scheduler_params=dict(step_size=20, gamma=0.8), # Learning rate scheduling + mask_type="entmax", # Better feature selection + verbose=0, + device_name=device, + seed=42, # Reproducibility + ) + + # IMPROVEMENT 2: Better training configuration + tabnet_fast.fit( + X.values, + y, + eval_set=[(X_test.values, y_test)], + max_epochs=100, # Increased epochs + patience=20, # More patience + batch_size=1024, # Larger batch size + virtual_batch_size=256, + eval_metric=["auc", "logloss"], + drop_last=False, + ) + + # Select more features in stage 1 + stage1_importance = tabnet_fast.feature_importances_ + stage1_features = X.columns[np.argsort(stage1_importance)[-250:]].tolist() # Increased from 200 + + logger.info(f"Stage 1: Selected {len(stage1_features)} features") + + # IMPROVEMENT 3: Enhanced Stage 2 with better architecture + logger.info("Stage 2: Enhanced refined selection with optimized cross-validation") + X_stage1 = X[stage1_features] + X_eval_stage1 = X_eval[stage1_features] + + # IMPROVEMENT 4: Enhanced cross-validation with better evaluation + cv_scores = [] + cv_importances = [] + successful_folds = 0 + + skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42) + for fold, (train_idx, val_idx) in enumerate(skf.split(X_stage1, y)): + logger.info(f"Processing fold {fold + 1}/5") + + X_train_cv, X_val_cv = X_stage1.iloc[train_idx], X_stage1.iloc[val_idx] + y_train_cv, y_val_cv = y[train_idx], y[val_idx] + + try: + # Reset model for each fold + tabnet_fold = TabNetClassifier( + n_d=64, + n_a=64, + n_steps=5, + gamma=1.3, + lambda_sparse=5e-5, + optimizer_params=dict(lr=1e-2, weight_decay=1e-5), + scheduler_params=dict(step_size=30, gamma=0.9), + mask_type="entmax", + verbose=0, + device_name=device, + seed=42 + fold, # Different seed per fold + ) + + tabnet_fold.fit( + X_train_cv.values, + y_train_cv, + eval_set=[(X_val_cv.values, y_val_cv)], + max_epochs=150, # More epochs for refined training + patience=25, + batch_size=512, + virtual_batch_size=128, + eval_metric=["auc", "logloss"], + drop_last=False, + ) + + cv_importances.append(tabnet_fold.feature_importances_) + + # Evaluate on validation fold + y_pred_proba = tabnet_fold.predict_proba(X_eval_stage1.values)[:, 1] + fold_score = roc_auc_score(y_eval, y_pred_proba) + cv_scores.append(fold_score) + successful_folds += 1 + + logger.info(f"Fold {fold + 1} AUC: {fold_score:.4f}") + + except Exception as e: + logger.warning(f"Error in CV fold {fold + 1}: {str(e)}") + continue + + if successful_folds < 3: + logger.error( + f"Only {successful_folds} successful CV folds, falling back to stage 1 features" + ) + return stage1_features[:target_features], stage1_importance + + # Average importance across successful folds + avg_importance = np.mean(cv_importances, axis=0) + stage2_features = [stage1_features[i] for i in np.argsort(avg_importance)[-target_features:]] + CV_score = np.mean(cv_scores) + selected_indices = np.argsort(avg_importance)[-target_features:] + selected_importances = avg_importance[selected_indices] + feature_importance_pairs = list(zip(stage2_features, selected_importances)) + + logger.info(f"Stage 2: Selected {len(stage2_features)} features: {stage2_features}") + logger.info(f"Feature-importance pairs: {feature_importance_pairs}") + logger.info(f"CV Score: {CV_score:.4f}") + + return stage2_features, avg_importance + + +def apply_outlier_removal(X_train, y_train, X_test, y_test, X_eval, y_eval): + """ + Apply outlier removal to training data if enabled. + + Args: + X_train: Training features + y_train: Training labels + X_test: Test features + y_test: Test labels + X_eval: Evaluation features + y_eval: Evaluation labels + + Returns: + Tuple of (potentially) cleaned datasets + """ + + logger.info("Applying Isolation Forest outlier removal to training data") + + # Store original data for comparison + X_train_original = X_train.copy() + y_train_original = y_train.copy() + + # Preprocess data with persistent scaler (only use training data for scaler fitting) + logger.info("Preprocessing data with persistent scaler for outlier detection") + + # Apply outlier removal with pre-fitted scaler + X_train_clean, y_train_clean = remove_outliers_isolation_forest( + X_train=X_train, + y_train=y_train, + contamination=0.05, + random_state=42, + logger=logger, + ) + + # Analyze impact of outlier removal + if len(X_train_clean) < len(X_train_original): + impact_analysis = analyze_outlier_impact( + X_before=X_train_original, + y_before=y_train_original, + X_after=X_train_clean, + y_after=y_train_clean, + logger=logger, + ) + + # Log outlier removal impact with structured formatting + logger.info("=== Outlier Removal Impact Analysis ===") + logger.info(f"Samples before: {impact_analysis['samples_before']:,}") + logger.info(f"Samples after: {impact_analysis['samples_after']:,}") + logger.info(f"Samples removed: {impact_analysis['samples_removed']:,}") + logger.info(f"Removal percentage: {impact_analysis['removal_percentage']:.2f}%") + logger.info(f"Positive class rate before: {impact_analysis['positive_rate_before']:.4f}") + logger.info(f"Positive class rate after: {impact_analysis['positive_rate_after']:.4f}") + logger.info( + f"Class distribution change: {impact_analysis['class_distribution_change']:.4f}" + ) + logger.info("=========================================") + else: + logger.info("No outliers were detected/removed") + + return X_train_clean, y_train_clean, X_test, y_test, X_eval, y_eval + + +def load_hyperparameter_space(): + """ + Define hyperparameter space for TabNet tuning. + """ + hyperparameter_space = { + "learning_rate": { + "type": "float", + "low": 0.0003, + "high": 0.04, + "log": True, + }, + "eps": {"type": "float", "low": 1e-8, "high": 1e-4, "log": True}, + "n_d": {"type": "int", "low": 32, "high": 128}, + "n_a": {"type": "int", "low": 32, "high": 128}, + "n_steps": {"type": "int", "low": 2, "high": 6}, + "gamma": {"type": "float", "low": 0.5, "high": 3.0, "step": 0.05}, + "lambda_sparse": {"type": "float", "low": 1e-7, "high": 1e-2, "log": True}, + "momentum": {"type": "float", "low": 0.7, "high": 0.99, "step": 0.005}, + "patience": {"type": "int", "low": 15, "high": 60}, + "max_epochs": {"type": "int", "low": 90, "high": 500, "step": 5}, + "batch_size": {"type": "int", "low": 1024, "high": 8192, "step": 1024}, + "virtual_batch_size": {"type": "int", "low": 128, "high": 1024, "step": 128}, + "n_independent": {"type": "int", "low": 1, "high": 4}, + "n_shared": {"type": "int", "low": 2, "high": 6}, + "weight_decay": {"type": "float", "low": 1e-6, "high": 1e-3, "log": True}, + "scheduler_type": { + "type": "categorical", + "choices": ["plateau", "onecycle", "none"], + }, + "scheduler_patience": {"type": "int", "low": 2, "high": 10}, + "scheduler_factor": {"type": "float", "low": 0.05, "high": 0.5}, + "scheduler_min_lr": {"type": "float", "low": 1e-6, "high": 1e-4, "log": True}, + "scheduler_pct_start": {"type": "float", "low": 0.1, "high": 0.5, "step": 0.05}, + "scheduler_div_factor": {"type": "float", "low": 10.0, "high": 40.0, "step": 0.5}, + "scheduler_final_div_factor": { + "type": "float", + "low": 1000.0, + "high": 10000.0, + "step": 100.0, + }, + "mask_type": {"type": "categorical", "choices": ["sparsemax", "entmax"]}, + } + return hyperparameter_space + + +# Create a custom metric that heavily weights precision +class PrecisionFocusedMetric(Metric): + def __init__(self, beta=0.5): + self._name = "precision_focused" + self._maximize = True + self.beta = beta + + def __call__(self, y_true, y_score): + """F-beta score with beta < 1 to favor precision over recall""" + + # Ensure y_true is a 1D array + # Check type of target + y_true_type = type_of_target(y_true) + if y_true_type == "multilabel-indicator": + # Assuming binary classification represented as one-hot + # Convert back to 1D: take the argmax along the class axis (axis=1) + y_true_flat = np.argmax(y_true, axis=1) + elif y_true_type == "binary": + y_true_flat = y_true.astype(int) # Ensure integer type + else: + # Handle unexpected types or raise an error + logger.warning( + f"Unexpected y_true type '{y_true_type}' in PrecisionFocusedMetric. Attempting to flatten." + ) + try: + y_true_flat = y_true.astype(int).ravel() # General attempt to flatten + except Exception as e: + logger.error(f"Could not convert y_true to 1D array: {e}") + return 0.0 # Return 0 score if conversion fails + + # Ensure y_score handling is robust + # Check if y_score has 2 columns (expected for binary probabilities) + if y_score.ndim == 2 and y_score.shape[1] == 2: + pred = (y_score[:, 1] > 0.5).astype(int) # Use probability of positive class + elif y_score.ndim == 1: # If y_score is already 1D predictions/scores + pred = (y_score > 0.5).astype(int) # Threshold directly + else: + logger.error(f"Unexpected y_score shape {y_score.shape} in PrecisionFocusedMetric.") + return 0.0 # Return 0 score if y_score format is wrong + + # Calculate precision and recall safely + try: + # Check target types again just before sklearn call for debugging + # logger.debug(f"y_true_flat type: {type_of_target(y_true_flat)}, pred type: {type_of_target(pred)}") + precision = precision_score(y_true_flat, pred, zero_division=0) + recall = recall_score(y_true_flat, pred, zero_division=0) + except ValueError as e: + logger.error(f"Error calculating scores in PrecisionFocusedMetric: {e}") + logger.error( + f"y_true_flat sample: {y_true_flat[:5]}, shape: {y_true_flat.shape}, type: {type_of_target(y_true_flat)}" + ) + logger.error( + f"pred sample: {pred[:5]}, shape: {pred.shape}, type: {type_of_target(pred)}" + ) + return 0.0 # Return 0 score if scikit-learn metric fails + + # If recall below threshold, return 0 + if recall < min_recall: + return 0.0 + + # F-beta with beta < 1 favors precision + f_beta = ( + (1 + self.beta**2) * (precision * recall) / (self.beta**2 * precision + recall + 1e-8) + ) + return f_beta + + +def main(): + """ + Main execution function for TabNet hypertuning including loss function. + """ + try: + logger.info("Starting TabNet model hypertuning (tuning fit_weights)") + setup_mlflow_tracking(experiment_name) + global X_eval_orig_df # Used in log_to_mlflow + + # Load data + dataloader = DataLoader() + X_train_orig, y_train_orig, X_test_orig, y_test_orig, X_eval_orig_df, y_eval_orig = ( + dataloader.load_data() + ) + X_eval_orig_df = X_eval_orig_df.copy() # Store original for signature + + # Select features + features = import_selected_features_ensemble_new(model_type="tabnet") + + X_train = X_train_orig[features] + X_test = X_test_orig[features] + X_eval_df = X_eval_orig_df[features] + + # Apply outlier removal to training data + X_train, y_train, X_test, y_test, X_eval, y_eval = apply_outlier_removal( + X_train, y_train_orig, X_test, y_test_orig, X_eval_df, y_eval_orig + ) + + # Assign labels (ensure 1D numpy) + y_train = ( + y_train.values.ravel() if hasattr(y_train, "values") else np.array(y_train).ravel() + ) + y_test = y_test.values.ravel() if hasattr(y_test, "values") else np.array(y_test).ravel() + y_eval = y_eval.values.ravel() if hasattr(y_eval, "values") else np.array(y_eval).ravel() + + # Convert features to float64 (will be scaled in train_model function) + X_train = X_train.astype("float64") + X_test = X_test.astype("float64") + X_eval = X_eval.astype("float64") + + logger.info( + f"Data shapes - Train: {X_train.shape}, Test: {X_test.shape}, Eval: {X_eval.shape}" + ) + logger.info("Feature scaling will be applied automatically in TabNet training") + + # --- Hyperparameter Optimization with Feature Importance --- + # best_params, importance_df = hypertune_with_feature_importance( + # X_train, y_train, X_test, y_test, X_eval, y_eval, n_trials=50 + # ) + + # === Run Hypertuning === + best_params_final, best_metrics_final = hypertune_tabnet( + experiment_name, X_train, y_train, X_test, y_test, X_eval, y_eval + ) + + # === Run Feature Selection === + final_selected, scores = improved_tabnet_staged_selection( + X_train, y_train, X_test, y_test, X_eval, y_eval, target_features=80 + ) + # final_selected, scores = tabnet_feature_selection_pipeline(X_train, y_train, X_eval, y_eval) + + # train_with_precision_target(X_train, y_train, X_test, y_test, X_eval, y_eval) + + except Exception as e: + logger.error(f"Error in main execution: {str(e)}") + + +if __name__ == "__main__": + main() diff --git a/src/models/StackedEnsemble/base/tree_based/catboost_model.py b/src/models/StackedEnsemble/base/tree_based/catboost_model.py index 7ffe7e9..4e23054 100644 --- a/src/models/StackedEnsemble/base/tree_based/catboost_model.py +++ b/src/models/StackedEnsemble/base/tree_based/catboost_model.py @@ -8,7 +8,6 @@ The implementation focuses on high precision while maintaining a minimum recall threshold. """ -import gc import os import random import time @@ -19,7 +18,6 @@ import numpy as np import optuna import pandas as pd -import torch from catboost import Pool from sklearn.feature_selection import RFECV from sklearn.model_selection import StratifiedKFold @@ -81,11 +79,29 @@ def load_hyperparameter_space(): "learning_rate": {"type": "float", "low": 0.060, "high": 0.25, "log": False, "step": 0.005}, "depth": {"type": "int", "low": 5, "high": 12, "log": False, "step": 1}, "min_data_in_leaf": {"type": "int", "low": 50, "high": 500, "log": False, "step": 10}, - "colsample_bylevel": {"type": "float", "low": 0.50, "high": 0.75, "log": False, "step": 0.01}, + "colsample_bylevel": { + "type": "float", + "low": 0.50, + "high": 0.75, + "log": False, + "step": 0.01, + }, "subsample": {"type": "float", "low": 0.40, "high": 0.75, "log": False, "step": 0.005}, - "bagging_temperature": {"type": "float", "low": 0.5, "high": 10.0, "log": False, "step": 0.05}, + "bagging_temperature": { + "type": "float", + "low": 0.5, + "high": 10.0, + "log": False, + "step": 0.05, + }, "reg_lambda": {"type": "float", "low": 2.0, "high": 20.0, "log": False, "step": 0.1}, - "leaf_estimation_iterations": {"type": "int", "low": 2, "high": 14, "log": False, "step": 1}, + "leaf_estimation_iterations": { + "type": "int", + "low": 2, + "high": 14, + "log": False, + "step": 1, + }, "early_stopping_rounds": {"type": "int", "low": 50, "high": 1500, "log": False, "step": 10}, "scale_pos_weight": {"type": "float", "low": 1.8, "high": 3.5, "log": False, "step": 0.01}, "max_bin": {"type": "int", "low": 32, "high": 256, "log": False, "step": 16}, @@ -140,7 +156,7 @@ def train_model(X_train, y_train, X_test, y_test, X_eval, y_eval, model_params): # Create Pool objects for CatBoost train_pool = Pool(X_combined, y_combined) eval_pool = Pool(X_eval, y_eval) - + # Extract early stopping rounds early_stopping_rounds = model_params.pop("early_stopping_rounds", 100) @@ -236,7 +252,7 @@ def objective(trial): if score > 0.36 and score > best_score: log_to_mlflow(model, metrics, params, experiment_name) - + return score except Exception as e: @@ -310,7 +326,9 @@ def callback(study, trial): logger.info( f"Starting batch {batch + 1}/{num_batches} with new sampler (seed={random_seed})" ) - study.optimize(objective, n_trials=batch_size, show_progress_bar=True, callbacks=[callback], n_jobs=1) + study.optimize( + objective, n_trials=batch_size, show_progress_bar=True, callbacks=[callback], n_jobs=1 + ) # Merge current batch's top trials with global_top_trials for trial_record in top_trials: @@ -364,9 +382,7 @@ def hypertune_catboost(): # Train final model with best parameters logger.info("Training final model with best parameters") - model, metrics = train_model( - X_train, y_train, X_test, y_test, X_eval, y_eval, best_params - ) + model, metrics = train_model(X_train, y_train, X_test, y_test, X_eval, y_eval, best_params) return best_params, metrics @@ -482,7 +498,7 @@ def train_with_precision_target(X_train, y_train, X_test, y_test, X_eval, y_eval # Train final model with best parameters logger.info("Training final model with best parameters") model, metrics = train_model(X_train, y_train, X_test, y_test, X_eval, y_eval, params) - compute_permutation_importance(model, X_eval, y_eval, metrics['threshold']) + compute_permutation_importance(model, X_eval, y_eval, metrics["threshold"]) return model, metrics @@ -490,9 +506,10 @@ def train_with_precision_target(X_train, y_train, X_test, y_test, X_eval, y_eval logger.error(f"Error in precision-focused training: {str(e)}") return None, None + def compute_permutation_importance( model, - X_val: pd.DataFrame, + X_val: pd.DataFrame, y_val: np.ndarray, threshold: float = 0.3, n_repeats: int = 50, @@ -525,13 +542,15 @@ def compute_permutation_importance( drops = [] for i in range(n_repeats): feat_idx = feature_names.index(feat) + 1 - logger.info(f"Shuffling feature: {feat} ({feat_idx}) - Repeat: {i+1}") + logger.info(f"Shuffling feature: {feat} ({feat_idx}) - Repeat: {i + 1}") X_shuffled = X_val.copy() X_shuffled[feat] = np.random.permutation(X_shuffled[feat].values) probs_shuffled = model.predict_proba(X_shuffled)[:, 1] preds_shuffled = (probs_shuffled >= threshold).astype(int) # Calculate precision directly instead of using metric parameter - precision = np.sum((y_val_np == 1) & (preds_shuffled == 1)) / (np.sum(preds_shuffled == 1)) + precision = np.sum((y_val_np == 1) & (preds_shuffled == 1)) / ( + np.sum(preds_shuffled == 1) + ) drop = baseline - precision drops.append(drop) mean_drop = np.mean(drops) @@ -544,7 +563,10 @@ def compute_permutation_importance( logger.info(df_importance.head(number_of_features).to_string(index=False)) return df_importance -def select_features_rfecv(X, y, logger, min_features=150, step=1, scoring='roc_auc', random_state=19): + +def select_features_rfecv( + X, y, logger, min_features=150, step=1, scoring="roc_auc", random_state=19 +): """ Perform RFECV-based feature selection using CatBoost. Args: @@ -558,7 +580,9 @@ def select_features_rfecv(X, y, logger, min_features=150, step=1, scoring='roc_a Returns: tuple: (List[str], pd.DataFrame) """ - logger.info(f"Starting RFECV feature selection with min_features={min_features}, step={step}, scoring={scoring}") + logger.info( + f"Starting RFECV feature selection with min_features={min_features}, step={step}, scoring={scoring}" + ) params = base_params.copy() params.update( { @@ -591,39 +615,42 @@ def select_features_rfecv(X, y, logger, min_features=150, step=1, scoring='roc_a scoring=scoring, min_features_to_select=min_features, n_jobs=-1, - verbose=2 + verbose=2, ) selector.fit(X, y) selected_features = X.columns[selector.support_].tolist() importances = selector.estimator_.feature_importances_ - feature_importance_df = pd.DataFrame({ - 'feature': selected_features, - 'importance': importances - }).sort_values('importance', ascending=False) + feature_importance_df = pd.DataFrame( + {"feature": selected_features, "importance": importances} + ).sort_values("importance", ascending=False) logger.info(f"RFECV selected {len(selected_features)} features:") - for feat, imp in zip(feature_importance_df['feature'], feature_importance_df['importance']): + for feat, imp in zip(feature_importance_df["feature"], feature_importance_df["importance"]): logger.info(f" - {feat}: {imp}") return selected_features, feature_importance_df -def hypertune_with_feature_importance(X_train, y_train, X_test, y_test, X_eval, y_eval, n_trials=50): + +def hypertune_with_feature_importance( + X_train, y_train, X_test, y_test, X_eval, y_eval, n_trials=50 +): """ Perform hyperparameter optimization with Optuna while tracking feature importances. - + Args: X_train (pd.DataFrame): Training features - y_train (pd.Series): Training labels + y_train (pd.Series): Training labels X_test (pd.DataFrame): Test features y_test (pd.Series): Test labels n_trials (int): Number of optimization trials - + Returns: tuple: (best_params, feature_importance_df) """ logger.info(f"Starting hyperparameter optimization with {n_trials} trials") - + # Store feature importances across trials feature_importances = [] hyperparameter_space = load_hyperparameter_space() + def objective(trial): params = base_params.copy() # Add hyperparameters from config with step size if provided @@ -657,57 +684,55 @@ def objective(trial): param_name, param_config["low"], param_config["high"] ) elif param_config["type"] == "categorical": - params[param_name] = trial.suggest_categorical( - param_name, param_config["choices"] - ) + params[param_name] = trial.suggest_categorical(param_name, param_config["choices"]) - # Train model model, metrics = train_model(X_train, y_train, X_test, y_test, X_eval, y_eval, params) - + # Store feature importances for this trial importance_dict = dict(zip(X_train.columns, model.feature_importances_)) feature_importances.append(importance_dict) - - return metrics['precision'] - + + return metrics["precision"] + # Create and run study - study = optuna.create_study(direction='maximize') + study = optuna.create_study(direction="maximize") study.optimize(objective, n_trials=n_trials) - + # Calculate average feature importance across all trials avg_importances = {} for feature in X_train.columns: importance_values = [trial_imp[feature] for trial_imp in feature_importances] avg_importances[feature] = np.mean(importance_values) - + # Create DataFrame and sort by importance - importance_df = pd.DataFrame({ - 'feature': list(avg_importances.keys()), - 'importance': list(avg_importances.values()) - }) - importance_df = importance_df.sort_values('importance', ascending=False) - + importance_df = pd.DataFrame( + {"feature": list(avg_importances.keys()), "importance": list(avg_importances.values())} + ) + importance_df = importance_df.sort_values("importance", ascending=False) + # Get top 100 features top_100_features = importance_df.head(100) - + logger.info("Top 100 features by average importance across trials:") for idx, row in top_100_features.iterrows(): logger.info(f"{row['feature']}: {row['importance']:.4f} id: {idx}") - + return study.best_params, importance_df + # Calculate class weights based on your data def calculate_class_weights(y): pos_count = np.sum(y == 1) neg_count = np.sum(y == 0) total = len(y) - + weight_pos = total / pos_count weight_neg = total / neg_count - + return [weight_neg, weight_pos] + def main(): """ Main execution function. @@ -715,7 +740,7 @@ def main(): try: logger.info("Starting CatBoost model training") global X_train, y_train, X_test, y_test, X_eval, y_eval, class_weights - + # Load data dataloader = DataLoader() X_train, y_train, X_test, y_test, X_eval, y_eval = dataloader.load_data() diff --git a/src/models/StackedEnsemble/base/tree_based/feature_selection/lightgbm_boruta.py b/src/models/StackedEnsemble/base/tree_based/feature_selection/lightgbm_boruta.py index 72f972b..11bdfaf 100644 --- a/src/models/StackedEnsemble/base/tree_based/feature_selection/lightgbm_boruta.py +++ b/src/models/StackedEnsemble/base/tree_based/feature_selection/lightgbm_boruta.py @@ -32,16 +32,17 @@ os.environ["MKL_NUM_THREADS"] = "4" os.environ["OPENBLAS_NUM_THREADS"] = "4" + def run_boruta_feature_selection(X_train, y_train, X_test, y_test, X_eval, y_eval, features): """ Run Boruta feature selection using LightGBM and SHAP importance. - + Returns: list: Selected features from Boruta algorithm """ try: logger.info("Starting LightGBM Boruta feature selection") - + logger.info(f"Features: {len(features)}") X_train = prepare_data(X_train, features) X_test = prepare_data(X_test, features) @@ -57,26 +58,26 @@ def run_boruta_feature_selection(X_train, y_train, X_test, y_test, X_eval, y_eva # Define LightGBM model params = { - 'objective': 'binary', - 'metric': ['auc', 'binary_logloss'], - 'learning_rate': 0.153, - 'num_leaves': 31, - 'max_depth': 5, - 'min_child_samples': 430, - 'feature_fraction': 0.88, - 'bagging_fraction': 0.905, - 'min_split_gain': 4.82, - 'lambda_l2': 11.72, - 'lambda_l1': 36.2, - 'scale_pos_weight': 2.36, - 'verbose': -1 + "objective": "binary", + "metric": ["auc", "binary_logloss"], + "learning_rate": 0.153, + "num_leaves": 31, + "max_depth": 5, + "min_child_samples": 430, + "feature_fraction": 0.88, + "bagging_fraction": 0.905, + "min_split_gain": 4.82, + "lambda_l2": 11.72, + "lambda_l1": 36.2, + "scale_pos_weight": 2.36, + "verbose": -1, } lgb_clf = lgb.LGBMClassifier(**params) # Run BorutaShap feature_selector = BorutaShap( model=lgb_clf, - importance_measure='shap', # or 'gini' + importance_measure="shap", # or 'gini' classification=True, # pvalue=0.10 ) @@ -85,8 +86,8 @@ def run_boruta_feature_selection(X_train, y_train, X_test, y_test, X_eval, y_eva y=y_train, n_trials=500, # Number of Boruta iterations sample=False, # Set to True for large datasets - train_or_test='train', # Use test set for SHAP values - verbose=True + train_or_test="train", # Use test set for SHAP values + verbose=True, ) # Get selected features @@ -95,15 +96,15 @@ def run_boruta_feature_selection(X_train, y_train, X_test, y_test, X_eval, y_eva logger.info(f"Number of selected features: {len(selected_features)}") # Save results - feature_selector.results_to_csv(filename='feature_importance') - + feature_selector.results_to_csv(filename="feature_importance") + # Optionally, transform your data X_train_selected = feature_selector.transform(X_train) logger.info(f"Transformed training data shape: {X_train_selected.shape}") - + logger.info("Boruta feature selection completed successfully") return selected_features - + except Exception as e: logger.error(f"Error: {e}") logger.error(f"Error type: {type(e)}") @@ -112,11 +113,12 @@ def run_boruta_feature_selection(X_train, y_train, X_test, y_test, X_eval, y_eva logger.error("Exiting the program") return [] + def lightgbm_staged_selection(X, y, X_eval, y_eval, target_features=80): """Multi-stage LightGBM feature selection with different objectives""" - + logger.info(f"Starting LightGBM staged selection with {X.shape[1]} initial features") - eval_metrics = ['auc', 'binary_logloss'] + eval_metrics = ["auc", "binary_logloss"] # Stage 1: Quick filter with high learning rate logger.info("Stage 1: Quick filter with high learning rate") lgb_fast = lgb.LGBMClassifier( @@ -126,7 +128,7 @@ def lightgbm_staged_selection(X, y, X_eval, y_eval, target_features=80): subsample=0.8, colsample_bytree=0.8, random_state=42, - verbose=-1 + verbose=-1, ) lgb_fast.fit(X, y, eval_set=[(X_eval, y_eval)], eval_metric=eval_metrics) @@ -155,12 +157,12 @@ def lightgbm_staged_selection(X, y, X_eval, y_eval, target_features=80): path_smooth=0.0, reg_alpha=0.0, reg_lambda=0.0, - objective='binary', - metric=['aucpr', 'binary_logloss'], - device='cpu', + objective="binary", + metric=["aucpr", "binary_logloss"], + device="cpu", n_jobs=8, random_state=19, - verbose=-1 + verbose=-1, ) # Cross-validation feature importance @@ -172,7 +174,13 @@ def lightgbm_staged_selection(X, y, X_eval, y_eval, target_features=80): X_train_cv, X_val_cv = X_stage1.iloc[train_idx], X_stage1.iloc[val_idx] y_train_cv, y_val_cv = y.iloc[train_idx], y.iloc[val_idx] - lgb_refined.fit(X_train_cv, y_train_cv, eval_set=[(X_val_cv, y_val_cv)], eval_metric=eval_metrics, callbacks=[lgb.early_stopping(stopping_rounds=200)]) + lgb_refined.fit( + X_train_cv, + y_train_cv, + eval_set=[(X_val_cv, y_val_cv)], + eval_metric=eval_metrics, + callbacks=[lgb.early_stopping(stopping_rounds=200)], + ) cv_importances.append(lgb_refined.feature_importances_) val_score = lgb_refined.score(X_eval_stage1, y_eval) @@ -181,21 +189,22 @@ def lightgbm_staged_selection(X, y, X_eval, y_eval, target_features=80): # Average importance across folds avg_importance = np.mean(cv_importances, axis=0) stage2_features = [stage1_features[i] for i in np.argsort(avg_importance)[-target_features:]] - + # Log average importances for the selected features selected_indices = np.argsort(avg_importance)[-target_features:] selected_importances = avg_importance[selected_indices] feature_importance_pairs = list(zip(stage2_features, selected_importances)) - + logger.info(f"Stage 2: Selected {len(stage2_features)} features: {stage2_features}") logger.info(f"Feature-importance pairs: {feature_importance_pairs}") logger.info(f"CV Score: {np.mean(cv_scores):.4f} ± {np.std(cv_scores):.4f}") return stage2_features, avg_importance + def xgboost_staged_selection(X, y, X_eval, y_eval, target_features=80): """Multi-stage XGBoost feature selection with different objectives""" - + logger.info(f"Starting XGBoost staged selection with {X.shape[1]} initial features") eval_metrics = ["aucpr", "error", "logloss"] # Stage 1: Quick filter with high learning rate @@ -207,7 +216,7 @@ def xgboost_staged_selection(X, y, X_eval, y_eval, target_features=80): subsample=0.8, colsample_bytree=0.8, random_state=42, - eval_metric=eval_metrics + eval_metric=eval_metrics, ) xgb_fast.fit(X, y, eval_set=[(X_eval, y_eval)], verbose=False) @@ -234,7 +243,7 @@ def xgboost_staged_selection(X, y, X_eval, y_eval, target_features=80): min_child_weight=1, scale_pos_weight=1.0, random_state=42, - eval_metric=eval_metrics + eval_metric=eval_metrics, ) # Cross-validation feature importance @@ -260,98 +269,107 @@ def xgboost_staged_selection(X, y, X_eval, y_eval, target_features=80): selected_indices = np.argsort(avg_importance)[-target_features:] selected_importances = avg_importance[selected_indices] feature_importance_pairs = list(zip(stage2_features, selected_importances)) - + logger.info(f"Stage 2: Selected {len(stage2_features)} features: {stage2_features}") logger.info(f"Feature-importance pairs: {feature_importance_pairs}") logger.info(f"CV Score: {np.mean(cv_scores):.4f} ± {np.std(cv_scores):.4f}") return stage2_features, avg_importance + def ensemble_gbm_selection(X, y, X_test, y_test, X_eval, y_eval, target_features=80): """ Combine XGBoost and LightGBM for robust feature selection. """ logger.info("Starting ensemble GBM feature selection") - + # Get features from both models - xgb_features, _ = xgboost_staged_selection(X, y, X_test, y_test, X_eval, y_eval, target_features + 20) - lgb_features, _ = lightgbm_staged_selection(X, y, X_test, y_test, X_eval, y_eval, target_features + 20) - + xgb_features, _ = xgboost_staged_selection( + X, y, X_test, y_test, X_eval, y_eval, target_features + 20 + ) + lgb_features, _ = lightgbm_staged_selection( + X, y, X_test, y_test, X_eval, y_eval, target_features + 20 + ) + # Feature voting system feature_votes = {} - + # XGBoost votes (weighted by rank) for i, feature in enumerate(xgb_features): weight = (len(xgb_features) - i) / len(xgb_features) feature_votes[feature] = feature_votes.get(feature, 0) + weight - + # LightGBM votes (weighted by rank) for i, feature in enumerate(lgb_features): weight = (len(lgb_features) - i) / len(lgb_features) feature_votes[feature] = feature_votes.get(feature, 0) + weight - + # Select top voted features sorted_features = sorted(feature_votes.items(), key=lambda x: x[1], reverse=True) final_features = [feature for feature, _ in sorted_features[:target_features]] - + # Validation with both models X_selected = X[final_features] - + # XGBoost validation - xgb_val = xgb.XGBClassifier(n_estimators=200, random_state=42, eval_metric='logloss') + xgb_val = xgb.XGBClassifier(n_estimators=200, random_state=42, eval_metric="logloss") try: - xgb_scores = cross_val_score(xgb_val, X_selected, y, cv=5, scoring='roc_auc') + xgb_scores = cross_val_score(xgb_val, X_selected, y, cv=5, scoring="roc_auc") except AttributeError as e: if "__sklearn_tags__" in str(e): - logger.warning("XGBoost compatibility issue with scikit-learn. Using manual cross-validation.") - + logger.warning( + "XGBoost compatibility issue with scikit-learn. Using manual cross-validation." + ) + skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42) xgb_scores_list = [] - + for train_idx, val_idx in skf.split(X_selected, y): X_train_cv, X_val_cv = X_selected.iloc[train_idx], X_selected.iloc[val_idx] y_train_cv, y_val_cv = y.iloc[train_idx], y.iloc[val_idx] - + xgb_val.fit(X_train_cv, y_train_cv) y_pred_proba = xgb_val.predict_proba(X_val_cv)[:, 1] score = roc_auc_score(y_val_cv, y_pred_proba) xgb_scores_list.append(score) - + xgb_scores = np.array(xgb_scores_list) else: raise e - + # LightGBM validation lgb_val = lgb.LGBMClassifier(n_estimators=200, random_state=42, verbose=-1) try: - lgb_scores = cross_val_score(lgb_val, X_selected, y, cv=5, scoring='roc_auc') + lgb_scores = cross_val_score(lgb_val, X_selected, y, cv=5, scoring="roc_auc") except AttributeError as e: if "__sklearn_tags__" in str(e): - logger.warning("LightGBM compatibility issue with scikit-learn. Using manual cross-validation.") + logger.warning( + "LightGBM compatibility issue with scikit-learn. Using manual cross-validation." + ) skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42) lgb_scores_list = [] - + for train_idx, val_idx in skf.split(X_selected, y): X_train_cv, X_val_cv = X_selected.iloc[train_idx], X_selected.iloc[val_idx] y_train_cv, y_val_cv = y.iloc[train_idx], y.iloc[val_idx] - + lgb_val.fit(X_train_cv, y_train_cv) y_pred_proba = lgb_val.predict_proba(X_val_cv)[:, 1] score = roc_auc_score(y_val_cv, y_pred_proba) lgb_scores_list.append(score) - + lgb_scores = np.array(lgb_scores_list) else: raise e - + logger.info(f"Selected {len(final_features)} features: {final_features}") logger.info(f"XGBoost CV AUC: {xgb_scores.mean():.4f} ± {xgb_scores.std():.4f}") logger.info(f"LightGBM CV AUC: {lgb_scores.mean():.4f} ± {lgb_scores.std():.4f}") - + return final_features, { - 'xgb_scores': xgb_scores, - 'lgb_scores': lgb_scores, - 'feature_votes': feature_votes + "xgb_scores": xgb_scores, + "lgb_scores": lgb_scores, + "feature_votes": feature_votes, } @@ -376,10 +394,12 @@ def main(): X_combined = pd.concat([X_train, X_test], axis=0, ignore_index=True) y_combined = pd.concat([y_train, y_test], axis=0, ignore_index=True) logger.info(f"Combined dataset shape: {X_combined.shape}") - stage1_features, avg_importance = xgboost_staged_selection(X_combined, y_combined, X_eval, y_eval, target_features=150) - stage2_features, avg_importance = lightgbm_staged_selection(X_combined, y_combined, X_eval, y_eval, target_features=150) - - + stage1_features, avg_importance = xgboost_staged_selection( + X_combined, y_combined, X_eval, y_eval, target_features=150 + ) + stage2_features, avg_importance = lightgbm_staged_selection( + X_combined, y_combined, X_eval, y_eval, target_features=150 + ) if __name__ == "__main__": diff --git a/src/models/StackedEnsemble/base/tree_based/feature_selection/random_forest_features.py b/src/models/StackedEnsemble/base/tree_based/feature_selection/random_forest_features.py index 9b63a62..bb5c934 100644 --- a/src/models/StackedEnsemble/base/tree_based/feature_selection/random_forest_features.py +++ b/src/models/StackedEnsemble/base/tree_based/feature_selection/random_forest_features.py @@ -74,35 +74,35 @@ def load_hyperparameter_space_for_hpo(): hyperparameter_space = { "n_estimators": { "type": "int", - "low": 800, # Focus on range of top performers + "low": 800, # Focus on range of top performers "high": 1300, # Cover the successful range - "step": 10, # Larger step to save computation + "step": 10, # Larger step to save computation }, "max_depth": { - "type": "categorical", - "choices": [6, 7, 8, 9, 18, 19, 20, 21], + "type": "categorical", + "choices": [6, 7, 8, 9, 18, 19, 20, 21], }, "min_samples_split": { "type": "int", - "low": 30, - "high": 80, - "step": 2, + "low": 30, + "high": 80, + "step": 2, }, "min_samples_leaf": { "type": "int", - "low": 16, - "high": 70, - "step": 2, + "low": 16, + "high": 70, + "step": 2, }, "max_features": { - "type": "categorical", - "choices": [0.22, 0.24, 0.26, 0.52, 0.70, 0.74, 0.84, 0.88, 0.98, 1.0], + "type": "categorical", + "choices": [0.22, 0.24, 0.26, 0.52, 0.70, 0.74, 0.84, 0.88, 0.98, 1.0], }, "class_weight": { "type": "float", - "low": 1.6, - "high": 3.5, - "step": 0.05, + "low": 1.6, + "high": 3.5, + "step": 0.05, }, } return hyperparameter_space @@ -308,7 +308,9 @@ def callback(study, trial): logger.info( f"Starting batch {batch + 1}/{num_batches} with new sampler (seed={random_seed})" ) - study.optimize(objective, n_trials=batch_size, show_progress_bar=True, callbacks=[callback], n_jobs=8) + study.optimize( + objective, n_trials=batch_size, show_progress_bar=True, callbacks=[callback], n_jobs=8 + ) # Merge current batch's top trials with global_top_trials for trial_record in top_trials: @@ -357,9 +359,7 @@ def hypertune_random_forest(experiment_name: str): # Train final model with best parameters logger.info("Training final model with best parameters") - model, metrics = train_model( - X_train, y_train, X_test, y_test, X_eval, y_eval, best_params - ) + model, metrics = train_model(X_train, y_train, X_test, y_test, X_eval, y_eval, best_params) return best_params, metrics @@ -467,7 +467,9 @@ def train_with_precision_target(X_train, y_train, X_test, y_test, X_eval, y_eval return None, None -def select_top_features_rf(model: RandomForestClassifier, X_features: pd.DataFrame, n_features: int = 40) -> list[str]: +def select_top_features_rf( + model: RandomForestClassifier, X_features: pd.DataFrame, n_features: int = 40 +) -> list[str]: """ Selects the top N features based on Random Forest feature importances. @@ -479,8 +481,10 @@ def select_top_features_rf(model: RandomForestClassifier, X_features: pd.DataFra Returns: A list of the names of the top N features. """ - if not hasattr(model, 'feature_importances_'): - raise ValueError("The provided model has not been trained yet or does not support feature importances.") + if not hasattr(model, "feature_importances_"): + raise ValueError( + "The provided model has not been trained yet or does not support feature importances." + ) importances = model.feature_importances_ feature_names = X_features.columns @@ -488,18 +492,19 @@ def select_top_features_rf(model: RandomForestClassifier, X_features: pd.DataFra if len(importances) != len(feature_names): raise ValueError("Mismatch between the number of feature importances and feature names.") - feature_importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': importances}) - feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False) + feature_importance_df = pd.DataFrame({"Feature": feature_names, "Importance": importances}) + feature_importance_df = feature_importance_df.sort_values(by="Importance", ascending=False) - top_features = feature_importance_df['Feature'].head(n_features).tolist() + top_features = feature_importance_df["Feature"].head(n_features).tolist() logger.info(f"Selected top {n_features} features based on RF importance.") - logger.info(f"Top features: {top_features}") # Log the selected features for visibility + logger.info(f"Top features: {top_features}") # Log the selected features for visibility return top_features + def compute_permutation_importance( model, - X_val: pd.DataFrame, + X_val: pd.DataFrame, y_val: np.ndarray, threshold: float = 0.3, n_repeats: int = 20, @@ -532,13 +537,15 @@ def compute_permutation_importance( drops = [] for i in range(n_repeats): feat_idx = feature_names.index(feat) + 1 - logger.info(f"Shuffling feature: {feat} ({feat_idx}) - Repeat: {i+1}") + logger.info(f"Shuffling feature: {feat} ({feat_idx}) - Repeat: {i + 1}") X_shuffled = X_val.copy() X_shuffled[feat] = np.random.permutation(X_shuffled[feat].values) probs_shuffled = model.predict_proba(X_shuffled)[:, 1] preds_shuffled = (probs_shuffled >= threshold).astype(int) # Calculate precision directly instead of using metric parameter - precision = np.sum((y_val_np == 1) & (preds_shuffled == 1)) / (np.sum(preds_shuffled == 1)) + precision = np.sum((y_val_np == 1) & (preds_shuffled == 1)) / ( + np.sum(preds_shuffled == 1) + ) drop = baseline - precision drops.append(drop) mean_drop = np.mean(drops) @@ -551,6 +558,7 @@ def compute_permutation_importance( logger.info(df_importance.head(number_of_features).to_string(index=False)) return df_importance + def compute_sklearn_permutation_importance( model, X_val: pd.DataFrame, @@ -562,7 +570,7 @@ def compute_sklearn_permutation_importance( ) -> pd.DataFrame: """ Compute permutation feature importance using sklearn's built-in function. - + Args: model: Trained model with predict_proba(X) method. X_val: Validation features (DataFrame). @@ -571,32 +579,31 @@ def compute_sklearn_permutation_importance( random_state: Seed for reproducibility. n_jobs: Number of parallel jobs (-1 for all cores). number_of_features: Number of top features to display in logs. - + Returns: DataFrame with columns: ['feature', 'importance_mean', 'importance_std'], sorted descending. """ feature_names = X_val.columns.tolist() - + logger.info(f"Computing permutation importance with {n_repeats} repeats...") - + # Permutation importance - more reliable than built-in perm_importance = permutation_importance( - model, X_val, y_val, - n_repeats=n_repeats, - random_state=random_state, - n_jobs=n_jobs + model, X_val, y_val, n_repeats=n_repeats, random_state=random_state, n_jobs=n_jobs ) - + # Create permutation importance DataFrame - perm_df = pd.DataFrame({ - 'feature': feature_names, - 'importance_mean': perm_importance.importances_mean, - 'importance_std': perm_importance.importances_std - }).sort_values('importance_mean', ascending=False) - + perm_df = pd.DataFrame( + { + "feature": feature_names, + "importance_mean": perm_importance.importances_mean, + "importance_std": perm_importance.importances_std, + } + ).sort_values("importance_mean", ascending=False) + logger.info("Top features by sklearn permutation importance:") logger.info(perm_df.head(number_of_features).to_string(index=False)) - + return perm_df @@ -606,15 +613,15 @@ def optimal_feature_selection_pipeline(X, y, target_range=(50, 70)): Returns both the transformed data and the final feature names """ logger.info(f"Starting with {X.shape[1]} features") - + # Convert to DataFrame if it's not already, and get original feature names - if hasattr(X, 'columns'): + if hasattr(X, "columns"): original_features = X.columns.tolist() X_array = X.values else: original_features = [f"feature_{i}" for i in range(X.shape[1])] X_array = X - + # Stage 1: Quick Filter Methods (260+ → ~150) # Remove low-variance features variance_selector = VarianceThreshold(threshold=0.01) @@ -623,7 +630,7 @@ def optimal_feature_selection_pipeline(X, y, target_range=(50, 70)): variance_mask = variance_selector.get_support() features_after_variance = [original_features[i] for i, keep in enumerate(variance_mask) if keep] logger.info(f"After variance filtering: {X_var.shape[1]} features") - + # Remove highly correlated features corr_matrix = np.corrcoef(X_var.T) high_corr_pairs = np.where(np.abs(corr_matrix) > 0.95) @@ -631,13 +638,13 @@ def optimal_feature_selection_pipeline(X, y, target_range=(50, 70)): for i, j in zip(high_corr_pairs[0], high_corr_pairs[1]): if i != j and i not in features_to_remove: features_to_remove.add(j) - + remaining_indices = [i for i in range(X_var.shape[1]) if i not in features_to_remove] X_corr = X_var[:, remaining_indices] # Track feature names after correlation filtering features_after_corr = [features_after_variance[i] for i in remaining_indices] logger.info(f"After correlation filtering: {X_corr.shape[1]} features") - + # Stage 2: Statistical Selection (150 → ~100) k_best = SelectKBest(score_func=f_classif, k=min(100, X_corr.shape[1])) X_stat = k_best.fit_transform(X_corr, y) @@ -648,16 +655,16 @@ def optimal_feature_selection_pipeline(X, y, target_range=(50, 70)): logger.info(f"Features after statistical selection: {features_after_stat}") # Stage 3: Model-based Selection (100 → 50-70) rf = RandomForestClassifier(n_estimators=100, max_depth=7, random_state=42, n_jobs=8) - + # Option A: Boruta for all-relevant features (with relaxed parameters) boruta = BorutaPy( - rf, - n_estimators='auto', - verbose=1, + rf, + n_estimators="auto", + verbose=1, random_state=42, alpha=0.3, # More lenient (default is 0.05) - allows more features max_iter=200, # More iterations to find features (default is 100) - perc=70 # Use 90th percentile instead of 100th for shadow features + perc=70, # Use 90th percentile instead of 100th for shadow features ) boruta.fit(X_stat, y) X_boruta = boruta.transform(X_stat) @@ -667,20 +674,22 @@ def optimal_feature_selection_pipeline(X, y, target_range=(50, 70)): logger.info(f"After Boruta selection: {X_boruta.shape[1]} features") logger.info(f"Boruta confirmed features: {sum(boruta.support_)}") logger.info(f"Boruta tentative features: {sum(boruta.support_weak_)}") - + # If Boruta is still too conservative, include tentative features if X_boruta.shape[1] < target_range[0]: # If less than 50 features logger.info("Boruta selected too few features, including tentative features...") # Combine confirmed and tentative features combined_mask = boruta.support_ | boruta.support_weak_ X_boruta_extended = X_stat[:, combined_mask] - features_after_boruta_extended = [features_after_stat[i] for i, keep in enumerate(combined_mask) if keep] + features_after_boruta_extended = [ + features_after_stat[i] for i, keep in enumerate(combined_mask) if keep + ] logger.info(f"After including tentative features: {X_boruta_extended.shape[1]} features") - + # Use the extended set for RFE X_boruta = X_boruta_extended features_after_boruta = features_after_boruta_extended - + # Option B: RFE for exact number target_features = min(target_range[1], max(target_range[0], X_boruta.shape[1])) rfe = RFE(rf, n_features_to_select=target_features, step=1) @@ -688,29 +697,29 @@ def optimal_feature_selection_pipeline(X, y, target_range=(50, 70)): # Track which features survived RFE rfe_mask = rfe.get_support() final_feature_names = [features_after_boruta[i] for i, keep in enumerate(rfe_mask) if keep] - + logger.info(f"Final features selected: {X_final.shape[1]}") logger.info(f"Final feature names: {final_feature_names}") - + # Log top features for visibility logger.info(f"All final features: {final_feature_names}") - + return X_final, { - 'final_feature_names': final_feature_names, - 'variance_selector': variance_selector, - 'correlation_indices': remaining_indices, - 'statistical_selector': k_best, - 'boruta_selector': boruta, - 'final_selector': rfe + "final_feature_names": final_feature_names, + "variance_selector": variance_selector, + "correlation_indices": remaining_indices, + "statistical_selector": k_best, + "boruta_selector": boruta, + "final_selector": rfe, } + def random_forest_staged_selection(X, y, X_eval, y_eval, target_features=150): """Multi-stage Random Forest feature selection with different objectives""" - + logger.info(f"Starting Random Forest staged selection with {X.shape[1]} initial features") # Combine training and test data for feature selection - - + logger.info(f"Combined data shape: {X.shape}") # Stage 1: Quick filter with fewer trees logger.info("Stage 1: Quick filter with fewer trees") @@ -719,9 +728,9 @@ def random_forest_staged_selection(X, y, X_eval, y_eval, target_features=150): max_depth=10, min_samples_split=5, min_samples_leaf=2, - max_features='sqrt', + max_features="sqrt", random_state=42, - n_jobs=-1 + n_jobs=-1, ) sample_weight = np.ones(len(y)) / len(y) rf_fast.fit(X, y, sample_weight=sample_weight) @@ -740,20 +749,20 @@ def random_forest_staged_selection(X, y, X_eval, y_eval, target_features=150): max_depth=15, min_samples_split=2, min_samples_leaf=1, - max_features='sqrt', + max_features="sqrt", bootstrap=True, random_state=42, - n_jobs=-1 + n_jobs=-1, ) # Cross-validation feature importance cv_scores = [] cv_importances = [] - val_weight = np.ones(len(y_eval)) / len(y_eval) + np.ones(len(y_eval)) / len(y_eval) skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42) for train_idx, val_idx in skf.split(X_stage1, y): - X_train_cv, X_val_cv = X_stage1.iloc[train_idx], X_stage1.iloc[val_idx] - y_train_cv, y_val_cv = y.iloc[train_idx], y.iloc[val_idx] + X_train_cv, _X_val_cv = X_stage1.iloc[train_idx], X_stage1.iloc[val_idx] + y_train_cv, _y_val_cv = y.iloc[train_idx], y.iloc[val_idx] sample_weight = np.ones(len(y_train_cv)) / len(y_train_cv) rf_refined.fit(X_train_cv, y_train_cv) cv_importances.append(rf_refined.feature_importances_) @@ -797,8 +806,10 @@ def main(): X_combined = pd.concat([X_train, X_test], axis=0, ignore_index=True) y_combined = pd.concat([y_train, y_test], axis=0, ignore_index=True) - stage2_features, avg_importance = random_forest_staged_selection(X_combined, y_combined, X_eval, y_eval, target_features=150) - + stage2_features, avg_importance = random_forest_staged_selection( + X_combined, y_combined, X_eval, y_eval, target_features=150 + ) + except Exception as e: logger.error(f"Error in main execution: {str(e)}") diff --git a/src/models/StackedEnsemble/base/tree_based/feature_selection/xgboost_boruta.py b/src/models/StackedEnsemble/base/tree_based/feature_selection/xgboost_boruta.py index 35c7f9a..fcf6dc8 100644 --- a/src/models/StackedEnsemble/base/tree_based/feature_selection/xgboost_boruta.py +++ b/src/models/StackedEnsemble/base/tree_based/feature_selection/xgboost_boruta.py @@ -2,7 +2,6 @@ import random import numpy as np -import pandas as pd import xgboost as xgb from BorutaShap import BorutaShap @@ -31,7 +30,7 @@ try: logger.info("Starting XGBoost model training") - + # Load data dataloader = DataLoader() X_train, y_train, X_test, y_test, X_eval, y_eval = dataloader.load_data() @@ -53,39 +52,39 @@ # Define your XGBoost model params = { "alpha": 55.8, - "colsample_bytree": 0.885, - "eval_metric": ['aucpr', 'error', 'logloss'], + "colsample_bytree": 0.885, + "eval_metric": ["aucpr", "error", "logloss"], "gamma": 4.43, "lambda": 6.94, "learning_rate": 0.15, "max_depth": 10, "min_child_weight": 635, "scale_pos_weight": 2.7, - "subsample": 0.795 + "subsample": 0.795, } xgb_clf = xgb.XGBClassifier(**params) # Run BorutaShap feature_selector = BorutaShap( model=xgb_clf, - importance_measure='shap', # or 'gini' + importance_measure="shap", # or 'gini' classification=True, - pvalue=0.10 + pvalue=0.10, ) feature_selector.fit( X=X_train, y=y_train, n_trials=500, # Number of Boruta iterations sample=False, # Set to True for large datasets - train_or_test='train', # Use test set for SHAP values - verbose=True + train_or_test="train", # Use test set for SHAP values + verbose=True, ) # Get selected features selected_features = feature_selector.Subset().columns.tolist() print("Selected features:", selected_features) - feature_selector.results_to_csv(filename='feature_importance') + feature_selector.results_to_csv(filename="feature_importance") # Optionally, transform your data X_train_selected = feature_selector.transform(X_train) except Exception as e: @@ -94,4 +93,3 @@ logger.error("Failed to run XGBoost model training") logger.error("Please check the data and model parameters") logger.error("Exiting the program") - diff --git a/src/models/StackedEnsemble/base/tree_based/lightgbm_model.py b/src/models/StackedEnsemble/base/tree_based/lightgbm_model.py index 0f405b4..891f42d 100644 --- a/src/models/StackedEnsemble/base/tree_based/lightgbm_model.py +++ b/src/models/StackedEnsemble/base/tree_based/lightgbm_model.py @@ -18,6 +18,7 @@ import numpy as np import optuna import pandas as pd +from mlflow.models import infer_signature from src.utils.logger import ExperimentLogger @@ -71,17 +72,41 @@ def load_hyperparameter_space(): dict: Hyperparameter space configuration with narrowed ranges and steps. """ hyperparameter_space = { - "learning_rate": {"type": "float", "low": 0.045, "high": 0.18, "log": False, "step": 0.0025}, + "learning_rate": { + "type": "float", + "low": 0.045, + "high": 0.18, + "log": False, + "step": 0.0025, + }, "num_leaves": {"type": "int", "low": 55, "high": 200, "log": False, "step": 5}, "max_depth": {"type": "int", "low": 5, "high": 12, "log": False, "step": 1}, "min_child_samples": {"type": "int", "low": 200, "high": 600, "log": False, "step": 10}, - "feature_fraction": {"type": "float", "low": 0.58, "high": 0.75, "log": False, "step": 0.01}, - "bagging_fraction": {"type": "float", "low": 0.56, "high": 0.75, "log": False, "step": 0.005}, + "feature_fraction": { + "type": "float", + "low": 0.58, + "high": 0.75, + "log": False, + "step": 0.01, + }, + "bagging_fraction": { + "type": "float", + "low": 0.56, + "high": 0.75, + "log": False, + "step": 0.005, + }, "bagging_freq": {"type": "int", "low": 10, "high": 15, "log": False, "step": 1}, "reg_alpha": {"type": "float", "low": 8.0, "high": 20.0, "log": False, "step": 0.1}, "reg_lambda": {"type": "float", "low": 8.0, "high": 20.0, "log": False, "step": 0.1}, "min_split_gain": {"type": "float", "low": 0.12, "high": 0.30, "log": False, "step": 0.005}, - "early_stopping_rounds": {"type": "int", "low": 600, "high": 1200, "log": False, "step": 10}, + "early_stopping_rounds": { + "type": "int", + "low": 600, + "high": 1200, + "log": False, + "step": 10, + }, "path_smooth": {"type": "float", "low": 0.10, "high": 0.60, "log": False, "step": 0.005}, "cat_smooth": {"type": "float", "low": 20.0, "high": 35.0, "log": False, "step": 0.1}, "max_bin": {"type": "int", "low": 200, "high": 700, "log": False, "step": 10}, @@ -111,16 +136,16 @@ def create_model(model_params): raise -def train_model(X_train, y_train, X_test, y_test, X_eval, y_eval, model_params): +def train_model(x_train, y_train, x_test, y_test, x_eval, y_eval, model_params): """ Train a LightGBM model with early stopping and threshold optimization. Updated to match notebook implementation. Args: - X_train: Training features + x_train: Training features y_train: Training labels - X_test: Validation features + x_test: Validation features y_test: Validation labels - X_eval: Evaluation features + x_eval: Evaluation features y_eval: Evaluation labels model_params: Model parameters Returns: @@ -128,11 +153,11 @@ def train_model(X_train, y_train, X_test, y_test, X_eval, y_eval, model_params): """ try: # Combine training and validation data while preserving indexes - X_combined = pd.concat([X_train, X_test], axis=0) + x_combined = pd.concat([x_train, x_test], axis=0) y_combined = pd.concat([y_train, y_test], axis=0) # Reset indexes to ensure proper alignment - X_combined.reset_index(drop=True, inplace=True) + x_combined.reset_index(drop=True, inplace=True) y_combined.reset_index(drop=True, inplace=True) # Extract early stopping rounds if present early_stopping_rounds = model_params.pop("early_stopping_rounds", 100) @@ -141,18 +166,18 @@ def train_model(X_train, y_train, X_test, y_test, X_eval, y_eval, model_params): model = create_model(model_params) # Create eval set for early stopping - eval_set = [(X_eval, y_eval)] + eval_set = [(x_eval, y_eval)] # Fit model with early stopping model.fit( - X_combined, + x_combined, y_combined, eval_set=eval_set, callbacks=[lgb.early_stopping(stopping_rounds=early_stopping_rounds)], ) # Get validation predictions - best_threshold, metrics = optimize_threshold(model, X_eval, y_eval, min_recall=min_recall) + _, metrics = optimize_threshold(model, x_eval, y_eval, min_recall=min_recall) return model, metrics @@ -161,56 +186,54 @@ def train_model(X_train, y_train, X_test, y_test, X_eval, y_eval, model_params): raise -def optimize_hyperparameters( - X_train, y_train, X_test, y_test, X_eval, y_eval, hyperparameter_space -): - logger.info("Starting hyperparameter optimization") - - if not hyperparameter_space: - hyperparameter_space = load_hyperparameter_space() - - best_score = -float("inf") - best_params = {} - # Global list to store best trials across the entire hypertuning process - global_top_trials = [] - top_trials = [] - +def suggest_hyperparameters(trial, hyperparameter_space): + """ + Suggest hyperparameters for a trial based on the hyperparameter space configuration. + """ + params = base_params.copy() + # Add hyperparameters from config with step size if provided + for param_name, param_config in hyperparameter_space.items(): + if param_config["type"] == "float": + if "step" in param_config: + params[param_name] = trial.suggest_float( + param_name, + param_config["low"], + param_config["high"], + step=param_config["step"], + log=param_config.get("log", False), + ) + else: + params[param_name] = trial.suggest_float( + param_name, + param_config["low"], + param_config["high"], + log=param_config.get("log", False), + ) + elif param_config["type"] == "int": + if "step" in param_config: + params[param_name] = trial.suggest_int( + param_name, + param_config["low"], + param_config["high"], + step=param_config["step"], + ) + else: + params[param_name] = trial.suggest_int( + param_name, param_config["low"], param_config["high"] + ) + return params + + +def create_objective_function(x_train, y_train, x_test, y_test, x_eval, y_eval, hyperparameter_space): + """ + Create the objective function for Optuna optimization. + """ def objective(trial): try: - params = base_params.copy() - # Add hyperparameters from config with step size if provided - for param_name, param_config in hyperparameter_space.items(): - if param_config["type"] == "float": - if "step" in param_config: - params[param_name] = trial.suggest_float( - param_name, - param_config["low"], - param_config["high"], - step=param_config["step"], - log=param_config.get("log", False), - ) - else: - params[param_name] = trial.suggest_float( - param_name, - param_config["low"], - param_config["high"], - log=param_config.get("log", False), - ) - elif param_config["type"] == "int": - if "step" in param_config: - params[param_name] = trial.suggest_int( - param_name, - param_config["low"], - param_config["high"], - step=param_config["step"], - ) - else: - params[param_name] = trial.suggest_int( - param_name, param_config["low"], param_config["high"] - ) + params = suggest_hyperparameters(trial, hyperparameter_space) # Train model and get metrics - model, metrics = train_model(X_train, y_train, X_test, y_test, X_eval, y_eval, params) + _, metrics = train_model(x_train, y_train, x_test, y_test, x_eval, y_eval, params) recall = metrics.get("recall", 0.0) precision = metrics.get("precision", 0.0) @@ -226,15 +249,39 @@ def objective(trial): for metric_name, metric_value in metrics.items(): trial.set_user_attr(metric_name, metric_value) - if score > 0.41 and score > best_score: - log_to_mlflow(model, metrics, params, experiment_name) return score except Exception as e: logger.error(f"Trial failed: {str(e)}") return 0.0 - # Callback function defined outside the loop so that its modifications affect the outer scope. + return objective + + +def optimize_hyperparameters( + x_train, y_train, x_test, y_test, x_eval, y_eval, hyperparameter_space +): + """ + Optimize hyperparameters using Optuna with batching strategy. + """ + logger.info("Starting hyperparameter optimization") + + if not hyperparameter_space: + hyperparameter_space = load_hyperparameter_space() + + # Get the objective function + objective = create_objective_function(x_train, y_train, x_test, y_test, x_eval, y_eval, hyperparameter_space) + + # Run optimization with batching + best_params = run_batched_optimization(objective) + + return best_params + + +def create_callback_function(best_score, best_params, top_trials, global_top_trials): + """ + Create the callback function for Optuna optimization. + """ def callback(study, trial): nonlocal best_score, best_params, top_trials, global_top_trials logger.info(f"Current best score in this batch: {best_score:.4f}") @@ -271,13 +318,26 @@ def callback(study, trial): logger.info(table_separator) for row in table_rows: logger.info(row) - return best_score + + return callback + + +def run_batched_optimization(objective): + """ + Run Optuna optimization with batching strategy. + """ + best_score = -float("inf") + best_params = {} + global_top_trials = [] + top_trials = [] + + # Create callback function + callback = create_callback_function(best_score, best_params, top_trials, global_top_trials) # Set persistent storage path using SQLite storage_url = "sqlite:///optuna_lightgbm.db" study_name = "lightgbm_optimization" - # Total trials to conduct - total_trials = n_trials # Example; you can set n_trials accordingly. + total_trials = n_trials batch_size = 1000 num_batches = total_trials // batch_size if total_trials % batch_size != 0: @@ -312,7 +372,7 @@ def callback(study, trial): # After all batches, update best_params (assume the best trial is the first in global_top_trials) if global_top_trials: - best_score, best_params, best_trial_number = global_top_trials[0] + best_score, best_params, _ = global_top_trials[0] else: best_params = {} @@ -328,36 +388,46 @@ def callback(study, trial): return best_params -def hypertune_lightgbm(experiment_name: str): +def hypertune_lightgbm(): """ Main training function with MLflow tracking. Updated name from hypertune_mlp to hypertune_lightgbm to match notebook. Args: - experiment_name (str): Experiment name for MLflow tracking Returns: tuple: (best_params, best_metrics) """ try: + # Load data + dataloader = DataLoader() + x_train_raw, y_train, x_test_raw, y_test, x_eval_raw, y_eval = dataloader.load_data() + features = import_selected_features_ensemble(model_type="lgbm") + + # Ensure features is a list of strings + if not isinstance(features, list): + raise ValueError("Expected features to be a list for lgbm model type") + + x_train = prepare_data(x_train_raw, features) + x_test = prepare_data(x_test_raw, features) + x_eval = prepare_data(x_eval_raw, features) + # Load hyperparameter space hyperparameter_space = load_hyperparameter_space() # Run hyperparameter optimization logger.info("Starting hyperparameter optimization") best_params = optimize_hyperparameters( - X_train, + x_train, y_train, - X_test, + x_test, y_test, - X_eval, + x_eval, y_eval, hyperparameter_space=hyperparameter_space, ) # Train final model with best parameters logger.info("Training final model with best parameters") - model, metrics = train_model( - X_train, y_train, X_test, y_test, X_eval, y_eval, best_params - ) + _, metrics = train_model(x_train, y_train, x_test, y_test, x_eval, y_eval, best_params) return best_params, metrics @@ -411,7 +481,7 @@ def log_to_mlflow(model, metrics, params, experiment_name): X_eval[col] = X_eval[col].astype("float64") # Infer signature with proper handling for integer columns with potential missing values - signature = mlflow.models.infer_signature(input_example, model.predict(input_example)) + signature = infer_signature(input_example, model.predict(input_example)) # Update model registration with signature model_info = mlflow.lightgbm.log_model( @@ -431,15 +501,15 @@ def log_to_mlflow(model, metrics, params, experiment_name): return None -def train_with_precision_target(X_train, y_train, X_test, y_test, X_eval, y_eval): +def train_with_precision_target(x_train, y_train, x_test, y_test, x_eval, y_eval): """ - Train XGBoost model with focus on precision target. + Train LightGBM model with focus on precision target. Args: - X_train: Training features + x_train: Training features y_train: Training labels - X_test: Testing features + x_test: Testing features y_test: Testing labels - X_eval: Evaluation features + x_eval: Evaluation features y_eval: Evaluation labels Returns: tuple: (best_model, best_metrics) @@ -468,7 +538,7 @@ def train_with_precision_target(X_train, y_train, X_test, y_test, X_eval, y_eval # Train final model with best parameters logger.info("Training final model with best parameters") - model, metrics = train_model(X_train, y_train, X_test, y_test, X_eval, y_eval, params) + model, metrics = train_model(x_train, y_train, x_test, y_test, x_eval, y_eval, params) # Log to MLflow log_to_mlflow(model, metrics, params, experiment_name) @@ -481,17 +551,24 @@ def train_with_precision_target(X_train, y_train, X_test, y_test, X_eval, y_eval def select_best_feature_combination( - X, y, X_test, y_test, X_eval, y_eval, - num_features=95, num_trials=1000, min_recall=0.2, random_state=19 + x, + y, + x_test, + y_test, + x_eval, + y_eval, + num_features=95, + num_trials=1000, + min_recall=0.2, + random_state=19, ): """ Try multiple random combinations of features, train a model for each, and select the best set based on precision (if recall >= min_recall). Args: - X (pd.DataFrame): Training features (all 165 columns) + x (pd.DataFrame): Training features (all 165 columns) y (pd.Series): Training labels - logger: Logger instance - X_test, y_test, X_eval, y_eval: Validation/eval sets (same columns as X) + x_test, y_test, x_eval, y_eval: Validation/eval sets (same columns as x) num_features (int): Number of features to select in each trial num_trials (int): Number of random combinations to try min_recall (float): Minimum recall threshold for score @@ -503,66 +580,71 @@ def select_best_feature_combination( """ rng = np.random.default_rng(random_state) - all_features = list(X_eval.columns) + all_features = list(x.columns) best_score = -1.0 best_features = None best_mask = None model_params = base_params.copy() model_params.update( - { - "learning_rate": 0.1625, - "num_leaves": 75, - "max_depth": 10, - "min_child_samples": 540, - "feature_fraction": 0.61, - "bagging_fraction": 0.645, - "bagging_freq": 15, - "reg_alpha": 19.4, - "reg_lambda": 11.8, - "min_split_gain": 0.17, - "early_stopping_rounds": 670, - "path_smooth": 0.51, - "cat_smooth": 33.8, - "max_bin": 670, - "device": "cpu", - "n_jobs": 8, - "objective": "binary", - "metric": ["aucpr", "binary_logloss"], - "random_state": 19, - "verbose": -1, - } - ) - logger.info(f"Trying {num_trials} random combinations of {num_features} features out of {len(all_features)}...") + { + "learning_rate": 0.1625, + "num_leaves": 75, + "max_depth": 10, + "min_child_samples": 540, + "feature_fraction": 0.61, + "bagging_fraction": 0.645, + "bagging_freq": 15, + "reg_alpha": 19.4, + "reg_lambda": 11.8, + "min_split_gain": 0.17, + "early_stopping_rounds": 670, + "path_smooth": 0.51, + "cat_smooth": 33.8, + "max_bin": 670, + "device": "cpu", + "n_jobs": 8, + "objective": "binary", + "metric": ["aucpr", "binary_logloss"], + "random_state": 19, + "verbose": -1, + } + ) + logger.info( + f"Trying {num_trials} random combinations of {num_features} features out of {len(all_features)}..." + ) for trial in range(num_trials): # Randomly select features selected = rng.choice(all_features, size=num_features, replace=False) selected = list(selected) # Subset data - X_train_sel = X[selected] - X_test_sel = X_test[selected] - X_eval_sel = X_eval[selected] + x_train_sel = x[selected] + x_test_sel = x_test[selected] + x_eval_sel = x_eval[selected] # Train model and get metrics try: - model, metrics = train_model( - X_train_sel, y, X_test_sel, y_test, X_eval_sel, y_eval, model_params + _, metrics = train_model( + x_train_sel, y, x_test_sel, y_test, x_eval_sel, y_eval, model_params ) recall = metrics.get("recall", 0.0) precision = metrics.get("precision", 0.0) score = precision if recall >= min_recall else 0.0 - + if score > best_score: best_score = score best_features = selected # Create boolean mask for best features best_mask = np.array([f in best_features for f in all_features]) - logger.info(f"Trial {trial+1}/{num_trials}: Score={score:.4f} (Precision={precision:.4f}, Recall={recall:.4f} best_score={best_score:.4f})") + logger.info( + f"Trial {trial + 1}/{num_trials}: Score={score:.4f} (Precision={precision:.4f}, Recall={recall:.4f} best_score={best_score:.4f})" + ) except Exception as e: - logger.error(f"Trial {trial+1} failed: {e}") + logger.error(f"Trial {trial + 1} failed: {e}") logger.info(f"Best score: {best_score:.4f} with {num_features} features: {best_features}") return best_features, best_mask, best_score + def main(): """ Main execution function. @@ -570,36 +652,33 @@ def main(): try: logger.info("Starting LightGBM model training") global X_train, y_train, X_test, y_test, X_eval, y_eval - + # Load data dataloader = DataLoader() X_train, y_train, X_test, y_test, X_eval, y_eval = dataloader.load_data() features = import_selected_features_ensemble(model_type="lgbm") - - X_train = prepare_data(X_train, features) - X_test = prepare_data(X_test, features) - X_eval = prepare_data(X_eval, features) - - # best_features, best_mask, best_score = select_best_feature_combination(X_train, y_train, X_test, y_test, X_eval, y_eval) + + # Ensure features is a list of strings + if not isinstance(features, list): + raise ValueError("Expected features to be a list for lgbm model type") + + x_train = prepare_data(X_train, features) + x_test = prepare_data(X_test, features) + x_eval = prepare_data(X_eval, features) + # Log data shapes - logger.info(f"Training data shape: {X_train.shape}") - logger.info(f"Testing data shape: {X_test.shape}") - logger.info(f"Evaluation data shape: {X_eval.shape}") + logger.info(f"Training data shape: {x_train.shape}") + logger.info(f"Testing data shape: {x_test.shape}") + logger.info(f"Evaluation data shape: {x_eval.shape}") logger.info( f"Positive class ratio - Train: {y_train.mean():.3f}, Test: {y_test.mean():.3f}, Eval: {y_eval.mean():.3f}" ) logger.info("Starting hyperparameter optimization run") - current_params, current_metrics = hypertune_lightgbm(experiment_name) + current_params, _ = hypertune_lightgbm() logger.info(f"Run completed with parameters: {current_params}") - # Train model with precision target - # best_model, best_metrics = train_with_precision_target( - # X_train, y_train, X_test, y_test, X_eval, y_eval - # ) - # logger.info(f"Best model: {best_model}") - # logger.info(f"Best metrics: {best_metrics}") except Exception as e: logger.error(f"Error in main execution: {str(e)}") diff --git a/src/models/StackedEnsemble/base/tree_based/lightgbm_model_25.py b/src/models/StackedEnsemble/base/tree_based/lightgbm_model_25.py index fce0e40..6958d59 100644 --- a/src/models/StackedEnsemble/base/tree_based/lightgbm_model_25.py +++ b/src/models/StackedEnsemble/base/tree_based/lightgbm_model_25.py @@ -21,7 +21,6 @@ from sklearn.feature_selection import RFECV from sklearn.model_selection import StratifiedKFold -from src.models.StackedEnsemble.base.tree_based.xgboost_model_outliers import apply_outlier_removal from src.utils.logger import ExperimentLogger experiment_name = "lightgbm_soccer_prediction_25" @@ -83,19 +82,38 @@ def load_hyperparameter_space(): "num_leaves": {"type": "int", "low": 10, "high": 250, "log": False, "step": 5}, "max_depth": {"type": "int", "low": 7, "high": 13, "log": False, "step": 1}, "min_child_samples": {"type": "int", "low": 100, "high": 1000, "log": False, "step": 10}, - "feature_fraction": {"type": "float", "low": 0.25, "high": 0.80, "log": False, "step": 0.01}, - "bagging_fraction": {"type": "float", "low": 0.25, "high": 0.80, "log": False, "step": 0.005}, + "feature_fraction": { + "type": "float", + "low": 0.25, + "high": 0.80, + "log": False, + "step": 0.01, + }, + "bagging_fraction": { + "type": "float", + "low": 0.25, + "high": 0.80, + "log": False, + "step": 0.005, + }, "bagging_freq": {"type": "int", "low": 30, "high": 100, "log": False, "step": 1}, "reg_alpha": {"type": "float", "low": 8.0, "high": 27.0, "log": False, "step": 0.1}, "reg_lambda": {"type": "float", "low": 1.0, "high": 22.0, "log": False, "step": 0.1}, "min_split_gain": {"type": "float", "low": 0.02, "high": 0.45, "log": False, "step": 0.005}, - "early_stopping_rounds": {"type": "int", "low": 400, "high": 2000, "log": False, "step": 10}, + "early_stopping_rounds": { + "type": "int", + "low": 400, + "high": 2000, + "log": False, + "step": 10, + }, "path_smooth": {"type": "float", "low": 0.02, "high": 0.30, "log": False, "step": 0.005}, "cat_smooth": {"type": "float", "low": 10.0, "high": 50.0, "log": False, "step": 0.1}, "max_bin": {"type": "int", "low": 300, "high": 1000, "log": False, "step": 10}, } return hyperparameter_space + def create_model(model_params): """ Create and configure LightGBM model instance. @@ -117,6 +135,7 @@ def create_model(model_params): logger.error(f"Error creating LightGBM model: {str(e)}") raise + def train_model(X_train, y_train, X_test, y_test, X_eval, y_eval, model_params): """ Train a LightGBM model with early stopping and threshold optimization. @@ -166,6 +185,7 @@ def train_model(X_train, y_train, X_test, y_test, X_eval, y_eval, model_params): logger.error(f"Error training LightGBM model: {str(e)}") raise + def optimize_hyperparameters( X_train, y_train, X_test, y_test, X_eval, y_eval, hyperparameter_space ): @@ -292,20 +312,24 @@ def callback(study, trial): for batch in range(num_batches): # Feature reduction: Remove 2 features every batch if batch > 0: # Skip feature reduction for the first batch - features_to_remove = min(1, X_train.shape[1] - 10) # Ensure we don't go below 10 features + features_to_remove = min( + 1, X_train.shape[1] - 10 + ) # Ensure we don't go below 10 features if features_to_remove > 0: # Always remove the first x features features_to_drop = X_train.columns[:features_to_remove].tolist() - - logger.info(f"Batch {batch + 1}: Removing {features_to_remove} features: {features_to_drop}") + + logger.info( + f"Batch {batch + 1}: Removing {features_to_remove} features: {features_to_drop}" + ) logger.info(f"Features before removal: {X_train.shape[1]}") - + # Remove features from all datasets X_train = X_train.drop(columns=features_to_drop) X_test = X_test.drop(columns=features_to_drop) if X_eval is not None: X_eval = X_eval.drop(columns=features_to_drop) - + logger.info(f"Features after removal: {X_train.shape[1]}") # Create a new sampler with a dynamic seed random_seed = int(time.time()) @@ -349,6 +373,7 @@ def callback(study, trial): return best_params + def hypertune_lightgbm(experiment_name: str): """ Main training function with MLflow tracking. @@ -376,9 +401,7 @@ def hypertune_lightgbm(experiment_name: str): # Train final model with best parameters logger.info("Training final model with best parameters") - model, metrics = train_model( - X_train, y_train, X_test, y_test, X_eval, y_eval, best_params - ) + model, metrics = train_model(X_train, y_train, X_test, y_test, X_eval, y_eval, best_params) return best_params, metrics @@ -386,6 +409,7 @@ def hypertune_lightgbm(experiment_name: str): logger.error(f"Error in hyperparameter tuning: {str(e)}") return None, None + def log_to_mlflow(model, metrics, params, experiment_name, X_eval): """ Log trained model, metrics, and parameters to MLflow. @@ -450,6 +474,7 @@ def log_to_mlflow(model, metrics, params, experiment_name, X_eval): logger.error(f"Error logging to MLflow: {str(e)}") return None + def train_with_precision_target(X_train, y_train, X_test, y_test, X_eval, y_eval): """ Train XGBoost model with focus on precision target. @@ -494,7 +519,7 @@ def train_with_precision_target(X_train, y_train, X_test, y_test, X_eval, y_eval # Train final model with best parameters logger.info("Training final model with best parameters") model, metrics = train_model(X_train, y_train, X_test, y_test, X_eval, y_eval, params) - compute_permutation_importance(model, X_eval, y_eval, metrics['threshold']) + compute_permutation_importance(model, X_eval, y_eval, metrics["threshold"]) return model, metrics @@ -502,9 +527,10 @@ def train_with_precision_target(X_train, y_train, X_test, y_test, X_eval, y_eval logger.error(f"Error in precision-focused training: {str(e)}") return None, None + def compute_permutation_importance( model, - X_val: pd.DataFrame, + X_val: pd.DataFrame, y_val: np.ndarray, threshold: float = 0.3, n_repeats: int = 50, @@ -537,13 +563,15 @@ def compute_permutation_importance( drops = [] for i in range(n_repeats): feat_idx = feature_names.index(feat) + 1 - logger.info(f"Shuffling feature: {feat} ({feat_idx}) - Repeat: {i+1}") + logger.info(f"Shuffling feature: {feat} ({feat_idx}) - Repeat: {i + 1}") X_shuffled = X_val.copy() X_shuffled[feat] = np.random.permutation(X_shuffled[feat].values) probs_shuffled = model.predict_proba(X_shuffled)[:, 1] preds_shuffled = (probs_shuffled >= threshold).astype(int) # Calculate precision directly instead of using metric parameter - precision = np.sum((y_val_np == 1) & (preds_shuffled == 1)) / (np.sum(preds_shuffled == 1)) + precision = np.sum((y_val_np == 1) & (preds_shuffled == 1)) / ( + np.sum(preds_shuffled == 1) + ) drop = baseline - precision drops.append(drop) mean_drop = np.mean(drops) @@ -556,7 +584,10 @@ def compute_permutation_importance( logger.info(df_importance.head(number_of_features).to_string(index=False)) return df_importance -def select_features_rfecv(X, y, logger, min_features=150, step=1, scoring='roc_auc', random_state=19): + +def select_features_rfecv( + X, y, logger, min_features=150, step=1, scoring="roc_auc", random_state=19 +): """ Perform RFECV-based feature selection using LightGBM. Args: @@ -570,7 +601,9 @@ def select_features_rfecv(X, y, logger, min_features=150, step=1, scoring='roc_a Returns: tuple: (List[str], pd.DataFrame) """ - logger.info(f"Starting RFECV feature selection with min_features={min_features}, step={step}, scoring={scoring}") + logger.info( + f"Starting RFECV feature selection with min_features={min_features}, step={step}, scoring={scoring}" + ) params = base_params.copy() params.update( { @@ -604,39 +637,42 @@ def select_features_rfecv(X, y, logger, min_features=150, step=1, scoring='roc_a scoring=scoring, min_features_to_select=min_features, n_jobs=-1, - verbose=2 + verbose=2, ) selector.fit(X, y) selected_features = X.columns[selector.support_].tolist() importances = selector.estimator_.feature_importances_ - feature_importance_df = pd.DataFrame({ - 'feature': selected_features, - 'importance': importances - }).sort_values('importance', ascending=False) + feature_importance_df = pd.DataFrame( + {"feature": selected_features, "importance": importances} + ).sort_values("importance", ascending=False) logger.info(f"RFECV selected {len(selected_features)} features:") - for feat, imp in zip(feature_importance_df['feature'], feature_importance_df['importance']): + for feat, imp in zip(feature_importance_df["feature"], feature_importance_df["importance"]): logger.info(f" - {feat}: {imp}") return selected_features, feature_importance_df -def hypertune_with_feature_importance(X_train, y_train, X_test, y_test, X_eval, y_eval, n_trials=50): + +def hypertune_with_feature_importance( + X_train, y_train, X_test, y_test, X_eval, y_eval, n_trials=50 +): """ Perform hyperparameter optimization with Optuna while tracking feature importances. - + Args: X_train (pd.DataFrame): Training features - y_train (pd.Series): Training labels + y_train (pd.Series): Training labels X_test (pd.DataFrame): Test features y_test (pd.Series): Test labels n_trials (int): Number of optimization trials - + Returns: tuple: (best_params, feature_importance_df) """ logger.info(f"Starting hyperparameter optimization with {n_trials} trials") - + # Store feature importances across trials feature_importances = [] hyperparameter_space = load_hyperparameter_space() + def objective(trial): params = base_params.copy() # Add hyperparameters from config with step size if provided @@ -670,40 +706,38 @@ def objective(trial): param_name, param_config["low"], param_config["high"] ) - # Train model model, metrics = train_model(X_train, y_train, X_test, y_test, X_eval, y_eval, params) - + # Store feature importances for this trial importance_dict = dict(zip(X_train.columns, model.feature_importances_)) feature_importances.append(importance_dict) - - return metrics['precision'] - + + return metrics["precision"] + # Create and run study - study = optuna.create_study(direction='maximize') + study = optuna.create_study(direction="maximize") study.optimize(objective, n_trials=n_trials) - + # Calculate average feature importance across all trials avg_importances = {} for feature in X_train.columns: importance_values = [trial_imp[feature] for trial_imp in feature_importances] avg_importances[feature] = np.mean(importance_values) - + # Create DataFrame and sort by importance - importance_df = pd.DataFrame({ - 'feature': list(avg_importances.keys()), - 'importance': list(avg_importances.values()) - }) - importance_df = importance_df.sort_values('importance', ascending=False) - + importance_df = pd.DataFrame( + {"feature": list(avg_importances.keys()), "importance": list(avg_importances.values())} + ) + importance_df = importance_df.sort_values("importance", ascending=False) + # Get top 100 features top_100_features = importance_df.head(100) - + logger.info("Top 100 features by average importance across trials:") for idx, row in top_100_features.iterrows(): logger.info(f"{row['feature']}: {row['importance']:.4f} id: {idx}") - + return study.best_params, importance_df @@ -714,7 +748,7 @@ def main(): try: logger.info("Starting LightGBM model training") global X_train, y_train, X_test, y_test, X_eval, y_eval - + # Load data dataloader = DataLoader() X_train, y_train, X_test, y_test, X_eval, y_eval = dataloader.load_data() @@ -739,10 +773,10 @@ def main(): # ) # --- Feature Selection with RFECV --- - # selected_features, feature_importance_df = select_features_rfecv(X_eval, y_eval, logger, - # min_features=150, - # step=1, - # scoring='roc_auc', + # selected_features, feature_importance_df = select_features_rfecv(X_eval, y_eval, logger, + # min_features=150, + # step=1, + # scoring='roc_auc', # random_state=SEED) # print(feature_importance_df) diff --git a/src/models/StackedEnsemble/base/tree_based/random_forest_30.py b/src/models/StackedEnsemble/base/tree_based/random_forest_30.py index 38e8912..9bf1340 100644 --- a/src/models/StackedEnsemble/base/tree_based/random_forest_30.py +++ b/src/models/StackedEnsemble/base/tree_based/random_forest_30.py @@ -65,37 +65,37 @@ def load_hyperparameter_space_for_hpo(): hyperparameter_space = { "n_estimators": { "type": "int", - "low": 500, # Focus on range of top performers + "low": 500, # Focus on range of top performers "high": 1300, # Cover the successful range - "step": 10, # Larger step to save computation + "step": 10, # Larger step to save computation }, "max_depth": { - "type": "int", + "type": "int", "low": 15, "high": 25, "step": 1, }, "min_samples_split": { "type": "int", - "low": 8, - "high": 30, - "step": 1, + "low": 8, + "high": 30, + "step": 1, }, "min_samples_leaf": { "type": "int", - "low": 10, - "high": 50, - "step": 1, + "low": 10, + "high": 50, + "step": 1, }, # "max_features": { - # "type": "categorical", - # "choices": [0.22, 0.24, 0.26, 0.52, 0.70, 0.74, 0.84, 0.88, 0.98, 1.0], + # "type": "categorical", + # "choices": [0.22, 0.24, 0.26, 0.52, 0.70, 0.74, 0.84, 0.88, 0.98, 1.0], # }, "class_weight": { "type": "float", - "low": 1.8, - "high": 2.9, - "step": 0.02, + "low": 1.8, + "high": 2.9, + "step": 0.02, }, } return hyperparameter_space @@ -289,20 +289,24 @@ def callback(study, trial): # Loop over batches, resetting the sampler each time for batch in range(num_batches): if batch > 0: # Skip feature reduction for the first batch - features_to_remove = min(1, X_train.shape[1] - 10) # Ensure we don't go below 10 features + features_to_remove = min( + 1, X_train.shape[1] - 10 + ) # Ensure we don't go below 10 features if features_to_remove > 0: # Always remove the first x features features_to_drop = X_train.columns[:features_to_remove].tolist() - - logger.info(f"Batch {batch + 1}: Removing {features_to_remove} features: {features_to_drop}") + + logger.info( + f"Batch {batch + 1}: Removing {features_to_remove} features: {features_to_drop}" + ) logger.info(f"Features before removal: {X_train.shape[1]}") - + # Remove features from all datasets X_train = X_train.drop(columns=features_to_drop) X_test = X_test.drop(columns=features_to_drop) if X_eval is not None: X_eval = X_eval.drop(columns=features_to_drop) - + logger.info(f"Features after removal: {X_train.shape[1]}") # Create a new sampler with a dynamic seed random_seed = int(time.time()) @@ -320,7 +324,9 @@ def callback(study, trial): logger.info( f"Starting batch {batch + 1}/{num_batches} with new sampler (seed={random_seed})" ) - study.optimize(objective, n_trials=batch_size, show_progress_bar=True, callbacks=[callback], n_jobs=4) + study.optimize( + objective, n_trials=batch_size, show_progress_bar=True, callbacks=[callback], n_jobs=4 + ) # Merge current batch's top trials with global_top_trials for trial_record in top_trials: @@ -369,9 +375,7 @@ def hypertune_random_forest(experiment_name: str): # Train final model with best parameters logger.info("Training final model with best parameters") - model, metrics = train_model( - X_train, y_train, X_test, y_test, X_eval, y_eval, best_params - ) + model, metrics = train_model(X_train, y_train, X_test, y_test, X_eval, y_eval, best_params) return best_params, metrics @@ -479,7 +483,9 @@ def train_with_precision_target(X_train, y_train, X_test, y_test, X_eval, y_eval return None, None -def select_top_features_rf(model: RandomForestClassifier, X_features: pd.DataFrame, n_features: int = 40) -> list[str]: +def select_top_features_rf( + model: RandomForestClassifier, X_features: pd.DataFrame, n_features: int = 40 +) -> list[str]: """ Selects the top N features based on Random Forest feature importances. @@ -491,8 +497,10 @@ def select_top_features_rf(model: RandomForestClassifier, X_features: pd.DataFra Returns: A list of the names of the top N features. """ - if not hasattr(model, 'feature_importances_'): - raise ValueError("The provided model has not been trained yet or does not support feature importances.") + if not hasattr(model, "feature_importances_"): + raise ValueError( + "The provided model has not been trained yet or does not support feature importances." + ) importances = model.feature_importances_ feature_names = X_features.columns @@ -500,18 +508,19 @@ def select_top_features_rf(model: RandomForestClassifier, X_features: pd.DataFra if len(importances) != len(feature_names): raise ValueError("Mismatch between the number of feature importances and feature names.") - feature_importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': importances}) - feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False) + feature_importance_df = pd.DataFrame({"Feature": feature_names, "Importance": importances}) + feature_importance_df = feature_importance_df.sort_values(by="Importance", ascending=False) - top_features = feature_importance_df['Feature'].head(n_features).tolist() + top_features = feature_importance_df["Feature"].head(n_features).tolist() logger.info(f"Selected top {n_features} features based on RF importance.") - logger.info(f"Top features: {top_features}") # Log the selected features for visibility + logger.info(f"Top features: {top_features}") # Log the selected features for visibility return top_features + def compute_permutation_importance( model, - X_val: pd.DataFrame, + X_val: pd.DataFrame, y_val: np.ndarray, threshold: float = 0.3, n_repeats: int = 20, @@ -544,13 +553,15 @@ def compute_permutation_importance( drops = [] for i in range(n_repeats): feat_idx = feature_names.index(feat) + 1 - logger.info(f"Shuffling feature: {feat} ({feat_idx}) - Repeat: {i+1}") + logger.info(f"Shuffling feature: {feat} ({feat_idx}) - Repeat: {i + 1}") X_shuffled = X_val.copy() X_shuffled[feat] = np.random.permutation(X_shuffled[feat].values) probs_shuffled = model.predict_proba(X_shuffled)[:, 1] preds_shuffled = (probs_shuffled >= threshold).astype(int) # Calculate precision directly instead of using metric parameter - precision = np.sum((y_val_np == 1) & (preds_shuffled == 1)) / (np.sum(preds_shuffled == 1)) + precision = np.sum((y_val_np == 1) & (preds_shuffled == 1)) / ( + np.sum(preds_shuffled == 1) + ) drop = baseline - precision drops.append(drop) mean_drop = np.mean(drops) diff --git a/src/models/StackedEnsemble/base/tree_based/xgboost_model.py b/src/models/StackedEnsemble/base/tree_based/xgboost_model.py index 1641256..3a5e8f4 100644 --- a/src/models/StackedEnsemble/base/tree_based/xgboost_model.py +++ b/src/models/StackedEnsemble/base/tree_based/xgboost_model.py @@ -19,6 +19,7 @@ import optuna import sklearn import xgboost as xgb +from mlflow.models import infer_signature from optuna.integration import XGBoostPruningCallback # Import own modules @@ -90,62 +91,62 @@ def load_hyperparameter_space(): hyperparameter_space = { "early_stopping_rounds": { "type": "int", - "low": 400, # Slightly below min - "high": 2000, # Slightly above max + "low": 400, # Slightly below min + "high": 2000, # Slightly above max "step": 10, }, "learning_rate": { "type": "float", - "low": 0.038, # Slightly below min - "high": 0.17, # Slightly above max + "low": 0.038, # Slightly below min + "high": 0.17, # Slightly above max "step": 0.001, }, "max_depth": { "type": "int", - "low": 5, # At min - "high": 16, # Slightly above max + "low": 5, # At min + "high": 16, # Slightly above max "step": 1, }, "min_child_weight": { "type": "int", - "low": 320, # Slightly below min - "high": 700, # Slightly above max + "low": 320, # Slightly below min + "high": 700, # Slightly above max "step": 10, }, "colsample_bytree": { "type": "float", - "low": 0.61, # Slightly below min + "low": 0.61, # Slightly below min "high": 0.97, # Slightly above max "step": 0.005, }, "subsample": { "type": "float", - "low": 0.60, # Slightly below min + "low": 0.60, # Slightly below min "high": 0.94, # Slightly above max "step": 0.005, }, "gamma": { "type": "float", - "low": 0.20, # Slightly below min - "high": 5.1, # Slightly above max + "low": 0.20, # Slightly below min + "high": 5.1, # Slightly above max "step": 0.01, }, "lambda": { "type": "float", - "low": 2.8, # Slightly below min + "low": 2.8, # Slightly below min "high": 17.0, # Slightly above max "step": 0.01, }, "alpha": { "type": "float", - "low": 24.0, # Slightly below min + "low": 24.0, # Slightly below min "high": 58.0, # Slightly above max "step": 0.1, }, "scale_pos_weight": { "type": "float", - "low": 1.5, # Slightly below min - "high": 3.2, # Slightly above max + "low": 1.5, # Slightly below min + "high": 3.2, # Slightly above max "step": 0.01, }, } @@ -171,18 +172,18 @@ def create_model(model_params): return model -def train_model(X_train, y_train, X_test, y_test, X_eval, y_eval, model_params): +def train_model(x_train, y_train, x_test, y_test, x_eval, y_eval, model_params): """ Train a XGBoost model with early stopping and threshold optimization. Uses DataFrame/Array for fitting (required by XGBClassifier wrapper) - Requires X_eval, y_eval for threshold optimization and eval_set. + Requires x_eval, y_eval for threshold optimization and eval_set. Early stopping is handled by the model constructor. Args: - X_train (pd.DataFrame): Training features + x_train (pd.DataFrame): Training features y_train: Training labels (needed by create_model if scale_pos_weight calculation required, though currently static) - X_test: Validation features (currently unused) + x_test: Validation features (currently unused) y_test: Validation labels (currently unused) - X_eval (pd.DataFrame): Evaluation features for eval_set and threshold optimization + x_eval (pd.DataFrame): Evaluation features for eval_set and threshold optimization y_eval (pd.Series): Evaluation labels for threshold optimization model_params (dict): Model parameters Returns: @@ -194,11 +195,11 @@ def train_model(X_train, y_train, X_test, y_test, X_eval, y_eval, model_params): # Create eval set for early stopping using DMatrix # Use DataFrame/Array for eval_set as required by fit when using wrapper - eval_set = [(X_test, y_test)] + eval_set = [(x_test, y_test)] # Fit model with early stopping model.fit( - X=X_train, + X=x_train, y=y_train, eval_set=eval_set, verbose=False, @@ -206,7 +207,7 @@ def train_model(X_train, y_train, X_test, y_test, X_eval, y_eval, model_params): ) # Get validation predictions using the DataFrame for optimize_threshold - best_threshold, metrics = optimize_threshold(model, X_eval, y_eval, min_recall=min_recall) + _, metrics = optimize_threshold(model, x_eval, y_eval, min_recall=min_recall) return model, metrics @@ -216,7 +217,7 @@ def train_model(X_train, y_train, X_test, y_test, X_eval, y_eval, model_params): def optimize_hyperparameters( - X_train, y_train, X_test, y_test, X_eval, y_eval, hyperparameter_space + x_train, y_train, x_test, y_test, x_eval, y_eval, hyperparameter_space ): """ Optimize hyperparameters using Optuna, passing DataFrames. @@ -234,7 +235,7 @@ def optimize_hyperparameters( # Pass necessary data to the objective function def objective_func(trial): return objective( - trial, X_train, y_train, X_test, y_test, X_eval, y_eval, hyperparameter_space + trial, x_train, y_train, x_test, y_test, x_eval, y_eval, hyperparameter_space ) # Callback function defined outside the loop so that its modifications affect the outer scope. @@ -274,7 +275,6 @@ def callback(study, trial): logger.info(table_separator) for row in table_rows: logger.info(row) - return best_score # Set persistent storage path using SQLite storage_url = "sqlite:///optuna_xgboost.db" @@ -317,7 +317,7 @@ def callback(study, trial): # After all batches, update best_params (assume the best trial is the first in global_top_trials) if global_top_trials: - best_score, best_params_from_hpo, best_trial_number = global_top_trials[0] + best_score, best_params_from_hpo, _ = global_top_trials[0] else: best_params_from_hpo = {} # Initialize if no trials were successful @@ -343,7 +343,7 @@ def callback(study, trial): # Objective function now needs to accept the data explicitly -def objective(trial, X_train, y_train, X_test, y_test, X_eval, y_eval, hyperparameter_space): +def objective(trial, x_train, y_train, x_test, y_test, x_eval, y_eval, hyperparameter_space): try: params = base_params.copy() # Extract early_stopping_rounds separately @@ -392,9 +392,7 @@ def objective(trial, X_train, y_train, X_test, y_test, X_eval, y_eval, hyperpara # Pruning Callback - Monitor AUC PR on eval set (default name 'validation_0') XGBoostPruningCallback(trial, "validation_0-aucpr") # Train model and get metrics using DataFrames - model, metrics = train_model( - X_train, y_train, X_test, y_test, X_eval, y_eval, params - ) + model, metrics = train_model(x_train, y_train, x_test, y_test, x_eval, y_eval, params) recall = metrics.get("recall", 0.0) precision = metrics.get("precision", 0.0) @@ -404,8 +402,6 @@ def objective(trial, X_train, y_train, X_test, y_test, X_eval, y_eval, hyperpara # Pruning: report the score back to Optuna trial.report(score, step=model.best_iteration if hasattr(model, "best_iteration") else 0) - # if trial.should_prune(): - # raise optuna.TrialPruned() logger.info(f"Trial {trial.number}:") logger.info(f" Score: {score:.4f} (Precision: {precision:.4f}, Recall: {recall:.4f})") logger.info(f" Threshold: {threshold:.4f}") @@ -413,9 +409,9 @@ def objective(trial, X_train, y_train, X_test, y_test, X_eval, y_eval, hyperpara for metric_name, metric_value in metrics.items(): trial.set_user_attr(metric_name, metric_value) - + if score > 0.37: - log_to_mlflow(model, metrics, params, experiment_name, X_eval) + log_to_mlflow(model, metrics, params, experiment_name, x_eval) return score except optuna.TrialPruned: @@ -426,33 +422,32 @@ def objective(trial, X_train, y_train, X_test, y_test, X_eval, y_eval, hyperpara return 0.0 # Return low score for failed trials -def hypertune_xgboost(X_train, y_train, X_test, y_test, X_eval, y_eval, experiment_name: str): +def hypertune_xgboost(x_train, y_train, x_test, y_test, x_eval, y_eval, experiment_name: str): """ Main training function with MLflow tracking using DataFrames. Args: - X_train (pd.DataFrame): Training features + x_train (pd.DataFrame): Training features y_train: Training labels - X_test: Validation features + x_test: Validation features y_test: Validation labels - X_eval (pd.DataFrame): Evaluation features for signature/logging + x_eval (pd.DataFrame): Evaluation features for signature/logging y_eval (pd.Series): Evaluation labels experiment_name (str): Experiment name for MLflow tracking Returns: tuple: (best_params, best_metrics) """ try: - # Load hyperparameter space hyperparameter_space = load_hyperparameter_space() # Run hyperparameter optimization logger.info("Starting hyperparameter optimization") best_params = optimize_hyperparameters( - X_train, + x_train, y_train, - X_test, + x_test, y_test, - X_eval, + x_eval, y_eval, hyperparameter_space=hyperparameter_space, ) @@ -462,12 +457,12 @@ def hypertune_xgboost(X_train, y_train, X_test, y_test, X_eval, y_eval, experime # best_params already contains the combined HPO and base params, including early_stopping_rounds final_train_params = best_params - model, metrics = train_model( - X_train, + _, metrics = train_model( + x_train, y_train, - X_test, + x_test, y_test, - X_eval, # Pass X_eval for threshold optimization within train_model + x_eval, # Pass x_eval for threshold optimization within train_model y_eval, final_train_params, # Pass the full best parameters including early stopping ) @@ -479,16 +474,16 @@ def hypertune_xgboost(X_train, y_train, X_test, y_test, X_eval, y_eval, experime return None, None -def log_to_mlflow(model, metrics, params, experiment_name, X_eval): +def log_to_mlflow(model, metrics, params, experiment_name, x_eval): """ Log trained model, metrics, and parameters to MLflow. - Requires X_eval DataFrame for signature generation. + Requires x_eval DataFrame for signature generation. Args: model: Trained XGBoost model metrics: Model evaluation metrics params: Model parameters experiment_name: Experiment name - X_eval (pd.DataFrame): Evaluation features for signature generation + x_eval (pd.DataFrame): Evaluation features for signature generation Returns: str: Run ID """ @@ -511,8 +506,8 @@ def log_to_mlflow(model, metrics, params, experiment_name, X_eval): # Log metrics mlflow.log_metrics(metrics) - # Create input example using the DataFrame X_eval - input_example = X_eval.iloc[:5].copy() + # Create input example using the DataFrame x_eval + input_example = x_eval.iloc[:5].copy() # Identify and convert integer columns to float64 if hasattr(input_example, "dtypes"): @@ -522,7 +517,7 @@ def log_to_mlflow(model, metrics, params, experiment_name, X_eval): input_example[col] = input_example[col].astype("float64") # Infer signature - signature = mlflow.models.infer_signature(input_example, model.predict(input_example)) + signature = infer_signature(input_example, model.predict(input_example)) # Update model registration with signature model_info = mlflow.xgboost.log_model( @@ -542,15 +537,15 @@ def log_to_mlflow(model, metrics, params, experiment_name, X_eval): return None -def train_with_precision_target(X_train, y_train, X_test, y_test, X_eval, y_eval): +def train_with_precision_target(x_train, y_train, x_test, y_test, x_eval, y_eval): """ Train XGBoost model with focus on precision target using DataFrames. Args: - X_train (pd.DataFrame): Training features + x_train (pd.DataFrame): Training features y_train: Training labels - X_test: Validation features + x_test: Validation features y_test: Validation labels - X_eval (pd.DataFrame): Evaluation features for logging/thresholding + x_eval (pd.DataFrame): Evaluation features for logging/thresholding y_eval (pd.Series): Evaluation labels Returns: tuple: (best_model, best_metrics) @@ -575,31 +570,38 @@ def train_with_precision_target(X_train, y_train, X_test, y_test, X_eval, y_eval } ) # Train final model with specific parameters - model, metrics = train_model( - X_train, y_train, X_test, y_test, X_eval, y_eval, params - ) + model, metrics = train_model(x_train, y_train, x_test, y_test, x_eval, y_eval, params) - # Log to MLflow using the DataFrame X_eval for signature - # log_to_mlflow(model, metrics, params, experiment_name, X_eval) - top_features = select_top_features_xgb(model, X_eval) + # Log to MLflow using the DataFrame x_eval for signature + # log_to_mlflow(model, metrics, params, experiment_name, x_eval) + top_features = select_top_features_xgb(model, x_eval) logger.info(f"Top features: {top_features}") return model, metrics except Exception as e: logger.error(f"Error in precision-focused training: {str(e)}") return None, None + def select_best_feature_combination( - X, y, X_test, y_test, X_eval, y_eval, - num_features=70, num_trials=500, min_recall=0.3, random_state=19 + x, + y, + x_test, + y_test, + x_eval, + y_eval, + num_features=70, + num_trials=500, + min_recall=0.3, + random_state=19, ): """ Try multiple random combinations of features, train a model for each, and select the best set based on precision (if recall >= min_recall). Args: - X (pd.DataFrame): Training features (all 165 columns) + x (pd.DataFrame): Training features (all 165 columns) y (pd.Series): Training labels logger: Logger instance - X_test, y_test, X_eval, y_eval: Validation/eval sets (same columns as X) + x_test, y_test, x_eval, y_eval: Validation/eval sets (same columns as x) num_features (int): Number of features to select in each trial num_trials (int): Number of random combinations to try min_recall (float): Minimum recall threshold for score @@ -611,7 +613,7 @@ def select_best_feature_combination( """ rng = np.random.default_rng(random_state) - all_features = list(X_eval.columns) + all_features = list(x_eval.columns) best_score = -1.0 best_features = None best_mask = None @@ -625,7 +627,7 @@ def select_best_feature_combination( "n_jobs": 4, "tree_method": "hist", "verbosity": 0, - "eval_metric": ['aucpr', 'error', 'logloss'], + "eval_metric": ["aucpr", "error", "logloss"], "colsample_bytree": 0.81, "subsample": 0.68, "gamma": 3.41, @@ -637,115 +639,130 @@ def select_best_feature_combination( "nthread": 8, "seed": 19, "device": "cuda", - "early_stopping_rounds": 700 + "early_stopping_rounds": 700, } ) - logger.info(f"Trying {num_trials} random combinations of {num_features} features out of {len(all_features)}...") + logger.info( + f"Trying {num_trials} random combinations of {num_features} features out of {len(all_features)}..." + ) for trial in range(num_trials): # Randomly select features selected = rng.choice(all_features, size=num_features, replace=False) selected = list(selected) # Subset data - X_train_sel = X[selected] - X_test_sel = X_test[selected] - X_eval_sel = X_eval[selected] + x_train_sel = x[selected] + x_test_sel = x_test[selected] + x_eval_sel = x_eval[selected] # Train model and get metrics try: - model, metrics = train_model( - X_train_sel, y, X_test_sel, y_test, X_eval_sel, y_eval, model_params + _, metrics = train_model( + x_train_sel, y, x_test_sel, y_test, x_eval_sel, y_eval, model_params ) recall = metrics.get("recall", 0.0) precision = metrics.get("precision", 0.0) score = precision if recall >= min_recall else 0.0 - + if score > best_score: best_score = score best_features = selected # Create boolean mask for best features best_mask = np.array([f in best_features for f in all_features]) - logger.info(f"Trial {trial+1}/{num_trials}: Score={score:.4f} (Precision={precision:.4f}, Recall={recall:.4f} best_score={best_score:.4f})") + logger.info( + f"Trial {trial + 1}/{num_trials}: Score={score:.4f} (Precision={precision:.4f}, Recall={recall:.4f} best_score={best_score:.4f})" + ) except Exception as e: - logger.error(f"Trial {trial+1} failed: {e}") + logger.error(f"Trial {trial + 1} failed: {e}") logger.info(f"Best score: {best_score:.4f} with features: {best_features}") return best_features, best_mask, best_score -def select_top_features_xgb(model, X_features, n_features: int = 30) -> list[str]: + +def select_top_features_xgb(model, x_features, n_features: int = 30) -> list[str]: """ Selects the top N features based on XGBoost feature importances. Args: model: Trained xgb.XGBClassifier model. - X_features: DataFrame containing the features used for training (to get names). + x_features: DataFrame containing the features used for training (to get names). n_features: The number of top features to select. Returns: A list of the names of the top N features. """ import pandas as pd - if not hasattr(model, 'feature_importances_'): - raise ValueError("The provided model has not been trained yet or does not support feature importances.") + + if not hasattr(model, "feature_importances_"): + raise ValueError( + "The provided model has not been trained yet or does not support feature importances." + ) importances = model.feature_importances_ - feature_names = X_features.columns + feature_names = x_features.columns if len(importances) != len(feature_names): raise ValueError("Mismatch between the number of feature importances and feature names.") - feature_importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': importances}) - feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False) + feature_importance_df = pd.DataFrame({"Feature": feature_names, "Importance": importances}) + feature_importance_df = feature_importance_df.sort_values(by="Importance", ascending=False) - top_features = feature_importance_df['Feature'].head(n_features).tolist() + top_features = feature_importance_df["Feature"].head(n_features).tolist() logger.info(f"Selected top {n_features} features based on RF importance.") - logger.info(f"Top features: {top_features}") # Log the selected features for visibility + logger.info(f"Top features: {top_features}") # Log the selected features for visibility return top_features + def main(): """ Main execution function. """ try: logger.info("Starting XGBoost model training") - + # Load data dataloader = DataLoader() - X_train, y_train, X_test, y_test, X_eval, y_eval = dataloader.load_data() + x_train, y_train, x_test, y_test, x_eval, y_eval = dataloader.load_data() features = import_selected_features_ensemble_new(model_type="xgb") logger.info(f"Features: {len(features)}") - X_train = prepare_data(X_train, features) - X_test = prepare_data(X_test, features) - X_eval = prepare_data(X_eval, features) - # best_features, best_mask, best_score = select_best_feature_combination(X_train, y_train, X_test, y_test, X_eval, y_eval) + # Ensure features is a list of strings for prepare_data + assert isinstance(features, list), f"Expected list of features, got {type(features)}" + x_train = prepare_data(x_train, features) + x_test = prepare_data(x_test, features) + x_eval = prepare_data(x_eval, features) + # best_features, best_mask, best_score = select_best_feature_combination(x_train, y_train, x_test, y_test, x_eval, y_eval) # Log data shapes - logger.info(f"Training data shape: {X_train.shape}") - logger.info(f"Testing data shape: {X_test.shape}") - logger.info(f"Evaluation data shape: {X_eval.shape}") + logger.info(f"Training data shape: {x_train.shape}") + logger.info(f"Testing data shape: {x_test.shape}") + logger.info(f"Evaluation data shape: {x_eval.shape}") logger.info( f"Positive class ratio - Train: {y_train.mean():.3f}, Test: {y_test.mean():.3f}, Eval: {y_eval.mean():.3f}" ) # Run Hyperparameter Optimization hypertune_xgboost( - X_train, y_train, X_test, y_test, X_eval, y_eval, # Pass DataFrames - experiment_name + x_train, + y_train, + x_test, + y_test, + x_eval, + y_eval, # Pass DataFrames + experiment_name, ) - # logger.info("Proceeding to train final model with precision target settings.") - # best_model, best_metrics = train_with_precision_target( - # X_train, - # y_train, - # X_test, - # y_test, - # X_eval, - # y_eval, # Pass DataFrames - # ) - # logger.error("Hyperparameter optimization failed. Skipping precision target training.") + # best_model, best_metrics = train_with_precision_target( # noqa: E501 + # x_train, # noqa: E501 + # y_train, # noqa: E501 + # x_test, # noqa: E501 + # y_test, # noqa: E501 + # x_eval, # noqa: E501 + # y_eval, # Pass DataFrames # noqa: E501 + # ) # noqa: E501 + # logger.error("Hyperparameter optimization failed. Skipping precision target training.") # noqa: E501 except Exception as e: - logger.error(f"Error in main execution: {str(e)}") # Add traceback + logger.error(f"Error in main execution: {str(e)}") # Add traceback finally: # Clean up DMatrix objects if needed (usually not necessary) # del dtrain, dtest, deval diff --git a/src/models/StackedEnsemble/base/tree_based/xgboost_model_25.py b/src/models/StackedEnsemble/base/tree_based/xgboost_model_25.py index 0dd18c8..da8582d 100644 --- a/src/models/StackedEnsemble/base/tree_based/xgboost_model_25.py +++ b/src/models/StackedEnsemble/base/tree_based/xgboost_model_25.py @@ -97,62 +97,62 @@ def load_hyperparameter_space(): }, "early_stopping_rounds": { "type": "int", - "low": 100, # Slightly below min - "high": 1000, # Slightly above max + "low": 100, # Slightly below min + "high": 1000, # Slightly above max "step": 10, }, "learning_rate": { "type": "float", - "low": 0.02, # Slightly below min - "high": 0.20, # Slightly above max + "low": 0.02, # Slightly below min + "high": 0.20, # Slightly above max "step": 0.001, }, "max_depth": { "type": "int", - "low": 5, # At min - "high": 14, # Slightly above max + "low": 5, # At min + "high": 14, # Slightly above max "step": 1, }, "min_child_weight": { "type": "int", - "low": 100, # Slightly below min - "high": 1000, # Slightly above max + "low": 100, # Slightly below min + "high": 1000, # Slightly above max "step": 5, }, "colsample_bytree": { "type": "float", - "low": 0.30, # Slightly below min + "low": 0.30, # Slightly below min "high": 0.98, # Slightly above max "step": 0.005, }, "subsample": { "type": "float", - "low": 0.65, # Slightly below min + "low": 0.65, # Slightly below min "high": 0.97, # Slightly above max "step": 0.005, }, "gamma": { "type": "float", - "low": 0.20, # Slightly below min - "high": 7.5, # Slightly above max + "low": 0.20, # Slightly below min + "high": 7.5, # Slightly above max "step": 0.01, }, "lambda": { "type": "float", - "low": 4.0, # Slightly below min + "low": 4.0, # Slightly below min "high": 17.0, # Slightly above max "step": 0.01, }, "alpha": { "type": "float", - "low": 20.0, # Slightly below min + "low": 20.0, # Slightly below min "high": 75.0, # Slightly above max "step": 0.1, }, "scale_pos_weight": { "type": "float", - "low": 1.8, # Slightly below min - "high": 3.2, # Slightly above max + "low": 1.8, # Slightly below min + "high": 3.2, # Slightly above max "step": 0.01, }, } @@ -172,7 +172,7 @@ def create_model(model_params): """ # Update with provided parameters model_params.update(base_params) - + # Create model # Pass all params, including early_stopping_rounds, to the constructor model = xgb.XGBClassifier(**model_params) @@ -251,7 +251,15 @@ def optimize_hyperparameters( def objective_func(trial): nonlocal best_score return objective( - trial, X_train, y_train, X_test, y_test, X_eval, y_eval, hyperparameter_space, best_score + trial, + X_train, + y_train, + X_test, + y_test, + X_eval, + y_eval, + hyperparameter_space, + best_score, ) # Callback function defined outside the loop so that its modifications affect the outer scope. @@ -321,9 +329,7 @@ def callback(study, trial): f"Starting batch {batch + 1}/{num_batches} with new sampler (seed={random_seed})" ) # Pass the lambda function wrapping objective - study.optimize( - objective_func, n_trials=batch_size, callbacks=[callback] - ) + study.optimize(objective_func, n_trials=batch_size, callbacks=[callback]) # Merge current batch's top trials with global_top_trials for trial_record in top_trials: @@ -360,7 +366,9 @@ def callback(study, trial): # Objective function now needs to accept the data explicitly -def objective(trial, X_train, y_train, X_test, y_test, X_eval, y_eval, hyperparameter_space, best_score): +def objective( + trial, X_train, y_train, X_test, y_test, X_eval, y_eval, hyperparameter_space, best_score +): try: params = base_params.copy() # Extract early_stopping_rounds separately @@ -409,16 +417,14 @@ def objective(trial, X_train, y_train, X_test, y_test, X_eval, y_eval, hyperpara # Pruning Callback - Monitor AUC PR on eval set (default name 'validation_0') XGBoostPruningCallback(trial, "validation_0-aucpr") # Train model and get metrics using DataFrames - model, metrics = train_model( - X_train, y_train, X_test, y_test, X_eval, y_eval, params - ) + model, metrics = train_model(X_train, y_train, X_test, y_test, X_eval, y_eval, params) recall = metrics.get("recall", 0.0) precision = metrics.get("precision", 0.0) threshold = metrics.get("threshold", 0.5) # Optimize for precision while maintaining minimum recall score = precision if recall >= min_recall else 0.0 - + # Pruning: report the score back to Optuna trial.report(score, step=model.best_iteration if hasattr(model, "best_iteration") else 0) # if trial.should_prune(): @@ -430,7 +436,7 @@ def objective(trial, X_train, y_train, X_test, y_test, X_eval, y_eval, hyperpara for metric_name, metric_value in metrics.items(): trial.set_user_attr(metric_name, metric_value) - + if score > 0.33 and score > best_score: log_to_mlflow(model, metrics, params, experiment_name, X_eval) return score @@ -575,7 +581,7 @@ def train_with_precision_target(X_train, y_train, X_test, y_test, X_eval, y_eval logger.warning( "Training model with precision target - Ensure parameters are updated from HPO." ) - + params = base_params.copy() params.update( { @@ -589,26 +595,25 @@ def train_with_precision_target(X_train, y_train, X_test, y_test, X_eval, y_eval "lambda": 11.69, "alpha": 31.5, "scale_pos_weight": 1.84, - "eval_metric": ['aucpr', 'error', 'logloss'], + "eval_metric": ["aucpr", "error", "logloss"], } ) # Train final model with specific parameters - model, metrics = train_model( - X_train, y_train, X_test, y_test, X_eval, y_eval, params - ) + model, metrics = train_model(X_train, y_train, X_test, y_test, X_eval, y_eval, params) # Log to MLflow using the DataFrame X_eval for signature # log_to_mlflow(model, metrics, params, experiment_name, X_eval) - top_features = compute_permutation_importance(model, X_eval, y_eval, metrics['threshold']) + top_features = compute_permutation_importance(model, X_eval, y_eval, metrics["threshold"]) logger.info(f"Top features: {top_features}") return model, metrics except Exception as e: logger.error(f"Error in precision-focused training: {str(e)}") return None, None + def compute_permutation_importance( model, - X_val: pd.DataFrame, + X_val: pd.DataFrame, y_val: np.ndarray, threshold: float = 0.3, n_repeats: int = 50, @@ -641,13 +646,15 @@ def compute_permutation_importance( drops = [] for i in range(n_repeats): feat_idx = feature_names.index(feat) + 1 - logger.info(f"Shuffling feature: {feat} ({feat_idx}) - Repeat: {i+1}") + logger.info(f"Shuffling feature: {feat} ({feat_idx}) - Repeat: {i + 1}") X_shuffled = X_val.copy() X_shuffled[feat] = np.random.permutation(X_shuffled[feat].values) probs_shuffled = model.predict_proba(X_shuffled)[:, 1] preds_shuffled = (probs_shuffled >= threshold).astype(int) # Calculate precision directly instead of using metric parameter - precision = np.sum((y_val_np == 1) & (preds_shuffled == 1)) / (np.sum(preds_shuffled == 1)) + precision = np.sum((y_val_np == 1) & (preds_shuffled == 1)) / ( + np.sum(preds_shuffled == 1) + ) drop = baseline - precision drops.append(drop) mean_drop = np.mean(drops) @@ -660,25 +667,29 @@ def compute_permutation_importance( logger.info(df_importance.head(number_of_features).to_string(index=False)) return df_importance -def hypertune_with_feature_importance(X_train, y_train, X_test, y_test, X_eval, y_eval, n_trials=50): + +def hypertune_with_feature_importance( + X_train, y_train, X_test, y_test, X_eval, y_eval, n_trials=50 +): """ Perform hyperparameter optimization with Optuna while tracking feature importances. - + Args: X_train (pd.DataFrame): Training features - y_train (pd.Series): Training labels + y_train (pd.Series): Training labels X_test (pd.DataFrame): Test features y_test (pd.Series): Test labels n_trials (int): Number of optimization trials - + Returns: tuple: (best_params, feature_importance_df) """ logger.info(f"Starting hyperparameter optimization with {n_trials} trials") - + # Store feature importances across trials feature_importances = [] hyperparameter_space = load_hyperparameter_space() + def objective(trial): params = base_params.copy() # Add hyperparameters from config with step size if provided @@ -712,42 +723,41 @@ def objective(trial): param_name, param_config["low"], param_config["high"] ) - # Train model model, metrics = train_model(X_train, y_train, X_test, y_test, X_eval, y_eval, params) - + # Store feature importances for this trial importance_dict = dict(zip(X_train.columns, model.feature_importances_)) feature_importances.append(importance_dict) - - return metrics['precision'] - + + return metrics["precision"] + # Create and run study - study = optuna.create_study(direction='maximize') + study = optuna.create_study(direction="maximize") study.optimize(objective, n_trials=n_trials) - + # Calculate average feature importance across all trials avg_importances = {} for feature in X_train.columns: importance_values = [trial_imp[feature] for trial_imp in feature_importances] avg_importances[feature] = np.mean(importance_values) - + # Create DataFrame and sort by importance - importance_df = pd.DataFrame({ - 'feature': list(avg_importances.keys()), - 'importance': list(avg_importances.values()) - }) - importance_df = importance_df.sort_values('importance', ascending=False) - + importance_df = pd.DataFrame( + {"feature": list(avg_importances.keys()), "importance": list(avg_importances.values())} + ) + importance_df = importance_df.sort_values("importance", ascending=False) + # Get top 100 features top_100_features = importance_df.head(100) - + logger.info("Top 100 features by average importance across trials:") for idx, row in top_100_features.iterrows(): logger.info(f"{row['feature']}: {row['importance']:.4f} id: {idx}") - + return study.best_params, importance_df + def main(): """ Main execution function. @@ -780,15 +790,10 @@ def main(): # ) # Run Hyperparameter Optimization - hypertune_xgboost( - X_train, y_train, X_test, y_test, X_eval, y_eval, - experiment_name - ) - + hypertune_xgboost(X_train, y_train, X_test, y_test, X_eval, y_eval, experiment_name) + best_model, best_metrics = train_with_precision_target( - X_train, y_train, - X_test, y_test, - X_eval, y_eval + X_train, y_train, X_test, y_test, X_eval, y_eval ) except Exception as e: diff --git a/src/models/StackedEnsemble/shared/config_loader.py b/src/models/StackedEnsemble/shared/config_loader.py index 405300c..1034347 100644 --- a/src/models/StackedEnsemble/shared/config_loader.py +++ b/src/models/StackedEnsemble/shared/config_loader.py @@ -1,18 +1,21 @@ """Configuration loading and validation utilities.""" from pathlib import Path -from typing import Any +from typing import Any, Optional import yaml -from utils.logger import ExperimentLogger +from src.utils.logger import ExperimentLogger class ConfigurationLoader: """Handles loading and validation of model configurations.""" def __init__( - self, model_type: str = None, logger: ExperimentLogger = None, experiment_name: str = None + self, + model_type: Optional[str] = None, + logger: Optional[ExperimentLogger] = None, + experiment_name: Optional[str] = None, ): """Initialize the configuration loader. @@ -28,7 +31,7 @@ def __init__( self.project_root = Path(__file__).parent.parent.parent.parent self.base_config_path = self.project_root / "models" / "StackedEnsemble" / "config" - def load_model_config(self, model_type: str = None) -> dict[str, Any]: + def load_model_config(self, model_type: Optional[str] = None) -> dict[str, Any]: """Load model-specific configuration. Args: @@ -59,7 +62,7 @@ def load_model_config(self, model_type: str = None) -> dict[str, Any]: self.logger.error(f"Error parsing configuration file: {e}") raise - def load_hyperparameter_space(self, model_type: str = None) -> dict[str, Any]: + def load_hyperparameter_space(self, model_type: Optional[str] = None) -> dict[str, Any]: """Load hyperparameter search space configuration. Args: diff --git a/src/models/StackedEnsemble/shared/data_loader.py b/src/models/StackedEnsemble/shared/data_loader.py index c624315..08758c3 100644 --- a/src/models/StackedEnsemble/shared/data_loader.py +++ b/src/models/StackedEnsemble/shared/data_loader.py @@ -1,4 +1,5 @@ """Data loading utilities for the stacked ensemble.""" + import numpy as np import pandas as pd diff --git a/src/models/StackedEnsemble/shared/data_loader_new.py b/src/models/StackedEnsemble/shared/data_loader_new.py index 594290f..8f603f4 100644 --- a/src/models/StackedEnsemble/shared/data_loader_new.py +++ b/src/models/StackedEnsemble/shared/data_loader_new.py @@ -1,4 +1,5 @@ """Data loading utilities for the stacked ensemble.""" + import numpy as np import pandas as pd diff --git a/src/models/StackedEnsemble/shared/hypertuner_utils.py b/src/models/StackedEnsemble/shared/hypertuner_utils.py index 869cee9..3b644cd 100644 --- a/src/models/StackedEnsemble/shared/hypertuner_utils.py +++ b/src/models/StackedEnsemble/shared/hypertuner_utils.py @@ -43,6 +43,7 @@ logger = ExperimentLogger(experiment_name="hypertuner_utils") + def predict(model: Any, X: Union[pd.DataFrame, np.ndarray], threshold: float = 0.5) -> np.ndarray: """ Generate binary predictions using a trained model. @@ -284,5 +285,3 @@ def calculate_feature_importance( except Exception as e: logger.error(f"Error extracting feature importance: {str(e)}") return pd.DataFrame(columns=["Feature", "Importance"]) - - diff --git a/src/models/StackedEnsemble/shared/validation.py b/src/models/StackedEnsemble/shared/validation.py index ed201fe..c7db3ef 100644 --- a/src/models/StackedEnsemble/shared/validation.py +++ b/src/models/StackedEnsemble/shared/validation.py @@ -5,7 +5,7 @@ import sys import time from pathlib import Path -from typing import Any +from typing import Any, Optional import optuna from optuna.pruners import MedianPruner @@ -25,7 +25,7 @@ class OptunaValidator: """Optuna-based hyperparameter optimization with validation.""" - def __init__(self, model_type: str = None, logger: ExperimentLogger = None): + def __init__(self, model_type: Optional[str] = None, logger: Optional[ExperimentLogger] = None): """Initialize validator. Args: @@ -38,6 +38,7 @@ def __init__(self, model_type: str = None, logger: ExperimentLogger = None): # Load optimization configuration self.hyperparameter_space = self.config_loader.load_hyperparameter_space(model_type) + self.optimization_config = self.hyperparameter_space.get("optimization", {}) # Set up study parameters self.study_name = self.optimization_config.get("study_name", f"{model_type}_study") @@ -193,7 +194,7 @@ def optimize_hyperparameters( model: Any, X_train: Any, y_train: Any, - X_val: Any, + x_val: Any, y_val: Any, X_test: Any, y_test: Any, @@ -203,7 +204,7 @@ def optimize_hyperparameters( Args: model: Model instance X_train, y_train: Training data - X_val, y_val: Validation data + x_val, y_val: Validation data X_test, y_test: Test data Returns: @@ -218,7 +219,7 @@ def objective(trial: Trial) -> float: try: # Train and evaluate model - metrics = model.fit(X_train, y_train, X_val, y_val, X_test, y_test, **params) + metrics = model.fit(X_train, y_train, x_val, y_val, X_test, y_test, **params) # Get optimization metric score = metrics.get(self.metric, 0.0) diff --git a/src/models/ensemble/data_utils.py b/src/models/ensemble/data_utils.py index dde1bc8..af32437 100644 --- a/src/models/ensemble/data_utils.py +++ b/src/models/ensemble/data_utils.py @@ -35,7 +35,7 @@ def prepare_data(X: pd.DataFrame, selected_features: list[str]) -> pd.DataFrame: X_selected = X[selected_features].copy() # Handle infinite values X = X.replace([np.inf, -np.inf], np.nan) - + # Fill NaN values X = X.fillna(0.0) # Fill missing values with appropriate strategies @@ -51,9 +51,10 @@ def prepare_data(X: pd.DataFrame, selected_features: list[str]) -> pd.DataFrame: raise ValueError(f"Error in filling missing values: {str(e)} for column: {col}") from e # Convert all columns to float64 to ensure consistent data types X_selected = X_selected.astype("float64") - + return X_selected + def apply_adasyn_resampling( X_train: pd.DataFrame, y_train: pd.Series, @@ -133,6 +134,7 @@ def apply_adasyn_resampling( logger.error(f"ADASYN resampling failed: {str(e)}. Using original data.") return X_train, y_train + def balance_and_clean_dataset( X_train: pd.DataFrame, y_train: pd.Series, @@ -251,6 +253,7 @@ def balance_and_clean_dataset( logger.error(f"Dataset balancing failed: {str(e)}. Using cleaned data without balancing.") return X_clean, y_clean + def select_features_by_importance( X: pd.DataFrame, y: pd.Series, diff --git a/src/models/ensemble/diagnostics.py b/src/models/ensemble/diagnostics.py index 1fd9fb6..944eb83 100644 --- a/src/models/ensemble/diagnostics.py +++ b/src/models/ensemble/diagnostics.py @@ -4,7 +4,7 @@ Functions for diagnosing and explaining model predictions. """ -from typing import Optional +from typing import TYPE_CHECKING, Optional import mlflow import numpy as np @@ -12,22 +12,251 @@ import shap from sklearn.metrics import confusion_matrix +if TYPE_CHECKING: + from src.utils.logger import ExperimentLogger + from src.utils.logger import ExperimentLogger +# Constants +DEFAULT_LOG_DIR = "./logs/ensemble_model_diagnostics" + + +def _prepare_shap_data(x_val: pd.DataFrame, max_samples: int = 500) -> pd.DataFrame: + """Prepare data for SHAP analysis by subsampling if needed.""" + if len(x_val) > max_samples: + return x_val.sample(max_samples, random_state=42) + return x_val + + +def _create_ensemble_meta_features( + model, x_sample: pd.DataFrame, logger +) -> tuple[np.ndarray, list[str]]: + """Create meta-features for ensemble model SHAP analysis.""" + # Get prepared data + x_prepared = x_sample + if hasattr(model, "selected_features"): + if isinstance(model.selected_features, list) and len(model.selected_features) > 0: + x_prepared = x_sample[model.selected_features] + + # Generate predictions from base models + xgb_model = getattr(model, "model_xgb_calibrated", None) or model.model_xgb + cat_model = getattr(model, "model_cat_calibrated", None) or model.model_cat + lgb_model = getattr(model, "model_lgb_calibrated", None) or model.model_lgb + extra_model = getattr(model, "model_extra_calibrated", None) or model.model_extra + + if ( + hasattr(model, "extra_base_model_type") + and model.extra_base_model_type in ["mlp", "svm"] + and model.extra_model_scaler is not None + ): + x_scaled = model.extra_model_scaler.transform(x_prepared) + p_extra = extra_model.predict_proba(x_scaled)[:, 1] + else: + p_extra = extra_model.predict_proba(x_prepared)[:, 1] + + p_xgb = xgb_model.predict_proba(x_prepared)[:, 1] + p_cat = cat_model.predict_proba(x_prepared)[:, 1] + p_lgb = lgb_model.predict_proba(x_prepared)[:, 1] + + # Create meta-features from base model predictions + from models.ensemble.meta_features import create_meta_features + + dynamic_weights = None + if hasattr(model, "dynamic_weighting") and hasattr(model, "dynamic_weights"): + if model.dynamic_weighting: + dynamic_weights = model.dynamic_weights + + meta_features = create_meta_features(p_xgb, p_cat, p_lgb, p_extra, dynamic_weights) + + feature_names = [ + "prob_xgb", "prob_cat", "prob_lgb", "prob_extra", + "weighted_avg", "diff_xgb_cat", "diff_xgb_lgb", "diff_cat_lgb", + "diff_extra_xgb", "diff_extra_cat", "diff_extra_lgb", + "max_prob", "min_prob", "range_prob", + "rank_xgb", "rank_cat", "rank_lgb", "rank_extra", + "vote_sum", "vote_agreement", + ] + + logger.info(f"Created meta-features with shape: {meta_features.shape}") + return meta_features, feature_names + + +def _compute_shap_values(target_model, x_for_shap: np.ndarray, logger) -> np.ndarray: + """Compute SHAP values for the given model and data.""" + try: + if hasattr(target_model, "tree_method"): + # For tree-based models (XGBoost, LightGBM) + explainer = shap.TreeExplainer(target_model) + else: + # For other models + explainer = shap.KernelExplainer( + target_model.predict_proba, shap.sample(x_for_shap, 100, random_state=42) + ) + + # Calculate SHAP values + shap_values = explainer.shap_values(x_for_shap) + + # For binary classifiers, shap_values might be a list with one element + if isinstance(shap_values, list): + # Take SHAP values for positive class (class 1) + shap_values = shap_values[1] if len(shap_values) > 1 else shap_values[0] + + # Check if shap_values is a numpy array before proceeding + if not isinstance(shap_values, np.ndarray): + raise TypeError("SHAP values must be a numpy array") + + return shap_values + + except Exception as e: + logger.error(f"SHAP computation failed: {str(e)}") + raise + + +def _calculate_feature_importance( + shap_values: np.ndarray, feature_names: list[str], logger +) -> dict[str, float]: + """Calculate feature importance from SHAP values.""" + # Calculate mean absolute SHAP values for feature importance + feature_importance = np.abs(shap_values).mean(axis=0) + + # Ensure feature_importance is 1D and matches feature_names length + if feature_importance.ndim > 1: + feature_importance = feature_importance.mean(axis=1) + + if len(feature_importance) != len(feature_names): + logger.warning( + f"Feature importance length ({len(feature_importance)}) doesn't match feature names length ({len(feature_names)})" + ) + # Adjust feature_names if needed + feature_names = ( + feature_names[: len(feature_importance)] + if len(feature_names) > len(feature_importance) + else feature_names + ) + + # Create feature importance dictionary + importance_dict = dict(zip(feature_names, feature_importance)) + + # Sort features by importance + sorted_importance = sorted(importance_dict.items(), key=lambda x: x[1], reverse=True) + + # Log top 10 features + logger.info("Top feature importances (SHAP):") + for feature, importance in sorted_importance[:10]: + logger.info(f" {feature}: {importance:.6f}") + mlflow.log_metric(f"shap_importance_{feature}", importance) + + return dict(sorted_importance) + + +def _get_model_predictions(model, x_val: pd.DataFrame, threshold: float) -> tuple[np.ndarray, np.ndarray]: + """Get probability predictions and binary predictions from model.""" + # Get predictions + y_prob = model.predict_proba(x_val) + + # Handle different return shapes from predict_proba + if isinstance(y_prob, tuple): + # Some models return (neg_class_prob, pos_class_prob) + y_prob = y_prob[1] + elif y_prob.ndim > 1 and y_prob.shape[1] > 1: + # Some models return [neg_class_prob, pos_class_prob] for each sample + y_prob = y_prob[:, 1] + + # Convert to binary predictions using threshold + y_pred = (y_prob >= threshold).astype(int) + + return y_prob, y_pred + + +def _compute_confusion_metrics(y_val: pd.Series, y_pred: np.ndarray) -> dict: + """Compute confusion matrix and basic classification metrics.""" + # Compute confusion matrix + tn, fp, fn, tp = confusion_matrix(y_val, y_pred).ravel() + + # Compute various metrics + total = tn + fp + fn + tp + accuracy = (tp + tn) / total + precision = tp / (tp + fp) if (tp + fp) > 0 else 0 + recall = tp / (tp + fn) if (tp + fn) > 0 else 0 + f1_score = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0 + + return { + "confusion_matrix": (tn, fp, fn, tp), + "metrics": { + "accuracy": accuracy, + "precision": precision, + "recall": recall, + "f1_score": f1_score, + "total": total, + } + } + + +def _analyze_errors(y_prob: np.ndarray, y_pred: np.ndarray, y_val: pd.Series, threshold: float) -> dict: + """Analyze prediction errors and create detailed error analysis.""" + # Calculate class balance + pos_rate = np.mean(y_val) + + # Find incorrect predictions + incorrect_mask = y_pred != y_val + incorrect_indices = np.nonzero(incorrect_mask)[0] + + # False positives and false negatives + fp_mask = (y_pred == 1) & (y_val == 0) + fn_mask = (y_pred == 0) & (y_val == 1) + fp_indices = np.nonzero(fp_mask)[0] + fn_indices = np.nonzero(fn_mask)[0] + + # Note: fp_indices and fn_indices can be used to extract examples if needed + + # Analyze false positives (highest probability first) + fp_probs = y_prob[fp_mask] + fp_indices_sorted = np.argsort(-fp_probs) # Sort in descending order + fp_analysis = [] + + for i in range(min(5, len(fp_indices_sorted))): + idx = fp_indices[fp_indices_sorted[i]] + prob = y_prob[idx] + fp_analysis.append( + {"idx": idx, "probability": prob, "threshold_difference": prob - threshold} + ) + + # Analyze false negatives (closest to threshold first) + fn_probs = y_prob[fn_mask] + fn_indices_sorted = np.argsort(threshold - fn_probs) # Sort by proximity to threshold + fn_analysis = [] + + for i in range(min(5, len(fn_indices_sorted))): + idx = fn_indices[fn_indices_sorted[i]] + prob = y_prob[idx] + fn_analysis.append( + {"idx": idx, "probability": prob, "threshold_difference": threshold - prob} + ) + + return { + "class_balance": pos_rate, + "error_count": len(incorrect_indices), + "error_rate": len(incorrect_indices) / len(y_val), + "false_positives": len(fp_indices), + "false_negatives": len(fn_indices), + "fp_analysis": fp_analysis, + "fn_analysis": fn_analysis, + } + def detect_data_leakage( - X_train: pd.DataFrame, - X_test: pd.DataFrame, - X_val: pd.DataFrame, - logger: ExperimentLogger = None, + x_train: pd.DataFrame, + x_test: pd.DataFrame, + x_val: pd.DataFrame, + logger: Optional["ExperimentLogger"] = None, ) -> dict: """ Check for potential data leakage between datasets by detecting duplicate rows. Args: - X_train: Training dataset - X_test: Test dataset - X_val: Validation dataset + x_train: Training dataset + x_test: Test dataset + x_val: Validation dataset logger: Logger instance Returns: @@ -36,15 +265,15 @@ def detect_data_leakage( if logger is None: logger = ExperimentLogger( experiment_name="ensemble_model_diagnostics", - log_dir="./logs/ensemble_model_diagnostics", + log_dir=DEFAULT_LOG_DIR, ) logger.info("Checking for data leakage between datasets...") # Create unique identifier for each row (converting to tuples) - train_tuples = set(map(tuple, X_train.values)) - test_tuples = set(map(tuple, X_test.values)) - val_tuples = set(map(tuple, X_val.values)) + train_tuples = set(map(tuple, x_train.values)) + test_tuples = set(map(tuple, x_test.values)) + val_tuples = set(map(tuple, x_val.values)) # Find overlaps train_test_overlap = train_tuples.intersection(test_tuples) @@ -90,13 +319,13 @@ def detect_data_leakage( return results -def explain_predictions(model, X_val: pd.DataFrame, logger: ExperimentLogger = None) -> dict: +def explain_predictions(model, x_val: pd.DataFrame, logger: Optional["ExperimentLogger"] = None) -> dict: """ Generate feature importance explanations using SHAP values on validation data. Args: model: Trained model with predict_proba method - X_val: Validation features + x_val: Validation features logger: Logger instance Returns: @@ -105,159 +334,45 @@ def explain_predictions(model, X_val: pd.DataFrame, logger: ExperimentLogger = N if logger is None: logger = ExperimentLogger( experiment_name="ensemble_model_diagnostics", - log_dir="./logs/ensemble_model_diagnostics", + log_dir=DEFAULT_LOG_DIR, ) logger.info("Generating model explanations with SHAP...") - # Limit sample size for SHAP analysis to avoid excessive computation - max_shap_samples = min(500, len(X_val)) - - # Subsample data if needed - if len(X_val) > max_shap_samples: - X_sample = X_val.sample(max_shap_samples, random_state=42) - logger.info(f"Using {max_shap_samples} random samples for SHAP analysis.") - else: - X_sample = X_val - - # Check if the model has the meta_learner attribute (ensemble model) - if hasattr(model, "meta_learner") and model.meta_learner is not None: - # For ensemble model, use the meta-learner for SHAP analysis - target_model = model.meta_learner - logger.info("Using meta-learner for SHAP explanations.") - - # IMPORTANT FIX: Transform original features into meta-features first - logger.info("Transforming features to meta-features for ensemble explanation") - - try: - # Get prepared data - X_prepared = X_sample - if hasattr(model, "selected_features"): - # Prepare input data using selected features if available - if isinstance(model.selected_features, list) and len(model.selected_features) > 0: - X_prepared = X_sample[model.selected_features] - - # Generate predictions from base models - # Use calibrated models if available - xgb_model = getattr(model, "model_xgb_calibrated", None) or model.model_xgb - cat_model = getattr(model, "model_cat_calibrated", None) or model.model_cat - lgb_model = getattr(model, "model_lgb_calibrated", None) or model.model_lgb - extra_model = getattr(model, "model_extra_calibrated", None) or model.model_extra - - # Get predictions from base models - if ( - hasattr(model, "extra_base_model_type") - and model.extra_base_model_type in ["mlp", "svm"] - and model.extra_model_scaler is not None - ): - X_scaled = model.extra_model_scaler.transform(X_prepared) - p_extra = extra_model.predict_proba(X_scaled)[:, 1] - else: - p_extra = extra_model.predict_proba(X_prepared)[:, 1] - - p_xgb = xgb_model.predict_proba(X_prepared)[:, 1] - p_cat = cat_model.predict_proba(X_prepared)[:, 1] - p_lgb = lgb_model.predict_proba(X_prepared)[:, 1] - - # Create meta-features from base model predictions - from models.ensemble.meta_features import create_meta_features - - dynamic_weights = None - if hasattr(model, "dynamic_weighting") and hasattr(model, "dynamic_weights"): - if model.dynamic_weighting: - dynamic_weights = model.dynamic_weights - - meta_features = create_meta_features(p_xgb, p_cat, p_lgb, p_extra, dynamic_weights) - - # Use meta-features for SHAP analysis instead of original features - X_for_shap = meta_features - feature_names = [ - "prob_xgb", - "prob_cat", - "prob_lgb", - "prob_extra", - "weighted_avg", - "diff_xgb_cat", - "diff_xgb_lgb", - "diff_cat_lgb", - "diff_extra_xgb", - "diff_extra_cat", - "diff_extra_lgb", - "max_prob", - "min_prob", - "range_prob", - "rank_xgb", - "rank_cat", - "rank_lgb", - "rank_extra", - "vote_sum", - "vote_agreement", - ] - - logger.info(f"Created meta-features with shape: {X_for_shap.shape}") - except Exception as e: - logger.error(f"Failed to create meta-features: {str(e)}") - return {"error": f"Meta-feature creation failed: {str(e)}", "feature_importance": {}} - else: - # Otherwise use the model directly with original features - target_model = model - X_for_shap = X_sample - feature_names = X_sample.columns.tolist() + # Prepare data for SHAP analysis + x_sample = _prepare_shap_data(x_val) + if len(x_sample) < len(x_val): + logger.info(f"Using {len(x_sample)} random samples for SHAP analysis.") - # Initialize SHAP explainer based on model type try: - if hasattr(target_model, "tree_method"): - # For tree-based models (XGBoost, LightGBM) - explainer = shap.TreeExplainer(target_model) + # Check if the model has the meta_learner attribute (ensemble model) + if hasattr(model, "meta_learner") and model.meta_learner is not None: + # For ensemble model, use meta-features for SHAP analysis + target_model = model.meta_learner + logger.info("Using meta-learner for SHAP explanations.") + + logger.info("Transforming features to meta-features for ensemble explanation") + x_for_shap, feature_names = _create_ensemble_meta_features(model, x_sample, logger) else: - # For other models - explainer = shap.KernelExplainer( - target_model.predict_proba, shap.sample(X_for_shap, 100, random_state=42) - ) - - # Calculate SHAP values - shap_values = explainer.shap_values(X_for_shap) - - # For binary classifiers, shap_values might be a list with one element - if isinstance(shap_values, list): - # Take SHAP values for positive class (class 1) - shap_values = shap_values[1] if len(shap_values) > 1 else shap_values[0] - - # Check if shap_values is a numpy array before proceeding - if not isinstance(shap_values, np.ndarray): - raise TypeError("SHAP values must be a numpy array") - - # Calculate mean absolute SHAP values for feature importance - feature_importance = np.abs(shap_values).mean(axis=0) - - # Ensure feature_importance is 1D and matches feature_names length - if feature_importance.ndim > 1: - feature_importance = feature_importance.mean(axis=1) - - if len(feature_importance) != len(feature_names): - logger.warning( - f"Feature importance length ({len(feature_importance)}) doesn't match feature names length ({len(feature_names)})" - ) - # Adjust feature_names if needed - feature_names = ( - feature_names[: len(feature_importance)] - if len(feature_names) > len(feature_importance) - else feature_names - ) - - # Create feature importance dictionary - importance_dict = dict(zip(feature_names, feature_importance)) + # Otherwise use the model directly with original features + target_model = model + x_for_shap = x_sample.values if hasattr(x_sample, 'values') else x_sample + feature_names = x_sample.columns.tolist() + + # Compute SHAP values + if isinstance(x_for_shap, pd.DataFrame): + x_for_shap_array = x_for_shap.values + elif isinstance(x_for_shap, np.ndarray): + x_for_shap_array = x_for_shap + else: + x_for_shap_array = np.asarray(x_for_shap) - # Sort features by importance - sorted_importance = sorted(importance_dict.items(), key=lambda x: x[1], reverse=True) + shap_values = _compute_shap_values(target_model, x_for_shap_array, logger) - # Log top 10 features - logger.info("Top feature importances (SHAP):") - for feature, importance in sorted_importance[:10]: - logger.info(f" {feature}: {importance:.6f}") - mlflow.log_metric(f"shap_importance_{feature}", importance) + # Calculate feature importance + feature_importance = _calculate_feature_importance(shap_values, feature_names, logger) - return {"feature_importance": dict(sorted_importance), "shap_values": shap_values} + return {"feature_importance": feature_importance, "shap_values": shap_values} except Exception as e: logger.error(f"SHAP explanation failed: {str(e)}") @@ -266,17 +381,17 @@ def explain_predictions(model, X_val: pd.DataFrame, logger: ExperimentLogger = N def analyze_prediction_errors( model, - X_val: pd.DataFrame, + x_val: pd.DataFrame, y_val: pd.Series, threshold: Optional[float] = None, - logger: ExperimentLogger = None, + logger: Optional["ExperimentLogger"] = None, ) -> dict: """ Analyze prediction errors on the validation set (most recent data). Args: model: Trained model with predict_proba method - X_val: Validation features + x_val: Validation features y_val: Validation target values threshold: Classification threshold (default: model.optimal_threshold or 0.5) logger: Logger instance @@ -287,7 +402,7 @@ def analyze_prediction_errors( if logger is None: logger = ExperimentLogger( experiment_name="ensemble_model_diagnostics", - log_dir="./logs/ensemble_model_diagnostics", + log_dir=DEFAULT_LOG_DIR, ) logger.info("Analyzing prediction errors...") @@ -301,79 +416,28 @@ def analyze_prediction_errors( logger.info(f"Using classification threshold: {threshold:.4f}") - # Get predictions - y_prob = model.predict_proba(X_val) - - # Handle different return shapes from predict_proba - if isinstance(y_prob, tuple): - # Some models return (neg_class_prob, pos_class_prob) - y_prob = y_prob[1] - elif y_prob.ndim > 1 and y_prob.shape[1] > 1: - # Some models return [neg_class_prob, pos_class_prob] for each sample - y_prob = y_prob[:, 1] - - # Convert to binary predictions using threshold - y_pred = (y_prob >= threshold).astype(int) + # Ensure threshold is not None + assert threshold is not None, "Threshold must be provided" - # Compute confusion matrix - tn, fp, fn, tp = confusion_matrix(y_val, y_pred).ravel() - - # Compute various metrics - total = tn + fp + fn + tp - accuracy = (tp + tn) / total - precision = tp / (tp + fp) if (tp + fp) > 0 else 0 - recall = tp / (tp + fn) if (tp + fn) > 0 else 0 - f1_score = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0 - - # Calculate class balance - pos_rate = np.mean(y_val) - - # Find incorrect predictions - incorrect_mask = y_pred != y_val - incorrect_indices = np.where(incorrect_mask)[0] - - # False positives and false negatives - fp_mask = (y_pred == 1) & (y_val == 0) - fn_mask = (y_pred == 0) & (y_val == 1) - fp_indices = np.where(fp_mask)[0] - fn_indices = np.where(fn_mask)[0] - - # Extract false positive and false negative examples - X_val.iloc[fp_indices] if len(fp_indices) > 0 else pd.DataFrame() - X_val.iloc[fn_indices] if len(fn_indices) > 0 else pd.DataFrame() - - # Analyze false positives (highest probability first) - fp_probs = y_prob[fp_mask] - fp_indices_sorted = np.argsort(-fp_probs) # Sort in descending order - fp_analysis = [] - - for i in range(min(5, len(fp_indices_sorted))): - idx = fp_indices[fp_indices_sorted[i]] - prob = y_prob[idx] - fp_analysis.append( - {"idx": idx, "probability": prob, "threshold_difference": prob - threshold} - ) + # Get predictions + y_prob, y_pred = _get_model_predictions(model, x_val, threshold) - # Analyze false negatives (closest to threshold first) - fn_probs = y_prob[fn_mask] - fn_indices_sorted = np.argsort(threshold - fn_probs) # Sort by proximity to threshold - fn_analysis = [] + # Compute confusion matrix and metrics + confusion_result = _compute_confusion_metrics(y_val, y_pred) + tn, fp, fn, tp = confusion_result["confusion_matrix"] + metrics = confusion_result["metrics"] - for i in range(min(5, len(fn_indices_sorted))): - idx = fn_indices[fn_indices_sorted[i]] - prob = y_prob[idx] - fn_analysis.append( - {"idx": idx, "probability": prob, "threshold_difference": threshold - prob} - ) + # Analyze errors + error_analysis = _analyze_errors(y_prob, y_pred, y_val, threshold) # Log results logger.info("Error analysis results:") - logger.info(f" Total samples: {total}") - logger.info(f" Class balance: {pos_rate:.2%} positive") - logger.info(f" Accuracy: {accuracy:.4f}") - logger.info(f" Precision: {precision:.4f}") - logger.info(f" Recall: {recall:.4f}") - logger.info(f" F1 Score: {f1_score:.4f}") + logger.info(f" Total samples: {metrics['total']}") + logger.info(f" Class balance: {error_analysis['class_balance']:.2%} positive") + logger.info(f" Accuracy: {metrics['accuracy']:.4f}") + logger.info(f" Precision: {metrics['precision']:.4f}") + logger.info(f" Recall: {metrics['recall']:.4f}") + logger.info(f" F1 Score: {metrics['f1_score']:.4f}") logger.info(f" Confusion Matrix: TN={tn}, FP={fp}, FN={fn}, TP={tp}") logger.info(f" False Positive Rate: {fp / (fp + tn):.4f}") logger.info(f" False Negative Rate: {fn / (fn + tp):.4f}") @@ -381,10 +445,10 @@ def analyze_prediction_errors( # Log to MLflow mlflow.log_metrics( { - "error_analysis_accuracy": accuracy, - "error_analysis_precision": precision, - "error_analysis_recall": recall, - "error_analysis_f1": f1_score, + "error_analysis_accuracy": metrics["accuracy"], + "error_analysis_precision": metrics["precision"], + "error_analysis_recall": metrics["recall"], + "error_analysis_f1": metrics["f1_score"], "error_analysis_fps": fp, "error_analysis_fns": fn, } @@ -394,19 +458,19 @@ def analyze_prediction_errors( return { "confusion_matrix": {"tn": int(tn), "fp": int(fp), "fn": int(fn), "tp": int(tp)}, "metrics": { - "accuracy": accuracy, - "precision": precision, - "recall": recall, - "f1_score": f1_score, + "accuracy": metrics["accuracy"], + "precision": metrics["precision"], + "recall": metrics["recall"], + "f1_score": metrics["f1_score"], "false_positive_rate": fp / (fp + tn) if (fp + tn) > 0 else 0, "false_negative_rate": fn / (fn + tp) if (fn + tp) > 0 else 0, }, - "class_balance": pos_rate, + "class_balance": error_analysis["class_balance"], "threshold": threshold, - "error_count": len(incorrect_indices), - "error_rate": len(incorrect_indices) / total, - "false_positives": len(fp_indices), - "false_negatives": len(fn_indices), - "fp_analysis": fp_analysis, - "fn_analysis": fn_analysis, + "error_count": error_analysis["error_count"], + "error_rate": error_analysis["error_rate"], + "false_positives": error_analysis["false_positives"], + "false_negatives": error_analysis["false_negatives"], + "fp_analysis": error_analysis["fp_analysis"], + "fn_analysis": error_analysis["fn_analysis"], } diff --git a/src/models/ensemble/ensemble_model.py b/src/models/ensemble/ensemble_model.py index 92014b4..7726c5e 100644 --- a/src/models/ensemble/ensemble_model.py +++ b/src/models/ensemble/ensemble_model.py @@ -9,6 +9,7 @@ import os import sys from pathlib import Path +from typing import Optional import numpy as np import pandas as pd @@ -28,11 +29,11 @@ import random +import keras import tensorflow as tf +from keras import layers, regularizers from sklearn.model_selection import train_test_split from sklearn.preprocessing import StandardScaler -from tensorflow import keras -from tensorflow.keras import layers, regularizers # Add project root to Python path try: @@ -45,8 +46,8 @@ except Exception as e: print(f"Error setting project root path: {e}") # Fallback to current directory if path resolution fails - sys.path.append(os.getcwd().parent) - print(f"Current directory ensemble_model: {os.getcwd().parent}") + sys.path.append(str(Path(os.getcwd()).parent)) + print(f"Current directory ensemble_model: {Path(os.getcwd()).parent}") # Local imports # Module imports @@ -96,7 +97,7 @@ class EnsembleModel(BaseEstimator, ClassifierMixin): def __init__( self, - logger: ExperimentLogger = None, + logger: Optional[ExperimentLogger] = None, calibrate: bool = False, calibration_method: str = "sigmoid", individual_thresholding: bool = False, @@ -107,7 +108,7 @@ def __init__( complexity_penalty: float = 0.01, target_precision: float = 0.50, required_recall: float = 0.25, - X_train: pd.DataFrame = None, + X_train: Optional[pd.DataFrame] = None, ): """ Initialize the EnsembleModel with configuration parameters. @@ -129,7 +130,7 @@ def __init__( experiment_name="ensemble_model_improved", log_dir="./logs/ensemble_model_improved" ) # Load selected features (assumed common to all models) - self.selected_features = import_selected_features_ensemble("all") + self.selected_features = list(import_selected_features_ensemble("all")) self.required_recall = required_recall self.sampling_strategy = sampling_strategy # For ADASYN resampling self.complexity_penalty = complexity_penalty # For regularization @@ -195,10 +196,10 @@ def __init__( cat_smooth=16.8, max_bin=590, ) - self.xgb_features = import_selected_features_ensemble(model_type="xgb") - self.cat_features = import_selected_features_ensemble(model_type="cat") - self.lgb_features = import_selected_features_ensemble(model_type="lgbm") - self.rf_features = import_selected_features_ensemble(model_type="rf") + self.xgb_features = list(import_selected_features_ensemble(model_type="xgb")) + self.cat_features = list(import_selected_features_ensemble(model_type="cat")) + self.lgb_features = list(import_selected_features_ensemble(model_type="lgbm")) + self.rf_features = list(import_selected_features_ensemble(model_type="rf")) # Initialize the extra base model based on the selected type with reduced complexity self.extra_base_model_type = extra_base_model_type.lower() if self.extra_base_model_type == "random_forest": @@ -207,7 +208,7 @@ def __init__( max_depth=13, min_samples_split=10, min_samples_leaf=6, - max_features=0.4, + max_features=0.4, # type: ignore bootstrap=True, class_weight={0: 1.0, 1: 2.0}, criterion="entropy", @@ -226,34 +227,21 @@ def __init__( ) self.logger.info("Extra base model initialized as SVC.") elif self.extra_base_model_type == "mlp": - self.model_extra = keras.Sequential() - self.model_extra.add(layers.InputLayer(shape=(X_train.shape[1],))) - - # Add hidden layers - for _ in range(1): # 1 hidden layer - self.model_extra.add( - layers.Dense( - 120, # 120 neurons per layer - activation="tanh", - kernel_regularizer=regularizers.l1_l2( - l1=0.0006252292488020048, l2=0.0010179804312458536 - ), - ) - ) - self.model_extra.add(layers.BatchNormalization()) - self.model_extra.add(layers.Dropout(0.2685783444324335)) # Updated dropout rate - - # Output layer for binary classification - self.model_extra.add(layers.Dense(1, activation="sigmoid")) - optimizer = keras.optimizers.Adam( - learning_rate=0.0008586241362721754, beta_1=0.9, beta_2=0.999, epsilon=1e-8 - ) - self.model_extra.compile( - optimizer=optimizer, - loss="binary_crossentropy", - metrics=["accuracy", keras.metrics.AUC(name="auc")], - ) - self.logger.info("Extra base model initialized as MLPClassifier.") + # Defer MLP initialization until training when we have data shape + self.model_extra = None + self.mlp_config = { + "hidden_layers": 1, + "neurons_per_layer": 120, + "activation": "tanh", + "l1_reg": 0.0006252292488020048, + "l2_reg": 0.0010179804312458536, + "dropout_rate": 0.2685783444324335, + "learning_rate": 0.0008586241362721754, + "beta_1": 0.9, + "beta_2": 0.999, + "epsilon": 1e-8, + } + self.logger.info("Extra base model configured as MLPClassifier (deferred initialization).") else: raise ValueError(f"Unknown extra_base_model_type: {self.extra_base_model_type}") @@ -282,79 +270,111 @@ def __init__( self.model_lgb_calibrated = None self.model_extra_calibrated = None self.extra_model_scaler = None + self.mlp_config = None + + def _initialize_mlp_model(self, input_shape: int): + """Initialize the Keras MLP model with the given input shape.""" + if self.mlp_config is None: + raise ValueError("MLP configuration not found. This should not happen.") + + model = keras.Sequential() + model.add(layers.InputLayer(shape=(input_shape,))) + + # Add hidden layers + for _ in range(self.mlp_config["hidden_layers"]): + model.add( + layers.Dense( + self.mlp_config["neurons_per_layer"], + activation=self.mlp_config["activation"], + kernel_regularizer=regularizers.l1_l2( + l1=self.mlp_config["l1_reg"], + l2=self.mlp_config["l2_reg"] + ), + ) + ) + model.add(layers.BatchNormalization()) + model.add(layers.Dropout(self.mlp_config["dropout_rate"])) - def train( - self, - X_train, - y_train, - X_val=None, - y_val=None, - X_test=None, - y_test=None, - split_validation=True, - val_size=0.2, - ) -> dict: - """ - Train the ensemble model including base models and meta-learner. + # Output layer for binary classification + model.add(layers.Dense(1, activation="sigmoid")) - Args: - X_train: Training features - y_train: Training target values - X_val: Optional validation features - y_val: Optional validation target values - X_test: Optional test features - y_test: Optional test target values - split_validation: Whether to split training data for validation - val_size: Size of validation split if split_validation is True + optimizer = keras.optimizers.Adam( + learning_rate=self.mlp_config["learning_rate"], + beta_1=self.mlp_config["beta_1"], + beta_2=self.mlp_config["beta_2"], + epsilon=self.mlp_config["epsilon"] + ) - Returns: - Dictionary with training results and metrics - """ - self.logger.info("Starting ensemble model training...") + model.compile( + optimizer=optimizer, # type: ignore + loss="binary_crossentropy", + metrics=["accuracy", keras.metrics.AUC(name="auc")], + ) - # Step 1: Data preparation and validation - X_train_prepared = prepare_data(X_train, self.selected_features) + return model + + def _prepare_training_data( + self, + X_train, + y_train, + x_val, + y_val, + X_test, + y_test, + split_validation, + val_size, + ): + """Prepare and validate training data.""" + x_train_prepared = prepare_data(X_train, self.selected_features) # Check for data leakage if validation data is provided - if X_val is not None and X_test is not None: - X_val_prepared = prepare_data(X_val, self.selected_features) - X_test_prepared = prepare_data(X_test, self.selected_features) + if x_val is not None and X_test is not None: + x_val_prepared = prepare_data(x_val, self.selected_features) + x_test_prepared = prepare_data(X_test, self.selected_features) leakage_results = detect_data_leakage( - X_train_prepared, X_test_prepared, X_val_prepared, self.logger + x_train_prepared, x_test_prepared, x_val_prepared, self.logger ) if leakage_results["overlap_percentage"] > 5.0: self.logger.warning( f"Significant data leakage detected: {leakage_results['overlap_percentage']:.2f}%" ) + else: + x_val_prepared = None + x_test_prepared = None - # If validation data is not provided, split the training data - if split_validation or X_val is None or y_val is None: + # Handle validation/test data splitting + if split_validation or x_val is None or y_val is None: self.logger.info("Splitting training data for validation...") - X_train_split, X_val_split, y_train_split, y_val_split = train_test_split( - X_train_prepared, y_train, test_size=val_size, random_state=19, stratify=y_train + x_train_split, x_val_split, y_train_split, y_val_split = train_test_split( + x_train_prepared, y_train, test_size=val_size, random_state=19, stratify=y_train ) - X_train_prepared, y_train = X_train_split, y_train_split - X_val_prepared, y_val = X_val_split, y_val_split - else: - X_val_prepared = prepare_data(X_val, self.selected_features) + x_train_prepared, y_train = x_train_split, y_train_split + x_val_prepared, y_val = x_val_split, y_val_split + elif x_val_prepared is None: + x_val_prepared = prepare_data(x_val, self.selected_features) - # Split training data further if no test data is provided if X_test is None or y_test is None: self.logger.info("Creating test set from training data...") - X_train_split, X_test_prepared, y_train_split, y_test = train_test_split( - X_train_prepared, y_train, test_size=val_size, random_state=43, stratify=y_train + x_train_split, x_test_split, y_train_split, y_test_split = train_test_split( + x_train_prepared, y_train, test_size=val_size, random_state=43, stratify=y_train ) - X_train_prepared, y_train = X_train_split, y_train_split - else: - X_test_prepared = prepare_data(X_test, self.selected_features) + x_train_prepared, y_train = x_train_split, y_train_split + x_test_prepared, y_test = x_test_split, y_test_split + elif x_test_prepared is None: + x_test_prepared = prepare_data(X_test, self.selected_features) - # Step 2: Skip class imbalance handling - using original data - self.logger.info("Skipping ADASYN resampling, using original training data...") - X_train_resampled, y_train_resampled = X_train_prepared, y_train + return x_train_prepared, y_train, x_val_prepared, y_val, x_test_prepared, y_test + + def _train_base_models(self, x_train_resampled, y_train_resampled, x_val_prepared, y_val, x_test_prepared, y_test): + """Train base models and optionally calibrate them.""" + # Initialize MLP model if needed + if self.extra_base_model_type == "mlp" and self.model_extra is None: + self.logger.info("Initializing MLP model with input shape...") + self.model_extra = self._initialize_mlp_model(x_train_resampled.shape[1]) - # Step 3: Initialize base models dictionary + # Initialize base models dictionary base_models = { "xgb": self.model_xgb, "cat": self.model_cat, @@ -365,36 +385,41 @@ def train( # For MLP or SVM, we need scaling if self.extra_base_model_type in ["mlp", "svm"]: self.logger.info(f"Applying StandardScaler for {self.extra_base_model_type} model...") - self.extra_model_scaler = StandardScaler().fit(X_train_resampled) + self.extra_model_scaler = StandardScaler().fit(x_train_resampled) base_models["extra_scaler"] = self.extra_model_scaler - # Step 4: Train base models + # Train base models self.logger.info("Training base models...") trained_models = train_base_models( base_models, - X_train_resampled, + x_train_resampled, y_train_resampled, - X_test_prepared, + x_test_prepared, y_test, - X_val_prepared, + x_val_prepared, y_val, ) - X_val_prepared_xgb = X_val_prepared[self.xgb_features] - X_val_prepared_cat = X_val_prepared[self.cat_features] - X_val_prepared_lgb = X_val_prepared[self.lgb_features] - X_val_prepared_rf = X_val_prepared[self.rf_features] - X_train_prepared_xgb = X_train_prepared[self.xgb_features] - X_train_prepared_cat = X_train_prepared[self.cat_features] - X_train_prepared_lgb = X_train_prepared[self.lgb_features] - X_train_prepared_rf = X_train_prepared[self.rf_features] - X_test_prepared_xgb = X_test_prepared[self.xgb_features] - X_test_prepared_cat = X_test_prepared[self.cat_features] - X_test_prepared_lgb = X_test_prepared[self.lgb_features] - X_test_prepared_rf = X_test_prepared[self.rf_features] - X_combined_xgb = pd.concat([X_train_prepared_xgb, X_test_prepared_xgb], axis=0) - X_combined_cat = pd.concat([X_train_prepared_cat, X_test_prepared_cat], axis=0) - X_combined_lgb = pd.concat([X_train_prepared_lgb, X_test_prepared_lgb], axis=0) - X_combined_rf = pd.concat([X_train_prepared_rf, X_test_prepared_rf], axis=0) + + # Prepare feature subsets + x_val_prepared_xgb = x_val_prepared[self.xgb_features] + x_val_prepared_cat = x_val_prepared[self.cat_features] + x_val_prepared_lgb = x_val_prepared[self.lgb_features] + x_val_prepared_rf = x_val_prepared[self.rf_features] + + x_train_prepared_xgb = x_train_resampled[self.xgb_features] + x_train_prepared_cat = x_train_resampled[self.cat_features] + x_train_prepared_lgb = x_train_resampled[self.lgb_features] + x_train_prepared_rf = x_train_resampled[self.rf_features] + + x_test_prepared_xgb = x_test_prepared[self.xgb_features] + x_test_prepared_cat = x_test_prepared[self.cat_features] + x_test_prepared_lgb = x_test_prepared[self.lgb_features] + x_test_prepared_rf = x_test_prepared[self.rf_features] + + x_combined_xgb = pd.concat([x_train_prepared_xgb, x_test_prepared_xgb], axis=0) + x_combined_cat = pd.concat([x_train_prepared_cat, x_test_prepared_cat], axis=0) + x_combined_lgb = pd.concat([x_train_prepared_lgb, x_test_prepared_lgb], axis=0) + x_combined_rf = pd.concat([x_train_prepared_rf, x_test_prepared_rf], axis=0) # Update model references self.model_xgb = trained_models["xgb"] @@ -405,14 +430,14 @@ def train( if "extra_scaler" in trained_models: self.extra_model_scaler = trained_models["extra_scaler"] - # Step 5: Optionally calibrate models + # Optionally calibrate models if self.calibrate: self.logger.info(f"Calibrating base models using {self.calibration_method} method...") calibration_results = calibrate_models( trained_models, - X_train_resampled, + x_train_resampled, y_train_resampled, - X_val_prepared, + x_val_prepared, y_val, self.calibration_method, self.logger, @@ -429,129 +454,203 @@ def train( # Analyze calibration effectiveness analyze_calibration(calibration_results["calibration_results"], y_val, self.logger) - # Step 6: Get base model predictions on validation data - self.logger.info("Generating base model predictions on validation data...") + feature_data = ( + x_val_prepared_xgb, x_val_prepared_cat, x_val_prepared_lgb, x_val_prepared_rf, + x_combined_xgb, x_combined_cat, x_combined_lgb, x_combined_rf + ) + + return trained_models, feature_data + + def _generate_base_predictions(self, x_train_prepared, y_train, x_test_prepared, y_test, x_val_prepared, feature_data): + """Generate predictions from base models.""" + ( + x_val_prepared_xgb, x_val_prepared_cat, x_val_prepared_lgb, x_val_prepared_rf, + x_combined_xgb, x_combined_cat, x_combined_lgb, x_combined_rf + ) = feature_data # Use calibrated models if available xgb_model = self.model_xgb_calibrated if self.calibrate else self.model_xgb cat_model = self.model_cat_calibrated if self.calibrate else self.model_cat lgb_model = self.model_lgb_calibrated if self.calibrate else self.model_lgb extra_model = self.model_extra_calibrated if self.calibrate else self.model_extra + + # Ensure models are not None (should be initialized during training) + assert xgb_model is not None, "XGBoost model not initialized" + assert cat_model is not None, "CatBoost model not initialized" + assert lgb_model is not None, "LightGBM model not initialized" + assert extra_model is not None, "Extra model not initialized" + # Combine features and handle indexes - X_combined = pd.concat([X_train_prepared, X_test_prepared], axis=0) + x_combined = pd.concat([x_train_prepared, x_test_prepared], axis=0) y_combined = pd.concat([y_train, y_test], axis=0) - - # Reset indexes to maintain proper alignment - X_combined.reset_index(drop=True, inplace=True) + x_combined.reset_index(drop=True, inplace=True) y_combined.reset_index(drop=True, inplace=True) + # Get predictions if self.extra_base_model_type in ["mlp", "svm"] and self.extra_model_scaler is not None: - # Scale validation, training, and test data using the fitted scaler - X_val_scaled = self.extra_model_scaler.transform(X_val_prepared) - X_train_scaled = self.extra_model_scaler.transform(X_combined) + x_val_scaled = self.extra_model_scaler.transform(x_val_prepared) + x_train_scaled = self.extra_model_scaler.transform(x_combined) - # Get probabilities from scaled data if self.extra_base_model_type == "mlp": - p_extra = extra_model.predict(X_val_scaled, verbose=0).flatten() - p_extra_train = extra_model.predict(X_train_scaled, verbose=0).flatten() + p_extra = extra_model.predict(x_val_scaled, verbose=0).flatten() # type: ignore + p_extra_train = extra_model.predict(x_train_scaled, verbose=0).flatten() # type: ignore else: # SVM case - p_extra = extra_model.predict_proba(X_val_scaled)[:, 1] - p_extra_train = extra_model.predict_proba(X_train_scaled)[:, 1] + p_extra = np.asarray(extra_model.predict_proba(x_val_scaled))[:, 1] + p_extra_train = np.asarray(extra_model.predict_proba(x_train_scaled))[:, 1] else: - p_extra = extra_model.predict_proba(X_val_prepared_rf)[:, 1] - p_extra_train = extra_model.predict_proba(X_combined_rf)[:, 1] + p_extra = np.asarray(extra_model.predict_proba(x_val_prepared_rf))[:, 1] + p_extra_train = np.asarray(extra_model.predict_proba(x_combined_rf))[:, 1] - p_xgb = xgb_model.predict_proba(X_val_prepared_xgb)[:, 1] - p_xgb_train = xgb_model.predict_proba(X_combined_xgb)[:, 1] + p_xgb = np.asarray(xgb_model.predict_proba(x_val_prepared_xgb))[:, 1] + p_xgb_train = np.asarray(xgb_model.predict_proba(x_combined_xgb))[:, 1] - p_cat = cat_model.predict_proba(X_val_prepared_cat)[:, 1] - p_cat_train = cat_model.predict_proba(X_combined_cat)[:, 1] + p_cat = np.asarray(cat_model.predict_proba(x_val_prepared_cat))[:, 1] + p_cat_train = np.asarray(cat_model.predict_proba(x_combined_cat))[:, 1] - p_lgb = lgb_model.predict_proba(X_val_prepared_lgb)[:, 1] - p_lgb_train = lgb_model.predict_proba(X_combined_lgb)[:, 1] + p_lgb = np.asarray(lgb_model.predict_proba(x_val_prepared_lgb))[:, 1] + p_lgb_train = np.asarray(lgb_model.predict_proba(x_combined_lgb))[:, 1] - # Step 7: Optionally calculate dynamic weights based on validation performance + return { + 'p_xgb': p_xgb, 'p_cat': p_cat, 'p_lgb': p_lgb, 'p_extra': p_extra, + 'p_xgb_train': p_xgb_train, 'p_cat_train': p_cat_train, + 'p_lgb_train': p_lgb_train, 'p_extra_train': p_extra_train, + 'y_combined': y_combined + } + + def _compute_dynamic_weights(self, predictions, y_val, y_combined): + """Compute dynamic weights if enabled.""" if self.dynamic_weighting: self.logger.info("Computing dynamic weights based on validation performance...") self.dynamic_weights = compute_precision_focused_weights( - p_xgb, - p_cat, - p_lgb, - p_extra, - y_val, - self.target_precision, - self.required_recall, - self.logger, + predictions['p_xgb'], predictions['p_cat'], predictions['p_lgb'], predictions['p_extra'], + y_val, self.target_precision, self.required_recall, self.logger, ) self.dynamic_weights_train = compute_precision_focused_weights( - p_xgb_train, - p_cat_train, - p_lgb_train, - p_extra_train, - y_combined, - self.target_precision, - self.required_recall, - self.logger, + predictions['p_xgb_train'], predictions['p_cat_train'], + predictions['p_lgb_train'], predictions['p_extra_train'], + predictions['y_combined'], self.target_precision, self.required_recall, self.logger, ) - # Step 8: Create meta-features from base model predictions + def _train_meta_learner(self, x_train_prepared, y_train, x_test_prepared, y_test, x_val_prepared, y_val, feature_data): + """Train the meta-learner with predictions from base models.""" + # Generate base model predictions + predictions = self._generate_base_predictions( + x_train_prepared, y_train, x_test_prepared, y_test, x_val_prepared, feature_data + ) + + # Compute dynamic weights if enabled + self._compute_dynamic_weights(predictions, y_val, predictions['y_combined']) + + # Create meta-features self.logger.info("Creating meta-features for meta-learner...") meta_features = create_meta_features( - p_xgb, p_cat, p_lgb, p_extra, self.dynamic_weights if self.dynamic_weighting else None + predictions['p_xgb'], predictions['p_cat'], predictions['p_lgb'], predictions['p_extra'], + self.dynamic_weights if self.dynamic_weighting else None ) meta_features_train = create_meta_features( - p_xgb_train, - p_cat_train, - p_lgb_train, - p_extra_train, + predictions['p_xgb_train'], predictions['p_cat_train'], + predictions['p_lgb_train'], predictions['p_extra_train'], self.dynamic_weights_train if self.dynamic_weighting else None, ) - # Convert to DataFrame for better interpretability + + # Convert to DataFrame meta_df = create_meta_dataframe(meta_features) meta_df_train = create_meta_dataframe(meta_features_train) - # Step 9: Initialize and train meta-learner + # Initialize and train meta-learner self.logger.info(f"Initializing meta-learner of type {self.meta_learner_type}...") self.meta_learner = initialize_meta_learner(self.meta_learner_type) - # Train meta-learner self.logger.info("Training meta-learner...") - # self.meta_learner = train_meta_learner(self.meta_learner, meta_df, y_val) self.meta_learner = hypertune_meta_learner( - meta_df_train, - y_combined, - meta_df, - y_val, + meta_df_train.values, + predictions['y_combined'].values if hasattr(predictions['y_combined'], 'values') else predictions['y_combined'], + meta_df.values, + y_val.values if hasattr(y_val, 'values') else y_val, meta_learner_type=self.meta_learner_type, target_precision=self.target_precision, min_recall=self.required_recall, ) - # Step 10: Tune threshold for optimal precision-recall trade-off - self.logger.info(f"Tuning threshold for target precision {self.target_precision}...") - - # Get meta-learner predictions on validation data - meta_val_probs = self.meta_learner.predict_proba(meta_df)[:, 1] # Tune threshold - best_threshold, threshold_metrics = tune_threshold_for_precision( - meta_val_probs, - y_val, + self.logger.info(f"Tuning threshold for target precision {self.target_precision}...") + meta_val_probs = self.meta_learner.predict_proba(meta_df)[:, 1] # type: ignore + best_threshold, _ = tune_threshold_for_precision( + meta_val_probs, y_val, target_precision=self.target_precision, required_recall=self.required_recall, - logger=self.logger, + logger=self.logger ) - self.optimal_threshold = best_threshold - threshold_metrics = threshold_metrics self.logger.info(f"Optimal threshold set to {self.optimal_threshold:.4f}") - # Step 11: Final evaluation on validation data + # Final evaluation self.logger.info("Performing final evaluation on validation data...") eval_results = evaluate_model( - self.meta_learner, meta_df, y_val, self.optimal_threshold, self.logger + self.meta_learner, meta_df, # type: ignore + y_val.values if hasattr(y_val, 'values') else y_val, + self.optimal_threshold, self.logger # type: ignore ) - self.logger.info("Ensemble model training completed successfully.") + return eval_results + + def train( + self, + X_train, + y_train, + x_val=None, + y_val=None, + X_test=None, + y_test=None, + split_validation=True, + val_size=0.2, + ) -> dict: + """ + Train the ensemble model including base models and meta-learner. + Args: + X_train: Training features + y_train: Training target values + x_val: Optional validation features + y_val: Optional validation target values + X_test: Optional test features + y_test: Optional test target values + split_validation: Whether to split training data for validation + val_size: Size of validation split if split_validation is True + + Returns: + Dictionary with training results and metrics + """ + self.logger.info("Starting ensemble model training...") + + # Step 1: Data preparation and validation + x_train_prepared, y_train, x_val_prepared, y_val, x_test_prepared, y_test = ( + self._prepare_training_data( + X_train, y_train, x_val, y_val, X_test, y_test, split_validation, val_size + ) + ) + + # Step 2: Skip class imbalance handling - using original data + self.logger.info("Skipping ADASYN resampling, using original training data...") + x_train_resampled, y_train_resampled = x_train_prepared, y_train + + # Step 3: Train and optionally calibrate base models + _, feature_data = self._train_base_models( + x_train_resampled, y_train_resampled, x_val_prepared, y_val, x_test_prepared, y_test + ) + + # Extract feature data + ( + x_val_prepared_xgb, x_val_prepared_cat, x_val_prepared_lgb, x_val_prepared_rf, + x_combined_xgb, x_combined_cat, x_combined_lgb, x_combined_rf + ) = feature_data + + # Step 4: Generate predictions and train meta-learner + eval_results = self._train_meta_learner( + x_train_prepared, y_train, x_test_prepared, y_test, + x_val_prepared, y_val, feature_data + ) + + self.logger.info("Ensemble model training completed successfully.") return eval_results def predict_proba(self, X) -> np.ndarray: @@ -568,7 +667,7 @@ def predict_proba(self, X) -> np.ndarray: raise ValueError("Model has not been trained. Call train() first.") # Prepare input data - X_prepared = prepare_data(X, X.columns) + x_prepared = prepare_data(X, X.columns) # Generate predictions from base models # Use calibrated models if available xgb_model = self.model_xgb_calibrated if self.calibrate else self.model_xgb @@ -576,26 +675,32 @@ def predict_proba(self, X) -> np.ndarray: lgb_model = self.model_lgb_calibrated if self.calibrate else self.model_lgb extra_model = self.model_extra_calibrated if self.calibrate else self.model_extra + # Ensure models are not None (should be initialized during training) + assert xgb_model is not None, "XGBoost model not initialized" + assert cat_model is not None, "CatBoost model not initialized" + assert lgb_model is not None, "LightGBM model not initialized" + assert extra_model is not None, "Extra model not initialized" + # Get predictions if self.extra_base_model_type in ["mlp", "svm"] and self.extra_model_scaler is not None: - X_scaled = self.extra_model_scaler.transform(X_prepared) + x_scaled = self.extra_model_scaler.transform(x_prepared) if self.extra_base_model_type == "mlp": - p_extra = extra_model.predict(X_scaled).flatten() + p_extra = extra_model.predict(x_scaled).flatten() else: # SVM case - p_extra = extra_model.predict_proba(X_scaled)[:, 1] + p_extra = np.asarray(extra_model.predict_proba(x_scaled))[:, 1] else: - p_extra = extra_model.predict_proba(X_prepared)[:, 1] + p_extra = np.asarray(extra_model.predict_proba(x_prepared))[:, 1] - p_xgb = xgb_model.predict_proba(X_prepared)[:, 1] - p_cat = cat_model.predict_proba(X_prepared)[:, 1] - p_lgb = lgb_model.predict_proba(X_prepared)[:, 1] + p_xgb = np.asarray(xgb_model.predict_proba(x_prepared))[:, 1] + p_cat = np.asarray(cat_model.predict_proba(x_prepared))[:, 1] + p_lgb = np.asarray(lgb_model.predict_proba(x_prepared))[:, 1] # Create meta-features meta_features = create_meta_features( p_xgb, p_cat, p_lgb, p_extra, self.dynamic_weights if self.dynamic_weighting else None ) # Get meta-learner predictions - meta_probs = self.meta_learner.predict_proba(meta_features) + meta_probs = self.meta_learner.predict_proba(meta_features) # type: ignore return meta_probs[:, 1] @@ -615,32 +720,32 @@ def predict(self, X) -> np.ndarray: # Apply threshold return (probabilities >= self.optimal_threshold).astype(int) - def explain_predictions(self, X_val) -> dict: + def explain_predictions(self, x_val) -> dict: """ Generate feature importance explanations using SHAP values. Args: - X_val: Validation features + x_val: Validation features Returns: Dictionary with explanation results """ - return explain_predictions(self, X_val, self.logger) + return explain_predictions(self, x_val, self.logger) - def analyze_prediction_errors(self, X_val, y_val) -> dict: + def analyze_prediction_errors(self, x_val, y_val) -> dict: """ Analyze prediction errors on the validation set. Args: - X_val: Validation features + x_val: Validation features y_val: Validation target values Returns: Dictionary with error analysis results """ - return analyze_prediction_errors(self, X_val, y_val, self.optimal_threshold, self.logger) + return analyze_prediction_errors(self, x_val, y_val, self.optimal_threshold, self.logger) # type: ignore - def precision_filter(self, X, probabilities): + def precision_filter(self, x, probabilities): """ Apply additional filtering to boost precision """ @@ -648,11 +753,11 @@ def precision_filter(self, X, probabilities): high_conf = probabilities > self.optimal_threshold # Get original features for these instances - X_high_conf = X[high_conf] + x_high_conf = x[high_conf] # Apply rule-based filters (examples) - if "home_form" in X_high_conf.columns and "away_form" in X_high_conf.columns: - form_diff = abs(X_high_conf["home_form"] - X_high_conf["away_form"]) + if "home_form" in x_high_conf.columns and "away_form" in x_high_conf.columns: + form_diff = abs(x_high_conf["home_form"] - x_high_conf["away_form"]) # Filter out likely non-draws (big form differences) likely_not_draw = form_diff > 0.5 high_conf[high_conf] = ~likely_not_draw diff --git a/src/models/ensemble/ensemble_model_0410.py b/src/models/ensemble/ensemble_model_0410.py index 6ac5870..faa4f59 100644 --- a/src/models/ensemble/ensemble_model_0410.py +++ b/src/models/ensemble/ensemble_model_0410.py @@ -70,30 +70,30 @@ def __init__( self.logger = logger or ExperimentLogger( experiment_name="ensemble_model_0410", log_dir="./logs/ensemble_model_0410" ) - self.required_recall = required_recall # For meta-learner - self.sampling_strategy = sampling_strategy # Used elsewhere? - self.complexity_penalty = complexity_penalty # Used elsewhere? - self.target_precision = target_precision # For dynamic weights + self.required_recall = required_recall # For meta-learner + self.sampling_strategy = sampling_strategy # Used elsewhere? + self.complexity_penalty = complexity_penalty # Used elsewhere? + self.target_precision = target_precision # For dynamic weights # --- MLflow Run IDs for Base Models --- - self.xgb_run_id = "4a3ebfc328af4041925d8b39786fb0ea" - self.lgb_run_id = "2c9ea4315c16460689e00596ed2b6d9d" - self.tabnet_run_id = "c531685eae4d429fb7fc1af4f6b38a95" - self.extra_run_id = "2830d0b8ebcb4c46809e6afab57da539" + self.xgb_run_id = "4a3ebfc328af4041925d8b39786fb0ea" + self.lgb_run_id = "2c9ea4315c16460689e00596ed2b6d9d" + self.tabnet_run_id = "c531685eae4d429fb7fc1af4f6b38a95" + self.extra_run_id = "2830d0b8ebcb4c46809e6afab57da539" self.mlp_run_id = "25b4a2f5478746d08253e31ea12161c4" self.pytorch_run_id = "fc1cfea4661b4603958894a956c1e91a" # Minimum recalls for dynamic weighting (order: xgb, tabnet, lgb, rf, mlp, pytorch) - self.min_recalls = [0.30, 0.20, 0.30, 0.40, 0.30, 0.30] + self.min_recalls = [0.30, 0.20, 0.30, 0.40, 0.30, 0.30] # Meta-learner settings self.meta_learner_type = meta_learner_type - self.optimal_threshold = 0.5 # Will be tuned - self.individual_thresholding = individual_thresholding # Unused currently? + self.optimal_threshold = 0.5 # Will be tuned + self.individual_thresholding = individual_thresholding # Unused currently? self.calibrate = calibrate # Unused currently? - self.calibration_method = calibration_method # Unused currently? + self.calibration_method = calibration_method # Unused currently? self.dynamic_weighting = dynamic_weighting - num_models = 6 # Updated number of models + num_models = 6 # Updated number of models if self.dynamic_weighting: # Adjusted for 6 base models self.dynamic_weights = { @@ -102,26 +102,26 @@ def __init__( "lgb": 1 / num_models, "extra": 1 / num_models, "mlp": 1 / num_models, - "pytorch": 1 / num_models, # Added pytorch + "pytorch": 1 / num_models, # Added pytorch } - + # Placeholder attributes for models and features - will be populated by load_models self.meta_learner = None self.model_xgb = None self.model_lgb = None self.model_tabnet = None - self.model_extra = None + self.model_extra = None self.model_mlp = None - self.model_mlp_scaler = None - self.model_pytorch = None # Added pytorch model placeholder - self.model_pytorch_scaler = None # Added pytorch scaler placeholder + self.model_mlp_scaler = None + self.model_pytorch = None # Added pytorch model placeholder + self.model_pytorch_scaler = None # Added pytorch scaler placeholder self.xgb_features = [] self.lgb_features = [] self.tabnet_features = [] self.extra_features = [] self.mlp_features = [] - self.pytorch_features = [] # Added pytorch feature list placeholder + self.pytorch_features = [] # Added pytorch feature list placeholder # Load models on initialization self.load_models_from_mlflow() @@ -162,21 +162,21 @@ def train( X_val_lgb = X_val_prepared[self.lgb_features] X_val_extra = X_val_prepared[self.extra_features] X_val_mlp = X_val_prepared[self.mlp_features] # Prepare MLP features - X_val_pytorch = X_val_prepared[self.pytorch_features] # Added PyTorch features + X_val_pytorch = X_val_prepared[self.pytorch_features] # Added PyTorch features X_test_xgb = X_test_prepared[self.xgb_features] X_test_tabnet = X_test_prepared[self.tabnet_features] X_test_lgb = X_test_prepared[self.lgb_features] X_test_extra = X_test_prepared[self.extra_features] X_test_mlp = X_test_prepared[self.mlp_features] # Prepare MLP features - X_test_pytorch = X_test_prepared[self.pytorch_features] # Added PyTorch features + X_test_pytorch = X_test_prepared[self.pytorch_features] # Added PyTorch features # Obtain predictions from base models on validation set self.logger.info("Obtaining validation predictions from base models...") p_xgb_val = self.model_xgb.predict_proba(X_val_xgb)[:, 1] # TabNet input might need .values depending on saving format try: - p_tabnet_val = self.model_tabnet.predict_proba(X_val_tabnet)[:, 1] + p_tabnet_val = self.model_tabnet.predict_proba(X_val_tabnet)[:, 1] except TypeError: self.logger.warning("TabNet predict_proba failed on DataFrame, trying .values") p_tabnet_val = self.model_tabnet.predict_proba(X_val_tabnet.values)[:, 1] @@ -202,25 +202,26 @@ def train( # MLP requires scaling X_test_mlp_scaled = self.model_mlp_scaler.transform(X_test_mlp) p_mlp_test = self.model_mlp.predict_proba(X_test_mlp_scaled)[:, 1] - # PyTorch + # PyTorch p_pytorch_test = self.model_pytorch.predict_proba(X_test_pytorch)[:, 1] # Optionally calculate dynamic weights based on validation performance if self.dynamic_weighting: - # Use test predictions for weights used during FINAL meta-learner TRAINING self.logger.info("Computing dynamic weights based on test performance...") - self.dynamic_weights_train, self.thresholds_train = compute_precision_focused_weights_optimized( - p_xgb_test, - p_tabnet_test, - p_lgb_test, - p_extra_test, - p_mlp_test, - p_pytorch_test, # Added PyTorch - y_test, - self.target_precision, - self.min_recalls, - self.logger, + self.dynamic_weights_train, self.thresholds_train = ( + compute_precision_focused_weights_optimized( + p_xgb_test, + p_tabnet_test, + p_lgb_test, + p_extra_test, + p_mlp_test, + p_pytorch_test, # Added PyTorch + y_test, + self.target_precision, + self.min_recalls, + self.logger, + ) ) # Ensure weights_0410 is imported and used self.logger.info("Computing dynamic weights based on validation performance...") @@ -229,11 +230,11 @@ def train( p_tabnet_val, p_lgb_val, p_extra_val, - p_mlp_val, - p_pytorch_val, # Added PyTorch - y_val, + p_mlp_val, + p_pytorch_val, # Added PyTorch + y_val, self.target_precision, - self.min_recalls, # Should now have 6 elements + self.min_recalls, # Should now have 6 elements self.logger, ) @@ -244,8 +245,8 @@ def train( p_tabnet_val, p_lgb_val, p_extra_val, - p_mlp_val, - p_pytorch_val, # Added PyTorch + p_mlp_val, + p_pytorch_val, # Added PyTorch self.dynamic_weights if self.dynamic_weighting else None, self.thresholds if self.dynamic_weighting else None, ) @@ -256,8 +257,8 @@ def train( p_tabnet_test, p_lgb_test, p_extra_test, - p_mlp_test, - p_pytorch_test, # Added PyTorch + p_mlp_test, + p_pytorch_test, # Added PyTorch self.dynamic_weights_train if self.dynamic_weighting else None, self.thresholds_train if self.dynamic_weighting else None, ) @@ -329,7 +330,7 @@ def predict_proba(self, X) -> np.ndarray: X_lgb = X_prepared[self.lgb_features] X_extra = X_prepared[self.extra_features] X_mlp = X_prepared[self.mlp_features] - X_pytorch = X_prepared[self.pytorch_features] # Added PyTorch + X_pytorch = X_prepared[self.pytorch_features] # Added PyTorch try: # Generate predictions @@ -353,8 +354,8 @@ def predict_proba(self, X) -> np.ndarray: p_tabnet, p_lgb, p_extra, - p_mlp, - p_pytorch, + p_mlp, + p_pytorch, self.dynamic_weights_train if self.dynamic_weighting else None, self.thresholds_train if self.dynamic_weighting else None, ) @@ -519,7 +520,7 @@ def load_models_from_mlflow( else: self.logger.warning("No feature signature found for MLP model") # Attempt to get features from the underlying sklearn model if possible - self.mlp_features = getattr(self.model_mlp, "feature_names_in_", []) + self.mlp_features = getattr(self.model_mlp, "feature_names_in_", []) # Load the associated MLP scaler self.logger.info( @@ -535,7 +536,7 @@ def load_models_from_mlflow( except Exception as e: self.logger.error(f"Failed to load MLP model or scaler: {str(e)}") raise ValueError(f"Failed to load MLP model or scaler: {str(e)}") from e - + # Load PyTorch model if not self.pytorch_run_id or self.pytorch_run_id == "YOUR_PYTORCH_RUN_ID_HERE": self.logger.warning( @@ -544,13 +545,13 @@ def load_models_from_mlflow( else: try: # Define artifact paths for PyTorch model and its scaler - pytorch_model_path = "model" # Assuming artifact path is 'model' - pytorch_scaler_path = "scaler/scaler.pkl" # Assuming scaler saved in 'scaler' dir - + pytorch_model_path = "model" # Assuming artifact path is 'model' + pytorch_scaler_path = "scaler/scaler.pkl" # Assuming scaler saved in 'scaler' dir + self.logger.info(f"Loading PyTorch model from run {self.pytorch_run_id}...") pytorch_uri = f"runs:/{self.pytorch_run_id}/{pytorch_model_path}" self.model_pytorch = mlflow.pytorch.load_model(pytorch_uri) - + # Load PyTorch model also as pyfunc to easily get signature pytorch_pyfunc = mlflow.pyfunc.load_model(pytorch_uri) if pytorch_pyfunc.metadata.signature and pytorch_pyfunc.metadata.signature.inputs: @@ -562,7 +563,7 @@ def load_models_from_mlflow( self.logger.warning("No feature signature found for PyTorch model.") # PyTorch models don't have a standard feature_names_in_ attribute # Consider storing feature names as a separate artifact if needed, or rely on signature - self.pytorch_features = [] + self.pytorch_features = [] # Load the associated PyTorch scaler self.logger.info( @@ -574,24 +575,38 @@ def load_models_from_mlflow( with open(scaler_local_path, "rb") as f: self.model_pytorch_scaler = pickle.load(f) self.logger.info("PyTorch scaler loaded successfully.") - - # Optional: Attach scaler and device to the loaded PyTorch model instance + + # Optional: Attach scaler and device to the loaded PyTorch model instance # if its predict_proba method relies on them being attributes (like in the hypertuner) - if hasattr(self.model_pytorch, 'scaler_') and hasattr(self.model_pytorch, 'device_'): - try: + if hasattr(self.model_pytorch, "scaler_") and hasattr( + self.model_pytorch, "device_" + ): + try: # Determine device (use CUDA if available, same logic as hypertuner) - pytorch_device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + pytorch_device = torch.device( + "cuda" if torch.cuda.is_available() else "cpu" + ) self.model_pytorch.scaler_ = self.model_pytorch_scaler self.model_pytorch.device_ = pytorch_device - self.model_pytorch.to(pytorch_device) # Ensure model is on the correct device - self.logger.info(f"Attached scaler and device ({pytorch_device}) to loaded PyTorch model.") + self.model_pytorch.to( + pytorch_device + ) # Ensure model is on the correct device + self.logger.info( + f"Attached scaler and device ({pytorch_device}) to loaded PyTorch model." + ) except Exception as attach_e: - self.logger.warning(f"Could not attach scaler/device to PyTorch model: {attach_e}") + self.logger.warning( + f"Could not attach scaler/device to PyTorch model: {attach_e}" + ) else: - self.logger.warning("Loaded PyTorch model does not have scaler_/device_ attributes for attachment.") + self.logger.warning( + "Loaded PyTorch model does not have scaler_/device_ attributes for attachment." + ) # Ensure model is moved to the correct device anyway try: - pytorch_device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + pytorch_device = torch.device( + "cuda" if torch.cuda.is_available() else "cpu" + ) self.model_pytorch.to(pytorch_device) self.logger.info(f"Moved loaded PyTorch model to device: {pytorch_device}") except Exception as move_e: diff --git a/src/models/ensemble/ensemble_model_0412.py b/src/models/ensemble/ensemble_model_0412.py index e1cc16d..55a966e 100644 --- a/src/models/ensemble/ensemble_model_0412.py +++ b/src/models/ensemble/ensemble_model_0412.py @@ -52,6 +52,7 @@ # PyTorch specific reproducibility settings torch.manual_seed(SEED) + class EnsembleModel(BaseEstimator, ClassifierMixin): def __init__( self, @@ -69,30 +70,30 @@ def __init__( self.logger = logger or ExperimentLogger( experiment_name="ensemble_model_0410", log_dir="./logs/ensemble_model_0410" ) - self.required_recall = required_recall # For meta-learner + self.required_recall = required_recall # For meta-learner - self.target_precision = target_precision # For dynamic weights + self.target_precision = target_precision # For dynamic weights # --- MLflow Run IDs for Base Models --- - self.xgb_run_id = "a002b5e26c544deb8d210e8bc4d360fc" - self.lgb_run_id = "50841f7ed0a8436aa354fc5de287a94c" - self.tabnet_run_id = "c531685eae4d429fb7fc1af4f6b38a95" - self.extra_run_id = "bde53176ab4f4c689a0ffa825f24e3a5" + self.xgb_run_id = "a002b5e26c544deb8d210e8bc4d360fc" + self.lgb_run_id = "50841f7ed0a8436aa354fc5de287a94c" + self.tabnet_run_id = "c531685eae4d429fb7fc1af4f6b38a95" + self.extra_run_id = "bde53176ab4f4c689a0ffa825f24e3a5" self.mlp_run_id = "f9725a6768f64e23904e0cc14ab1c010" self.pytorch_run_id = "54c6f88b705f4394b87995d28b969936" self.svm_run_id = "a0ec998ec11941a895e056a81ab04281" # <<< ADD SVM RUN ID HERE # Minimum recalls for dynamic weighting (order: xgb, tabnet, lgb, rf, mlp, pytorch, svm) - self.min_recalls = [0.30, 0.20, 0.30, 0.40, 0.30, 0.30, 0.30] # Added SVM recall + self.min_recalls = [0.30, 0.20, 0.30, 0.40, 0.30, 0.30, 0.30] # Added SVM recall # Meta-learner settings self.meta_learner_type = meta_learner_type - self.optimal_threshold = 0.5 # Will be tuned - self.individual_thresholding = individual_thresholding # Unused currently? + self.optimal_threshold = 0.5 # Will be tuned + self.individual_thresholding = individual_thresholding # Unused currently? self.calibrate = calibrate # Unused currently? - self.calibration_method = calibration_method # Unused currently? + self.calibration_method = calibration_method # Unused currently? self.dynamic_weighting = dynamic_weighting - num_models = 7 # Updated number of models (added SVM) + num_models = 7 # Updated number of models (added SVM) if self.dynamic_weighting: # Adjusted for 7 base models self.dynamic_weights = { @@ -101,30 +102,30 @@ def __init__( "tabnet": 1 / num_models, "extra": 1 / num_models, "mlp": 1 / num_models, - "pytorch": 1 / num_models, # Added pytorch - "svm": 1 / num_models, # Added SVM + "pytorch": 1 / num_models, # Added pytorch + "svm": 1 / num_models, # Added SVM } - + # Placeholder attributes for models and features - will be populated by load_models self.meta_learner = None self.model_xgb = None self.model_lgb = None self.model_tabnet = None - self.model_extra = None + self.model_extra = None self.model_mlp = None - self.model_mlp_scaler = None - self.model_pytorch = None # Added pytorch model placeholder - self.model_pytorch_scaler = None # Added pytorch scaler placeholder - self.model_svm = None # Added SVM model placeholder - self.model_svm_scaler = None # Added SVM scaler placeholder + self.model_mlp_scaler = None + self.model_pytorch = None # Added pytorch model placeholder + self.model_pytorch_scaler = None # Added pytorch scaler placeholder + self.model_svm = None # Added SVM model placeholder + self.model_svm_scaler = None # Added SVM scaler placeholder self.xgb_features = [] self.lgb_features = [] self.tabnet_features = [] self.extra_features = [] self.mlp_features = [] - self.pytorch_features = [] # Added pytorch feature list placeholder - self.svm_features = [] # Added SVM feature list placeholder + self.pytorch_features = [] # Added pytorch feature list placeholder + self.svm_features = [] # Added SVM feature list placeholder # Load models on initialization self.load_models_from_mlflow() @@ -144,7 +145,7 @@ def train( # Data preparation # Assume features are loaded during model loading, use a consistent set if needed # self.selected_features = self.xgb_features # Example: Use XGB features as the common set - X_train_prepared = prepare_data(X_train, X_train.columns) + prepare_data(X_train, X_train.columns) X_val_prepared = prepare_data(X_val, X_val.columns) X_test_prepared = prepare_data(X_test, X_test.columns) @@ -155,23 +156,23 @@ def train( X_val_lgb = X_val_prepared[self.lgb_features] X_val_extra = X_val_prepared[self.extra_features] X_val_mlp = X_val_prepared[self.mlp_features] # Prepare MLP features - X_val_pytorch = X_val_prepared[self.pytorch_features] # Added PyTorch features - X_val_svm = X_val_prepared[self.svm_features] # Added SVM features + X_val_pytorch = X_val_prepared[self.pytorch_features] # Added PyTorch features + X_val_svm = X_val_prepared[self.svm_features] # Added SVM features X_test_xgb = X_test_prepared[self.xgb_features] X_test_tabnet = X_test_prepared[self.tabnet_features] X_test_lgb = X_test_prepared[self.lgb_features] X_test_extra = X_test_prepared[self.extra_features] X_test_mlp = X_test_prepared[self.mlp_features] # Prepare MLP features - X_test_pytorch = X_test_prepared[self.pytorch_features] # Added PyTorch features - X_test_svm = X_test_prepared[self.svm_features] # Added SVM features + X_test_pytorch = X_test_prepared[self.pytorch_features] # Added PyTorch features + X_test_svm = X_test_prepared[self.svm_features] # Added SVM features # Obtain predictions from base models on validation set self.logger.info("Obtaining validation predictions from base models...") p_xgb_val = self.model_xgb.predict_proba(X_val_xgb)[:, 1] # TabNet input might need .values depending on saving format try: - p_tabnet_val = self.model_tabnet.predict_proba(X_val_tabnet)[:, 1] + p_tabnet_val = self.model_tabnet.predict_proba(X_val_tabnet)[:, 1] except TypeError: self.logger.warning("TabNet predict_proba failed on DataFrame, trying .values") p_tabnet_val = self.model_tabnet.predict_proba(X_val_tabnet.values)[:, 1] @@ -202,7 +203,7 @@ def train( # MLP requires scaling X_test_mlp_scaled = self.model_mlp_scaler.transform(X_test_mlp) p_mlp_test = self.model_mlp.predict_proba(X_test_mlp_scaled)[:, 1] - # PyTorch + # PyTorch p_pytorch_test = self.model_pytorch.predict_proba(X_test_pytorch)[:, 1] # SVM requires scaling X_test_svm_scaled = self.model_svm_scaler.transform(X_test_svm) @@ -212,18 +213,20 @@ def train( if self.dynamic_weighting: # Use test predictions for weights used during FINAL meta-learner TRAINING self.logger.info("Computing dynamic weights based on test performance...") - self.dynamic_weights_train, self.thresholds_train = compute_precision_focused_weights_optimized( - p_xgb_test, - p_tabnet_test, - p_lgb_test, - p_extra_test, - p_mlp_test, - p_pytorch_test, # Added PyTorch - p_svm_test, # Added SVM - y_test, - self.target_precision, - self.min_recalls, # Should now have 7 elements - self.logger, + self.dynamic_weights_train, self.thresholds_train = ( + compute_precision_focused_weights_optimized( + p_xgb_test, + p_tabnet_test, + p_lgb_test, + p_extra_test, + p_mlp_test, + p_pytorch_test, # Added PyTorch + p_svm_test, # Added SVM + y_test, + self.target_precision, + self.min_recalls, # Should now have 7 elements + self.logger, + ) ) # Ensure weights_0410 is imported and used self.logger.info("Computing dynamic weights based on validation performance...") @@ -232,12 +235,12 @@ def train( p_tabnet_val, p_lgb_val, p_extra_val, - p_mlp_val, - p_pytorch_val, # Added PyTorch - p_svm_val, # Added SVM - y_val, + p_mlp_val, + p_pytorch_val, # Added PyTorch + p_svm_val, # Added SVM + y_val, self.target_precision, - self.min_recalls, # Should now have 7 elements + self.min_recalls, # Should now have 7 elements self.logger, ) @@ -248,9 +251,9 @@ def train( p_tabnet_val, p_lgb_val, p_extra_val, - p_mlp_val, - p_pytorch_val, # Added PyTorch - p_svm_val, # Added SVM + p_mlp_val, + p_pytorch_val, # Added PyTorch + p_svm_val, # Added SVM self.dynamic_weights if self.dynamic_weighting else None, self.thresholds if self.dynamic_weighting else None, ) @@ -261,9 +264,9 @@ def train( p_tabnet_test, p_lgb_test, p_extra_test, - p_mlp_test, - p_pytorch_test, # Added PyTorch - p_svm_test, # Added SVM + p_mlp_test, + p_pytorch_test, # Added PyTorch + p_svm_test, # Added SVM self.dynamic_weights_train if self.dynamic_weighting else None, self.thresholds_train if self.dynamic_weighting else None, ) @@ -335,8 +338,8 @@ def predict_proba(self, X) -> np.ndarray: X_lgb = X_prepared[self.lgb_features] X_extra = X_prepared[self.extra_features] X_mlp = X_prepared[self.mlp_features] - X_pytorch = X_prepared[self.pytorch_features] # Added PyTorch - X_svm = X_prepared[self.svm_features] # Added SVM + X_pytorch = X_prepared[self.pytorch_features] # Added PyTorch + X_svm = X_prepared[self.svm_features] # Added SVM try: # Generate predictions @@ -348,11 +351,13 @@ def predict_proba(self, X) -> np.ndarray: p_tabnet = self.model_tabnet.predict_proba(X_tabnet.values)[:, 1] p_lgb = self.model_lgb.predict_proba(X_lgb)[:, 1] p_extra = self.model_extra.predict_proba(X_extra)[:, 1] - + # --- MLP Scaling and Prediction --- # Ensure X_mlp is a DataFrame with correct columns before transform if not isinstance(X_mlp, pd.DataFrame): - self.logger.warning("X_mlp is not a DataFrame before scaling. Attempting conversion.") + self.logger.warning( + "X_mlp is not a DataFrame before scaling. Attempting conversion." + ) X_mlp = pd.DataFrame(X_mlp, columns=self.mlp_features) # Re-select columns just in case order changed or to ensure DataFrame type X_mlp = X_mlp[self.mlp_features] @@ -362,17 +367,21 @@ def predict_proba(self, X) -> np.ndarray: # --- PyTorch Scaling and Prediction --- # Ensure X_pytorch is a DataFrame with correct columns before transform if not isinstance(X_pytorch, pd.DataFrame): - self.logger.warning("X_pytorch is not a DataFrame before scaling. Attempting conversion.") + self.logger.warning( + "X_pytorch is not a DataFrame before scaling. Attempting conversion." + ) X_pytorch = pd.DataFrame(X_pytorch, columns=self.pytorch_features) # Re-select columns X_pytorch = X_pytorch[self.pytorch_features] X_pytorch_scaled = self.model_pytorch_scaler.transform(X_pytorch) - p_pytorch = self.model_pytorch.predict_proba(X_pytorch_scaled)[:, 1] + p_pytorch = self.model_pytorch.predict_proba(X_pytorch_scaled)[:, 1] # --- SVM Scaling and Prediction --- # Ensure X_svm is a DataFrame with correct columns before transform if not isinstance(X_svm, pd.DataFrame): - self.logger.warning("X_svm is not a DataFrame before scaling. Attempting conversion.") + self.logger.warning( + "X_svm is not a DataFrame before scaling. Attempting conversion." + ) X_svm = pd.DataFrame(X_svm, columns=self.svm_features) # Re-select columns X_svm = X_svm[self.svm_features] @@ -384,9 +393,9 @@ def predict_proba(self, X) -> np.ndarray: p_tabnet, p_lgb, p_extra, - p_mlp, - p_pytorch, - p_svm, # Added SVM + p_mlp, + p_pytorch, + p_svm, # Added SVM self.dynamic_weights_train if self.dynamic_weighting else None, self.thresholds_train if self.dynamic_weighting else None, ) @@ -452,10 +461,10 @@ def load_models_from_mlflow( pytorch_path="model", pytorch_scaler_path="scaler/scaler_pytorch.pkl", svm_path="model_svm", # Artifact path for SVM model (from svm_model.py) - svm_scaler_path="scaler_svm.pkl", # Artifact path for SVM scaler (default name in svm_model.py) + svm_scaler_path="scaler_svm.pkl", # Artifact path for SVM scaler (default name in svm_model.py) ): """ - Load pre-trained models (XGB, LGBM, TabNet, Extra Trees, MLP, PyTorch, SVM) + Load pre-trained models (XGB, LGBM, TabNet, Extra Trees, MLP, PyTorch, SVM) and scalers from MLflow. Updates feature signatures for each model. """ @@ -549,7 +558,7 @@ def load_models_from_mlflow( else: self.logger.warning("No feature signature found for MLP model") # Attempt to get features from the underlying sklearn model if possible - self.mlp_features = getattr(self.model_mlp, "feature_names_in_", []) + self.mlp_features = getattr(self.model_mlp, "feature_names_in_", []) # Load the associated MLP scaler self.logger.info( @@ -565,17 +574,19 @@ def load_models_from_mlflow( except Exception as e: self.logger.error(f"Failed to load MLP model or scaler: {str(e)}") raise ValueError(f"Failed to load MLP model or scaler: {str(e)}") from e - + # Load PyTorch model try: # Define artifact paths for PyTorch model and its scaler - pytorch_model_path = "model" # Assuming artifact path is 'model' - pytorch_scaler_path = "scaler/scaler_pytorch.pkl" # Assuming scaler saved in 'scaler' dir - + pytorch_model_path = "model" # Assuming artifact path is 'model' + pytorch_scaler_path = ( + "scaler/scaler_pytorch.pkl" # Assuming scaler saved in 'scaler' dir + ) + self.logger.info(f"Loading PyTorch model from run {self.pytorch_run_id}...") pytorch_uri = f"runs:/{self.pytorch_run_id}/{pytorch_model_path}" self.model_pytorch = mlflow.pytorch.load_model(pytorch_uri) - + # Load PyTorch model also as pyfunc to easily get signature pytorch_pyfunc = mlflow.pyfunc.load_model(pytorch_uri) if pytorch_pyfunc.metadata.signature and pytorch_pyfunc.metadata.signature.inputs: @@ -585,7 +596,7 @@ def load_models_from_mlflow( ) else: self.logger.warning("No feature signature found for PyTorch model.") - self.pytorch_features = [] + self.pytorch_features = [] # Load the associated PyTorch scaler self.logger.info( @@ -597,21 +608,27 @@ def load_models_from_mlflow( with open(scaler_local_path, "rb") as f: self.model_pytorch_scaler = pickle.load(f) self.logger.info("PyTorch scaler loaded successfully.") - - # Optional: Attach scaler and device to the loaded PyTorch model instance + + # Optional: Attach scaler and device to the loaded PyTorch model instance # if its predict_proba method relies on them being attributes (like in the hypertuner) - if hasattr(self.model_pytorch, 'scaler_') and hasattr(self.model_pytorch, 'device_'): - try: + if hasattr(self.model_pytorch, "scaler_") and hasattr(self.model_pytorch, "device_"): + try: # Determine device (use CUDA if available, same logic as hypertuner) pytorch_device = torch.device("cuda" if torch.cuda.is_available() else "cpu") self.model_pytorch.scaler_ = self.model_pytorch_scaler self.model_pytorch.device_ = pytorch_device - self.model_pytorch.to(pytorch_device) # Ensure model is on the correct device - self.logger.info(f"Attached scaler and device ({pytorch_device}) to loaded PyTorch model.") + self.model_pytorch.to(pytorch_device) # Ensure model is on the correct device + self.logger.info( + f"Attached scaler and device ({pytorch_device}) to loaded PyTorch model." + ) except Exception as attach_e: - self.logger.warning(f"Could not attach scaler/device to PyTorch model: {attach_e}") + self.logger.warning( + f"Could not attach scaler/device to PyTorch model: {attach_e}" + ) else: - self.logger.warning("Loaded PyTorch model does not have scaler_/device_ attributes for attachment.") + self.logger.warning( + "Loaded PyTorch model does not have scaler_/device_ attributes for attachment." + ) # Ensure model is moved to the correct device anyway try: pytorch_device = torch.device("cuda" if torch.cuda.is_available() else "cpu") @@ -650,14 +667,17 @@ def load_models_from_mlflow( self.model_svm_scaler = pickle.load(f) self.logger.info("SVM scaler loaded successfully.") # Verify scaler type (optional) - if not isinstance(self.model_svm_scaler, (StandardScaler, sklearn.preprocessing.RobustScaler)): # Add other expected scaler types if needed - self.logger.warning(f"Loaded SVM scaler is of unexpected type: {type(self.model_svm_scaler).__name__}") + if not isinstance( + self.model_svm_scaler, (StandardScaler, sklearn.preprocessing.RobustScaler) + ): # Add other expected scaler types if needed + self.logger.warning( + f"Loaded SVM scaler is of unexpected type: {type(self.model_svm_scaler).__name__}" + ) except Exception as e: self.logger.error(f"Failed to load SVM model or scaler: {str(e)}") raise ValueError(f"Failed to load SVM model or scaler: {str(e)}") from e - self.logger.info("Base models loading complete.") # Consider setting self.selected_features based on intersection or a specific model # self.selected_features = self.xgb_features diff --git a/src/models/ensemble/ensemble_model_0414.py b/src/models/ensemble/ensemble_model_0414.py index b18c688..e0ae026 100644 --- a/src/models/ensemble/ensemble_model_0414.py +++ b/src/models/ensemble/ensemble_model_0414.py @@ -52,6 +52,7 @@ # PyTorch specific reproducibility settings torch.manual_seed(SEED) + class EnsembleModel(BaseEstimator, ClassifierMixin): def __init__( self, @@ -69,31 +70,31 @@ def __init__( self.logger = logger or ExperimentLogger( experiment_name="ensemble_model_0410", log_dir="./logs/ensemble_model_0410" ) - self.required_recall = required_recall # For meta-learner + self.required_recall = required_recall # For meta-learner - self.target_precision = target_precision # For dynamic weights + self.target_precision = target_precision # For dynamic weights # --- MLflow Run IDs for Base Models --- self.xgb_run_id = "5f827508ce8346a99206d37f626912bf" - self.lgb_run_id = "be439e143bd04b768309ca1f4e03199d" - self.tabnet_run_id = "e7d72ec3cd5c48ecb129630a50ed311d" - self.extra_run_id = "ec3e2fdbd57b4edab1988f7e39457de2" + self.lgb_run_id = "be439e143bd04b768309ca1f4e03199d" + self.tabnet_run_id = "e7d72ec3cd5c48ecb129630a50ed311d" + self.extra_run_id = "ec3e2fdbd57b4edab1988f7e39457de2" self.mlp_run_id = "2d921ea19abd472f8650055e1b9f92c2" self.pytorch_run_id = "b42c959962574bcc9c41289e80aee7e4" - self.svm_run_id = "2b10c9862c7d48db9a1922a3c30ba28e" + self.svm_run_id = "2b10c9862c7d48db9a1922a3c30ba28e" self.fnn_run_id = "9868d00ccb3c4629a2d2d00f100c8951" # Minimum recalls for dynamic weighting (order: xgb, tabnet, lgb, rf, mlp, pytorch, svm) - self.min_recalls = [0.30, 0.20, 0.30, 0.40, 0.30, 0.30, 0.30, 0.30] # Added SVM recall + self.min_recalls = [0.30, 0.20, 0.30, 0.40, 0.30, 0.30, 0.30, 0.30] # Added SVM recall # Meta-learner settings self.meta_learner_type = meta_learner_type - self.optimal_threshold = 0.5 # Will be tuned - self.individual_thresholding = individual_thresholding # Unused currently? + self.optimal_threshold = 0.5 # Will be tuned + self.individual_thresholding = individual_thresholding # Unused currently? self.calibrate = calibrate # Unused currently? - self.calibration_method = calibration_method # Unused currently? + self.calibration_method = calibration_method # Unused currently? self.dynamic_weighting = dynamic_weighting - num_models = 8 # Updated number of models (added SVM) + num_models = 8 # Updated number of models (added SVM) if self.dynamic_weighting: # Adjusted for 7 base models self.dynamic_weights = { @@ -102,34 +103,34 @@ def __init__( "tabnet": 1 / num_models, "extra": 1 / num_models, "mlp": 1 / num_models, - "pytorch": 1 / num_models, # Added pytorch - "svm": 1 / num_models, # Added SVM - "fnn": 1 / num_models, # Added FNN + "pytorch": 1 / num_models, # Added pytorch + "svm": 1 / num_models, # Added SVM + "fnn": 1 / num_models, # Added FNN } - + # Placeholder attributes for models and features - will be populated by load_models self.meta_learner = None self.model_xgb = None self.model_lgb = None self.model_tabnet = None - self.model_extra = None + self.model_extra = None self.model_mlp = None - self.model_mlp_scaler = None - self.model_pytorch = None # Added pytorch model placeholder - self.model_pytorch_scaler = None # Added pytorch scaler placeholder - self.model_svm = None # Added SVM model placeholder - self.model_svm_scaler = None # Added SVM scaler placeholder - self.model_fnn = None # Added FNN model placeholder - self.model_fnn_scaler = None # Added FNN scaler placeholder + self.model_mlp_scaler = None + self.model_pytorch = None # Added pytorch model placeholder + self.model_pytorch_scaler = None # Added pytorch scaler placeholder + self.model_svm = None # Added SVM model placeholder + self.model_svm_scaler = None # Added SVM scaler placeholder + self.model_fnn = None # Added FNN model placeholder + self.model_fnn_scaler = None # Added FNN scaler placeholder self.xgb_features = [] self.lgb_features = [] self.tabnet_features = [] self.extra_features = [] self.mlp_features = [] - self.pytorch_features = [] # Added pytorch feature list placeholder - self.svm_features = [] # Added SVM feature list placeholder - self.fnn_features = [] # Added FNN feature list placeholder + self.pytorch_features = [] # Added pytorch feature list placeholder + self.svm_features = [] # Added SVM feature list placeholder + self.fnn_features = [] # Added FNN feature list placeholder # Load models on initialization self.load_models_from_mlflow() @@ -156,31 +157,33 @@ def train( try: X_val_xgb = X_val_prepared[self.xgb_features] except KeyError: - self.logger.warning(f"XGB features not found in X_val_prepared. Using all features. {self.xgb_features}") + self.logger.warning( + f"XGB features not found in X_val_prepared. Using all features. {self.xgb_features}" + ) X_val_xgb = X_val_prepared X_val_tabnet = X_val_prepared[self.tabnet_features] X_val_lgb = X_val_prepared[self.lgb_features] X_val_extra = X_val_prepared[self.extra_features] X_val_mlp = X_val_prepared[self.mlp_features] # Prepare MLP features - X_val_pytorch = X_val_prepared[self.pytorch_features] # Added PyTorch features - X_val_svm = X_val_prepared[self.svm_features] # Added SVM features - X_val_fnn = X_val_prepared[self.fnn_features] # Added FNN features + X_val_pytorch = X_val_prepared[self.pytorch_features] # Added PyTorch features + X_val_svm = X_val_prepared[self.svm_features] # Added SVM features + X_val_fnn = X_val_prepared[self.fnn_features] # Added FNN features X_test_xgb = X_test_prepared[self.xgb_features] X_test_tabnet = X_test_prepared[self.tabnet_features] X_test_lgb = X_test_prepared[self.lgb_features] X_test_extra = X_test_prepared[self.extra_features] X_test_mlp = X_test_prepared[self.mlp_features] # Prepare MLP features - X_test_pytorch = X_test_prepared[self.pytorch_features] # Added PyTorch features - X_test_svm = X_test_prepared[self.svm_features] # Added SVM features - X_test_fnn = X_test_prepared[self.fnn_features] # Added FNN features + X_test_pytorch = X_test_prepared[self.pytorch_features] # Added PyTorch features + X_test_svm = X_test_prepared[self.svm_features] # Added SVM features + X_test_fnn = X_test_prepared[self.fnn_features] # Added FNN features # Obtain predictions from base models on validation set self.logger.info("Obtaining validation predictions from base models...") p_xgb_val = self.model_xgb.predict_proba(X_val_xgb)[:, 1] # TabNet input might need .values depending on saving format try: - p_tabnet_val = self.model_tabnet.predict_proba(X_val_tabnet)[:, 1] + p_tabnet_val = self.model_tabnet.predict_proba(X_val_tabnet)[:, 1] except TypeError: self.logger.warning("TabNet predict_proba failed on DataFrame, trying .values") p_tabnet_val = self.model_tabnet.predict_proba(X_val_tabnet.values)[:, 1] @@ -213,7 +216,7 @@ def train( # MLP requires scaling X_test_mlp_scaled = self.model_mlp_scaler.transform(X_test_mlp) p_mlp_test = self.model_mlp.predict_proba(X_test_mlp_scaled)[:, 1] - # PyTorch + # PyTorch p_pytorch_test = self.model_pytorch.predict_proba(X_test_pytorch)[:, 1] # SVM requires scaling X_test_svm_scaled = self.model_svm_scaler.transform(X_test_svm) @@ -224,19 +227,21 @@ def train( if self.dynamic_weighting: # Use test predictions for weights used during FINAL meta-learner TRAINING self.logger.info("Computing dynamic weights based on test performance...") - self.dynamic_weights_train, self.thresholds_train = compute_precision_focused_weights_optimized( - p_xgb_test, - p_tabnet_test, - p_lgb_test, - p_extra_test, - p_mlp_test, - p_pytorch_test, # Added PyTorch - p_svm_test, # Added SVM - p_fnn_test, # Added FNN - y_test, - self.target_precision, - self.min_recalls, # Should now have 7 elements - self.logger, + self.dynamic_weights_train, self.thresholds_train = ( + compute_precision_focused_weights_optimized( + p_xgb_test, + p_tabnet_test, + p_lgb_test, + p_extra_test, + p_mlp_test, + p_pytorch_test, # Added PyTorch + p_svm_test, # Added SVM + p_fnn_test, # Added FNN + y_test, + self.target_precision, + self.min_recalls, # Should now have 7 elements + self.logger, + ) ) # Ensure weights_0410 is imported and used self.logger.info("Computing dynamic weights based on validation performance...") @@ -245,13 +250,13 @@ def train( p_tabnet_val, p_lgb_val, p_extra_val, - p_mlp_val, - p_pytorch_val, # Added PyTorch - p_svm_val, # Added SVM - p_fnn_val, # Added FNN - y_val, + p_mlp_val, + p_pytorch_val, # Added PyTorch + p_svm_val, # Added SVM + p_fnn_val, # Added FNN + y_val, self.target_precision, - self.min_recalls, # Should now have 7 elements + self.min_recalls, # Should now have 7 elements self.logger, ) @@ -262,10 +267,10 @@ def train( p_tabnet_val, p_lgb_val, p_extra_val, - p_mlp_val, - p_pytorch_val, # Added PyTorch - p_svm_val, # Added SVM - p_fnn_val, # Added FNN + p_mlp_val, + p_pytorch_val, # Added PyTorch + p_svm_val, # Added SVM + p_fnn_val, # Added FNN self.dynamic_weights if self.dynamic_weighting else None, self.thresholds if self.dynamic_weighting else None, ) @@ -276,10 +281,10 @@ def train( p_tabnet_test, p_lgb_test, p_extra_test, - p_mlp_test, - p_pytorch_test, # Added PyTorch - p_svm_test, # Added SVM - p_fnn_test, # Added FNN + p_mlp_test, + p_pytorch_test, # Added PyTorch + p_svm_test, # Added SVM + p_fnn_test, # Added FNN self.dynamic_weights_train if self.dynamic_weighting else None, self.thresholds_train if self.dynamic_weighting else None, ) @@ -351,26 +356,28 @@ def predict_proba(self, X) -> np.ndarray: X_lgb = X_prepared[self.lgb_features] X_extra = X_prepared[self.extra_features] X_mlp = X_prepared[self.mlp_features] - X_pytorch = X_prepared[self.pytorch_features] # Added PyTorch - X_svm = X_prepared[self.svm_features] # Added SVM - X_fnn = X_prepared[self.fnn_features] # Added FNN + X_pytorch = X_prepared[self.pytorch_features] # Added PyTorch + X_svm = X_prepared[self.svm_features] # Added SVM + X_fnn = X_prepared[self.fnn_features] # Added FNN try: # Generate predictions p_xgb = self.model_xgb.predict_proba(X_xgb)[:, 1] p_lgb = self.model_lgb.predict_proba(X_lgb)[:, 1] p_extra = self.model_extra.predict_proba(X_extra)[:, 1] - p_pytorch = self.model_pytorch.predict_proba(X_pytorch)[:, 1] + p_pytorch = self.model_pytorch.predict_proba(X_pytorch)[:, 1] p_fnn = self.model_fnn.predict_proba(X_fnn)[:, 1] # Handle potential TabNet input type error try: p_tabnet = self.model_tabnet.predict_proba(X_tabnet)[:, 1] except TypeError: p_tabnet = self.model_tabnet.predict_proba(X_tabnet.values)[:, 1] - + # --- MLP Scaling and Prediction --- # Ensure X_mlp is a DataFrame with correct columns before transform if not isinstance(X_mlp, pd.DataFrame): - self.logger.warning("X_mlp is not a DataFrame before scaling. Attempting conversion.") + self.logger.warning( + "X_mlp is not a DataFrame before scaling. Attempting conversion." + ) X_mlp = pd.DataFrame(X_mlp, columns=self.mlp_features) # Re-select columns just in case order changed or to ensure DataFrame type X_mlp = X_mlp[self.mlp_features] @@ -380,7 +387,9 @@ def predict_proba(self, X) -> np.ndarray: # --- SVM Scaling and Prediction --- # Ensure X_svm is a DataFrame with correct columns before transform if not isinstance(X_svm, pd.DataFrame): - self.logger.warning("X_svm is not a DataFrame before scaling. Attempting conversion.") + self.logger.warning( + "X_svm is not a DataFrame before scaling. Attempting conversion." + ) X_svm = pd.DataFrame(X_svm, columns=self.svm_features) # Re-select columns X_svm = X_svm[self.svm_features] @@ -392,10 +401,10 @@ def predict_proba(self, X) -> np.ndarray: p_tabnet, p_lgb, p_extra, - p_mlp, - p_pytorch, - p_svm, # Added SVM - p_fnn, # Added FNN + p_mlp, + p_pytorch, + p_svm, # Added SVM + p_fnn, # Added FNN self.dynamic_weights_train if self.dynamic_weighting else None, self.thresholds_train if self.dynamic_weighting else None, ) @@ -461,12 +470,12 @@ def load_models_from_mlflow( pytorch_path="model", pytorch_scaler_path="scaler/scaler_pytorch.pkl", svm_path="model_svm", # Artifact path for SVM model (from svm_model.py) - svm_scaler_path="scaler_svm.pkl", # Artifact path for SVM scaler (default name in svm_model.py) + svm_scaler_path="scaler_svm.pkl", # Artifact path for SVM scaler (default name in svm_model.py) fnn_path="model", fnn_scaler_path="scaler/scaler_pytorch.pkl", ): """ - Load pre-trained models (XGB, LGBM, TabNet, Extra Trees, MLP, PyTorch, SVM) + Load pre-trained models (XGB, LGBM, TabNet, Extra Trees, MLP, PyTorch, SVM) and scalers from MLflow. Updates feature signatures for each model. """ @@ -560,7 +569,7 @@ def load_models_from_mlflow( else: self.logger.warning("No feature signature found for MLP model") # Attempt to get features from the underlying sklearn model if possible - self.mlp_features = getattr(self.model_mlp, "feature_names_in_", []) + self.mlp_features = getattr(self.model_mlp, "feature_names_in_", []) # Load the associated MLP scaler self.logger.info( @@ -572,18 +581,22 @@ def load_models_from_mlflow( with open(scaler_local_path, "rb") as f: self.model_mlp_scaler = pickle.load(f) self.logger.info("MLP scaler loaded successfully.") - if hasattr(self.model_mlp, 'scaler_') and hasattr(self.model_mlp, 'device_'): - try: + if hasattr(self.model_mlp, "scaler_") and hasattr(self.model_mlp, "device_"): + try: # Determine device (use CUDA if available, same logic as hypertuner) pytorch_device = torch.device("cuda" if torch.cuda.is_available() else "cpu") self.model_mlp.scaler_ = self.model_mlp_scaler self.model_mlp.device_ = pytorch_device - self.model_mlp.to(pytorch_device) # Ensure model is on the correct device - self.logger.info(f"Attached scaler and device ({pytorch_device}) to loaded MLP model.") + self.model_mlp.to(pytorch_device) # Ensure model is on the correct device + self.logger.info( + f"Attached scaler and device ({pytorch_device}) to loaded MLP model." + ) except Exception as attach_e: self.logger.warning(f"Could not attach scaler/device to MLP model: {attach_e}") else: - self.logger.warning("Loaded MLP model does not have scaler_/device_ attributes for attachment.") + self.logger.warning( + "Loaded MLP model does not have scaler_/device_ attributes for attachment." + ) # Ensure model is moved to the correct device anyway try: pytorch_device = torch.device("cuda" if torch.cuda.is_available() else "cpu") @@ -594,17 +607,19 @@ def load_models_from_mlflow( except Exception as e: self.logger.error(f"Failed to load MLP model or scaler: {str(e)}") raise ValueError(f"Failed to load MLP model or scaler: {str(e)}") from e - + # Load PyTorch model try: # Define artifact paths for PyTorch model and its scaler - pytorch_model_path = "model" # Assuming artifact path is 'model' - pytorch_scaler_path = "scaler/scaler_pytorch.pkl" # Assuming scaler saved in 'scaler' dir - + pytorch_model_path = "model" # Assuming artifact path is 'model' + pytorch_scaler_path = ( + "scaler/scaler_pytorch.pkl" # Assuming scaler saved in 'scaler' dir + ) + self.logger.info(f"Loading PyTorch model from run {self.pytorch_run_id}...") pytorch_uri = f"runs:/{self.pytorch_run_id}/{pytorch_model_path}" self.model_pytorch = mlflow.pytorch.load_model(pytorch_uri) - + # Load PyTorch model also as pyfunc to easily get signature pytorch_pyfunc = mlflow.pyfunc.load_model(pytorch_uri) if pytorch_pyfunc.metadata.signature and pytorch_pyfunc.metadata.signature.inputs: @@ -614,7 +629,7 @@ def load_models_from_mlflow( ) else: self.logger.warning("No feature signature found for PyTorch model.") - self.pytorch_features = [] + self.pytorch_features = [] # Load the associated PyTorch scaler self.logger.info( @@ -626,21 +641,27 @@ def load_models_from_mlflow( with open(scaler_local_path, "rb") as f: self.model_pytorch_scaler = pickle.load(f) self.logger.info("PyTorch scaler loaded successfully.") - - # Optional: Attach scaler and device to the loaded PyTorch model instance + + # Optional: Attach scaler and device to the loaded PyTorch model instance # if its predict_proba method relies on them being attributes (like in the hypertuner) - if hasattr(self.model_pytorch, 'scaler_') and hasattr(self.model_pytorch, 'device_'): - try: + if hasattr(self.model_pytorch, "scaler_") and hasattr(self.model_pytorch, "device_"): + try: # Determine device (use CUDA if available, same logic as hypertuner) pytorch_device = torch.device("cuda" if torch.cuda.is_available() else "cpu") self.model_pytorch.scaler_ = self.model_pytorch_scaler self.model_pytorch.device_ = pytorch_device - self.model_pytorch.to(pytorch_device) # Ensure model is on the correct device - self.logger.info(f"Attached scaler and device ({pytorch_device}) to loaded PyTorch model.") + self.model_pytorch.to(pytorch_device) # Ensure model is on the correct device + self.logger.info( + f"Attached scaler and device ({pytorch_device}) to loaded PyTorch model." + ) except Exception as attach_e: - self.logger.warning(f"Could not attach scaler/device to PyTorch model: {attach_e}") + self.logger.warning( + f"Could not attach scaler/device to PyTorch model: {attach_e}" + ) else: - self.logger.warning("Loaded PyTorch model does not have scaler_/device_ attributes for attachment.") + self.logger.warning( + "Loaded PyTorch model does not have scaler_/device_ attributes for attachment." + ) # Ensure model is moved to the correct device anyway try: pytorch_device = torch.device("cuda" if torch.cuda.is_available() else "cpu") @@ -678,8 +699,12 @@ def load_models_from_mlflow( self.model_svm_scaler = pickle.load(f) self.logger.info("SVM scaler loaded successfully.") # Verify scaler type (optional) - if not isinstance(self.model_svm_scaler, (StandardScaler, sklearn.preprocessing.RobustScaler)): # Add other expected scaler types if needed - self.logger.warning(f"Loaded SVM scaler is of unexpected type: {type(self.model_svm_scaler).__name__}") + if not isinstance( + self.model_svm_scaler, (StandardScaler, sklearn.preprocessing.RobustScaler) + ): # Add other expected scaler types if needed + self.logger.warning( + f"Loaded SVM scaler is of unexpected type: {type(self.model_svm_scaler).__name__}" + ) except Exception as e: self.logger.error(f"Failed to load SVM model or scaler: {str(e)}") raise ValueError(f"Failed to load SVM model or scaler: {str(e)}") from e @@ -689,7 +714,7 @@ def load_models_from_mlflow( self.logger.info(f"Loading FNN model from run {self.fnn_run_id}...") fnn_uri = f"runs:/{self.fnn_run_id}/{fnn_path}" self.model_fnn = mlflow.pytorch.load_model(fnn_uri) - + # Load PyTorch model also as pyfunc to easily get signature fnn_pyfunc = mlflow.pyfunc.load_model(fnn_uri) if fnn_pyfunc.metadata.signature and fnn_pyfunc.metadata.signature.inputs: @@ -699,7 +724,7 @@ def load_models_from_mlflow( ) else: self.logger.warning("No feature signature found for FNN model.") - self.fnn_features = [] + self.fnn_features = [] # Load the associated PyTorch scaler self.logger.info( @@ -711,21 +736,25 @@ def load_models_from_mlflow( with open(scaler_local_path, "rb") as f: self.model_fnn_scaler = pickle.load(f) self.logger.info("FNN scaler loaded successfully.") - - # Optional: Attach scaler and device to the loaded PyTorch model instance + + # Optional: Attach scaler and device to the loaded PyTorch model instance # if its predict_proba method relies on them being attributes (like in the hypertuner) - if hasattr(self.model_pytorch, 'scaler_') and hasattr(self.model_pytorch, 'device_'): - try: + if hasattr(self.model_pytorch, "scaler_") and hasattr(self.model_pytorch, "device_"): + try: # Determine device (use CUDA if available, same logic as hypertuner) pytorch_device = torch.device("cuda" if torch.cuda.is_available() else "cpu") self.model_fnn.scaler_ = self.model_fnn_scaler self.model_fnn.device_ = pytorch_device - self.model_fnn.to(pytorch_device) # Ensure model is on the correct device - self.logger.info(f"Attached scaler and device ({pytorch_device}) to loaded FNN model.") + self.model_fnn.to(pytorch_device) # Ensure model is on the correct device + self.logger.info( + f"Attached scaler and device ({pytorch_device}) to loaded FNN model." + ) except Exception as attach_e: self.logger.warning(f"Could not attach scaler/device to FNN model: {attach_e}") else: - self.logger.warning("Loaded FNN model does not have scaler_/device_ attributes for attachment.") + self.logger.warning( + "Loaded FNN model does not have scaler_/device_ attributes for attachment." + ) # Ensure model is moved to the correct device anyway try: pytorch_device = torch.device("cuda" if torch.cuda.is_available() else "cpu") diff --git a/src/models/ensemble/ensemble_model_20.py b/src/models/ensemble/ensemble_model_20.py index b3a7c89..57af497 100644 --- a/src/models/ensemble/ensemble_model_20.py +++ b/src/models/ensemble/ensemble_model_20.py @@ -12,16 +12,15 @@ import random import mlflow # Ensure mlflow is imported for download_artifacts +import mlflow.artifacts # Explicit import for artifacts import mlflow.lightgbm import mlflow.pyfunc import mlflow.sklearn import mlflow.xgboost import numpy as np import pandas as pd # Add pandas import -import sklearn import torch from sklearn.base import BaseEstimator, ClassifierMixin -from sklearn.preprocessing import StandardScaler # Needed for SVM scaler type check potentially from src.models.ensemble.data_utils import prepare_data from src.models.ensemble.diagnostics import analyze_prediction_errors, explain_predictions @@ -52,6 +51,11 @@ # PyTorch specific reproducibility settings torch.manual_seed(SEED) +# String constants for repeated messages +XGB_PREDICT_PROBA_FALLBACK_MSG = "XGBoost predict_proba not available, using pyfunc predict_proba" +PYTORCH_SCALER_PATH = "scaler/scaler_pytorch.pkl" + + class EnsembleModel(BaseEstimator, ClassifierMixin): def __init__( self, @@ -69,31 +73,31 @@ def __init__( self.logger = logger or ExperimentLogger( experiment_name="ensemble_model_0410", log_dir="./logs/ensemble_model_0410" ) - self.required_recall = required_recall # For meta-learner + self.required_recall = required_recall # For meta-learner - self.target_precision = target_precision # For dynamic weights + self.target_precision = target_precision # For dynamic weights # --- MLflow Run IDs for Base Models --- self.xgb_run_id = "a8f84a8a82ef44b7a64bc72d5d797a82" - self.lgb_run_id = "ea8b1bb86aaa4faf9bf8c3d2d08145da" - self.tabnet_run_id = "431df2695dd9431f8c088e15b675e8a3" - self.extra_run_id = "625d925be2634d10b7da7f6a42576405" + self.lgb_run_id = "ea8b1bb86aaa4faf9bf8c3d2d08145da" + self.tabnet_run_id = "431df2695dd9431f8c088e15b675e8a3" + self.extra_run_id = "625d925be2634d10b7da7f6a42576405" self.mlp_run_id = "ebb5dfa8d32c4409a7edb21f3fad09d0" self.pytorch_run_id = "bb59b9589aef4638a1e0d3aa406c7da7" - self.svm_run_id = "8fded66e23fd422ab8d7db66687508dd" + self.svm_run_id = "8fded66e23fd422ab8d7db66687508dd" self.fnn_run_id = "c32a835ceac048e9af04334ebb3c3d57" # Minimum recalls for dynamic weighting (order: xgb, tabnet, lgb, rf, mlp, pytorch, svm) - self.min_recalls = [0.25, 0.25, 0.30, 0.30, 0.25, 0.20, 0.25, 0.20] # Added SVM recall + self.min_recalls = [0.25, 0.25, 0.30, 0.30, 0.25, 0.20, 0.25, 0.20] # Added SVM recall # Meta-learner settings self.meta_learner_type = meta_learner_type - self.optimal_threshold = 0.5 # Will be tuned - self.individual_thresholding = individual_thresholding # Unused currently? + self.optimal_threshold = 0.5 # Will be tuned + self.individual_thresholding = individual_thresholding # Unused currently? self.calibrate = calibrate # Unused currently? - self.calibration_method = calibration_method # Unused currently? + self.calibration_method = calibration_method # Unused currently? self.dynamic_weighting = dynamic_weighting - num_models = 8 # Updated number of models (added SVM) + num_models = 8 # Updated number of models (added SVM) if self.dynamic_weighting: # Adjusted for 7 base models self.dynamic_weights = { @@ -102,34 +106,34 @@ def __init__( "tabnet": 1 / num_models, "extra": 1 / num_models, "mlp": 1 / num_models, - "pytorch": 1 / num_models, # Added pytorch - "svm": 1 / num_models, # Added SVM - "fnn": 1 / num_models, # Added FNN + "pytorch": 1 / num_models, # Added pytorch + "svm": 1 / num_models, # Added SVM + "fnn": 1 / num_models, # Added FNN } - + # Placeholder attributes for models and features - will be populated by load_models self.meta_learner = None self.model_xgb = None self.model_lgb = None self.model_tabnet = None - self.model_extra = None + self.model_extra = None self.model_mlp = None - self.model_mlp_scaler = None - self.model_pytorch = None # Added pytorch model placeholder - self.model_pytorch_scaler = None # Added pytorch scaler placeholder - self.model_svm = None # Added SVM model placeholder - self.model_svm_scaler = None # Added SVM scaler placeholder - self.model_fnn = None # Added FNN model placeholder - self.model_fnn_scaler = None # Added FNN scaler placeholder + self.model_mlp_scaler = None + self.model_pytorch = None # Added pytorch model placeholder + self.model_pytorch_scaler = None # Added pytorch scaler placeholder + self.model_svm = None # Added SVM model placeholder + self.model_svm_scaler = None # Added SVM scaler placeholder + self.model_fnn = None # Added FNN model placeholder + self.model_fnn_scaler = None # Added FNN scaler placeholder self.xgb_features = [] self.lgb_features = [] self.tabnet_features = [] self.extra_features = [] self.mlp_features = [] - self.pytorch_features = [] # Added pytorch feature list placeholder - self.svm_features = [] # Added SVM feature list placeholder - self.fnn_features = [] # Added FNN feature list placeholder + self.pytorch_features = [] # Added pytorch feature list placeholder + self.svm_features = [] # Added SVM feature list placeholder + self.fnn_features = [] # Added FNN feature list placeholder # Load models on initialization self.load_models_from_mlflow() @@ -138,7 +142,7 @@ def train( self, X_train, y_train, - X_val=None, + x_val=None, y_val=None, X_test=None, y_test=None, @@ -147,96 +151,146 @@ def train( ) -> dict: self.logger.info("Starting ensemble model 0404 training...") # Data preparation - # X_train_prepared = prepare_data(X_train, X_train.columns) - X_val_prepared = prepare_data(X_val, X_val.columns) - X_test_prepared = prepare_data(X_test, X_test.columns) + # x_train_prepared = prepare_data(X_train, X_train.columns) + # Add type assertions for DataFrame operations + assert isinstance(x_val, pd.DataFrame), f"x_val must be a DataFrame, got {type(x_val)}" + assert isinstance(X_test, pd.DataFrame), f"X_test must be a DataFrame, got {type(X_test)}" + x_val_prepared = prepare_data(x_val, list(x_val.columns)) + x_test_prepared = prepare_data(X_test, list(X_test.columns)) # Prepare feature subsets for each model self.logger.info("Preparing feature subsets for base models...") try: - X_val_xgb = X_val_prepared[self.xgb_features] + x_val_xgb = x_val_prepared[self.xgb_features] except KeyError: - self.logger.warning(f"XGB features not found in X_val_prepared. Using all features. {self.xgb_features}") - X_val_xgb = X_val_prepared - X_val_tabnet = X_val_prepared[self.tabnet_features] - X_val_lgb = X_val_prepared[self.lgb_features] - X_val_extra = X_val_prepared[self.extra_features] - X_val_mlp = X_val_prepared[self.mlp_features] # Prepare MLP features - X_val_pytorch = X_val_prepared[self.pytorch_features] # Added PyTorch features - X_val_svm = X_val_prepared[self.svm_features] # Added SVM features - X_val_fnn = X_val_prepared[self.fnn_features] # Added FNN features - - X_test_xgb = X_test_prepared[self.xgb_features] - X_test_tabnet = X_test_prepared[self.tabnet_features] - X_test_lgb = X_test_prepared[self.lgb_features] - X_test_extra = X_test_prepared[self.extra_features] - X_test_mlp = X_test_prepared[self.mlp_features] # Prepare MLP features - X_test_pytorch = X_test_prepared[self.pytorch_features] # Added PyTorch features - X_test_svm = X_test_prepared[self.svm_features] # Added SVM features - X_test_fnn = X_test_prepared[self.fnn_features] # Added FNN features + self.logger.warning( + f"XGB features not found in x_val_prepared. Using all features. {self.xgb_features}" + ) + x_val_xgb = x_val_prepared + x_val_tabnet = x_val_prepared[self.tabnet_features] + x_val_lgb = x_val_prepared[self.lgb_features] + x_val_extra = x_val_prepared[self.extra_features] + x_val_mlp = x_val_prepared[self.mlp_features] # Prepare MLP features + x_val_pytorch = x_val_prepared[self.pytorch_features] # Added PyTorch features + x_val_svm = x_val_prepared[self.svm_features] # Added SVM features + x_val_fnn = x_val_prepared[self.fnn_features] # Added FNN features + + x_test_xgb = x_test_prepared[self.xgb_features] + x_test_tabnet = x_test_prepared[self.tabnet_features] + x_test_lgb = x_test_prepared[self.lgb_features] + x_test_extra = x_test_prepared[self.extra_features] + x_test_mlp = x_test_prepared[self.mlp_features] # Prepare MLP features + x_test_pytorch = x_test_prepared[self.pytorch_features] # Added PyTorch features + x_test_svm = x_test_prepared[self.svm_features] # Added SVM features + x_test_fnn = x_test_prepared[self.fnn_features] # Added FNN features # Obtain predictions from base models on validation set self.logger.info("Obtaining validation predictions from base models...") - p_xgb_val = self.model_xgb.predict_proba(X_val_xgb)[:, 1] + # Add null checks before accessing model attributes + if self.model_xgb is None: + raise ValueError("XGBoost model not loaded") + # XGBoost predict_proba might not be available on MLflow-loaded model, try pyfunc + try: + p_xgb_val = self.model_xgb.predict_proba(x_val_xgb)[:, 1] # type: ignore + except AttributeError: + self.logger.warning("XGBoost predict_proba not available, using pyfunc predict_proba") + # Load pyfunc version for prediction if direct model doesn't work + xgb_uri = f"runs:/{self.xgb_run_id}/{'model'}" + xgb_pyfunc = mlflow.pyfunc.load_model(xgb_uri) + p_xgb_val = xgb_pyfunc.predict_proba(x_val_xgb)[:, 1] # type: ignore + + if self.model_tabnet is None: + raise ValueError("TabNet model not loaded") # TabNet input might need .values depending on saving format try: - p_tabnet_val = self.model_tabnet.predict_proba(X_val_tabnet)[:, 1] + p_tabnet_val = self.model_tabnet.predict_proba(x_val_tabnet)[:, 1] # type: ignore except TypeError: self.logger.warning("TabNet predict_proba failed on DataFrame, trying .values") - p_tabnet_val = self.model_tabnet.predict_proba(X_val_tabnet.values)[:, 1] - p_lgb_val = self.model_lgb.predict_proba(X_val_lgb)[:, 1] - p_extra_val = self.model_extra.predict_proba(X_val_extra)[:, 1] + p_tabnet_val = self.model_tabnet.predict_proba(x_val_tabnet.values)[:, 1] # type: ignore + + if self.model_lgb is None: + raise ValueError("LightGBM model not loaded") + p_lgb_val = self.model_lgb.predict_proba(x_val_lgb)[:, 1] # type: ignore + + if self.model_extra is None: + raise ValueError("Extra Trees model not loaded") + p_extra_val = self.model_extra.predict_proba(x_val_extra)[:, 1] # type: ignore + # MLP requires scaling - X_val_mlp_scaled = self.model_mlp_scaler.transform(X_val_mlp) - p_mlp_val = self.model_mlp.predict_proba(X_val_mlp_scaled)[:, 1] + if self.model_mlp is None: + raise ValueError("MLP model not loaded") + if self.model_mlp_scaler is None: + raise ValueError("MLP scaler not loaded") + x_val_mlp_scaled = self.model_mlp_scaler.transform(x_val_mlp) + p_mlp_val = self.model_mlp.predict_proba(x_val_mlp_scaled)[:, 1] # type: ignore + # PyTorch model has scaler_ and device_ attached during creation - p_pytorch_val = self.model_pytorch.predict_proba(X_val_pytorch)[:, 1] + if self.model_pytorch is None: + raise ValueError("PyTorch model not loaded") + p_pytorch_val = self.model_pytorch.predict_proba(x_val_pytorch)[:, 1] # type: ignore + # SVM requires scaling - X_val_svm_scaled = self.model_svm_scaler.transform(X_val_svm) - p_svm_val = self.model_svm.predict_proba(X_val_svm_scaled)[:, 1] + if self.model_svm is None: + raise ValueError("SVM model not loaded") + if self.model_svm_scaler is None: + raise ValueError("SVM scaler not loaded") + x_val_svm_scaled = self.model_svm_scaler.transform(x_val_svm) + p_svm_val = self.model_svm.predict_proba(x_val_svm_scaled)[:, 1] # type: ignore + # FNN - p_fnn_val = self.model_fnn.predict_proba(X_val_fnn)[:, 1] + if self.model_fnn is None: + raise ValueError("FNN model not loaded") + p_fnn_val = self.model_fnn.predict_proba(x_val_fnn)[:, 1] # type: ignore # Obtain predictions from base models on test set (used for meta-learner training) self.logger.info( "Obtaining test predictions from base models (for meta-learner training)..." ) - p_xgb_test = self.model_xgb.predict_proba(X_test_xgb)[:, 1] + # Null checks already performed above for models try: - p_tabnet_test = self.model_tabnet.predict_proba(X_test_tabnet)[:, 1] + p_xgb_test = self.model_xgb.predict_proba(x_test_xgb)[:, 1] # type: ignore + except AttributeError: + self.logger.warning(XGB_PREDICT_PROBA_FALLBACK_MSG) + xgb_uri = f"runs:/{self.xgb_run_id}/{'model'}" + xgb_pyfunc = mlflow.pyfunc.load_model(xgb_uri) + p_xgb_test = xgb_pyfunc.predict_proba(x_test_xgb)[:, 1] # type: ignore + try: + p_tabnet_test = self.model_tabnet.predict_proba(x_test_tabnet)[:, 1] # type: ignore except TypeError: - p_tabnet_test = self.model_tabnet.predict_proba(X_test_tabnet.values)[:, 1] + p_tabnet_test = self.model_tabnet.predict_proba(x_test_tabnet.values)[:, 1] # type: ignore # LightGBM - p_lgb_test = self.model_lgb.predict_proba(X_test_lgb)[:, 1] + p_lgb_test = self.model_lgb.predict_proba(x_test_lgb)[:, 1] # type: ignore # Extra Trees - p_extra_test = self.model_extra.predict_proba(X_test_extra)[:, 1] + p_extra_test = self.model_extra.predict_proba(x_test_extra)[:, 1] # type: ignore # MLP requires scaling - X_test_mlp_scaled = self.model_mlp_scaler.transform(X_test_mlp) - p_mlp_test = self.model_mlp.predict_proba(X_test_mlp_scaled)[:, 1] - # PyTorch - p_pytorch_test = self.model_pytorch.predict_proba(X_test_pytorch)[:, 1] + x_test_mlp_scaled = self.model_mlp_scaler.transform(x_test_mlp) + p_mlp_test = self.model_mlp.predict_proba(x_test_mlp_scaled)[:, 1] # type: ignore + # PyTorch + p_pytorch_test = self.model_pytorch.predict_proba(x_test_pytorch)[:, 1] # type: ignore # SVM requires scaling - X_test_svm_scaled = self.model_svm_scaler.transform(X_test_svm) - p_svm_test = self.model_svm.predict_proba(X_test_svm_scaled)[:, 1] + x_test_svm_scaled = self.model_svm_scaler.transform(x_test_svm) + p_svm_test = self.model_svm.predict_proba(x_test_svm_scaled)[:, 1] # type: ignore # FNN - p_fnn_test = self.model_fnn.predict_proba(X_test_fnn)[:, 1] + p_fnn_test = self.model_fnn.predict_proba(x_test_fnn)[:, 1] # type: ignore # Optionally calculate dynamic weights based on validation performance if self.dynamic_weighting: # Use test predictions for weights used during FINAL meta-learner TRAINING self.logger.info("Computing dynamic weights based on test performance...") - self.dynamic_weights_train, self.thresholds_train = compute_precision_focused_weights_optimized( - p_xgb_test, - p_tabnet_test, - p_lgb_test, - p_extra_test, - p_mlp_test, - p_pytorch_test, # Added PyTorch - p_svm_test, # Added SVM - p_fnn_test, # Added FNN - y_test, - self.target_precision, - self.min_recalls, # Should now have 7 elements - self.logger, + self.dynamic_weights_train, self.thresholds_train = ( + compute_precision_focused_weights_optimized( + p_xgb_test, + p_tabnet_test, + p_lgb_test, + p_extra_test, + p_mlp_test, + p_pytorch_test, # Added PyTorch + p_svm_test, # Added SVM + p_fnn_test, # Added FNN + y_test, + self.target_precision, + self.min_recalls, # Should now have 7 elements + self.logger, + ) ) # Ensure weights_0410 is imported and used self.logger.info("Computing dynamic weights based on validation performance...") @@ -245,13 +299,13 @@ def train( p_tabnet_val, p_lgb_val, p_extra_val, - p_mlp_val, - p_pytorch_val, # Added PyTorch - p_svm_val, # Added SVM - p_fnn_val, # Added FNN - y_val, + p_mlp_val, + p_pytorch_val, # Added PyTorch + p_svm_val, # Added SVM + p_fnn_val, # Added FNN + y_val, self.target_precision, - self.min_recalls, # Should now have 7 elements + self.min_recalls, # Should now have 7 elements self.logger, ) @@ -262,11 +316,11 @@ def train( p_tabnet_val, p_lgb_val, p_extra_val, - p_mlp_val, - p_pytorch_val, # Added PyTorch - p_svm_val, # Added SVM - p_fnn_val, # Added FNN - X_val, + p_mlp_val, + p_pytorch_val, # Added PyTorch + p_svm_val, # Added SVM + p_fnn_val, # Added FNN + x_val_prepared, self.dynamic_weights if self.dynamic_weighting else None, self.thresholds if self.dynamic_weighting else None, ) @@ -277,11 +331,11 @@ def train( p_tabnet_test, p_lgb_test, p_extra_test, - p_mlp_test, - p_pytorch_test, # Added PyTorch - p_svm_test, # Added SVM - p_fnn_test, # Added FNN - X_test, + p_mlp_test, + p_pytorch_test, # Added PyTorch + p_svm_test, # Added SVM + p_fnn_test, # Added FNN + x_test_prepared, self.dynamic_weights_train if self.dynamic_weighting else None, self.thresholds_train if self.dynamic_weighting else None, ) @@ -295,10 +349,22 @@ def train( # Train meta-learner using TEST meta-features and HYPERTUNE using VALIDATION meta-features self.logger.info("Hypertuning and training meta-learner...") + # Fix DataFrame parameter passing to hypertune_meta_learner - convert to numpy arrays + assert isinstance(meta_df_train, pd.DataFrame), ( + f"meta_df_train must be DataFrame, got {type(meta_df_train)}" + ) + assert isinstance(meta_df_val, pd.DataFrame), ( + f"meta_df_val must be DataFrame, got {type(meta_df_val)}" + ) + # Assert that y_test and y_val are not None (should be guaranteed by earlier checks) + assert y_test is not None, "y_test should not be None at this point" + assert y_val is not None, "y_val should not be None at this point" + meta_train_np = meta_df_train.values + meta_val_np = meta_df_val.values self.meta_learner = hypertune_meta_learner( - meta_df_train, + meta_train_np, y_test, # Train/evaluate HPO on test set - meta_df_val, + meta_val_np, y_val, # Use validation set for final HPO eval (or nested CV split) meta_learner_type=self.meta_learner_type, target_precision=self.target_precision, @@ -308,11 +374,13 @@ def train( self.logger.info( f"Tuning final threshold using validation data for target precision {self.target_precision}..." ) + # Initialize variables to avoid unbound variable issues + meta_df_val_np = None if self.meta_learner_type == "tabnet": meta_df_val_np = meta_df_val.to_numpy() - meta_val_probs = self.meta_learner.predict_proba(meta_df_val_np)[:, 1] + meta_val_probs = self.meta_learner.predict_proba(meta_df_val_np)[:, 1] # type: ignore else: - meta_val_probs = self.meta_learner.predict_proba(meta_df_val)[:, 1] + meta_val_probs = self.meta_learner.predict_proba(meta_df_val)[:, 1] # type: ignore best_threshold, threshold_metrics = tune_threshold_for_precision_optimized( meta_val_probs, y_val, @@ -326,14 +394,12 @@ def train( # Final evaluation on validation data using the tuned threshold self.logger.info("Performing final evaluation on validation data...") - if self.meta_learner_type == "tabnet": - eval_results = evaluate_model( - self.meta_learner, meta_df_val_np, y_val, self.optimal_threshold, self.logger - ) - else: - eval_results = evaluate_model( - self.meta_learner, meta_df_val, y_val, self.optimal_threshold, self.logger - ) + # Assert y_val is not None for evaluation + assert y_val is not None, "y_val should not be None for evaluation" + # evaluate_model expects DataFrame, so always pass meta_df_val + eval_results = evaluate_model( + self.meta_learner, meta_df_val, y_val, self.optimal_threshold, self.logger + ) eval_results.update(threshold_metrics) # Add threshold metrics to final results self.logger.info(f"Validation evaluation results: {eval_results}") self.logger.info("Ensemble model 0404 training completed successfully.") @@ -345,60 +411,107 @@ def predict_proba(self, X) -> np.ndarray: raise ValueError("Model has not been trained. Call train() first.") # Use original columns from X for prepare_data - X_prepared = prepare_data(X, X.columns) + x_prepared = prepare_data(X, X.columns) # Select features for each model - X_xgb = X_prepared[self.xgb_features] - X_tabnet = X_prepared[self.tabnet_features] - X_lgb = X_prepared[self.lgb_features] - X_extra = X_prepared[self.extra_features] - X_mlp = X_prepared[self.mlp_features] - X_pytorch = X_prepared[self.pytorch_features] # Added PyTorch - X_svm = X_prepared[self.svm_features] # Added SVM - X_fnn = X_prepared[self.fnn_features] # Added FNN + x_xgb = x_prepared[self.xgb_features] + x_tabnet = x_prepared[self.tabnet_features] + x_lgb = x_prepared[self.lgb_features] + x_extra = x_prepared[self.extra_features] + x_mlp = x_prepared[self.mlp_features] + x_pytorch = x_prepared[self.pytorch_features] # Added PyTorch + x_svm = x_prepared[self.svm_features] # Added SVM + x_fnn = x_prepared[self.fnn_features] # Added FNN try: - # Generate predictions - p_xgb = self.model_xgb.predict_proba(X_xgb)[:, 1] - p_lgb = self.model_lgb.predict_proba(X_lgb)[:, 1] - p_extra = self.model_extra.predict_proba(X_extra)[:, 1] - p_pytorch = self.model_pytorch.predict_proba(X_pytorch)[:, 1] - p_fnn = self.model_fnn.predict_proba(X_fnn)[:, 1] + # Generate predictions with null guards + if self.model_xgb is None: + raise ValueError("XGBoost model not loaded") + try: + p_xgb = self.model_xgb.predict_proba(x_xgb)[:, 1] # type: ignore + except AttributeError: + self.logger.warning(XGB_PREDICT_PROBA_FALLBACK_MSG) + xgb_uri = f"runs:/{self.xgb_run_id}/{'model'}" + xgb_pyfunc = mlflow.pyfunc.load_model(xgb_uri) + p_xgb = xgb_pyfunc.predict_proba(x_xgb)[:, 1] # type: ignore + + if self.model_lgb is None: + raise ValueError("LightGBM model not loaded") + p_lgb = self.model_lgb.predict_proba(x_lgb)[:, 1] # type: ignore + + if self.model_extra is None: + raise ValueError("Extra Trees model not loaded") + p_extra = self.model_extra.predict_proba(x_extra)[:, 1] # type: ignore + + if self.model_pytorch is None: + raise ValueError("PyTorch model not loaded") + p_pytorch = self.model_pytorch.predict_proba(x_pytorch)[:, 1] # type: ignore + + if self.model_fnn is None: + raise ValueError("FNN model not loaded") + p_fnn = self.model_fnn.predict_proba(x_fnn)[:, 1] # type: ignore + # Handle potential TabNet input type error + if self.model_tabnet is None: + raise ValueError("TabNet model not loaded") try: - p_tabnet = self.model_tabnet.predict_proba(X_tabnet)[:, 1] + p_tabnet = self.model_tabnet.predict_proba(x_tabnet)[:, 1] # type: ignore except TypeError: - p_tabnet = self.model_tabnet.predict_proba(X_tabnet.values)[:, 1] - + p_tabnet = self.model_tabnet.predict_proba(x_tabnet.values)[:, 1] # type: ignore + # --- MLP Scaling and Prediction --- - # Ensure X_mlp is a DataFrame with correct columns before transform - if not isinstance(X_mlp, pd.DataFrame): - self.logger.warning("X_mlp is not a DataFrame before scaling. Attempting conversion.") - X_mlp = pd.DataFrame(X_mlp, columns=self.mlp_features) + if self.model_mlp_scaler is None: + raise ValueError("MLP scaler not loaded") + # Ensure x_mlp is a DataFrame with correct columns before transform + if not isinstance(x_mlp, pd.DataFrame): + self.logger.warning( + "x_mlp is not a DataFrame before scaling. Attempting conversion." + ) + # Assert that mlp_features is a list for DataFrame constructor + assert isinstance(self.mlp_features, list), ( + f"mlp_features must be list, got {type(self.mlp_features)}" + ) + # Explicitly cast to list[str] for type checker + mlp_columns: list[str] = list(self.mlp_features) + x_mlp = pd.DataFrame(x_mlp, columns=mlp_columns) # type: ignore # Re-select columns just in case order changed or to ensure DataFrame type - X_mlp = X_mlp[self.mlp_features] - X_mlp_scaled = self.model_mlp_scaler.transform(X_mlp) - p_mlp = self.model_mlp.predict_proba(X_mlp_scaled)[:, 1] + x_mlp = x_mlp[self.mlp_features] + x_mlp_scaled = self.model_mlp_scaler.transform(x_mlp) + if self.model_mlp is None: + raise ValueError("MLP model not loaded") + p_mlp = self.model_mlp.predict_proba(x_mlp_scaled)[:, 1] # type: ignore # --- SVM Scaling and Prediction --- - # Ensure X_svm is a DataFrame with correct columns before transform - if not isinstance(X_svm, pd.DataFrame): - self.logger.warning("X_svm is not a DataFrame before scaling. Attempting conversion.") - X_svm = pd.DataFrame(X_svm, columns=self.svm_features) + if self.model_svm_scaler is None: + raise ValueError("SVM scaler not loaded") + # Ensure x_svm is a DataFrame with correct columns before transform + if not isinstance(x_svm, pd.DataFrame): + self.logger.warning( + "x_svm is not a DataFrame before scaling. Attempting conversion." + ) + # Assert that svm_features is a list for DataFrame constructor + assert isinstance(self.svm_features, list), ( + f"svm_features must be list, got {type(self.svm_features)}" + ) + # Explicitly cast to list[str] for type checker + svm_columns: list[str] = list(self.svm_features) + x_svm = pd.DataFrame(x_svm, columns=svm_columns) # type: ignore # Re-select columns - X_svm = X_svm[self.svm_features] - X_svm_scaled = self.model_svm_scaler.transform(X_svm) - p_svm = self.model_svm.predict_proba(X_svm_scaled)[:, 1] + x_svm = x_svm[self.svm_features] + x_svm_scaled = self.model_svm_scaler.transform(x_svm) + if self.model_svm is None: + raise ValueError("SVM model not loaded") + p_svm = self.model_svm.predict_proba(x_svm_scaled)[:, 1] # type: ignore meta_features = create_meta_features_optimized( p_xgb, p_tabnet, p_lgb, p_extra, - p_mlp, - p_pytorch, - p_svm, # Added SVM - p_fnn, # Added FNN - X_prepared, + p_mlp, + p_pytorch, + p_svm, # Added SVM + p_fnn, # Added FNN + x_prepared, self.dynamic_weights_train if self.dynamic_weighting else None, self.thresholds_train if self.dynamic_weighting else None, ) @@ -406,9 +519,9 @@ def predict_proba(self, X) -> np.ndarray: meta_df = create_meta_dataframe(meta_features) if self.meta_learner_type == "tabnet": meta_df_np = meta_df.to_numpy() - meta_probs = self.meta_learner.predict_proba(meta_df_np)[:, 1] + meta_probs = self.meta_learner.predict_proba(meta_df_np)[:, 1] # type: ignore else: - meta_probs = self.meta_learner.predict_proba(meta_df) + meta_probs = self.meta_learner.predict_proba(meta_df) # type: ignore # Handle both 1D and 2D probability arrays if len(meta_probs.shape) == 1: # Already 1D probabilities @@ -417,26 +530,26 @@ def predict_proba(self, X) -> np.ndarray: # Extract positive class probabilities from 2D array return meta_probs[:, 1] except Exception as e: - self.logger.error(f"Error predicting probabilities: {e}", exc_info=True) + self.logger.error(f"Error predicting probabilities: {e}") raise def predict(self, X) -> np.ndarray: probabilities = self.predict_proba(X) return (probabilities >= self.optimal_threshold).astype(int) - def explain_predictions(self, X_val) -> dict: + def explain_predictions(self, x_val) -> dict: # This might need updating if explain_predictions relies on specific base model types - return explain_predictions(self, X_val, self.logger) + return explain_predictions(self, x_val, self.logger) - def analyze_prediction_errors(self, X_val, y_val) -> dict: - return analyze_prediction_errors(self, X_val, y_val, self.optimal_threshold, self.logger) + def analyze_prediction_errors(self, x_val, y_val) -> dict: + return analyze_prediction_errors(self, x_val, y_val, self.optimal_threshold, self.logger) - def precision_filter(self, X, probabilities): + def precision_filter(self, x, probabilities): # This logic might be too specific, consider making it more general or removing high_conf = probabilities > self.optimal_threshold - X_high_conf = X[high_conf] - if "home_form" in X_high_conf.columns and "away_form" in X_high_conf.columns: - form_diff = abs(X_high_conf["home_form"] - X_high_conf["away_form"]) + x_high_conf = x[high_conf] + if "home_form" in x_high_conf.columns and "away_form" in x_high_conf.columns: + form_diff = abs(x_high_conf["home_form"] - x_high_conf["away_form"]) likely_not_draw = form_diff > 0.5 high_conf[high_conf] = ~likely_not_draw return high_conf @@ -453,291 +566,170 @@ def get_model_params(self, model): except Exception as e: return {"error": str(e)} - def load_models_from_mlflow( - self, - xgb_path="model", - lgb_path="model", - tabnet_path="model_sklearn", - extra_path="model", # Path for Extra Trees model - mlp_path="model", # Artifact path for MLP model - mlp_scaler_path="scaler/scaler_mlp.pkl", # Artifact path for MLP scaler - pytorch_path="model", - pytorch_scaler_path="scaler/scaler_pytorch.pkl", - svm_path="model_svm", # Artifact path for SVM model (from svm_model.py) - svm_scaler_path="scaler_svm.pkl", # Artifact path for SVM scaler (default name in svm_model.py) - fnn_path="model", - fnn_scaler_path="scaler/scaler_pytorch.pkl", - ): - """ - Load pre-trained models (XGB, LGBM, TabNet, Extra Trees, MLP, PyTorch, SVM) - and scalers from MLflow. - Updates feature signatures for each model. - """ - self.logger.info("Loading models and scaler from MLflow repository...") - - # Load XGBoost model + def _load_sklearn_model_with_features(self, run_id, model_path, model_name, flavor="sklearn"): + """Helper method to load sklearn-based models and extract features.""" try: - self.logger.info(f"Loading XGBoost model from run {self.xgb_run_id}...") - xgb_uri = f"runs:/{self.xgb_run_id}/{xgb_path}" - self.model_xgb = mlflow.xgboost.load_model(xgb_uri) - xgb_pyfunc = mlflow.pyfunc.load_model(xgb_uri) - if xgb_pyfunc.metadata.signature and xgb_pyfunc.metadata.signature.inputs: - self.xgb_features = xgb_pyfunc.metadata.signature.inputs.input_names() - self.logger.info( - f"Updated XGBoost feature signature: {len(self.xgb_features)} features" - ) - else: - self.logger.warning("No feature signature found for XGBoost model") - self.xgb_features = getattr(self.model_xgb, "feature_names_in_", []) - except Exception as e: - self.logger.error(f"Failed to load XGBoost model: {str(e)}") - raise ValueError(f"Failed to load XGBoost model: {str(e)}") from e + self.logger.info(f"Loading {model_name} model from run {run_id}...") + uri = f"runs:/{run_id}/{model_path}" - # Load LightGBM model - try: - self.logger.info(f"Loading LightGBM model from run {self.lgb_run_id}...") - lgb_uri = f"runs:/{self.lgb_run_id}/{lgb_path}" - self.model_lgb = mlflow.lightgbm.load_model(lgb_uri) - lgb_pyfunc = mlflow.pyfunc.load_model(lgb_uri) - if lgb_pyfunc.metadata.signature and lgb_pyfunc.metadata.signature.inputs: - self.lgb_features = lgb_pyfunc.metadata.signature.inputs.input_names() - self.logger.info( - f"Updated LightGBM feature signature: {len(self.lgb_features)} features" - ) + if flavor == "xgboost": + model = mlflow.xgboost.load_model(uri) + elif flavor == "lightgbm": + model = mlflow.lightgbm.load_model(uri) else: - self.logger.warning("No feature signature found for LightGBM model") - self.lgb_features = getattr(self.model_lgb, "feature_name_", []) - except Exception as e: - self.logger.error(f"Failed to load LightGBM model: {str(e)}") - raise ValueError(f"Failed to load LightGBM model: {str(e)}") from e + model = mlflow.sklearn.load_model(uri) - # Load TabNet model - try: - self.logger.info(f"Loading TabNet model from run {self.tabnet_run_id}...") - tabnet_uri = f"runs:/{self.tabnet_run_id}/{tabnet_path}" - self.model_tabnet = mlflow.sklearn.load_model( - tabnet_uri - ) # Assuming saved via sklearn flavor - tabnet_pyfunc = mlflow.pyfunc.load_model(tabnet_uri) - if tabnet_pyfunc.metadata.signature and tabnet_pyfunc.metadata.signature.inputs: - self.tabnet_features = tabnet_pyfunc.metadata.signature.inputs.input_names() + pyfunc = mlflow.pyfunc.load_model(uri) + features = [] + if pyfunc.metadata.signature and pyfunc.metadata.signature.inputs: + features = pyfunc.metadata.signature.inputs.input_names() self.logger.info( - f"Updated TabNet feature signature: {len(self.tabnet_features)} features" + f"Updated {model_name} feature signature: {len(features)} features" ) else: - self.logger.warning("No feature signature found for TabNet model") - self.tabnet_features = getattr(self.model_tabnet, "feature_names_in_", []) - except Exception as e: - self.logger.error(f"Failed to load TabNet model: {str(e)}") - raise ValueError(f"Failed to load TabNet model: {str(e)}") from e + self.logger.warning(f"No feature signature found for {model_name} model") + if flavor == "lightgbm": + features = getattr(model, "feature_name_", []) + else: + features = getattr(model, "feature_names_in_", []) - # Load Random Forest model - try: - self.logger.info(f"Loading Extra Trees model from run {self.extra_run_id}...") - extra_uri = f"runs:/{self.extra_run_id}/{extra_path}" - self.model_extra = mlflow.sklearn.load_model(extra_uri) - extra_pyfunc = mlflow.pyfunc.load_model(extra_uri) - if extra_pyfunc.metadata.signature and extra_pyfunc.metadata.signature.inputs: - self.extra_features = extra_pyfunc.metadata.signature.inputs.input_names() - self.logger.info( - f"Updated Extra Trees signature: {len(self.extra_features)} features" - ) - else: - self.logger.warning("No feature signature found for Extra Trees model") - self.extra_features = getattr(self.model_extra, "feature_names_in_", []) + return model, features except Exception as e: - self.logger.error(f"Failed to load Extra Trees model: {str(e)}") - raise ValueError(f"Failed to load Extra Trees model: {str(e)}") from e + self.logger.error(f"Failed to load {model_name} model: {str(e)}") + raise ValueError(f"Failed to load {model_name} model: {str(e)}") from e - # Load MLP (sklearn) model + def _load_sklearn_model_with_scaler(self, run_id, model_path, scaler_path, model_name): + """Helper method to load sklearn models with scalers.""" try: - self.logger.info(f"Loading MLP model from run {self.mlp_run_id}...") - mlp_uri = f"runs:/{self.mlp_run_id}/{mlp_path}" - self.model_mlp = mlflow.sklearn.load_model(mlp_uri) - mlp_pyfunc = mlflow.pyfunc.load_model(mlp_uri) - if mlp_pyfunc.metadata.signature and mlp_pyfunc.metadata.signature.inputs: - self.mlp_features = mlp_pyfunc.metadata.signature.inputs.input_names() + self.logger.info(f"Loading {model_name} model from run {run_id}...") + model_uri = f"runs:/{run_id}/{model_path}" + model = mlflow.sklearn.load_model(model_uri) + + pyfunc = mlflow.pyfunc.load_model(model_uri) + features = [] + if pyfunc.metadata.signature and pyfunc.metadata.signature.inputs: + features = pyfunc.metadata.signature.inputs.input_names() self.logger.info( - f"Updated MLP feature signature: {len(self.mlp_features)} features" + f"Updated {model_name} feature signature: {len(features)} features" ) else: - self.logger.warning("No feature signature found for MLP model") - # Attempt to get features from the underlying sklearn model if possible - self.mlp_features = getattr(self.model_mlp, "feature_names_in_", []) + self.logger.warning(f"No feature signature found for {model_name} model") + features = getattr(model, "feature_names_in_", []) - # Load the associated MLP scaler + # Load the associated scaler self.logger.info( - f"Loading MLP scaler artifact '{mlp_scaler_path}' from run {self.mlp_run_id}..." + f"Loading {model_name} scaler artifact '{scaler_path}' from run {run_id}..." ) scaler_local_path = mlflow.artifacts.download_artifacts( - run_id=self.mlp_run_id, artifact_path=mlp_scaler_path + run_id=run_id, artifact_path=scaler_path ) with open(scaler_local_path, "rb") as f: - self.model_mlp_scaler = pickle.load(f) - self.logger.info("MLP scaler loaded successfully.") - if hasattr(self.model_mlp, 'scaler_') and hasattr(self.model_mlp, 'device_'): - try: - # Determine device (use CUDA if available, same logic as hypertuner) - pytorch_device = torch.device("cuda" if torch.cuda.is_available() else "cpu") - self.model_mlp.scaler_ = self.model_mlp_scaler - self.model_mlp.device_ = pytorch_device - self.model_mlp.to(pytorch_device) # Ensure model is on the correct device - self.logger.info(f"Attached scaler and device ({pytorch_device}) to loaded MLP model.") - except Exception as attach_e: - self.logger.warning(f"Could not attach scaler/device to MLP model: {attach_e}") - else: - self.logger.warning("Loaded MLP model does not have scaler_/device_ attributes for attachment.") - # Ensure model is moved to the correct device anyway - try: - pytorch_device = torch.device("cuda" if torch.cuda.is_available() else "cpu") - self.model_mlp.to(pytorch_device) - self.logger.info(f"Moved loaded MLP model to device: {pytorch_device}") - except Exception as move_e: - self.logger.error(f"Could not move MLP model to device: {move_e}") + scaler = pickle.load(f) + self.logger.info(f"{model_name} scaler loaded successfully.") + + return model, scaler, features except Exception as e: - self.logger.error(f"Failed to load MLP model or scaler: {str(e)}") - raise ValueError(f"Failed to load MLP model or scaler: {str(e)}") from e - - # Load PyTorch model + self.logger.error(f"Failed to load {model_name} model or scaler: {str(e)}") + raise ValueError(f"Failed to load {model_name} model or scaler: {str(e)}") from e + + def _load_pytorch_model_with_scaler(self, run_id, model_path, scaler_path, model_name): + """Helper method to load PyTorch models and their scalers.""" try: - # Define artifact paths for PyTorch model and its scaler - pytorch_model_path = "model" # Assuming artifact path is 'model' - pytorch_scaler_path = "scaler/scaler_pytorch.pkl" # Assuming scaler saved in 'scaler' dir - - self.logger.info(f"Loading PyTorch model from run {self.pytorch_run_id}...") - pytorch_uri = f"runs:/{self.pytorch_run_id}/{pytorch_model_path}" - self.model_pytorch = mlflow.pytorch.load_model(pytorch_uri) - + self.logger.info(f"Loading {model_name} model from run {run_id}...") + model_uri = f"runs:/{run_id}/{model_path}" + model = mlflow.pytorch.load_model(model_uri) + # Load PyTorch model also as pyfunc to easily get signature - pytorch_pyfunc = mlflow.pyfunc.load_model(pytorch_uri) - if pytorch_pyfunc.metadata.signature and pytorch_pyfunc.metadata.signature.inputs: - self.pytorch_features = pytorch_pyfunc.metadata.signature.inputs.input_names() + pyfunc = mlflow.pyfunc.load_model(model_uri) + features = [] + if pyfunc.metadata.signature and pyfunc.metadata.signature.inputs: + features = pyfunc.metadata.signature.inputs.input_names() self.logger.info( - f"Updated PyTorch feature signature: {len(self.pytorch_features)} features" + f"Updated {model_name} feature signature: {len(features)} features" ) else: - self.logger.warning("No feature signature found for PyTorch model.") - self.pytorch_features = [] + self.logger.warning(f"No feature signature found for {model_name} model.") + features = [] - # Load the associated PyTorch scaler + # Load the associated scaler self.logger.info( - f"Loading PyTorch scaler artifact '{pytorch_scaler_path}' from run {self.pytorch_run_id}..." + f"Loading {model_name} scaler artifact '{scaler_path}' from run {run_id}..." ) scaler_local_path = mlflow.artifacts.download_artifacts( - run_id=self.pytorch_run_id, artifact_path=pytorch_scaler_path + run_id=run_id, artifact_path=scaler_path ) with open(scaler_local_path, "rb") as f: - self.model_pytorch_scaler = pickle.load(f) - self.logger.info("PyTorch scaler loaded successfully.") - - # Optional: Attach scaler and device to the loaded PyTorch model instance - # if its predict_proba method relies on them being attributes (like in the hypertuner) - if hasattr(self.model_pytorch, 'scaler_') and hasattr(self.model_pytorch, 'device_'): - try: - # Determine device (use CUDA if available, same logic as hypertuner) - pytorch_device = torch.device("cuda" if torch.cuda.is_available() else "cpu") - self.model_pytorch.scaler_ = self.model_pytorch_scaler - self.model_pytorch.device_ = pytorch_device - self.model_pytorch.to(pytorch_device) # Ensure model is on the correct device - self.logger.info(f"Attached scaler and device ({pytorch_device}) to loaded PyTorch model.") - except Exception as attach_e: - self.logger.warning(f"Could not attach scaler/device to PyTorch model: {attach_e}") - else: - self.logger.warning("Loaded PyTorch model does not have scaler_/device_ attributes for attachment.") - # Ensure model is moved to the correct device anyway - try: - pytorch_device = torch.device("cuda" if torch.cuda.is_available() else "cpu") - self.model_pytorch.to(pytorch_device) - self.logger.info(f"Moved loaded PyTorch model to device: {pytorch_device}") - except Exception as move_e: - self.logger.error(f"Could not move PyTorch model to device: {move_e}") + scaler = pickle.load(f) + self.logger.info(f"{model_name} scaler loaded successfully.") + + return model, scaler, features except Exception as e: - self.logger.error(f"Failed to load PyTorch model or scaler: {str(e)}") - raise ValueError(f"Failed to load PyTorch model or scaler: {str(e)}") from e + self.logger.error(f"Failed to load {model_name} model or scaler: {str(e)}") + raise ValueError(f"Failed to load {model_name} model or scaler: {str(e)}") from e - # Load SVM (sklearn) model - try: - self.logger.info(f"Loading SVM model from run {self.svm_run_id}...") - svm_uri = f"runs:/{self.svm_run_id}/{svm_path}" - self.model_svm = mlflow.sklearn.load_model(svm_uri) - svm_pyfunc = mlflow.pyfunc.load_model(svm_uri) - if svm_pyfunc.metadata.signature and svm_pyfunc.metadata.signature.inputs: - self.svm_features = svm_pyfunc.metadata.signature.inputs.input_names() - self.logger.info( - f"Updated SVM feature signature: {len(self.svm_features)} features" - ) - else: - self.logger.warning("No feature signature found for SVM model") - self.svm_features = getattr(self.model_svm, "feature_names_in_", []) + def load_models_from_mlflow( + self, + xgb_path="model", + lgb_path="model", + tabnet_path="model_sklearn", + extra_path="model", # Path for Extra Trees model + mlp_path="model", # Artifact path for MLP model + mlp_scaler_path="scaler/scaler_mlp.pkl", # Artifact path for MLP scaler + pytorch_path="model", + pytorch_scaler_path=PYTORCH_SCALER_PATH, + svm_path="model_svm", # Artifact path for SVM model (from svm_model.py) + svm_scaler_path="scaler_svm.pkl", # Artifact path for SVM scaler (default name in svm_model.py) + fnn_path="model", + fnn_scaler_path=PYTORCH_SCALER_PATH, + ): + """ + Load pre-trained models (XGB, LGBM, TabNet, Extra Trees, MLP, PyTorch, SVM) + and scalers from MLflow. + Updates feature signatures for each model. + """ + self.logger.info("Loading models and scaler from MLflow repository...") - # Load the associated SVM scaler - self.logger.info( - f"Loading SVM scaler artifact '{svm_scaler_path}' from run {self.svm_run_id}..." - ) - scaler_local_path = mlflow.artifacts.download_artifacts( - run_id=self.svm_run_id, artifact_path=svm_scaler_path - ) - with open(scaler_local_path, "rb") as f: - self.model_svm_scaler = pickle.load(f) - self.logger.info("SVM scaler loaded successfully.") - # Verify scaler type (optional) - if not isinstance(self.model_svm_scaler, (StandardScaler, sklearn.preprocessing.RobustScaler)): # Add other expected scaler types if needed - self.logger.warning(f"Loaded SVM scaler is of unexpected type: {type(self.model_svm_scaler).__name__}") - except Exception as e: - self.logger.error(f"Failed to load SVM model or scaler: {str(e)}") - raise ValueError(f"Failed to load SVM model or scaler: {str(e)}") from e + # Load XGBoost model + self.model_xgb, self.xgb_features = self._load_sklearn_model_with_features( + self.xgb_run_id, xgb_path, "XGBoost", "xgboost" + ) - # Load FNN model - try: - self.logger.info(f"Loading FNN model from run {self.fnn_run_id}...") - fnn_uri = f"runs:/{self.fnn_run_id}/{fnn_path}" - self.model_fnn = mlflow.pytorch.load_model(fnn_uri) - - # Load PyTorch model also as pyfunc to easily get signature - fnn_pyfunc = mlflow.pyfunc.load_model(fnn_uri) - if fnn_pyfunc.metadata.signature and fnn_pyfunc.metadata.signature.inputs: - self.fnn_features = fnn_pyfunc.metadata.signature.inputs.input_names() - self.logger.info( - f"Updated FNN feature signature: {len(self.fnn_features)} features" - ) - else: - self.logger.warning("No feature signature found for FNN model.") - self.fnn_features = [] + # Load LightGBM model + self.model_lgb, self.lgb_features = self._load_sklearn_model_with_features( + self.lgb_run_id, lgb_path, "LightGBM", "lightgbm" + ) - # Load the associated PyTorch scaler - self.logger.info( - f"Loading FNN scaler artifact '{fnn_scaler_path}' from run {self.fnn_run_id}..." - ) - scaler_local_path = mlflow.artifacts.download_artifacts( - run_id=self.fnn_run_id, artifact_path=fnn_scaler_path + # Load TabNet model + self.model_tabnet, self.tabnet_features = self._load_sklearn_model_with_features( + self.tabnet_run_id, tabnet_path, "TabNet", "sklearn" + ) + + # Load Random Forest model + self.model_extra, self.extra_features = self._load_sklearn_model_with_features( + self.extra_run_id, extra_path, "Extra Trees", "sklearn" + ) + + # Load MLP model and scaler + self.model_mlp, self.model_mlp_scaler, self.mlp_features = ( + self._load_sklearn_model_with_scaler(self.mlp_run_id, mlp_path, mlp_scaler_path, "MLP") + ) + + # Load PyTorch model and scaler + self.model_pytorch, self.model_pytorch_scaler, self.pytorch_features = ( + self._load_pytorch_model_with_scaler( + self.pytorch_run_id, pytorch_path, pytorch_scaler_path, "PyTorch" ) - with open(scaler_local_path, "rb") as f: - self.model_fnn_scaler = pickle.load(f) - self.logger.info("FNN scaler loaded successfully.") - - # Optional: Attach scaler and device to the loaded PyTorch model instance - if hasattr(self.model_pytorch, 'scaler_') and hasattr(self.model_pytorch, 'device_'): - try: - # Determine device (use CUDA if available, same logic as hypertuner) - pytorch_device = torch.device("cuda" if torch.cuda.is_available() else "cpu") - self.model_fnn.scaler_ = self.model_fnn_scaler - self.model_fnn.device_ = pytorch_device - self.model_fnn.to(pytorch_device) # Ensure model is on the correct device - self.logger.info(f"Attached scaler and device ({pytorch_device}) to loaded FNN model.") - except Exception as attach_e: - self.logger.warning(f"Could not attach scaler/device to FNN model: {attach_e}") - else: - self.logger.warning("Loaded FNN model does not have scaler_/device_ attributes for attachment.") - # Ensure model is moved to the correct device anyway - try: - pytorch_device = torch.device("cuda" if torch.cuda.is_available() else "cpu") - self.model_fnn.to(pytorch_device) - self.logger.info(f"Moved loaded FNN model to device: {pytorch_device}") - except Exception as move_e: - self.logger.error(f"Could not move FNN model to device: {move_e}") - except Exception as e: - self.logger.error(f"Failed to load FNN model or scaler: {str(e)}") - raise ValueError(f"Failed to load FNN model or scaler: {str(e)}") from e + ) + + # Load SVM model and scaler + self.model_svm, self.model_svm_scaler, self.svm_features = ( + self._load_sklearn_model_with_scaler(self.svm_run_id, svm_path, svm_scaler_path, "SVM") + ) + + # Load FNN model and scaler + self.model_fnn, self.model_fnn_scaler, self.fnn_features = ( + self._load_pytorch_model_with_scaler(self.fnn_run_id, fnn_path, fnn_scaler_path, "FNN") + ) self.logger.info("Base models loading complete.") # Consider setting self.selected_features based on intersection or a specific model diff --git a/src/models/ensemble/meta_features_0410.py b/src/models/ensemble/meta_features_0410.py index d5fe879..12debac 100644 --- a/src/models/ensemble/meta_features_0410.py +++ b/src/models/ensemble/meta_features_0410.py @@ -46,7 +46,7 @@ def create_meta_features_optimized( default_weight = 1.0 / num_models # Basic probabilities - meta_features = np.column_stack([p.reshape(-1, 1) for p in all_preds]) # Stack all 6 + meta_features = np.column_stack([p.reshape(-1, 1) for p in all_preds]) # Stack all 6 # Add weighted average if dynamic_weights: @@ -56,11 +56,11 @@ def create_meta_features_optimized( + dynamic_weights.get("lgb", default_weight) * p_lgb + dynamic_weights.get("extra", default_weight) * p_extra + dynamic_weights.get("mlp", default_weight) * p_mlp - + dynamic_weights.get("pytorch", default_weight) * p_pytorch # Added PyTorch + + dynamic_weights.get("pytorch", default_weight) * p_pytorch # Added PyTorch ) else: # Simple average - weighted_avg = np.mean(all_preds, axis=0) # Simpler way to average + weighted_avg = np.mean(all_preds, axis=0) # Simpler way to average # Add to meta-features meta_features = np.column_stack([meta_features, weighted_avg.reshape(-1, 1)]) @@ -72,7 +72,7 @@ def create_meta_features_optimized( diff_lgb_pytorch = np.abs(p_lgb - p_pytorch) diff_extra_pytorch = np.abs(p_extra - p_pytorch) diff_mlp_pytorch = np.abs(p_mlp - p_pytorch) - + # Existing differences diff_xgb_tabnet = np.abs(p_xgb - p_tabnet) diff_xgb_lgb = np.abs(p_xgb - p_lgb) @@ -87,17 +87,27 @@ def create_meta_features_optimized( # Stack all differences (original 10 + new 5 = 15) all_diffs = [ - diff_xgb_tabnet, diff_xgb_lgb, diff_xgb_extra, diff_xgb_mlp, diff_xgb_pytorch, # xgb vs others - diff_tabnet_lgb, diff_tabnet_extra, diff_tabnet_mlp, diff_tabnet_pytorch, # tabnet vs others - diff_lgb_extra, diff_lgb_mlp, diff_lgb_pytorch, # lgb vs others - diff_extra_mlp, diff_extra_pytorch, # extra vs others - diff_mlp_pytorch # mlp vs pytorch + diff_xgb_tabnet, + diff_xgb_lgb, + diff_xgb_extra, + diff_xgb_mlp, + diff_xgb_pytorch, # xgb vs others + diff_tabnet_lgb, + diff_tabnet_extra, + diff_tabnet_mlp, + diff_tabnet_pytorch, # tabnet vs others + diff_lgb_extra, + diff_lgb_mlp, + diff_lgb_pytorch, # lgb vs others + diff_extra_mlp, + diff_extra_pytorch, # extra vs others + diff_mlp_pytorch, # mlp vs pytorch ] meta_features = np.column_stack([meta_features] + [d.reshape(-1, 1) for d in all_diffs]) # Add max and min probabilities - max_prob = np.maximum.reduce(all_preds) # Use all_preds list - min_prob = np.minimum.reduce(all_preds) # Use all_preds list + max_prob = np.maximum.reduce(all_preds) # Use all_preds list + min_prob = np.minimum.reduce(all_preds) # Use all_preds list range_prob = max_prob - min_prob meta_features = np.column_stack( @@ -106,9 +116,9 @@ def create_meta_features_optimized( # Add rank features # Use all_preds list for simpler ranking - probs_stacked = np.stack(all_preds, axis=1) # Shape (n_samples, num_models) + probs_stacked = np.stack(all_preds, axis=1) # Shape (n_samples, num_models) ranks = np.argsort(np.argsort(probs_stacked, axis=1), axis=1) - meta_features = np.column_stack([meta_features, ranks]) # Adds num_models columns + meta_features = np.column_stack([meta_features, ranks]) # Adds num_models columns # Add agreement features default_threshold = 0.5 @@ -118,16 +128,18 @@ def create_meta_features_optimized( thresholds.get("lgb", default_threshold) if thresholds else default_threshold, thresholds.get("extra", default_threshold) if thresholds else default_threshold, thresholds.get("mlp", default_threshold) if thresholds else default_threshold, - thresholds.get("pytorch", default_threshold) if thresholds else default_threshold, # Added PyTorch + thresholds.get("pytorch", default_threshold) + if thresholds + else default_threshold, # Added PyTorch ] - - votes = np.column_stack([ - (pred > thresh).astype(int) for pred, thresh in zip(all_preds, vote_thresholds) - ]) + + votes = np.column_stack( + [(pred > thresh).astype(int) for pred, thresh in zip(all_preds, vote_thresholds)] + ) vote_sum = np.sum(votes, axis=1) vote_agreement = np.where( - (vote_sum == 0) | (vote_sum == num_models), # Check for 0 or num_models (6) votes + (vote_sum == 0) | (vote_sum == num_models), # Check for 0 or num_models (6) votes 1, 0, ) @@ -153,20 +165,37 @@ def create_meta_dataframe(meta_features: np.ndarray) -> pd.DataFrame: num_features = meta_features.shape[1] # Define column names based on the expected structure (33 features) col_names = ( - ["p_xgb", "p_tabnet", "p_lgb", "p_extra", "p_mlp", "p_pytorch"] + # 6 base - ["weighted_avg"] + # 1 avg - [ - "diff_xgb_tabnet", "diff_xgb_lgb", "diff_xgb_extra", "diff_xgb_mlp", "diff_xgb_pytorch", - "diff_tabnet_lgb", "diff_tabnet_extra", "diff_tabnet_mlp", "diff_tabnet_pytorch", - "diff_lgb_extra", "diff_lgb_mlp", "diff_lgb_pytorch", - "diff_extra_mlp", "diff_extra_pytorch", - "diff_mlp_pytorch" - ] + # 15 diffs - ["max_prob", "min_prob", "range_prob"] + # 3 range - ["rank_xgb", "rank_tabnet", "rank_lgb", "rank_extra", "rank_mlp", "rank_pytorch"] + # 6 ranks - ["vote_sum", "vote_agreement"] # 2 votes + ["p_xgb", "p_tabnet", "p_lgb", "p_extra", "p_mlp", "p_pytorch"] # 6 base + + ["weighted_avg"] # 1 avg + + [ + "diff_xgb_tabnet", + "diff_xgb_lgb", + "diff_xgb_extra", + "diff_xgb_mlp", + "diff_xgb_pytorch", + "diff_tabnet_lgb", + "diff_tabnet_extra", + "diff_tabnet_mlp", + "diff_tabnet_pytorch", + "diff_lgb_extra", + "diff_lgb_mlp", + "diff_lgb_pytorch", + "diff_extra_mlp", + "diff_extra_pytorch", + "diff_mlp_pytorch", + ] # 15 diffs + + ["max_prob", "min_prob", "range_prob"] # 3 range + + [ + "rank_xgb", + "rank_tabnet", + "rank_lgb", + "rank_extra", + "rank_mlp", + "rank_pytorch", + ] # 6 ranks + + ["vote_sum", "vote_agreement"] # 2 votes ) - + if num_features != len(col_names): # Fallback if the number of features doesn't match the expected 33 # This might happen if the assert in the creation function is commented out and logic changes diff --git a/src/models/ensemble/meta_features_0412.py b/src/models/ensemble/meta_features_0412.py index 9993515..3e2405c 100644 --- a/src/models/ensemble/meta_features_0412.py +++ b/src/models/ensemble/meta_features_0412.py @@ -48,7 +48,7 @@ def create_meta_features_optimized( default_weight = 1.0 / num_models # Basic probabilities - meta_features = np.column_stack([p.reshape(-1, 1) for p in all_preds]) # Stack all 7 + meta_features = np.column_stack([p.reshape(-1, 1) for p in all_preds]) # Stack all 7 # Add weighted average if dynamic_weights: @@ -58,12 +58,12 @@ def create_meta_features_optimized( + dynamic_weights.get("lgb", default_weight) * p_lgb + dynamic_weights.get("extra", default_weight) * p_extra + dynamic_weights.get("mlp", default_weight) * p_mlp - + dynamic_weights.get("pytorch", default_weight) * p_pytorch # Added PyTorch - + dynamic_weights.get("svm", default_weight) * p_svm # Added SVM + + dynamic_weights.get("pytorch", default_weight) * p_pytorch # Added PyTorch + + dynamic_weights.get("svm", default_weight) * p_svm # Added SVM ) else: # Simple average - weighted_avg = np.mean(all_preds, axis=0) # Simpler way to average + weighted_avg = np.mean(all_preds, axis=0) # Simpler way to average # Add to meta-features meta_features = np.column_stack([meta_features, weighted_avg.reshape(-1, 1)]) @@ -82,7 +82,7 @@ def create_meta_features_optimized( diff_extra_svm = np.abs(p_extra - p_svm) diff_mlp_svm = np.abs(p_mlp - p_svm) diff_pytorch_svm = np.abs(p_pytorch - p_svm) - + # Existing differences diff_xgb_tabnet = np.abs(p_xgb - p_tabnet) diff_xgb_lgb = np.abs(p_xgb - p_lgb) @@ -97,18 +97,33 @@ def create_meta_features_optimized( # Stack all differences (original 15 + new 6 = 21) all_diffs = [ - diff_xgb_tabnet, diff_xgb_lgb, diff_xgb_extra, diff_xgb_mlp, diff_xgb_pytorch, diff_xgb_svm, # xgb vs others - diff_tabnet_lgb, diff_tabnet_extra, diff_tabnet_mlp, diff_tabnet_pytorch, diff_tabnet_svm, # tabnet vs others - diff_lgb_extra, diff_lgb_mlp, diff_lgb_pytorch, diff_lgb_svm, # lgb vs others - diff_extra_mlp, diff_extra_pytorch, diff_extra_svm, # extra vs others - diff_mlp_pytorch, diff_mlp_svm, # mlp vs others - diff_pytorch_svm # pytorch vs svm + diff_xgb_tabnet, + diff_xgb_lgb, + diff_xgb_extra, + diff_xgb_mlp, + diff_xgb_pytorch, + diff_xgb_svm, # xgb vs others + diff_tabnet_lgb, + diff_tabnet_extra, + diff_tabnet_mlp, + diff_tabnet_pytorch, + diff_tabnet_svm, # tabnet vs others + diff_lgb_extra, + diff_lgb_mlp, + diff_lgb_pytorch, + diff_lgb_svm, # lgb vs others + diff_extra_mlp, + diff_extra_pytorch, + diff_extra_svm, # extra vs others + diff_mlp_pytorch, + diff_mlp_svm, # mlp vs others + diff_pytorch_svm, # pytorch vs svm ] meta_features = np.column_stack([meta_features] + [d.reshape(-1, 1) for d in all_diffs]) # Add max and min probabilities - max_prob = np.maximum.reduce(all_preds) # Use all_preds list - min_prob = np.minimum.reduce(all_preds) # Use all_preds list + max_prob = np.maximum.reduce(all_preds) # Use all_preds list + min_prob = np.minimum.reduce(all_preds) # Use all_preds list range_prob = max_prob - min_prob meta_features = np.column_stack( @@ -117,9 +132,9 @@ def create_meta_features_optimized( # Add rank features # Use all_preds list for simpler ranking - probs_stacked = np.stack(all_preds, axis=1) # Shape (n_samples, num_models) + probs_stacked = np.stack(all_preds, axis=1) # Shape (n_samples, num_models) ranks = np.argsort(np.argsort(probs_stacked, axis=1), axis=1) - meta_features = np.column_stack([meta_features, ranks]) # Adds num_models columns + meta_features = np.column_stack([meta_features, ranks]) # Adds num_models columns # Add agreement features default_threshold = 0.5 @@ -129,17 +144,19 @@ def create_meta_features_optimized( thresholds.get("lgb", default_threshold) if thresholds else default_threshold, thresholds.get("extra", default_threshold) if thresholds else default_threshold, thresholds.get("mlp", default_threshold) if thresholds else default_threshold, - thresholds.get("pytorch", default_threshold) if thresholds else default_threshold, # Added PyTorch - thresholds.get("svm", default_threshold) if thresholds else default_threshold, # Added SVM + thresholds.get("pytorch", default_threshold) + if thresholds + else default_threshold, # Added PyTorch + thresholds.get("svm", default_threshold) if thresholds else default_threshold, # Added SVM ] - - votes = np.column_stack([ - (pred > thresh).astype(int) for pred, thresh in zip(all_preds, vote_thresholds) - ]) + + votes = np.column_stack( + [(pred > thresh).astype(int) for pred, thresh in zip(all_preds, vote_thresholds)] + ) vote_sum = np.sum(votes, axis=1) vote_agreement = np.where( - (vote_sum == 0) | (vote_sum == num_models), # Check for 0 or num_models (7) votes + (vote_sum == 0) | (vote_sum == num_models), # Check for 0 or num_models (7) votes 1, 0, ) @@ -167,20 +184,44 @@ def create_meta_dataframe(meta_features: np.ndarray) -> pd.DataFrame: # 7 base + 1 avg + 21 diffs + 3 range + 7 ranks + 2 votes = 41 ? Mistake somewhere # Re-count: 7 base + 1 avg + 21 diffs + 3 range + 7 ranks + 2 votes = 41 features col_names = ( - ["p_xgb", "p_tabnet", "p_lgb", "p_extra", "p_mlp", "p_pytorch", "p_svm"] + # 7 base - ["weighted_avg"] + # 1 avg - [ "diff_xgb_tabnet", "diff_xgb_lgb", "diff_xgb_extra", "diff_xgb_mlp", "diff_xgb_pytorch", "diff_xgb_svm", - "diff_tabnet_lgb", "diff_tabnet_extra", "diff_tabnet_mlp", "diff_tabnet_pytorch", "diff_tabnet_svm", - "diff_lgb_extra", "diff_lgb_mlp", "diff_lgb_pytorch", "diff_lgb_svm", - "diff_extra_mlp", "diff_extra_pytorch", "diff_extra_svm", - "diff_mlp_pytorch", "diff_mlp_svm", - "diff_pytorch_svm" - ] + # 21 diffs - ["max_prob", "min_prob", "range_prob"] + # 3 range - ["rank_xgb", "rank_tabnet", "rank_lgb", "rank_extra", "rank_mlp", "rank_pytorch", "rank_svm"] + # 7 ranks - ["vote_sum", "vote_agreement"] # 2 votes + ["p_xgb", "p_tabnet", "p_lgb", "p_extra", "p_mlp", "p_pytorch", "p_svm"] # 7 base + + ["weighted_avg"] # 1 avg + + [ + "diff_xgb_tabnet", + "diff_xgb_lgb", + "diff_xgb_extra", + "diff_xgb_mlp", + "diff_xgb_pytorch", + "diff_xgb_svm", + "diff_tabnet_lgb", + "diff_tabnet_extra", + "diff_tabnet_mlp", + "diff_tabnet_pytorch", + "diff_tabnet_svm", + "diff_lgb_extra", + "diff_lgb_mlp", + "diff_lgb_pytorch", + "diff_lgb_svm", + "diff_extra_mlp", + "diff_extra_pytorch", + "diff_extra_svm", + "diff_mlp_pytorch", + "diff_mlp_svm", + "diff_pytorch_svm", + ] # 21 diffs + + ["max_prob", "min_prob", "range_prob"] # 3 range + + [ + "rank_xgb", + "rank_tabnet", + "rank_lgb", + "rank_extra", + "rank_mlp", + "rank_pytorch", + "rank_svm", + ] # 7 ranks + + ["vote_sum", "vote_agreement"] # 2 votes ) - + if num_features != len(col_names): # Fallback if the number of features doesn't match the expected 41 col_names = [f"meta_{i}" for i in range(num_features)] diff --git a/src/models/ensemble/meta_features_0414.py b/src/models/ensemble/meta_features_0414.py index 312b82c..88d5ced 100644 --- a/src/models/ensemble/meta_features_0414.py +++ b/src/models/ensemble/meta_features_0414.py @@ -50,7 +50,7 @@ def create_meta_features_optimized( default_weight = 1.0 / num_models # Basic probabilities - meta_features = np.column_stack([p.reshape(-1, 1) for p in all_preds]) # Stack all 7 + meta_features = np.column_stack([p.reshape(-1, 1) for p in all_preds]) # Stack all 7 # Add weighted average if dynamic_weights: @@ -60,13 +60,13 @@ def create_meta_features_optimized( + dynamic_weights.get("lgb", default_weight) * p_lgb + dynamic_weights.get("extra", default_weight) * p_extra + dynamic_weights.get("mlp", default_weight) * p_mlp - + dynamic_weights.get("pytorch", default_weight) * p_pytorch # Added PyTorch - + dynamic_weights.get("svm", default_weight) * p_svm # Added SVM - + dynamic_weights.get("fnn", default_weight) * p_fnn # Added FNN + + dynamic_weights.get("pytorch", default_weight) * p_pytorch # Added PyTorch + + dynamic_weights.get("svm", default_weight) * p_svm # Added SVM + + dynamic_weights.get("fnn", default_weight) * p_fnn # Added FNN ) else: # Simple average - weighted_avg = np.mean(all_preds, axis=0) # Simpler way to average + weighted_avg = np.mean(all_preds, axis=0) # Simpler way to average # Add to meta-features meta_features = np.column_stack([meta_features, weighted_avg.reshape(-1, 1)]) @@ -85,7 +85,7 @@ def create_meta_features_optimized( diff_extra_svm = np.abs(p_extra - p_svm) diff_mlp_svm = np.abs(p_mlp - p_svm) diff_pytorch_svm = np.abs(p_pytorch - p_svm) - + # Existing differences diff_xgb_tabnet = np.abs(p_xgb - p_tabnet) diff_xgb_lgb = np.abs(p_xgb - p_lgb) @@ -100,21 +100,37 @@ def create_meta_features_optimized( diff_fnn_pytorch = np.abs(p_fnn - p_pytorch) diff_fnn_svm = np.abs(p_fnn - p_svm) - # Stack all differences (original 15 + new 6 = 21) all_diffs = [ - diff_xgb_tabnet, diff_xgb_lgb, diff_xgb_extra, diff_xgb_mlp, diff_xgb_pytorch, diff_xgb_svm, # xgb vs others - diff_tabnet_lgb, diff_tabnet_extra, diff_tabnet_mlp, diff_tabnet_pytorch, diff_tabnet_svm, # tabnet vs others - diff_lgb_extra, diff_lgb_mlp, diff_lgb_pytorch, diff_lgb_svm, # lgb vs others - diff_extra_mlp, diff_extra_pytorch, diff_extra_svm, # extra vs others - diff_mlp_pytorch, diff_mlp_svm, # mlp vs others - diff_pytorch_svm, diff_fnn_pytorch, diff_fnn_svm, # pytorch vs svm + diff_xgb_tabnet, + diff_xgb_lgb, + diff_xgb_extra, + diff_xgb_mlp, + diff_xgb_pytorch, + diff_xgb_svm, # xgb vs others + diff_tabnet_lgb, + diff_tabnet_extra, + diff_tabnet_mlp, + diff_tabnet_pytorch, + diff_tabnet_svm, # tabnet vs others + diff_lgb_extra, + diff_lgb_mlp, + diff_lgb_pytorch, + diff_lgb_svm, # lgb vs others + diff_extra_mlp, + diff_extra_pytorch, + diff_extra_svm, # extra vs others + diff_mlp_pytorch, + diff_mlp_svm, # mlp vs others + diff_pytorch_svm, + diff_fnn_pytorch, + diff_fnn_svm, # pytorch vs svm ] meta_features = np.column_stack([meta_features] + [d.reshape(-1, 1) for d in all_diffs]) # Add max and min probabilities - max_prob = np.maximum.reduce(all_preds) # Use all_preds list - min_prob = np.minimum.reduce(all_preds) # Use all_preds list + max_prob = np.maximum.reduce(all_preds) # Use all_preds list + min_prob = np.minimum.reduce(all_preds) # Use all_preds list range_prob = max_prob - min_prob meta_features = np.column_stack( @@ -123,9 +139,9 @@ def create_meta_features_optimized( # Add rank features # Use all_preds list for simpler ranking - probs_stacked = np.stack(all_preds, axis=1) # Shape (n_samples, num_models) + probs_stacked = np.stack(all_preds, axis=1) # Shape (n_samples, num_models) ranks = np.argsort(np.argsort(probs_stacked, axis=1), axis=1) - meta_features = np.column_stack([meta_features, ranks]) # Adds num_models columns + meta_features = np.column_stack([meta_features, ranks]) # Adds num_models columns # Add agreement features default_threshold = 0.5 @@ -135,18 +151,20 @@ def create_meta_features_optimized( thresholds.get("lgb", default_threshold) if thresholds else default_threshold, thresholds.get("extra", default_threshold) if thresholds else default_threshold, thresholds.get("mlp", default_threshold) if thresholds else default_threshold, - thresholds.get("pytorch", default_threshold) if thresholds else default_threshold, # Added PyTorch - thresholds.get("svm", default_threshold) if thresholds else default_threshold, # Added SVM - thresholds.get("fnn", default_threshold) if thresholds else default_threshold, # Added FNN + thresholds.get("pytorch", default_threshold) + if thresholds + else default_threshold, # Added PyTorch + thresholds.get("svm", default_threshold) if thresholds else default_threshold, # Added SVM + thresholds.get("fnn", default_threshold) if thresholds else default_threshold, # Added FNN ] - - votes = np.column_stack([ - (pred > thresh).astype(int) for pred, thresh in zip(all_preds, vote_thresholds) - ]) + + votes = np.column_stack( + [(pred > thresh).astype(int) for pred, thresh in zip(all_preds, vote_thresholds)] + ) vote_sum = np.sum(votes, axis=1) vote_agreement = np.where( - (vote_sum == 0) | (vote_sum == num_models), # Check for 0 or num_models (7) votes + (vote_sum == 0) | (vote_sum == num_models), # Check for 0 or num_models (7) votes 1, 0, ) @@ -174,20 +192,47 @@ def create_meta_dataframe(meta_features: np.ndarray) -> pd.DataFrame: # 7 base + 1 avg + 21 diffs + 3 range + 7 ranks + 2 votes = 41 ? Mistake somewhere # Re-count: 7 base + 1 avg + 21 diffs + 3 range + 7 ranks + 2 votes = 41 features col_names = ( - ["p_xgb", "p_tabnet", "p_lgb", "p_extra", "p_mlp", "p_pytorch", "p_svm", "p_fnn"] + # 8 base - ["weighted_avg"] + # 1 avg - [ "diff_xgb_tabnet", "diff_xgb_lgb", "diff_xgb_extra", "diff_xgb_mlp", "diff_xgb_pytorch", "diff_xgb_svm", - "diff_tabnet_lgb", "diff_tabnet_extra", "diff_tabnet_mlp", "diff_tabnet_pytorch", "diff_tabnet_svm", - "diff_lgb_extra", "diff_lgb_mlp", "diff_lgb_pytorch", "diff_lgb_svm", - "diff_extra_mlp", "diff_extra_pytorch", "diff_extra_svm", - "diff_mlp_pytorch", "diff_mlp_svm", - "diff_pytorch_svm", "diff_fnn_pytorch", "diff_fnn_svm" - ] + # 21 diffs - ["max_prob", "min_prob", "range_prob"] + # 3 range - ["rank_xgb", "rank_tabnet", "rank_lgb", "rank_extra", "rank_mlp", "rank_pytorch", "rank_svm", "rank_fnn"] + # 8 ranks - ["vote_sum", "vote_agreement"] # 2 votes + ["p_xgb", "p_tabnet", "p_lgb", "p_extra", "p_mlp", "p_pytorch", "p_svm", "p_fnn"] # 8 base + + ["weighted_avg"] # 1 avg + + [ + "diff_xgb_tabnet", + "diff_xgb_lgb", + "diff_xgb_extra", + "diff_xgb_mlp", + "diff_xgb_pytorch", + "diff_xgb_svm", + "diff_tabnet_lgb", + "diff_tabnet_extra", + "diff_tabnet_mlp", + "diff_tabnet_pytorch", + "diff_tabnet_svm", + "diff_lgb_extra", + "diff_lgb_mlp", + "diff_lgb_pytorch", + "diff_lgb_svm", + "diff_extra_mlp", + "diff_extra_pytorch", + "diff_extra_svm", + "diff_mlp_pytorch", + "diff_mlp_svm", + "diff_pytorch_svm", + "diff_fnn_pytorch", + "diff_fnn_svm", + ] # 21 diffs + + ["max_prob", "min_prob", "range_prob"] # 3 range + + [ + "rank_xgb", + "rank_tabnet", + "rank_lgb", + "rank_extra", + "rank_mlp", + "rank_pytorch", + "rank_svm", + "rank_fnn", + ] # 8 ranks + + ["vote_sum", "vote_agreement"] # 2 votes ) - + if num_features != len(col_names): # Fallback if the number of features doesn't match the expected 41 col_names = [f"meta_{i}" for i in range(num_features)] diff --git a/src/models/ensemble/meta_features_20.py b/src/models/ensemble/meta_features_20.py index d505a68..9343f41 100644 --- a/src/models/ensemble/meta_features_20.py +++ b/src/models/ensemble/meta_features_20.py @@ -51,7 +51,7 @@ def create_meta_features_optimized( default_weight = 1.0 / num_models # Basic probabilities - meta_features = np.column_stack([p.reshape(-1, 1) for p in all_preds]) # Stack all 7 + meta_features = np.column_stack([p.reshape(-1, 1) for p in all_preds]) # Stack all 7 # Add weighted average if dynamic_weights: @@ -61,13 +61,13 @@ def create_meta_features_optimized( + dynamic_weights.get("lgb", default_weight) * p_lgb + dynamic_weights.get("extra", default_weight) * p_extra + dynamic_weights.get("mlp", default_weight) * p_mlp - + dynamic_weights.get("pytorch", default_weight) * p_pytorch # Added PyTorch - + dynamic_weights.get("svm", default_weight) * p_svm # Added SVM - + dynamic_weights.get("fnn", default_weight) * p_fnn # Added FNN + + dynamic_weights.get("pytorch", default_weight) * p_pytorch # Added PyTorch + + dynamic_weights.get("svm", default_weight) * p_svm # Added SVM + + dynamic_weights.get("fnn", default_weight) * p_fnn # Added FNN ) else: # Simple average - weighted_avg = np.mean(all_preds, axis=0) # Simpler way to average + weighted_avg = np.mean(all_preds, axis=0) # Simpler way to average # Add to meta-features meta_features = np.column_stack([meta_features, weighted_avg.reshape(-1, 1)]) @@ -86,7 +86,7 @@ def create_meta_features_optimized( diff_extra_svm = np.abs(p_extra - p_svm) diff_mlp_svm = np.abs(p_mlp - p_svm) diff_pytorch_svm = np.abs(p_pytorch - p_svm) - + # Existing differences diff_xgb_tabnet = np.abs(p_xgb - p_tabnet) diff_xgb_lgb = np.abs(p_xgb - p_lgb) @@ -104,21 +104,37 @@ def create_meta_features_optimized( date_encoded = data["date_encoded"] season_encoded = data["season_encoded"] - # Stack all differences (original 15 + new 6 = 21) all_diffs = [ - diff_xgb_tabnet, diff_xgb_lgb, diff_xgb_extra, diff_xgb_mlp, diff_xgb_pytorch, diff_xgb_svm, # xgb vs others - diff_tabnet_lgb, diff_tabnet_extra, diff_tabnet_mlp, diff_tabnet_pytorch, diff_tabnet_svm, # tabnet vs others - diff_lgb_extra, diff_lgb_mlp, diff_lgb_pytorch, diff_lgb_svm, # lgb vs others - diff_extra_mlp, diff_extra_pytorch, diff_extra_svm, # extra vs others - diff_mlp_pytorch, diff_mlp_svm, # mlp vs others - diff_pytorch_svm, diff_fnn_pytorch, diff_fnn_svm, # pytorch vs svm + diff_xgb_tabnet, + diff_xgb_lgb, + diff_xgb_extra, + diff_xgb_mlp, + diff_xgb_pytorch, + diff_xgb_svm, # xgb vs others + diff_tabnet_lgb, + diff_tabnet_extra, + diff_tabnet_mlp, + diff_tabnet_pytorch, + diff_tabnet_svm, # tabnet vs others + diff_lgb_extra, + diff_lgb_mlp, + diff_lgb_pytorch, + diff_lgb_svm, # lgb vs others + diff_extra_mlp, + diff_extra_pytorch, + diff_extra_svm, # extra vs others + diff_mlp_pytorch, + diff_mlp_svm, # mlp vs others + diff_pytorch_svm, + diff_fnn_pytorch, + diff_fnn_svm, # pytorch vs svm ] meta_features = np.column_stack([meta_features] + [d.reshape(-1, 1) for d in all_diffs]) # Add max and min probabilities - max_prob = np.maximum.reduce(all_preds) # Use all_preds list - min_prob = np.minimum.reduce(all_preds) # Use all_preds list + max_prob = np.maximum.reduce(all_preds) # Use all_preds list + min_prob = np.minimum.reduce(all_preds) # Use all_preds list range_prob = max_prob - min_prob meta_features = np.column_stack( @@ -127,9 +143,9 @@ def create_meta_features_optimized( # Add rank features # Use all_preds list for simpler ranking - probs_stacked = np.stack(all_preds, axis=1) # Shape (n_samples, num_models) + probs_stacked = np.stack(all_preds, axis=1) # Shape (n_samples, num_models) ranks = np.argsort(np.argsort(probs_stacked, axis=1), axis=1) - meta_features = np.column_stack([meta_features, ranks]) # Adds num_models columns + meta_features = np.column_stack([meta_features, ranks]) # Adds num_models columns # Add agreement features default_threshold = 0.5 @@ -139,18 +155,20 @@ def create_meta_features_optimized( thresholds.get("lgb", default_threshold) if thresholds else default_threshold, thresholds.get("extra", default_threshold) if thresholds else default_threshold, thresholds.get("mlp", default_threshold) if thresholds else default_threshold, - thresholds.get("pytorch", default_threshold) if thresholds else default_threshold, # Added PyTorch - thresholds.get("svm", default_threshold) if thresholds else default_threshold, # Added SVM - thresholds.get("fnn", default_threshold) if thresholds else default_threshold, # Added FNN + thresholds.get("pytorch", default_threshold) + if thresholds + else default_threshold, # Added PyTorch + thresholds.get("svm", default_threshold) if thresholds else default_threshold, # Added SVM + thresholds.get("fnn", default_threshold) if thresholds else default_threshold, # Added FNN ] - - votes = np.column_stack([ - (pred > thresh).astype(int) for pred, thresh in zip(all_preds, vote_thresholds) - ]) + + votes = np.column_stack( + [(pred > thresh).astype(int) for pred, thresh in zip(all_preds, vote_thresholds)] + ) vote_sum = np.sum(votes, axis=1) vote_agreement = np.where( - (vote_sum == 0) | (vote_sum == num_models), # Check for 0 or num_models (7) votes + (vote_sum == 0) | (vote_sum == num_models), # Check for 0 or num_models (7) votes 1, 0, ) @@ -179,21 +197,48 @@ def create_meta_dataframe(meta_features: np.ndarray) -> pd.DataFrame: # 7 base + 1 avg + 21 diffs + 3 range + 7 ranks + 2 votes = 41 ? Mistake somewhere # Re-count: 7 base + 1 avg + 21 diffs + 3 range + 7 ranks + 2 votes = 41 features col_names = ( - ["p_xgb", "p_tabnet", "p_lgb", "p_extra", "p_mlp", "p_pytorch", "p_svm", "p_fnn"] + # 8 base - ["weighted_avg"] + # 1 avg - ["diff_xgb_tabnet", "diff_xgb_lgb", "diff_xgb_extra", "diff_xgb_mlp", "diff_xgb_pytorch", "diff_xgb_svm", - "diff_tabnet_lgb", "diff_tabnet_extra", "diff_tabnet_mlp", "diff_tabnet_pytorch", "diff_tabnet_svm", - "diff_lgb_extra", "diff_lgb_mlp", "diff_lgb_pytorch", "diff_lgb_svm", - "diff_extra_mlp", "diff_extra_pytorch", "diff_extra_svm", - "diff_mlp_pytorch", "diff_mlp_svm", - "diff_pytorch_svm", "diff_fnn_pytorch", "diff_fnn_svm" - ] + # 21 diffs - ["max_prob", "min_prob", "range_prob"] + # 3 range - ["rank_xgb", "rank_tabnet", "rank_lgb", "rank_extra", "rank_mlp", "rank_pytorch", "rank_svm", "rank_fnn"] + # 8 ranks - ["vote_sum", "vote_agreement"] + # 2 votes - ["league_encoded", "season_encoded", "date_encoded"] # 4 meta + ["p_xgb", "p_tabnet", "p_lgb", "p_extra", "p_mlp", "p_pytorch", "p_svm", "p_fnn"] # 8 base + + ["weighted_avg"] # 1 avg + + [ + "diff_xgb_tabnet", + "diff_xgb_lgb", + "diff_xgb_extra", + "diff_xgb_mlp", + "diff_xgb_pytorch", + "diff_xgb_svm", + "diff_tabnet_lgb", + "diff_tabnet_extra", + "diff_tabnet_mlp", + "diff_tabnet_pytorch", + "diff_tabnet_svm", + "diff_lgb_extra", + "diff_lgb_mlp", + "diff_lgb_pytorch", + "diff_lgb_svm", + "diff_extra_mlp", + "diff_extra_pytorch", + "diff_extra_svm", + "diff_mlp_pytorch", + "diff_mlp_svm", + "diff_pytorch_svm", + "diff_fnn_pytorch", + "diff_fnn_svm", + ] # 21 diffs + + ["max_prob", "min_prob", "range_prob"] # 3 range + + [ + "rank_xgb", + "rank_tabnet", + "rank_lgb", + "rank_extra", + "rank_mlp", + "rank_pytorch", + "rank_svm", + "rank_fnn", + ] # 8 ranks + + ["vote_sum", "vote_agreement"] # 2 votes + + ["league_encoded", "season_encoded", "date_encoded"] # 4 meta ) - + if num_features != len(col_names): # Fallback if the number of features doesn't match the expected 41 col_names = [f"meta_{i}" for i in range(num_features)] diff --git a/src/models/ensemble/run_ensemble.py b/src/models/ensemble/run_ensemble.py index 560c051..b036ce3 100644 --- a/src/models/ensemble/run_ensemble.py +++ b/src/models/ensemble/run_ensemble.py @@ -13,6 +13,7 @@ from pathlib import Path import mlflow +import mlflow.models import mlflow.sklearn import numpy as np from sklearn.base import BaseEstimator, ClassifierMixin @@ -32,8 +33,9 @@ except Exception as e: print(f"Error setting project root path: {e}") # Fallback to current directory if path resolution fails - sys.path.append(os.getcwd().parent) - print(f"Current directory run_ensemble: {os.getcwd().parent}") + current_dir = Path(os.getcwd()).parent + sys.path.append(str(current_dir)) + print(f"Current directory run_ensemble: {current_dir}") # Set environment variables for Git os.environ["GIT_PYTHON_GIT_EXECUTABLE"] = "C:/Program Files/Git/bin/git.exe" @@ -107,26 +109,33 @@ def run_ensemble( ) logger.info("Starting ensemble model execution...") + # Initialize variables to None to handle potential loading failures + X_train, y_train, X_test, y_test, x_val, y_val = None, None, None, None, None, None + try: - logger.info("Loading data...") - X_train, y_train, X_test, y_test, X_val, y_val = DataLoader().load_data() + X_train, y_train, X_test, y_test, x_val, y_val = DataLoader().load_data() # Convert all columns to float64 to ensure consistent data types X_train = X_train.astype("float64") X_test = X_test.astype("float64") - X_val = X_val.astype("float64") + x_val = x_val.astype("float64") except Exception as e: logger.error(f"Error loading time-based data: {str(e)}") logger.info("Falling back to standard data loading...") + raise ValueError(f"Data loading failed: {str(e)}") from e + + # Add null checks for unbound variables + if any(var is None for var in [X_train, y_train, X_test, y_test, x_val, y_val]): + raise ValueError("Data loading failed - some variables were not initialized") # Log dataset sizes logger.info( - f"Dataset sizes - Training: {X_train.shape}, Test: {X_test.shape}, Validation: {X_val.shape}" + f"Dataset sizes - Training: {X_train.shape}, Test: {X_test.shape}, Validation: {x_val.shape}" ) mlflow.log_params( { "train_size": len(X_train), "test_size": len(X_test), - "val_size": len(X_val), + "val_size": len(x_val), "positive_rate_train": y_train.mean(), "positive_rate_test": y_test.mean(), "positive_rate_val": y_val.mean(), @@ -135,17 +144,26 @@ def run_ensemble( # Feature selection logger.info("Selecting features...") - features = import_selected_features_ensemble_new(model_type="all") + # Add type guard for selected_features parameter + try: + features = import_selected_features_ensemble_new(model_type="all") + if not isinstance(features, list): + raise TypeError(f"Expected features to be a list, got {type(features)}") + if not features: + raise ValueError("No features selected") + except Exception as e: + logger.error(f"Error selecting features: {str(e)}") + raise ValueError(f"Feature selection failed: {str(e)}") from e # Filter features for all datasets - X_train_filtered = prepare_data(X_train, features) - X_test_filtered = prepare_data(X_test, features) - X_val_filtered = prepare_data(X_val, features) + x_train_filtered = prepare_data(X_train, features) + x_test_filtered = prepare_data(X_test, features) + x_val_filtered = prepare_data(x_val, features) # Log the conversion mlflow.log_param("data_type_conversion", "all_columns_to_float64") logger.info( - f"Data types after conversion: {X_train_filtered.dtypes.value_counts().to_dict()}" + f"Data types after conversion: {x_train_filtered.dtypes.value_counts().to_dict()}" ) # Create ensemble with configuration ensemble_model = EnsembleModel( @@ -161,11 +179,11 @@ def run_ensemble( # Train the model logger.info("Training ensemble model...") training_results = ensemble_model.train( - X_train=X_train_filtered, + X_train=x_train_filtered, y_train=y_train, - X_test=X_test_filtered, + X_test=x_test_filtered, y_test=y_test, - X_val=X_val_filtered, + x_val=x_val_filtered, y_val=y_val, split_validation=False, # Don't split again, we already have splits ) @@ -179,7 +197,7 @@ def run_ensemble( logger.info("Ensemble model execution completed successfully.") # Save model with signature to MLflow logger.info("Saving ensemble model with signature to MLflow...") - input_example = X_val_filtered.iloc[0:1].copy() + input_example = x_val_filtered.iloc[0:1].copy() best_threshold = training_results["threshold"] # Get prediction for output example output_example = ensemble_model.predict_proba(input_example) @@ -232,8 +250,12 @@ def set_params(self, **parameters): ) logger.info(f"Model saved with signature and registered as: {model_name}") # Log the run ID for future reference - run_id = mlflow.active_run().info.run_id - logger.info(f"MLflow Run ID: {run_id}") + active_run = mlflow.active_run() + if active_run is not None: + run_id = active_run.info.run_id + logger.info(f"MLflow Run ID: {run_id}") + else: + logger.warning("No active MLflow run found") return ensemble_model except Exception as e: @@ -255,6 +277,48 @@ def get_model_params(model): return {"error": str(e)} +def _extract_base_model_params(ensemble_model): + """Extract parameters from base models.""" + base_models = { + "model_xgb": "XGBoost", + "model_tabnet": "TabNet", + "model_lgb": "LightGBM", + "model_mlp": "MLP", + "model_extra": "Extra", + "model_svm": "SVM" + } + + params_dict = {} + for attr_name, model_name in base_models.items(): + if hasattr(ensemble_model, attr_name): + params_dict[model_name] = ensemble_model.get_model_params( + getattr(ensemble_model, attr_name) + ) + + return params_dict + + +def _extract_calibrated_model_params(ensemble_model): + """Extract parameters from calibrated models.""" + calibrated_models = { + "model_xgb_calibrated": "XGBoost_calibrated", + "model_tabnet_calibrated": "TabNet_calibrated", + "model_lgb_calibrated": "LightGBM_calibrated", + "model_extra_calibrated": "Extra_calibrated", + "model_mlp_calibrated": "MLP_calibrated", + "model_mlp_sklearn_calibrated": "MLP_sklearn_calibrated", + "model_svm_calibrated": "SVM_calibrated" + } + + params_dict = {} + for attr_name, model_name in calibrated_models.items(): + model = getattr(ensemble_model, attr_name, None) + if model is not None: + params_dict[model_name] = ensemble_model.get_model_params(model) + + return params_dict + + def log_all_model_params(ensemble_model): """ Extracts and logs parameters for each base and extra model in the ensemble. @@ -265,79 +329,26 @@ def log_all_model_params(ensemble_model): """ params_dict = {} - # Log parameters from each base model. - if hasattr(ensemble_model, "model_xgb"): - params_dict["XGBoost"] = ensemble_model.get_model_params(ensemble_model.model_xgb) - if hasattr(ensemble_model, "model_tabnet"): - params_dict["TabNet"] = ensemble_model.get_model_params(ensemble_model.model_tabnet) - if hasattr(ensemble_model, "model_lgb"): - params_dict["LightGBM"] = ensemble_model.get_model_params(ensemble_model.model_lgb) - if hasattr(ensemble_model, "model_mlp"): - params_dict["MLP"] = ensemble_model.get_model_params(ensemble_model.model_mlp) - if hasattr(ensemble_model, "model_extra"): - params_dict["Extra"] = ensemble_model.get_model_params(ensemble_model.model_extra) - if hasattr(ensemble_model, "model_svm"): - params_dict["SVM"] = ensemble_model.get_model_params(ensemble_model.model_svm) - - # Optionally, log calibrated versions if available. - if ( - hasattr(ensemble_model, "model_xgb_calibrated") - and ensemble_model.model_xgb_calibrated is not None - ): - params_dict["XGBoost_calibrated"] = ensemble_model.get_model_params( - ensemble_model.model_xgb_calibrated - ) - if ( - hasattr(ensemble_model, "model_tabnet_calibrated") - and ensemble_model.model_tabnet_calibrated is not None - ): - params_dict["TabNet_calibrated"] = ensemble_model.get_model_params( - ensemble_model.model_tabnet_calibrated - ) - if ( - hasattr(ensemble_model, "model_lgb_calibrated") - and ensemble_model.model_lgb_calibrated is not None - ): - params_dict["LightGBM_calibrated"] = ensemble_model.get_model_params( - ensemble_model.model_lgb_calibrated - ) - if ( - hasattr(ensemble_model, "model_extra_calibrated") - and ensemble_model.model_extra_calibrated is not None - ): - params_dict["Extra_calibrated"] = ensemble_model.get_model_params( - ensemble_model.model_extra_calibrated - ) - if ( - hasattr(ensemble_model, "model_mlp_calibrated") - and ensemble_model.model_mlp_calibrated is not None - ): - params_dict["MLP_calibrated"] = ensemble_model.get_model_params( - ensemble_model.model_mlp_calibrated - ) - if ( - hasattr(ensemble_model, "model_mlp_sklearn_calibrated") - and ensemble_model.model_mlp_sklearn_calibrated is not None - ): - params_dict["MLP_sklearn_calibrated"] = ensemble_model.get_model_params( - ensemble_model.model_mlp_sklearn_calibrated - ) - if ( - hasattr(ensemble_model, "model_svm_calibrated") - and ensemble_model.model_svm_calibrated is not None - ): - params_dict["SVM_calibrated"] = ensemble_model.get_model_params( - ensemble_model.model_svm_calibrated - ) - # Log additional settings (such as meta-learner parameters) if applicable. + # Extract base model parameters + params_dict.update(_extract_base_model_params(ensemble_model)) + + # Extract calibrated model parameters + params_dict.update(_extract_calibrated_model_params(ensemble_model)) + + # Extract meta-learner parameters if hasattr(ensemble_model, "meta_learner") and ensemble_model.meta_learner is not None: params_dict["MetaLearner"] = ensemble_model.get_model_params(ensemble_model.meta_learner) # Log the complete parameters dictionary as a JSON artifact to MLflow. mlflow.log_dict(params_dict, "ensemble_model_parameters.json") - # Optionally, also log some keys using mlflow.log_param for faster comparison in the UI. + + # Log summary parameters for UI comparison + _log_model_summary_params(params_dict) + + +def _log_model_summary_params(params_dict): + """Log first few parameters from each model for UI comparison.""" for model_name, params in params_dict.items(): - # For each top-level model, log a summary (e.g., only the first few keys). if isinstance(params, dict): for key, value in list(params.items())[:3]: mlflow.log_param(f"{model_name}_{key}", str(value)) diff --git a/src/models/ensemble/training.py b/src/models/ensemble/training.py index fc6c9d8..b93569d 100644 --- a/src/models/ensemble/training.py +++ b/src/models/ensemble/training.py @@ -530,12 +530,14 @@ def hypertune_meta_learner( np.random.seed(random_seed) tf.random.set_seed(random_seed) os.environ["PYTHONHASHSEED"] = str(random_seed) - + # PyTorch specific reproducibility settings and optimizations torch.manual_seed(random_seed) if torch.cuda.is_available(): torch.cuda.manual_seed_all(random_seed) - torch.backends.cudnn.benchmark = True # Auto-optimizes for hardware if input sizes don't change + torch.backends.cudnn.benchmark = ( + True # Auto-optimizes for hardware if input sizes don't change + ) torch.backends.cudnn.deterministic = False # Better performance, less deterministic # Enable TF32 for better performance on Ampere GPUs (RTX 30xx and newer) torch.backends.cuda.matmul.allow_tf32 = True @@ -550,9 +552,7 @@ def hypertune_meta_learner( # Use to_numpy() for modern pandas meta_features_np = meta_features.to_numpy() meta_targets_np = ( - meta_targets.to_numpy() - if hasattr(meta_targets, "to_numpy") - else meta_targets + meta_targets.to_numpy() if hasattr(meta_targets, "to_numpy") else meta_targets ) eval_features_np = ( eval_meta_features.to_numpy() @@ -626,8 +626,10 @@ def objective(trial): batch_size = trial.suggest_int("batch_size", 1024, 24576) # Fit param # Ensure virtual_batch_size is always <= batch_size to prevent CUDA errors max_virtual_batch_size = min(4096, batch_size) - virtual_batch_size = trial.suggest_int("virtual_batch_size", 128, max_virtual_batch_size) # Fit param - + virtual_batch_size = trial.suggest_int( + "virtual_batch_size", 128, max_virtual_batch_size + ) # Fit param + fit_params = { "max_epochs": trial.suggest_int("max_epochs", 50, 250, step=5), # Fit param "patience": trial.suggest_int("patience", 4, 40, step=2), # Fit param @@ -655,10 +657,12 @@ def objective(trial): meta_learner = TabNetClassifier(**train_params) if torch.cuda.is_available(): try: - if hasattr(meta_learner, 'network') and hasattr(torch, 'compile'): + if hasattr(meta_learner, "network") and hasattr(torch, "compile"): logger.info("Applying torch.compile to TabNet network for GPU acceleration") # Apply compilation with 'reduce-overhead' mode which is good for GPU performance - meta_learner.network = torch.compile(meta_learner.network, mode="reduce-overhead") + meta_learner.network = torch.compile( + meta_learner.network, mode="reduce-overhead" + ) logger.info("Successfully applied torch.compile to TabNet network") except Exception as e: logger.warning(f"Could not apply torch.compile: {str(e)}") @@ -784,15 +788,17 @@ def objective(trial): # Clear GPU cache before training to prevent memory issues if torch.cuda.is_available(): torch.cuda.empty_cache() - + # Validate batch size relationship batch_size = params["batch_size"] virtual_batch_size = params["virtual_batch_size"] if virtual_batch_size > batch_size: - logger.warning(f"Invalid batch size relationship: virtual_batch_size ({virtual_batch_size}) > batch_size ({batch_size}). Adjusting virtual_batch_size.") + logger.warning( + f"Invalid batch size relationship: virtual_batch_size ({virtual_batch_size}) > batch_size ({batch_size}). Adjusting virtual_batch_size." + ) virtual_batch_size = min(virtual_batch_size, batch_size) params["virtual_batch_size"] = virtual_batch_size - + # Special handling for TabNet's training meta_learner.fit( meta_features_np, @@ -853,7 +859,8 @@ def objective(trial): eval_set=[(eval_meta_features, eval_meta_targets)], callbacks=[lgb.early_stopping(stopping_rounds=early_stopping_rounds)], ) - elif (hasattr(meta_learner, "early_stopping_rounds") + elif ( + hasattr(meta_learner, "early_stopping_rounds") or hasattr(meta_learner, "early_stopping") and meta_learner_type != "sgd" ): @@ -886,16 +893,19 @@ def objective(trial): except Exception as e: error_msg = str(e) logger.error(f"Error in trial {trial.number}: {error_msg}") - + # Handle CUDA-specific errors if "CUDA" in error_msg or "device-side assert" in error_msg: - logger.error(f"CUDA error detected in trial {trial.number}. Clearing GPU cache and continuing.") + logger.error( + f"CUDA error detected in trial {trial.number}. Clearing GPU cache and continuing." + ) if torch.cuda.is_available(): torch.cuda.empty_cache() # Force garbage collection import gc + gc.collect() - + return -1.0 # Initialize variables for batch training @@ -958,9 +968,7 @@ def callback(study, trial): # Create and run Optuna study with persistent storage study = optuna.create_study( - study_name=study_name, - direction="maximize", - sampler=random_sampler + study_name=study_name, direction="maximize", sampler=random_sampler ) logger.info( diff --git a/src/models/ensemble/weights_0410.py b/src/models/ensemble/weights_0410.py index a410ade..ce7c230 100644 --- a/src/models/ensemble/weights_0410.py +++ b/src/models/ensemble/weights_0410.py @@ -11,8 +11,16 @@ def compute_precision_focused_weights_optimized( - p_xgb, p_tabnet, p_lgb, p_extra, p_mlp, p_pytorch, - y_true, target_precision, required_recalls, logger=None + p_xgb, + p_tabnet, + p_lgb, + p_extra, + p_mlp, + p_pytorch, + y_true, + target_precision, + required_recalls, + logger=None, ): """ Compute weights with strong focus on precision, including MLP and PyTorch models. @@ -22,11 +30,13 @@ def compute_precision_focused_weights_optimized( logger = ExperimentLogger(experiment_name="ensemble_weights_0410") logger.info("Computing precision-focused weights for 6 models...") - + # Ensure required_recalls has 6 elements if len(required_recalls) != 6: - raise ValueError(f"Expected required_recalls list to have 6 elements, got {len(required_recalls)}") - + raise ValueError( + f"Expected required_recalls list to have 6 elements, got {len(required_recalls)}" + ) + xgb_recall = required_recalls[0] lgb_recall = required_recalls[1] tabnet_recall = required_recalls[2] @@ -69,15 +79,21 @@ def compute_precision_focused_weights_optimized( pytorch_weight = pytorch_metrics["precision"] ** 2 # Ensure minimum contribution from each model (e.g., 5% -> 1/num_models? Let's keep 5% for now) - min_contrib = 0.05 - total_weight = xgb_weight + tabnet_weight + lgb_weight + extra_weight + mlp_weight + pytorch_weight - if total_weight <= 0: # Avoid division by zero if all precisions are 0 + min_contrib = 0.05 + total_weight = ( + xgb_weight + tabnet_weight + lgb_weight + extra_weight + mlp_weight + pytorch_weight + ) + if total_weight <= 0: # Avoid division by zero if all precisions are 0 logger.warning("All base model precisions are zero. Assigning equal weights.") num_models = 6 - weights = {m: 1.0/num_models for m in ["xgb", "tabnet", "lgb", "extra", "mlp", "pytorch"]} + weights = {m: 1.0 / num_models for m in ["xgb", "tabnet", "lgb", "extra", "mlp", "pytorch"]} thresholds = { - "xgb": xgb_threshold, "tabnet": tabnet_threshold, "lgb": lgb_threshold, - "extra": extra_threshold, "mlp": mlp_threshold, "pytorch": pytorch_threshold + "xgb": xgb_threshold, + "tabnet": tabnet_threshold, + "lgb": lgb_threshold, + "extra": extra_threshold, + "mlp": mlp_threshold, + "pytorch": pytorch_threshold, } return weights, thresholds @@ -89,7 +105,9 @@ def compute_precision_focused_weights_optimized( pytorch_weight = max(min_contrib, pytorch_weight / total_weight) # Renormalize - total_weight = xgb_weight + tabnet_weight + lgb_weight + extra_weight + mlp_weight + pytorch_weight + total_weight = ( + xgb_weight + tabnet_weight + lgb_weight + extra_weight + mlp_weight + pytorch_weight + ) weights = { "xgb": xgb_weight / total_weight, "tabnet": tabnet_weight / total_weight, diff --git a/src/models/ensemble/weights_0412.py b/src/models/ensemble/weights_0412.py index c7a3dd2..ac0e18d 100644 --- a/src/models/ensemble/weights_0412.py +++ b/src/models/ensemble/weights_0412.py @@ -11,8 +11,17 @@ def compute_precision_focused_weights_optimized( - p_xgb, p_tabnet, p_lgb, p_extra, p_mlp, p_pytorch, p_svm, - y_true, target_precision, required_recalls, logger=None + p_xgb, + p_tabnet, + p_lgb, + p_extra, + p_mlp, + p_pytorch, + p_svm, + y_true, + target_precision, + required_recalls, + logger=None, ): """ Compute weights with strong focus on precision, including MLP, PyTorch and SVM models. @@ -22,12 +31,14 @@ def compute_precision_focused_weights_optimized( logger = ExperimentLogger(experiment_name="ensemble_weights_0412") logger.info("Computing precision-focused weights for 7 models...") - + # Ensure required_recalls has 7 elements num_models = 7 if len(required_recalls) != num_models: - raise ValueError(f"Expected required_recalls list to have {num_models} elements, got {len(required_recalls)}") - + raise ValueError( + f"Expected required_recalls list to have {num_models} elements, got {len(required_recalls)}" + ) + xgb_recall = required_recalls[0] lgb_recall = required_recalls[1] tabnet_recall = required_recalls[2] @@ -76,15 +87,29 @@ def compute_precision_focused_weights_optimized( svm_weight = svm_metrics["precision"] ** 2 # Ensure minimum contribution from each model (e.g., 5% -> 1/num_models? Let's keep 5% for now) - min_contrib = 0.05 - total_weight = xgb_weight + tabnet_weight + lgb_weight + extra_weight + mlp_weight + pytorch_weight + svm_weight - if total_weight <= 0: # Avoid division by zero if all precisions are 0 + min_contrib = 0.05 + total_weight = ( + xgb_weight + + tabnet_weight + + lgb_weight + + extra_weight + + mlp_weight + + pytorch_weight + + svm_weight + ) + if total_weight <= 0: # Avoid division by zero if all precisions are 0 logger.warning("All base model precisions are zero. Assigning equal weights.") - weights = {m: 1.0/num_models for m in ["xgb", "tabnet", "lgb", "extra", "mlp", "pytorch", "svm"]} + weights = { + m: 1.0 / num_models for m in ["xgb", "tabnet", "lgb", "extra", "mlp", "pytorch", "svm"] + } thresholds = { - "xgb": xgb_threshold, "tabnet": tabnet_threshold, "lgb": lgb_threshold, - "extra": extra_threshold, "mlp": mlp_threshold, "pytorch": pytorch_threshold, - "svm": svm_threshold + "xgb": xgb_threshold, + "tabnet": tabnet_threshold, + "lgb": lgb_threshold, + "extra": extra_threshold, + "mlp": mlp_threshold, + "pytorch": pytorch_threshold, + "svm": svm_threshold, } return weights, thresholds @@ -97,7 +122,15 @@ def compute_precision_focused_weights_optimized( svm_weight = max(min_contrib, svm_weight / total_weight) # Renormalize - total_weight = xgb_weight + tabnet_weight + lgb_weight + extra_weight + mlp_weight + pytorch_weight + svm_weight + total_weight = ( + xgb_weight + + tabnet_weight + + lgb_weight + + extra_weight + + mlp_weight + + pytorch_weight + + svm_weight + ) weights = { "xgb": xgb_weight / total_weight, "tabnet": tabnet_weight / total_weight, @@ -143,4 +176,4 @@ def compute_precision_focused_weights_optimized( for model, weight in weights.items(): logger.info(f" {model}: {weight:.4f}") - return weights, thresholds \ No newline at end of file + return weights, thresholds diff --git a/src/models/ensemble/weights_0414.py b/src/models/ensemble/weights_0414.py index 47e0c86..20e67b3 100644 --- a/src/models/ensemble/weights_0414.py +++ b/src/models/ensemble/weights_0414.py @@ -11,8 +11,18 @@ def compute_precision_focused_weights_optimized( - p_xgb, p_tabnet, p_lgb, p_extra, p_mlp, p_pytorch, p_svm, p_fnn, - y_true, target_precision, required_recalls, logger=None + p_xgb, + p_tabnet, + p_lgb, + p_extra, + p_mlp, + p_pytorch, + p_svm, + p_fnn, + y_true, + target_precision, + required_recalls, + logger=None, ): """ Compute weights with strong focus on precision, including MLP, PyTorch and SVM models. @@ -22,12 +32,14 @@ def compute_precision_focused_weights_optimized( logger = ExperimentLogger(experiment_name="ensemble_weights_0412") logger.info("Computing precision-focused weights for 8 models...") - + # Ensure required_recalls has 7 elements num_models = 8 if len(required_recalls) != num_models: - raise ValueError(f"Expected required_recalls list to have {num_models} elements, got {len(required_recalls)}") - + raise ValueError( + f"Expected required_recalls list to have {num_models} elements, got {len(required_recalls)}" + ) + xgb_recall = required_recalls[0] lgb_recall = required_recalls[1] tabnet_recall = required_recalls[2] @@ -82,15 +94,32 @@ def compute_precision_focused_weights_optimized( fnn_weight = fnn_metrics["precision"] ** 2 # Ensure minimum contribution from each model (e.g., 5% -> 1/num_models? Let's keep 5% for now) - min_contrib = 0.05 - total_weight = xgb_weight + tabnet_weight + lgb_weight + extra_weight + mlp_weight + pytorch_weight + svm_weight + fnn_weight - if total_weight <= 0: # Avoid division by zero if all precisions are 0 + min_contrib = 0.05 + total_weight = ( + xgb_weight + + tabnet_weight + + lgb_weight + + extra_weight + + mlp_weight + + pytorch_weight + + svm_weight + + fnn_weight + ) + if total_weight <= 0: # Avoid division by zero if all precisions are 0 logger.warning("All base model precisions are zero. Assigning equal weights.") - weights = {m: 1.0/num_models for m in ["xgb", "tabnet", "lgb", "extra", "mlp", "pytorch", "svm", "fnn"]} + weights = { + m: 1.0 / num_models + for m in ["xgb", "tabnet", "lgb", "extra", "mlp", "pytorch", "svm", "fnn"] + } thresholds = { - "xgb": xgb_threshold, "tabnet": tabnet_threshold, "lgb": lgb_threshold, - "extra": extra_threshold, "mlp": mlp_threshold, "pytorch": pytorch_threshold, - "svm": svm_threshold, "fnn": fnn_threshold + "xgb": xgb_threshold, + "tabnet": tabnet_threshold, + "lgb": lgb_threshold, + "extra": extra_threshold, + "mlp": mlp_threshold, + "pytorch": pytorch_threshold, + "svm": svm_threshold, + "fnn": fnn_threshold, } return weights, thresholds @@ -104,7 +133,16 @@ def compute_precision_focused_weights_optimized( fnn_weight = max(min_contrib, fnn_weight / total_weight) # Renormalize - total_weight = xgb_weight + tabnet_weight + lgb_weight + extra_weight + mlp_weight + pytorch_weight + svm_weight + fnn_weight + total_weight = ( + xgb_weight + + tabnet_weight + + lgb_weight + + extra_weight + + mlp_weight + + pytorch_weight + + svm_weight + + fnn_weight + ) weights = { "xgb": xgb_weight / total_weight, "tabnet": tabnet_weight / total_weight, @@ -153,4 +191,4 @@ def compute_precision_focused_weights_optimized( for model, weight in weights.items(): logger.info(f" {model}: {weight:.4f}") - return weights, thresholds \ No newline at end of file + return weights, thresholds diff --git a/src/predictors/predict_ensemble.py b/src/predictors/predict_ensemble.py index db9098b..42fe629 100644 --- a/src/predictors/predict_ensemble.py +++ b/src/predictors/predict_ensemble.py @@ -18,7 +18,7 @@ warnings.filterwarnings("ignore", category=SettingWithCopyWarning) sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) # Suppress pandas chained assignment warnings -pd.options.mode.chained_assignment = None # default='warn' +pd.options.mode.chained_assignment = None from src.utils.create_evaluation_set import ( create_prediction_set_ensemble, @@ -39,6 +39,10 @@ os.environ["MKL_NUM_THREADS"] = "4" os.environ["OPENBLAS_NUM_THREADS"] = "4" +# Error message constants +MODEL_NOT_LOADED_ERROR = "Model not loaded" +MODEL_MISSING_PREDICT_PROBA_ERROR = "Model must have predict_proba method" + class DrawPredictor: """Predictor class for draw predictions using the stacked model.""" @@ -46,7 +50,6 @@ class DrawPredictor: def __init__(self, model_uri: str): """Initialize predictor with model URI.""" # Set up MLflow tracking URI based on current environment - os.getcwd() try: self.model = mlflow.sklearn.load_model(model_uri) self.test_model = mlflow.pyfunc.load_model(model_uri) @@ -55,13 +58,15 @@ def __init__(self, model_uri: str): self.model = mlflow.pyfunc.load_model(model_uri) self.test_model = self.model try: - # Retrieve the optimal threshold if set during training. - if hasattr(self.model, "optimal_threshold"): - self.threshold = self.model.optimal_threshold - print(f"Using model's optimal threshold: {self.threshold:.2%}") - # elif hasattr(self.test_model, "metadata") and "threshold" in self.test_model.metadata.get_all_tags(): - # self.threshold = float(self.test_model.metadata.get_tag("threshold")) - # print(f"Using model's threshold: {self.threshold:.2%}") + # Retrieve the optimal threshold if set during training with null checks + if self.model is not None and hasattr(self.model, "optimal_threshold"): + optimal_threshold = getattr(self.model, "optimal_threshold", None) + if optimal_threshold is not None: + self.threshold = optimal_threshold + print(f"Using model's optimal threshold: {self.threshold:.2%}") + else: + self.threshold = 0.27 + print("Model optimal_threshold attribute is None, using default 27% threshold") else: self.threshold = 0.27 print("No optimal threshold found in model, using default 27% threshold") @@ -89,10 +94,33 @@ def predict(self, df: pd.DataFrame) -> dict[str, Any]: # Validate input if needed. self._validate_input(df) - # Get probabilities - our ensemble model returns a 1D array of positive class probabilities + # Initialize variables to avoid unbound variable errors + predictions = None + pos_probas = None + + # Get probabilities - extract positive class probabilities (draw class) try: - predictions = self.model.predict(df) - pos_probas = self.model.predict_proba(df) + # Add null guards for model operations + if self.model is None: + raise ValueError(MODEL_NOT_LOADED_ERROR) + + # Handle different model types + if hasattr(self.model, "predict_proba") and callable(getattr(self.model, "predict_proba", None)): + # sklearn model + predictions = self.model.predict(df) + pos_probas = self.model.predict_proba(df)[:, 1] # type: ignore # Get positive class probabilities + else: + # PyFuncModel - use predict method which returns probabilities + predictions = self.test_model.predict(df) + # For binary classification, PyFuncModel predict returns probabilities for both classes + if predictions.ndim == 2 and predictions.shape[1] == 2: + pos_probas = predictions[:, 1] + predictions = (pos_probas >= self.threshold).astype(int) + else: + # If predict returns 1D array, it might be class predictions + pos_probas = predictions + predictions = (pos_probas >= self.threshold).astype(int) + # Ensure we have a 1D numpy array if not isinstance(pos_probas, np.ndarray): pos_probas = np.array(pos_probas) @@ -100,9 +128,20 @@ def predict(self, df: pd.DataFrame) -> dict[str, Any]: print(f"Error predicting: {e}") if "use_label_encoder" in str(e): print("Attribute error due to missing 'use_label_encoder'. Patching model...") - self.model.use_label_encoder = False - predictions = self.model.predict(df) - pos_probas = self.model.predict_proba(df) + if self.model is None: + raise ValueError(MODEL_NOT_LOADED_ERROR) from e + # Fallback to PyFuncModel predict + predictions = self.test_model.predict(df) + if predictions.ndim == 2 and predictions.shape[1] == 2: + pos_probas = predictions[:, 1] + predictions = (pos_probas >= self.threshold).astype(int) + else: + pos_probas = predictions + predictions = (pos_probas >= self.threshold).astype(int) + + # Ensure predictions and probabilities were obtained + if predictions is None or pos_probas is None: + raise RuntimeError("Failed to obtain predictions from model") results = { "predictions": predictions.tolist(), @@ -129,7 +168,21 @@ def _find_optimal_threshold( try: prediction_df = features_val.copy() prediction_df = prediction_df[self.required_features] - probas = self.model.predict_proba(prediction_df)[:, 1] + # Add null guard for model operations + if self.model is None: + raise ValueError(MODEL_NOT_LOADED_ERROR) + + # Handle different model types + if hasattr(self.model, "predict_proba") and callable(getattr(self.model, "predict_proba", None)): + # sklearn model + probas = self.model.predict_proba(prediction_df)[:, 1] # type: ignore + else: + # PyFuncModel - use predict method which returns probabilities + predictions = self.test_model.predict(prediction_df) + if predictions.ndim == 2 and predictions.shape[1] == 2: + probas = predictions[:, 1] + else: + probas = predictions best_metrics = {"precision": 0, "recall": 0, "f1": 0, "threshold": 0.5} best_score = 0 @@ -138,7 +191,6 @@ def _find_optimal_threshold( preds = (probas >= threshold).astype(int) true_positives = ((preds == 1) & (target_val == 1)).sum() false_positives = ((preds == 1) & (target_val == 0)).sum() - ((preds == 0) & (target_val == 0)).sum() false_negatives = ((preds == 0) & (target_val == 1)).sum() # Calculate metrics recall = ( @@ -187,12 +239,15 @@ def _find_optimal_threshold( raise -def make_prediction(prediction_data, model_uri, real_scores_df) -> pd.DataFrame: +def make_prediction( + prediction_data, model_uri, real_scores_df +) -> tuple[pd.DataFrame, float, float]: """Make predictions and return results with probabilities.""" try: # Initialize default values precision = 0.0 draws_recall = 0.0 + matches_with_results = pd.DataFrame() # Initialize to avoid unbound variable error # Initialize predictor predictor = DrawPredictor(model_uri) @@ -201,7 +256,7 @@ def make_prediction(prediction_data, model_uri, real_scores_df) -> pd.DataFrame: # Ensure data types are compatible with model expectations # Convert numeric columns to float64 to match model expectations numeric_columns = prediction_df.select_dtypes(include=["number"]).columns - prediction_df = prediction_df.astype({col: "float64" for col in numeric_columns}) + prediction_df = prediction_df.astype(dict.fromkeys(numeric_columns, "float64")) # Add column validation predictor._validate_input(prediction_df) @@ -231,8 +286,7 @@ def make_prediction(prediction_data, model_uri, real_scores_df) -> pd.DataFrame: ] # Get real scores and merge - this is where the error occurs if "fixture_id" in prediction_data.columns: - print(f"prediction_data.columns: {prediction_data.shape}") - # valid_fixture_ids = prediction_df['fixture_id'].dropna().astype('Int64').tolist() + print(f"prediction_data.shape: {prediction_data.shape}") if not real_scores_df.empty: # Only proceed if we have real scores # Ensure is_draw column exists and is properly formatted if "is_draw" not in real_scores_df.columns: @@ -277,14 +331,16 @@ def make_prediction(prediction_data, model_uri, real_scores_df) -> pd.DataFrame: matches_with_results["is_draw"].fillna(-1).astype(int) ) - # Filter matches with results for date >= 2025-04-01 and order by date descending + # Filter matches with results for date >= 2025-05-01 and order by date descending if "Date" in matches_with_results.columns: matches_with_results["Date"] = pd.to_datetime(matches_with_results["Date"]) matches_with_results = matches_with_results[ matches_with_results["Date"] >= "2025-05-01" ] - matches_with_results = matches_with_results.sort_values(by="Date", ascending=False) - + matches_with_results = matches_with_results.sort_values( + by="Date", ascending=False + ) + if len(matches_with_results) > 0 and "is_draw" in matches_with_results.columns: # Filter out rows without valid is_draw values valid_matches = matches_with_results[matches_with_results["is_draw"] != -1] @@ -309,7 +365,7 @@ def make_prediction(prediction_data, model_uri, real_scores_df) -> pd.DataFrame: print(f"Predicted Draws: {valid_matches['draw_predicted'].sum()}") # Calculate metrics - accuracy = (true_positives + true_negatives) / len(matches_with_results) + accuracy = (true_positives + true_negatives) / len(valid_matches) if true_positives + false_negatives > 0: draws_recall = true_positives / (true_positives + false_negatives) @@ -330,12 +386,19 @@ def make_prediction(prediction_data, model_uri, real_scores_df) -> pd.DataFrame: matches_with_results = matches_with_results.loc[ :, ~matches_with_results.columns.duplicated(keep="last") ] - return matches_with_results, precision, draws_recall + # Ensure return type is DataFrame + assert isinstance(matches_with_results, pd.DataFrame), ( + f"matches_with_results must be DataFrame, got {type(matches_with_results)}" + ) + return matches_with_results, precision, draws_recall except Exception as e: print(f"Error during prediction: {str(e)}") print(f"Error type: {type(e).__name__}") return pd.DataFrame(), 0.0, 0.0 + # Fallback return (should never be reached) + return pd.DataFrame(), 0.0, 0.0 + def apply_threshold_filter(df: pd.DataFrame, remove_thresholds: list[float]) -> pd.DataFrame: """ @@ -356,7 +419,7 @@ def apply_threshold_filter(df: pd.DataFrame, remove_thresholds: list[float]) -> print(f"Deleted {num_deleted} rows at threshold {threshold}") # Apply the filter and return the filtered DataFrame - return df[mask] + return pd.DataFrame(df[mask].copy()) def apply_keep_thresholds_filter(df: pd.DataFrame, allowed_thresholds: list[float]) -> pd.DataFrame: @@ -369,7 +432,7 @@ def apply_keep_thresholds_filter(df: pd.DataFrame, allowed_thresholds: list[floa pd.DataFrame: Filtered DataFrame. """ condition = df["draw_probability"].isin(allowed_thresholds) - return df[condition] + return pd.DataFrame(df[condition].copy()) def main(): @@ -378,19 +441,22 @@ def main(): best_predictions = pd.DataFrame() # Initialize empty DataFrame predicted_df = pd.DataFrame() # Initialize predicted_df # Model URIs to evaluate - model_uris = [ - "7b3d6490c26e499f99e999a7a86975f8", - "cc5ff9dcc34a4f1aab2f0e270bb920b6" - ] + model_uris = ["7b3d6490c26e499f99e999a7a86975f8", "cc5ff9dcc34a4f1aab2f0e270bb920b6"] # Filter configuration to remove predictions near specific thresholds - filter_config = { - "8d80522037ae4a9790b72129c06851a4": {"remove_thresholds": [0.45, 0.47]}, - } + # Note: URIs in config don't match current model_uris - commented out + # filter_config = { + # "8d80522037ae4a9790b72129c06851a4": {"remove_thresholds": [0.45, 0.47]}, + # } # Keep configuration to only allow predictions near specific thresholds - keep_config = { - "97207cdaab54477fa267d8cd29ce35e9": {"keep_thresholds": [0.31, 0.32, 0.34, 0.37]}, - } + # Note: URIs in config don't match current model_uris - commented out + # keep_config = { + # "97207cdaab54477fa267d8cd29ce35e9": {"keep_thresholds": [0.31, 0.32, 0.34, 0.37]}, + # } + + # Empty configs since URIs don't match + filter_config = {} + keep_config = {} # Get preprocessed prediction data using standardized function prediction_df = create_prediction_set_ensemble() @@ -416,14 +482,21 @@ def main(): print(f"Skipping invalid predictions from model {uri}") continue + # Ensure predicted_df is a DataFrame for subsequent operations + assert isinstance(predicted_df, pd.DataFrame), ( + f"predicted_df must be DataFrame, got {type(predicted_df)}" + ) + # Type annotation to help type checker + predictions_df: pd.DataFrame = predicted_df + # Reorder columns to place draw_predicted and draw_probability last cols = [ col - for col in predicted_df.columns + for col in predictions_df.columns if col not in ["draw_predicted", "draw_probability"] ] cols.extend(["draw_predicted", "draw_probability"]) - predicted_df = predicted_df[cols] + predictions_df = pd.DataFrame(predictions_df[cols].copy()) # Save individual model predictions # model_output_path = Path(f"./data/prediction/ensemble/predictions_{uri}.xlsx") # predicted_df.to_excel(model_output_path, index=False) @@ -437,7 +510,7 @@ def main(): print( f"Applying remove threshold filter for model {uri}: removing all predictions with draw_probability in {remove_thresholds}" ) - predicted_df = apply_threshold_filter(predicted_df, remove_thresholds) + predictions_df = apply_threshold_filter(predictions_df, remove_thresholds) # --- Apply keep threshold filtering if configured for this model --- config_keep = keep_config.get(uri, None) if config_keep is not None: @@ -446,20 +519,22 @@ def main(): print( f"Applying keep threshold filter for model {uri}: keeping only predictions with draw_probability in {allowed_thresholds}" ) - predicted_df = apply_keep_thresholds_filter(predicted_df, allowed_thresholds) + predictions_df = apply_keep_thresholds_filter( + predictions_df, allowed_thresholds + ) # Remove rows where draw_predicted is 0 - predicted_df = predicted_df[predicted_df["draw_predicted"] == 1] - print(f"Filtered to {len(predicted_df)} rows where draw_predicted = 1") + predictions_df = pd.DataFrame(predictions_df[predictions_df["draw_predicted"] == 1].copy()) + print(f"Filtered to {len(predictions_df)} rows where draw_predicted = 1") # Save individual model predictions model_output_path = Path(f"./data/prediction/ensemble/predictions_model_{uri}.xlsx") - - predicted_df.to_excel(model_output_path, index=False) + + predictions_df.to_excel(model_output_path, index=False) print(f"Predictions for model {uri} saved to: {model_output_path}") if precision > best_precision and draws_recall > 0.20: best_precision = precision best_model_uri = uri - best_predictions = predicted_df.copy() + best_predictions = predictions_df.copy() print(f"New best model: {uri} with precision: {precision:.2%}") print(f"Draws recall: {draws_recall:.2%}") except Exception as e: @@ -471,7 +546,9 @@ def main(): # Handle empty predictions for best model if best_predictions.empty: print("Warning: No valid predictions generated. Creating empty result.") - predicted_df = pd.DataFrame(columns=["fixture_id", "draw_predicted", "draw_probability"]) + # Create empty DataFrame with specified columns + columns_list = ["fixture_id", "draw_predicted", "draw_probability"] + predicted_df = pd.DataFrame(columns=columns_list) # type: ignore else: predicted_df = best_predictions cols = [ diff --git a/src/utils/K_factor_calculation.py b/src/utils/K_factor_calculation.py index 0c28045..985e254 100644 --- a/src/utils/K_factor_calculation.py +++ b/src/utils/K_factor_calculation.py @@ -10,27 +10,31 @@ def calculate_draw_k_factor(league_data, logger): try: # 1. Draw Rate Stability (most important for draws) draw_rates = [] - for team_id in league_data['home_encoded'].unique(): - home_draws = league_data[league_data['home_encoded'] == team_id]['Home_draws'].iloc[0] - home_matches = league_data[league_data['home_encoded'] == team_id]['Home_team_matches'].iloc[0] - away_draws = league_data[league_data['away_encoded'] == team_id]['Away_draws'].iloc[0] - away_matches = league_data[league_data['away_encoded'] == team_id]['Away_team_matches'].iloc[0] - + for team_id in league_data["home_encoded"].unique(): + home_draws = league_data[league_data["home_encoded"] == team_id]["Home_draws"].iloc[0] + home_matches = league_data[league_data["home_encoded"] == team_id][ + "Home_team_matches" + ].iloc[0] + away_draws = league_data[league_data["away_encoded"] == team_id]["Away_draws"].iloc[0] + away_matches = league_data[league_data["away_encoded"] == team_id][ + "Away_team_matches" + ].iloc[0] + if home_matches > 0 and away_matches > 0: team_draw_rate = (home_draws + away_draws) / (home_matches + away_matches) draw_rates.append(team_draw_rate) - + draw_rate_std = np.std(draw_rates) if draw_rates else 0.5 draw_stability_factor = 1 - (draw_rate_std / 0.5) # 0.5 = max possible std - + # 2. Goal Difference Concentration (draws happen with small goal differences) goal_diff_data = [] for _, row in league_data.iterrows(): - if row['match_outcome'] == 2: # Draw matches only - home_goals = row.get('home_goals', 0) - away_goals = row.get('away_goals', 0) + if row["match_outcome"] == 2: # Draw matches only + home_goals = row.get("home_goals", 0) + away_goals = row.get("away_goals", 0) goal_diff_data.append(abs(home_goals - away_goals)) - + if goal_diff_data: # More 0-0, 1-1, 2-2 draws = higher predictability zero_diff_rate = sum(1 for diff in goal_diff_data if diff == 0) / len(goal_diff_data) @@ -38,18 +42,19 @@ def calculate_draw_k_factor(league_data, logger): goal_pattern_factor = (zero_diff_rate * 0.6) + (small_diff_rate * 0.4) else: goal_pattern_factor = 0.5 - + # 3. Temporal Draw Distribution (consistent vs. clustered draws) - league_draws = league_data[league_data['match_outcome'] == 2] + league_draws = league_data[league_data["match_outcome"] == 2] if len(league_draws) > 5: # Calculate draw frequency consistency across season - league_draws_sorted = league_draws.sort_values('Date') + league_draws_sorted = league_draws.sort_values("Date") draw_intervals = [] for i in range(1, len(league_draws_sorted)): - interval = (league_draws_sorted.iloc[i]['Date'] - - league_draws_sorted.iloc[i-1]['Date']).days + interval = ( + league_draws_sorted.iloc[i]["Date"] - league_draws_sorted.iloc[i - 1]["Date"] + ).days draw_intervals.append(interval) - + if draw_intervals: interval_std = np.std(draw_intervals) # Lower std = more consistent draw timing = higher predictability @@ -58,31 +63,34 @@ def calculate_draw_k_factor(league_data, logger): temporal_factor = 0.5 else: temporal_factor = 0.5 - + # 4. ELO Difference in Draw Matches (balanced teams = more draws) draw_elo_diffs = [] for _, row in league_data.iterrows(): - if row['match_outcome'] == 2 and 'home_team_elo' in row and 'away_team_elo' in row: - elo_diff = abs(row['home_team_elo'] - row['away_team_elo']) + if row["match_outcome"] == 2 and "home_team_elo" in row and "away_team_elo" in row: + elo_diff = abs(row["home_team_elo"] - row["away_team_elo"]) draw_elo_diffs.append(elo_diff) - + if draw_elo_diffs: avg_draw_elo_diff = np.mean(draw_elo_diffs) # Smaller ELO differences in draws = more predictable elo_balance_factor = max(0, 1 - (avg_draw_elo_diff / 200)) # 200 ELO normalization else: elo_balance_factor = 0.5 - + # Combine factors with weights optimized for draw prediction - draw_k_factor = 15 + (25 * ( - draw_stability_factor * 0.4 + # Most important - goal_pattern_factor * 0.3 + # Second most important - temporal_factor * 0.2 + # Timing consistency - elo_balance_factor * 0.1 # Team balance - )) - + draw_k_factor = 15 + ( + 25 + * ( + draw_stability_factor * 0.4 # Most important + + goal_pattern_factor * 0.3 # Second most important + + temporal_factor * 0.2 # Timing consistency + + elo_balance_factor * 0.1 # Team balance + ) + ) + return round(draw_k_factor, 2) - + except Exception as e: logger.error(f"Error calculating draw K-factor: {str(e)}") - return 25 # Default draw K-factor \ No newline at end of file + return 25 # Default draw K-factor diff --git a/src/utils/create_evaluation_set.py b/src/utils/create_evaluation_set.py index 7f26d16..52597da 100644 --- a/src/utils/create_evaluation_set.py +++ b/src/utils/create_evaluation_set.py @@ -4,28 +4,26 @@ import time from functools import wraps from pathlib import Path -from typing import Optional, Union +from typing import Optional, Union, cast import numpy as np import pandas as pd -from openpyxl import Workbook -from pyexcelerate import Workbook from sklearn.calibration import LabelEncoder from sklearn.model_selection import train_test_split # Add project root to Python path +project_root = Path(__file__).parent.parent.parent +current_dir = Path(os.getcwd()).parent +project_root_error = None try: - project_root = Path(__file__).parent.parent.parent if not project_root.exists(): # Handle network path by using raw string project_root = Path(r"\\".join(str(project_root).split("\\"))) sys.path.append(str(project_root)) - print(f"Project root create_evaluation_set: {project_root}") except Exception as e: - print(f"Error setting project root path: {str(e)}") + project_root_error = str(e) # Fallback to current directory if path resolution fails - sys.path.append(os.getcwd().parent) - print(f"Fallback to current directory: {os.getcwd().parent}") + sys.path.append(str(current_dir)) from src.utils.advanced_goal_features import AdvancedGoalFeatureEngineer from src.utils.K_factor_calculation import calculate_draw_k_factor @@ -36,6 +34,13 @@ experiment_name="create_evaluation_set", log_dir="logs/create_evaluation_set" ) +# Log project root setup after logger is initialized +if project_root_error: + logger.info(f"Error setting project root path: {project_root_error}") + logger.info(f"Fallback to current directory: {current_dir}") +else: + logger.info(f"Project root create_evaluation_set: {project_root}") + # Error codes for standardized logging class DataProcessingError: @@ -59,6 +64,38 @@ class DataProcessingError: MLFLOW_ERROR = "E302" +# Constants for commonly used file paths and messages +class FilePaths: + API_TRAINING_FINAL_XLSX = "data/api_training_final.xlsx" + API_TRAINING_FINAL_PARQUET = "data/api_training_final.parquet" + API_PREDICTION_DATA_XLSX = "data/prediction/api_prediction_data.xlsx" + API_PREDICTION_DATA_NEW_XLSX = "data/prediction/api_prediction_data_new.xlsx" + API_PREDICTION_EVAL_XLSX = "data/prediction/api_prediction_eval.xlsx" + API_PREDICTION_EVAL_PARQUET = "data/prediction/api_prediction_eval.parquet" + API_PREDICTIONS_DATA_PARQUET = "data/prediction/api_predictions_data.parquet" + NEW_API_TRAINING_FINAL_XLSX = "data/new_api_training_final.xlsx" + NEW_API_TRAINING_FINAL_PARQUET = "data/new_api_training_final.parquet" + NEW_API_PREDICTION_DATA_XLSX = "data/prediction/new_api_prediction_data.xlsx" + NEW_API_PREDICTION_EVAL_XLSX = "data/prediction/new_api_prediction_eval.xlsx" + NEW_API_PREDICTION_EVAL_PARQUET = "data/prediction/new_api_prediction_eval.parquet" + NEW_API_PREDICTIONS_DATA_XLSX = "data/prediction/new_api_predictions_data.xlsx" + NEW_API_PREDICTIONS_DATA_PARQUET = "data/prediction/new_api_predictions_data.parquet" + + +class Messages: + LOADED_DATASET_EMPTY = "Loaded dataset is empty" + DATASET_EMPTY = "Dataset is empty" + STARTING_NUMERIC_CONVERSION = "Starting numeric conversion" + DATA_NOT_DATAFRAME_SKIP_PARQUET = "data is not a DataFrame, skipping parquet creation" + IS_DRAW_COLUMN_NOT_FOUND = "is_draw column not found in parquet file, creating target variable" + X_TRAIN_MUST_BE_DATAFRAME = "X_train must be DataFrame" + X_TEST_MUST_BE_DATAFRAME = "X_test must be DataFrame" + Y_TRAIN_MUST_BE_SERIES = "y_train must be Series" + Y_TEST_MUST_BE_SERIES = "y_test must be Series" + DATE_ENCODED_MUST_BE_SERIES = "date_encoded must be Series" + EVALUATION_DATA_MUST_BE_DATAFRAME = "evaluation_data must be DataFrame" + + # Retry decorator for file operations def retry_on_error(max_retries: int = 3, delay: float = 1.0): def decorator(func): @@ -125,6 +162,8 @@ def convert_numeric_columns( # If no columns specified, use all columns if columns is None: columns = df.columns.tolist() + # Ensure columns is a list at this point + assert isinstance(columns, list), "columns must be a list or None" # Track problematic columns failed_columns = [] @@ -179,10 +218,13 @@ def convert_numeric_columns( ) # Handle negatives ) - # Try converting to numeric + # Try converting to numeric with type guard numeric_series = pd.to_numeric(series, errors="coerce") + assert isinstance(numeric_series, pd.Series), ( + f"pd.to_numeric must return Series, got {type(numeric_series)}" + ) - # Check if conversion was successful + # Check if conversion was successful - ensure proper Series typing if numeric_series.isna().all(): if verbose: logger.info(f"Column {col} contains no valid numeric values") @@ -191,10 +233,12 @@ def convert_numeric_columns( columns_to_drop.append(col) continue - # Apply the conversion + # Apply the conversion with proper type annotations for pandas operations df[col] = numeric_series.replace([np.inf, -np.inf], fill_value).fillna(fill_value) - if verbose and df[col].isna().any(): + # Type guard for DataFrame operations + has_na = bool(df[col].isna().any()) + if verbose and has_na: logger.info(f"Warning: Column {col} contains NaN values after conversion") except Exception as e: @@ -347,13 +391,13 @@ def update_api_training_data_for_draws(): """ try: # Load existing training data - data_path = "data/api_training_final.xlsx" + data_path = FilePaths.API_TRAINING_FINAL_XLSX data = pd.read_excel(data_path) # Initialize the feature engineer feature_engineer = AdvancedGoalFeatureEngineer() # Add advanced goal features updated_data = feature_engineer.add_goal_features(data) - logger.info(updated_data.shape) + logger.info(f"Updated data shape: {updated_data.shape}") # Save updated data back to Excel updated_data.to_excel(data_path, index=False) except Exception as e: @@ -366,14 +410,14 @@ def update_api_data_for_draws(): """ try: # Load existing training data - data_path = "data/prediction/api_prediction_data.xlsx" - data_path_new = "data/prediction/api_prediction_data_new.xlsx" + data_path = FilePaths.API_PREDICTION_DATA_XLSX + data_path_new = FilePaths.API_PREDICTION_DATA_NEW_XLSX data = pd.read_excel(data_path) # Initialize the feature engineer feature_engineer = AdvancedGoalFeatureEngineer() # Add advanced goal features updated_data = feature_engineer.add_goal_features(data) - logger.info(updated_data.shape) + logger.info(f"Updated data shape: {updated_data.shape}") # Filter data for dates before 2024-11-01 api_training_data = updated_data[updated_data["Date"] < "2025-03-01"] @@ -385,8 +429,13 @@ def update_api_data_for_draws(): (updated_data["Date"] >= "2025-03-01") & (updated_data["match_outcome"].notna()) ] # Filter data for dates after 2024-11-01 where match_outcome is blank + # Add type guard for pandas Series operations + match_outcome_series = updated_data["match_outcome"] + assert isinstance(match_outcome_series, pd.Series), ( + f"match_outcome must be Series, got {type(match_outcome_series)}" + ) api_prediction_data = updated_data[ - (updated_data["Date"] >= "2025-03-01") & (updated_data["match_outcome"].isna()) + (updated_data["Date"] >= "2025-03-01") & (match_outcome_series.isna()) ] logger.info(f"api_prediction_data.shape: {api_prediction_data.shape}") @@ -396,22 +445,38 @@ def update_api_data_for_draws(): updated_data = pd.concat([api_prediction_eval, api_prediction_data], ignore_index=True) # Export df_before_2024_11_01 to data/api_training_final.xlsx and .parquet - save_data_to_excel(api_training_data, "data/api_training_final.xlsx", "api_training_final") - create_parquet_files(api_training_data, "data/api_training_final.parquet") + save_data_to_excel(api_training_data, FilePaths.API_TRAINING_FINAL_XLSX, "api_training_final") + # Ensure api_training_data is a DataFrame + if isinstance(api_training_data, pd.DataFrame): + create_parquet_files(api_training_data, FilePaths.API_TRAINING_FINAL_PARQUET) + else: + logger.warning(Messages.DATA_NOT_DATAFRAME_SKIP_PARQUET) logger.info("api_training_final.xlsx and .parquet updated") # Export df_after_2024_11_01_not_blank to data/prediction/api_predictions_eval.xlsx and .parquet save_data_to_excel( - api_prediction_eval, "data/prediction/api_prediction_eval.xlsx", "api_prediction_eval" + api_prediction_eval, FilePaths.API_PREDICTION_EVAL_XLSX, "api_prediction_eval" ) - create_parquet_files(api_prediction_eval, "data/prediction/api_prediction_eval.parquet") + # Ensure api_prediction_eval is a DataFrame + if isinstance(api_prediction_eval, pd.DataFrame): + create_parquet_files(api_prediction_eval, FilePaths.API_PREDICTION_EVAL_PARQUET) + else: + logger.warning(Messages.DATA_NOT_DATAFRAME_SKIP_PARQUET) logger.info("api_prediction_eval.xlsx and .parquet updated") # Export df_after_2024_11_01_blank to data/prediction/api_predictions_data.xlsx and .parquet + # Note: This appears to be a typo in the original code - should be api_predictions_data.xlsx + api_predictions_data_xlsx = "data/prediction/api_predictions_data.xlsx" save_data_to_excel( - api_prediction_data, "data/prediction/api_predictions_data.xlsx", "api_prediction_data" + api_prediction_data, api_predictions_data_xlsx, "api_prediction_data" ) - create_parquet_files(api_prediction_data, "data/prediction/api_predictions_data.parquet") + # Ensure api_prediction_data is a DataFrame + if isinstance(api_prediction_data, pd.DataFrame): + create_parquet_files( + api_prediction_data, FilePaths.API_PREDICTIONS_DATA_PARQUET + ) + else: + logger.warning(Messages.DATA_NOT_DATAFRAME_SKIP_PARQUET) logger.info("api_predictions_data.xlsx and .parquet updated") # Save updated data back to Excel updated_data.to_excel(data_path_new, index=False) @@ -576,13 +641,13 @@ def import_training_data_draws_api() -> tuple[pd.DataFrame, pd.Series, pd.DataFr ValueError: If data validation fails Exception: For other processing errors """ - data_path = "data/api_training_final.xlsx" + data_path = FilePaths.API_TRAINING_FINAL_XLSX logger.info(f"Loading training data from: {data_path}") # Get selected columns selected_columns = get_selected_api_columns_draws() try: - parquet_path = "data/api_training_final.parquet" + parquet_path = FilePaths.API_TRAINING_FINAL_PARQUET if os.path.exists(parquet_path): data = pd.read_parquet(parquet_path) logger.info(f"Loaded data from Parquet: {parquet_path}") @@ -590,8 +655,8 @@ def import_training_data_draws_api() -> tuple[pd.DataFrame, pd.Series, pd.DataFr # Load data with retry mechanism data = pd.read_excel(data_path) if data.empty: - logger.info("Loaded dataset is empty", error_code=DataProcessingError.EMPTY_DATASET) - raise ValueError("Dataset is empty") + logger.info(Messages.LOADED_DATASET_EMPTY, error_code=DataProcessingError.EMPTY_DATASET) + raise ValueError(Messages.DATASET_EMPTY) logger.info(f"Successfully loaded data with shape: {data.shape}") @@ -607,12 +672,13 @@ def import_training_data_draws_api() -> tuple[pd.DataFrame, pd.Series, pd.DataFr ) raise ValueError(f"Missing required columns: {missing_columns}") - # Replace inf and nan values + # Replace inf and nan values with type guard + assert isinstance(data, pd.DataFrame), f"data must be DataFrame, got {type(data)}" data = data.replace([np.inf, -np.inf], np.nan) logger.info("Replaced infinite values with NaN") # Convert numeric columns - logger.info("Starting numeric conversion") + logger.info(Messages.STARTING_NUMERIC_CONVERSION) data = convert_numeric_columns( data=data, columns=data.columns.tolist(), @@ -660,18 +726,37 @@ def import_training_data_draws_api() -> tuple[pd.DataFrame, pd.Series, pd.DataFr error_code=DataProcessingError.INVALID_DATA_TYPE, ) raise ValueError(f"Non-numeric columns found: {object_columns}") - create_parquet_files(data, "data/api_training_final.parquet") + # Ensure data is a DataFrame + if isinstance(data, pd.DataFrame): + create_parquet_files(data, FilePaths.API_TRAINING_FINAL_PARQUET) + else: + logger.warning(Messages.DATA_NOT_DATAFRAME_SKIP_PARQUET) # Split into train and test sets logger.info("Splitting data into train and test sets") train_data, test_data = train_test_split( data, test_size=0.2, random_state=42, stratify=data["is_draw"] ) + # Convert back to DataFrames since train_test_split returns numpy arrays + train_data = pd.DataFrame(train_data, columns=data.columns) + test_data = pd.DataFrame(test_data, columns=data.columns) # Select features and target - X_train = train_data[selected_columns] + if isinstance(selected_columns, list) and len(selected_columns) > 0: + X_train = train_data[selected_columns] + X_test = test_data[selected_columns] + # Ensure they are DataFrames + assert isinstance(X_train, pd.DataFrame), Messages.X_TRAIN_MUST_BE_DATAFRAME + assert isinstance(X_test, pd.DataFrame), Messages.X_TEST_MUST_BE_DATAFRAME + else: + # If selected_columns is not a valid list, use all columns except target + X_train = train_data.drop(columns=["is_draw"], errors="ignore") + X_test = test_data.drop(columns=["is_draw"], errors="ignore") + y_train = train_data["is_draw"] - X_test = test_data[selected_columns] y_test = test_data["is_draw"] + # Ensure targets are Series + assert isinstance(y_train, pd.Series), Messages.Y_TRAIN_MUST_BE_SERIES + assert isinstance(y_test, pd.Series), Messages.Y_TEST_MUST_BE_SERIES # Final validation logger.info(f"Training set shape: {X_train.shape}") logger.info(f"Test set shape: {X_test.shape}") @@ -697,7 +782,7 @@ def import_training_data_draws_api() -> tuple[pd.DataFrame, pd.Series, pd.DataFr def import_feature_select_draws_api(): """Import training data for draw predictions.""" - data_path = "data/api_training_final.xlsx" + data_path = FilePaths.API_TRAINING_FINAL_XLSX data = pd.read_excel(data_path) # Create target variable data["is_draw"] = (data["match_outcome"] == 2).astype(int) @@ -749,10 +834,19 @@ def import_feature_select_draws_api(): train_data, test_data = train_test_split( data, test_size=0.2, random_state=42, stratify=data["is_draw"] ) + # Convert back to DataFrames since train_test_split returns numpy arrays + train_data = pd.DataFrame(train_data, columns=data.columns) + test_data = pd.DataFrame(test_data, columns=data.columns) + # Select features and target X_train = train_data.drop(columns="is_draw", errors="ignore") y_train = train_data["is_draw"] X_test = test_data.drop(columns="is_draw", errors="ignore") y_test = test_data["is_draw"] + # Ensure proper types + assert isinstance(X_train, pd.DataFrame), Messages.X_TRAIN_MUST_BE_DATAFRAME + assert isinstance(y_train, pd.Series), Messages.Y_TRAIN_MUST_BE_SERIES + assert isinstance(X_test, pd.DataFrame), Messages.X_TEST_MUST_BE_DATAFRAME + assert isinstance(y_test, pd.Series), Messages.Y_TEST_MUST_BE_SERIES # Add verification of dtypes logger.info("\nVerifying final dtypes:") non_numeric_cols = X_train.select_dtypes(include=["object"]).columns @@ -783,21 +877,23 @@ def create_evaluation_sets_draws_api(use_selected_columns: bool = True): ValueError: If data validation fails Exception: For other processing errors """ - file_path = "data/prediction/api_prediction_eval.xlsx" + file_path = FilePaths.API_PREDICTION_EVAL_XLSX logger.info(f"Loading evaluation data from: {file_path}") try: # Load data from the Excel file data = pd.read_excel(file_path) if data.empty: - logger.info("Loaded dataset is empty", error_code=DataProcessingError.EMPTY_DATASET) - raise ValueError("Dataset is empty") + logger.info(Messages.LOADED_DATASET_EMPTY, error_code=DataProcessingError.EMPTY_DATASET) + raise ValueError(Messages.DATASET_EMPTY) logger.info(f"Successfully loaded data with shape: {data.shape}") # Filter data where match_outcome is not NA data = data.dropna(subset=["match_outcome"]) logger.info(f"Data shape after filtering NA match outcomes: {data.shape}") # Replace inf and nan values + # Add type guard for DataFrame operations + assert isinstance(data, pd.DataFrame), f"data must be DataFrame, got {type(data)}" data = data.replace([np.inf, -np.inf], np.nan) logger.info("Replaced infinite values with NaN") # Get selected columns if needed @@ -861,7 +957,7 @@ def create_evaluation_sets_draws_api(use_selected_columns: bool = True): error_code=DataProcessingError.NUMERIC_CONVERSION_FAILED, ) # Convert numeric columns - logger.info("Starting numeric conversion") + logger.info(Messages.STARTING_NUMERIC_CONVERSION) data = convert_numeric_columns( data=data, columns=selected_columns, drop_errors=False, fill_value=0.0, verbose=True ) @@ -920,15 +1016,15 @@ def create_prediction_set_api() -> pd.DataFrame: ValueError: If data validation fails Exception: For other processing errors """ - file_path = "data/prediction/api_prediction_data_new.xlsx" + file_path = FilePaths.API_PREDICTION_DATA_NEW_XLSX logger.info(f"Loading prediction data from: {file_path}") try: # Load data from the Excel file data = pd.read_excel(file_path) if data.empty: - logger.info("Loaded dataset is empty", error_code=DataProcessingError.EMPTY_DATASET) - raise ValueError("Dataset is empty") + logger.info(Messages.LOADED_DATASET_EMPTY, error_code=DataProcessingError.EMPTY_DATASET) + raise ValueError(Messages.DATASET_EMPTY) logger.info(f"Successfully loaded data with shape: {data.shape}") # Get selected columns @@ -962,7 +1058,7 @@ def create_prediction_set_api() -> pd.DataFrame: data = data.drop(columns=["Away"]) logger.info("Dropped Date, Home, and Away columns from data") # Convert numeric columns - logger.info("Starting numeric conversion") + logger.info(Messages.STARTING_NUMERIC_CONVERSION) data = convert_numeric_columns( data=data, columns=None, # Convert all columns @@ -978,7 +1074,12 @@ def create_prediction_set_api() -> pd.DataFrame: data["Away"] = data_copy["Away"] logger.info("Restored Date, Home, and Away columns from original data") # Select only the required columns - X = data[selected_columns] + if isinstance(selected_columns, list) and len(selected_columns) > 0: + X = data[selected_columns].copy() + else: + X = data.copy() + # Ensure X is a DataFrame + assert isinstance(X, pd.DataFrame), "X must be DataFrame" # Final validation logger.info(f"Final feature set shape: {X.shape}") logger.info("Feature set ready for prediction") @@ -1029,7 +1130,10 @@ def import_selected_features_ensemble(model_type: Optional[str] = None) -> Union with open(json_path) as f: features = json.load(f) # Validate loaded data structure - if not all(key in features for key in ["xgb", "cat", "lgbm", "rf", "tabnet", "mlp", "pytorch", "svm"]): + if not all( + key in features + for key in ["xgb", "cat", "lgbm", "rf", "tabnet", "mlp", "pytorch", "svm"] + ): raise ValueError("JSON file missing required model keys") # Return specific model type if requested if model_type is not None: @@ -1047,7 +1151,17 @@ def import_selected_features_ensemble(model_type: Optional[str] = None) -> Union common_features = features["all"] logger.info("Returning features common to all models") return common_features - elif model_type not in ["xgb", "cat", "lgbm", "rf", "tabnet", "mlp", "pytorch", "svm", "all"]: + elif model_type not in [ + "xgb", + "cat", + "lgbm", + "rf", + "tabnet", + "mlp", + "pytorch", + "svm", + "all", + ]: raise ValueError( f"Invalid model_type: {model_type}. Must be one of: 'xgb', 'cat', 'lgbm', 'rf', 'tabnet', 'mlp', 'pytorch', 'svm', 'all'" ) @@ -1075,15 +1189,14 @@ def import_selected_features_ensemble(model_type: Optional[str] = None) -> Union raise -def create_ensemble_evaluation_set() -> pd.DataFrame: +def create_ensemble_evaluation_set() -> tuple[pd.DataFrame, pd.Series]: """Create evaluation set for ensemble training with selected features and evaluation columns. This function creates a dataset containing all features from selected_features_ensemble.json along with the target variable (is_draw) and an evaluator column for model comparison. Returns: - pd.DataFrame: DataFrame containing: - - All features from selected_features_ensemble - - is_draw: Target variable (1 for draw, 0 otherwise) - - evaluator: Column for model evaluation tracking + tuple[pd.DataFrame, pd.Series]: A tuple containing: + - x_val (pd.DataFrame): Features for evaluation + - y_val (pd.Series): Target variable (1 for draw, 0 for non-draw) Raises: FileNotFoundError: If required data files are not found ValueError: If data validation fails @@ -1137,22 +1250,29 @@ def create_ensemble_evaluation_set() -> pd.DataFrame: ) raise ValueError(f"Missing required features: {missing_features}") # Select features and add evaluator column - evaluation_data = data[selected_features].copy() + selected_data = data[selected_features] + evaluation_data = selected_data.copy() evaluation_data["is_draw"] = data["is_draw"] # Convert numeric columns + # Ensure evaluation_data is DataFrame before conversion + assert isinstance(evaluation_data, pd.DataFrame), Messages.EVALUATION_DATA_MUST_BE_DATAFRAME evaluation_data = convert_numeric_columns( data=evaluation_data, columns=None, drop_errors=True, fill_value=0.0, verbose=True ) + assert isinstance(evaluation_data, pd.DataFrame), Messages.EVALUATION_DATA_MUST_BE_DATAFRAME # Split into features and target - X_val = evaluation_data.drop(columns=["is_draw"]) + x_val = evaluation_data.drop(columns=["is_draw"]) y_val = evaluation_data["is_draw"] + # Ensure proper types + assert isinstance(x_val, pd.DataFrame), "x_val must be DataFrame" + assert isinstance(y_val, pd.Series), "y_val must be Series" # Final validation logger.info(f"Ensemble evaluation set created with shape: {evaluation_data.shape}") logger.info(f"Draw rate: {evaluation_data['is_draw'].mean():.2%}") - logger.info(f"Train set shape: {X_val.shape}") + logger.info(f"Train set shape: {x_val.shape}") logger.info(f"Test set shape: {y_val.shape}") - return X_val, y_val + return x_val, y_val except FileNotFoundError as e: logger.info(f"Data file not found: {str(e)}", error_code=DataProcessingError.FILE_NOT_FOUND) raise @@ -1171,8 +1291,8 @@ def create_ensemble_evaluation_set() -> pd.DataFrame: def import_training_data_ensemble(): """Import training data for draw predictions.""" - parquet_path = os.path.join(project_root, "data", "api_training_final.parquet") - data_path = os.path.join(project_root, "data", "api_training_final.xlsx") + parquet_path = os.path.join(project_root, FilePaths.API_TRAINING_FINAL_PARQUET) + data_path = os.path.join(project_root, FilePaths.API_TRAINING_FINAL_XLSX) # Check if parquet file exists and is valid if os.path.exists(parquet_path): @@ -1180,7 +1300,7 @@ def import_training_data_ensemble(): data = pd.read_parquet(parquet_path) logger.info(f"Loaded training data from parquet: {parquet_path}") if "is_draw" not in data.columns: - logger.info("is_draw column not found in parquet file, creating target variable") + logger.info(Messages.IS_DRAW_COLUMN_NOT_FOUND) data["is_draw"] = (data["match_outcome"] == 2).astype(int) except Exception as e: logger.info(f"Failed to load parquet file, falling back to Excel: {str(e)}") @@ -1242,32 +1362,45 @@ def import_training_data_ensemble(): if col in data.columns: data[col] = data[col].astype("int64") # Export processed data to parquet for efficient storage and retrieval - create_parquet_files(data, "data/api_training_final.parquet") + # Ensure data is a DataFrame + if isinstance(data, pd.DataFrame): + create_parquet_files(data, FilePaths.API_TRAINING_FINAL_PARQUET) + else: + logger.warning(Messages.DATA_NOT_DATAFRAME_SKIP_PARQUET) logger.info("Exported processed training data to parquet format") # Split into train and test sets train_data, test_data = train_test_split( data, test_size=0.3, random_state=42, stratify=data["is_draw"] ) + # Convert back to DataFrames since train_test_split returns numpy arrays + train_data = pd.DataFrame(train_data, columns=data.columns) + test_data = pd.DataFrame(test_data, columns=data.columns) + # Select features and target X_train = train_data.drop(columns="is_draw", errors="ignore") y_train = train_data["is_draw"] X_test = test_data.drop(columns="is_draw", errors="ignore") y_test = test_data["is_draw"] + # Ensure proper types + assert isinstance(X_train, pd.DataFrame), Messages.X_TRAIN_MUST_BE_DATAFRAME + assert isinstance(y_train, pd.Series), Messages.Y_TRAIN_MUST_BE_SERIES + assert isinstance(X_test, pd.DataFrame), Messages.X_TEST_MUST_BE_DATAFRAME + assert isinstance(y_test, pd.Series), Messages.Y_TEST_MUST_BE_SERIES return X_train, y_train, X_test, y_test -def save_data_to_excel(df, output_path, type): - # Replace NaN/None with empty string - df = df.fillna('') - wb = Workbook() - wb.new_sheet("Sheet1", data=[df.columns.tolist()] + df.values.tolist()) - wb.save(output_path) +def save_data_to_excel(df, output_path, sheet_name="Sheet1"): + # Replace NaN/None with empty string with type guard + assert isinstance(df, pd.DataFrame), f"df must be DataFrame, got {type(df)}" + df = df.fillna("") + # Use pandas to_excel which is more reliable + df.to_excel(output_path, sheet_name=sheet_name, index=False) return df @retry_on_error(max_retries=3, delay=1.0) def create_prediction_set_ensemble() -> pd.DataFrame: """Optimized data loading and preprocessing for predictions.""" - file_path = os.path.join(project_root, "data", "prediction", "new_api_prediction_data.xlsx") + file_path = os.path.join(project_root, FilePaths.NEW_API_PREDICTION_DATA_XLSX) logger.info(f"Loading prediction data from: {file_path}") import_selected_features_ensemble_new("all") try: @@ -1276,7 +1409,7 @@ def create_prediction_set_ensemble() -> pd.DataFrame: # Validate early if data.empty: logger.info("Empty dataset loaded", error_code=DataProcessingError.EMPTY_DATASET) - raise ValueError("Dataset is empty") + raise ValueError(Messages.DATASET_EMPTY) logger.info(f"Initial data shape: {data.shape}") data_copy = data.copy() @@ -1292,10 +1425,14 @@ def create_prediction_set_ensemble() -> pd.DataFrame: # Date handling if "date_encoded" not in data.columns: try: - data["date_encoded"] = ( + date_encoded_series = ( pd.to_datetime(data["Date"], errors="coerce") - pd.Timestamp("2020-08-11") ).dt.days - data["date_encoded"] = data["date_encoded"].fillna(0).astype("int64") + # Add type guard for pandas Series operations + assert isinstance(date_encoded_series, pd.Series), ( + Messages.DATE_ENCODED_MUST_BE_SERIES + ) + data["date_encoded"] = date_encoded_series.fillna(0).astype("int64") except Exception as e: logger.info( f"Date encoding failed: {str(e)}", @@ -1303,11 +1440,12 @@ def create_prediction_set_ensemble() -> pd.DataFrame: ) raise # Optimized column dropping - cols_to_drop = {"Date", "Home", "Away", "league_name"} - if cols_to_drop: - data = data.drop(columns=cols_to_drop) + cols_to_drop = ["Date", "Home", "Away", "league_name"] + existing_cols_to_drop = [col for col in cols_to_drop if col in data.columns] + if existing_cols_to_drop: + data = data.drop(columns=existing_cols_to_drop) logger.info(f"Dropped columns: {cols_to_drop}") - print(f"data.columns: {data.shape}") + logger.info(f"Data shape before numeric conversion: {data.shape}") # Numeric conversion with parallel processing data = convert_numeric_columns(data=data, drop_errors=True, fill_value=0.0, verbose=False) # Restore columns using vectorized merge @@ -1315,10 +1453,15 @@ def create_prediction_set_ensemble() -> pd.DataFrame: restore_cols = data_copy[["fixture_id", "Date", "Home", "Away", "league_name"]] # Only merge columns that don't already exist in data cols_to_restore = [col for col in restore_cols.columns if col not in data.columns] - print(f"cols_to_restore: {cols_to_restore}") + logger.info(f"Columns to restore: {cols_to_restore}") if cols_to_restore: + merge_columns = ["fixture_id"] + cols_to_restore + merge_data = restore_cols[merge_columns] + # Ensure merge_data is a DataFrame + if not isinstance(merge_data, pd.DataFrame): + merge_data = pd.DataFrame(merge_data) data = data.merge( - restore_cols[["fixture_id"] + cols_to_restore], + merge_data, on="fixture_id", how="left", validate="one_to_one", # Ensure no duplicate fixture_ids @@ -1342,21 +1485,27 @@ def get_real_api_scores_from_excel() -> pd.DataFrame: ValueError: If data validation fails Exception: For other processing errors """ - file_path = os.path.join(project_root, "data", "prediction", "new_api_prediction_eval.xlsx") + file_path = os.path.join(project_root, FilePaths.NEW_API_PREDICTION_EVAL_XLSX) logger.info(f"Loading match results from: {file_path}") try: # Load Excel file df = pd.read_excel(file_path) if df.empty: - logger.info("Loaded dataset is empty", error_code=DataProcessingError.EMPTY_DATASET) - raise ValueError("Dataset is empty") + logger.info(Messages.LOADED_DATASET_EMPTY, error_code=DataProcessingError.EMPTY_DATASET) + raise ValueError(Messages.DATASET_EMPTY) logger.info(f"Successfully loaded data with shape: {df.shape}") # Filter rows where match_outcome is not NA df = df.dropna(subset=["match_outcome"]) logger.info(f"Data shape after filtering NA match outcomes: {df.shape}") # Convert fixture_id column to integer type - df["fixture_id"] = pd.to_numeric(df["fixture_id"], errors="coerce").astype("Int64") + fixture_id_numeric = pd.to_numeric(df["fixture_id"], errors="coerce") + # Ensure it's a Series and handle potential NaN values + if isinstance(fixture_id_numeric, pd.Series): + df["fixture_id"] = fixture_id_numeric.fillna(-1).astype(int) + else: + # Fallback if conversion didn't return a Series + df["fixture_id"] = df["fixture_id"].astype(int) # Create new column for is_draw based on match_outcome df["is_draw"] = (df["match_outcome"] == 2).astype(int) @@ -1364,7 +1513,10 @@ def get_real_api_scores_from_excel() -> pd.DataFrame: # Select and rename relevant columns results_df = df[ ["fixture_id", "Home", "Away", "Date", "league_name", "match_outcome", "is_draw"] - ].rename( + ] + # Ensure results_df is a DataFrame before renaming + assert isinstance(results_df, pd.DataFrame), "results_df must be DataFrame" + results_df = results_df.rename( columns={ "Home": "home_team", "Away": "away_team", @@ -1404,18 +1556,18 @@ def update_api_data_new_for_draws(): """ try: # Load existing training data - data_path = "data/prediction/new_api_prediction_data.xlsx" - data_path_new = "data/prediction/new_api_prediction_data.xlsx" + data_path = FilePaths.NEW_API_PREDICTION_DATA_XLSX + data_path_new = FilePaths.NEW_API_PREDICTION_DATA_XLSX data = pd.read_excel(data_path) - + logger.info(f"Loaded initial data with shape: {data.shape}") - + # Extract year and month from Date column - data['Date'] = pd.to_datetime(data['Date']) - data['year'] = data['Date'].dt.year.astype(int) - data['month'] = data['Date'].dt.month.astype(int) + data["Date"] = pd.to_datetime(data["Date"]) + data["year"] = data["Date"].dt.year.astype(int) + data["month"] = data["Date"].dt.month.astype(int) logger.info("Added year and month columns from Date") - + # Initialize the feature engineer feature_engineer = AdvancedGoalFeatureEngineer() # Add advanced goal features @@ -1435,74 +1587,114 @@ def update_api_data_new_for_draws(): api_training_data = api_training_data.copy() api_training_data["is_draw"] = (api_training_data["match_outcome"] == 2).astype(int) logger.info("Added is_draw column to training data") - + # Filter data for dates after 2025-04-15 where match_outcome is not blank (evaluation data) api_prediction_eval = updated_data[ (updated_data["Date"] >= "2025-04-15") & (updated_data["match_outcome"].notna()) ] - + # Filter data for dates after 2025-04-15 where match_outcome is blank (prediction data) + # Add type guard for pandas Series operations + match_outcome_series_2 = updated_data["match_outcome"] + assert isinstance(match_outcome_series_2, pd.Series), ( + f"match_outcome must be Series, got {type(match_outcome_series_2)}" + ) api_prediction_data = updated_data[ - (updated_data["Date"] >= "2025-04-15") & (updated_data["match_outcome"].isna()) + (updated_data["Date"] >= "2025-04-15") & (match_outcome_series_2.isna()) ] logger.info("Data split summary:") logger.info(f" - Training data shape: {api_training_data.shape}") logger.info(f" - Evaluation data shape: {api_prediction_eval.shape}") logger.info(f" - Prediction data shape: {api_prediction_data.shape}") - + # Check if featuretools features are present in the splits - ft_features_in_data = [col for col in updated_data.columns if col.startswith('ft_')] + ft_features_in_data = [col for col in updated_data.columns if col.startswith("ft_")] if ft_features_in_data: - for dataset_name, dataset in [("Training", api_training_data), ("Evaluation", api_prediction_eval), ("Prediction", api_prediction_data)]: + for dataset_name, dataset in [ + ("Training", api_training_data), + ("Evaluation", api_prediction_eval), + ("Prediction", api_prediction_data), + ]: ft_features_present = [f for f in ft_features_in_data if f in dataset.columns] - logger.info(f" - {dataset_name} data has {len(ft_features_present)} featuretools features") - + logger.info( + f" - {dataset_name} data has {len(ft_features_present)} featuretools features" + ) + # Concatenate the filtered dataframes for final output updated_data = pd.concat([api_prediction_eval, api_prediction_data], ignore_index=True) # Export training data - save_data_to_excel(api_training_data, "data/new_api_training_final.xlsx", "api_training_final") - create_parquet_files(api_training_data, "data/new_api_training_final.parquet") + save_data_to_excel( + api_training_data, FilePaths.NEW_API_TRAINING_FINAL_XLSX, "api_training_final" + ) + # Ensure api_training_data is a DataFrame + if isinstance(api_training_data, pd.DataFrame): + create_parquet_files(api_training_data, FilePaths.NEW_API_TRAINING_FINAL_PARQUET) + else: + logger.warning(Messages.DATA_NOT_DATAFRAME_SKIP_PARQUET) logger.info("api_training_final.xlsx and .parquet updated with featuretools features") # Export evaluation data save_data_to_excel( - api_prediction_eval, "data/prediction/new_api_prediction_eval.xlsx", "api_prediction_eval" + api_prediction_eval, + FilePaths.NEW_API_PREDICTION_EVAL_XLSX, + "api_prediction_eval", ) - create_parquet_files(api_prediction_eval, "data/prediction/new_api_prediction_eval.parquet") + # Ensure api_prediction_eval is a DataFrame + if isinstance(api_prediction_eval, pd.DataFrame): + create_parquet_files( + api_prediction_eval, FilePaths.NEW_API_PREDICTION_EVAL_PARQUET + ) + else: + logger.warning(Messages.DATA_NOT_DATAFRAME_SKIP_PARQUET) logger.info("api_prediction_eval.xlsx and .parquet updated with featuretools features") # Export prediction data save_data_to_excel( - api_prediction_data, "data/prediction/new_api_predictions_data.xlsx", "api_prediction_data" + api_prediction_data, + FilePaths.NEW_API_PREDICTIONS_DATA_XLSX, + "api_prediction_data", ) - create_parquet_files(api_prediction_data, "data/prediction/new_api_predictions_data.parquet") + # Ensure api_prediction_data is a DataFrame + if isinstance(api_prediction_data, pd.DataFrame): + create_parquet_files( + api_prediction_data, FilePaths.NEW_API_PREDICTIONS_DATA_PARQUET + ) + else: + logger.warning(Messages.DATA_NOT_DATAFRAME_SKIP_PARQUET) logger.info("api_predictions_data.xlsx and .parquet updated with featuretools features") - + # Save updated data back to Excel updated_data.to_excel(data_path_new, index=False) - + logger.info("=== Data Update Complete ===") logger.info(f"Final data shape: {updated_data.shape}") - + # Feature summary if ft_features_in_data: - logger.info(f"Successfully integrated {len(ft_features_in_data)} automated features from featuretools") - logger.info("Your ensemble models can now use these enhanced features for better performance") + logger.info( + f"Successfully integrated {len(ft_features_in_data)} automated features from featuretools" + ) + logger.info( + "Your ensemble models can now use these enhanced features for better performance" + ) else: - logger.info("Data updated with existing feature engineering (featuretools features not added)") - + logger.info( + "Data updated with existing feature engineering (featuretools features not added)" + ) + except Exception as e: - logger.error(f"Error updating training data for draws: {str(e)}") + logger.info(f"Error updating training data for draws: {str(e)}") import traceback + traceback.print_exc() def import_training_data_ensemble_new(): """Import training data for draw predictions.""" - parquet_path = os.path.join(project_root, "data", "new_api_training_final.parquet") - data_path = os.path.join(project_root, "data", "new_api_training_final.xlsx") + parquet_path = os.path.join(project_root, FilePaths.NEW_API_TRAINING_FINAL_PARQUET) + data_path = os.path.join(project_root, FilePaths.NEW_API_TRAINING_FINAL_XLSX) # Check if parquet file exists and is valid if os.path.exists(parquet_path): @@ -1510,7 +1702,7 @@ def import_training_data_ensemble_new(): data = pd.read_parquet(parquet_path) logger.info(f"Loaded training data from parquet: {parquet_path}") if "is_draw" not in data.columns: - logger.info("is_draw column not found in parquet file, creating target variable") + logger.info(Messages.IS_DRAW_COLUMN_NOT_FOUND) data["is_draw"] = (data["match_outcome"] == 2).astype(int) # Drop rows where home_failed_to_score_away is NA # data = data.dropna(subset=['home_failed_to_score_away']) @@ -1573,12 +1765,20 @@ def import_training_data_ensemble_new(): if col in data.columns: data[col] = data[col].astype("int64") # Export processed data to parquet for efficient storage and retrieval - create_parquet_files(data, "data/new_api_training_final.parquet") + # Ensure data is a DataFrame + if isinstance(data, pd.DataFrame): + create_parquet_files(data, "data/new_api_training_final.parquet") + else: + logger.warning(Messages.DATA_NOT_DATAFRAME_SKIP_PARQUET) logger.info("Exported processed training data to parquet format") # Split into train and test sets train_data, test_data = train_test_split( data, test_size=0.2, random_state=42, stratify=data["is_draw"] ) + # Convert back to DataFrames since train_test_split returns numpy arrays + train_data = pd.DataFrame(train_data, columns=data.columns) + test_data = pd.DataFrame(test_data, columns=data.columns) + # Select features and target X_train = train_data.drop(columns="is_draw", errors="ignore") y_train = train_data["is_draw"] X_test = test_data.drop(columns="is_draw", errors="ignore") @@ -1586,7 +1786,7 @@ def import_training_data_ensemble_new(): return X_train, y_train, X_test, y_test -def create_evaluation_set_new() -> pd.DataFrame: +def create_evaluation_set_new() -> tuple[pd.DataFrame, pd.Series]: """Create evaluation set for ensemble training with selected features and evaluation columns. This function creates a dataset containing all features from selected_features_ensemble.json along with the target variable (is_draw) and an evaluator column for model comparison. @@ -1602,7 +1802,9 @@ def create_evaluation_set_new() -> pd.DataFrame: """ try: # Load training data - data_path = os.path.join(project_root, "data", "prediction", "new_api_prediction_eval.parquet") + data_path = os.path.join( + project_root, FilePaths.NEW_API_PREDICTION_EVAL_PARQUET + ) logger.info(f"Loading training data from: {data_path}") data = pd.read_parquet(data_path) # Create target variable @@ -1648,22 +1850,30 @@ def create_evaluation_set_new() -> pd.DataFrame: ) raise ValueError(f"Missing required features: {missing_features}") # Select features and add evaluator column - evaluation_data = data[selected_features].copy() + selected_data = data[selected_features] + evaluation_data = selected_data.copy() evaluation_data["is_draw"] = data["is_draw"] # Convert numeric columns + # Ensure evaluation_data is DataFrame before conversion + assert isinstance(evaluation_data, pd.DataFrame), Messages.EVALUATION_DATA_MUST_BE_DATAFRAME evaluation_data = convert_numeric_columns( data=evaluation_data, columns=None, drop_errors=True, fill_value=0.0, verbose=True ) + assert isinstance(evaluation_data, pd.DataFrame), Messages.EVALUATION_DATA_MUST_BE_DATAFRAME # Split into features and target - X_val = evaluation_data.drop(columns=["is_draw"]) + x_val = evaluation_data.drop(columns=["is_draw"]) y_val = evaluation_data["is_draw"] + # Ensure proper types + assert isinstance(x_val, pd.DataFrame), "x_val must be DataFrame" + assert isinstance(y_val, pd.Series), "y_val must be Series" # Final validation logger.info(f"Ensemble evaluation set created with shape: {evaluation_data.shape}") logger.info(f"Draw rate: {evaluation_data['is_draw'].mean():.2%}") - logger.info(f"Train set shape: {X_val.shape}") + logger.info(f"Train set shape: {x_val.shape}") logger.info(f"Test set shape: {y_val.shape}") - return X_val, y_val + return x_val, y_val + except FileNotFoundError as e: logger.info(f"Data file not found: {str(e)}", error_code=DataProcessingError.FILE_NOT_FOUND) raise @@ -1709,7 +1919,10 @@ def import_selected_features_ensemble_new(model_type: Optional[str] = None) -> U with open(json_path) as f: features = json.load(f) # Validate loaded data structure - if not all(key in features for key in ["xgb", "catboost", "lgbm", "rf", "tabnet", "mlp", "pytorch", "svm"]): + if not all( + key in features + for key in ["xgb", "catboost", "lgbm", "rf", "tabnet", "mlp", "pytorch", "svm"] + ): raise ValueError("JSON file missing required model keys") # Return specific model type if requested if model_type is not None: @@ -1717,7 +1930,17 @@ def import_selected_features_ensemble_new(model_type: Optional[str] = None) -> U common_features = features["all"] logger.info("Returning features common to all models") return common_features - elif model_type not in ["xgb", "catboost", "lgbm", "rf", "tabnet", "mlp", "pytorch", "svm", "all"]: + elif model_type not in [ + "xgb", + "catboost", + "lgbm", + "rf", + "tabnet", + "mlp", + "pytorch", + "svm", + "all", + ]: raise ValueError( f"Invalid model_type: {model_type}. Must be one of: 'xgb', 'catboost', 'lgbm', 'rf', 'tabnet', 'mlp', 'pytorch', 'svm', 'all'" ) @@ -1745,130 +1968,245 @@ def import_selected_features_ensemble_new(model_type: Optional[str] = None) -> U raise -def create_enhanced_categorical_features(data): - """ - Create enhanced categorical features that leverage team, league, and temporal interactions. - - Args: - X_train (pd.DataFrame): Training features - X_test (pd.DataFrame): Test features - X_eval (pd.DataFrame): Evaluation features - - Returns: - tuple: (X_train_enhanced, X_test_enhanced, X_eval_enhanced) - """ - logger.info("Creating enhanced categorical features for better model performance") - - # Create copies to avoid modifying original data - data_enh = data.copy() +def _clean_base_encoded_columns(data): + """Clean NaN, inf, and -inf values in base encoded columns.""" + data_clean = data.copy() - # Handle NaN, inf, and -inf values in base encoded columns base_encoded_columns = [ - 'home_encoded', 'away_encoded', 'venue_encoded', 'league_encoded', - 'season_encoded', 'referee_encoded','away_league_position','home_league_position' + "home_encoded", + "away_encoded", + "venue_encoded", + "league_encoded", + "season_encoded", + "referee_encoded", + "away_league_position", + "home_league_position", ] - + for col in base_encoded_columns: - if col in data_enh.columns: + if col in data_clean.columns: # Replace NaN, inf, and -inf with 0 - data_enh[col] = data_enh[col].replace([np.inf, -np.inf], np.nan).fillna(0) + # Add type guards for pandas Series operations + col_series = data_clean[col] + assert isinstance(col_series, pd.Series), ( + f"data_clean[{col}] must be Series, got {type(col_series)}" + ) + data_clean[col] = col_series.replace([np.inf, -np.inf], np.nan).fillna(0) # Ensure integer type for encoded columns - data_enh[col] = data_enh[col].astype(int) - + data_clean[col] = data_clean[col].astype(int) + logger.info("Replaced NaN, inf, and -inf values in base encoded columns with 0") + return data_clean + + +def _add_team_league_interactions(data): + """Add team-league interaction features.""" + if "home_encoded" not in data.columns or "league_encoded" not in data.columns: + return data + encoder = LabelEncoder() - - for df in [data_enh]: - # 1. Team-League Interaction Features - if 'home_encoded' in df.columns and 'league_encoded' in df.columns: - df['home_team_league'] = df['home_encoded'].astype(str) + '_' + df['league_encoded'].astype(str) - df['away_team_league'] = df['away_encoded'].astype(str) + '_' + df['league_encoded'].astype(str) - df['home_team_league_encoded'] = encoder.fit_transform(df['home_team_league']) - df['away_team_league_encoded'] = encoder.fit_transform(df['away_team_league']) - - # 2. Team Matchup Feature (captures historical team dynamics) - if 'home_encoded' in df.columns and 'away_encoded' in df.columns: - # Create consistent team pair identifier (smaller ID first) - df['team_pair'] = df.apply( - lambda row: f"{min(row['home_encoded'], row['away_encoded'])}_{max(row['home_encoded'], row['away_encoded'])}", - axis=1 - ) - df['team_pair_encoded'] = encoder.fit_transform(df['team_pair']) - - # 3. Season-League Context - if 'season_encoded' in df.columns and 'league_encoded' in df.columns: - df['season_league'] = df['season_encoded'].astype(str) + '_' + df['league_encoded'].astype(str) - df['season_league_encoded'] = encoder.fit_transform(df['season_league']) - - # 4. Venue-League Context (captures venue effects within leagues) - if 'venue_encoded' in df.columns and 'league_encoded' in df.columns: - df['venue_league'] = df['venue_encoded'].astype(str) + '_' + df['league_encoded'].astype(str) - df['venue_league_encoded'] = encoder.fit_transform(df['venue_league']) - - # 5. Temporal Features for Seasonality - if 'year' in df.columns and 'month' in df.columns: - df['year_month'] = df['year'].astype(str) + '_' + df['month'].astype(str) - df['year_month_encoded'] = encoder.fit_transform(df['year_month']) - - # 6. Team Strength Tier (based on actual league positions) - if 'home_league_position' in df.columns and 'away_league_position' in df.columns: - # Create team strength tiers based on league positions - # 1-4: tier 1 (elite), 5-8: tier 2 (strong), 9-12: tier 3 (medium), 13+: tier 4 (weak) - def get_strength_tier(position): - if pd.isna(position): - return 0 # Unknown/missing position - elif position <= 4: - return 1 # Elite (top 4) - elif position <= 8: - return 2 # Strong (5-8) - elif position <= 12: - return 3 # Medium (9-12) - else: - return 4 # Weak (13+) - - df['home_strength_tier'] = df['home_league_position'].apply(get_strength_tier) - df['away_strength_tier'] = df['away_league_position'].apply(get_strength_tier) - - # Create strength difference feature (useful for predicting outcomes) - df['strength_tier_difference'] = abs(df['home_strength_tier'] - df['away_strength_tier']) - - logger.info(f"Home strength tier distribution: {df['home_strength_tier'].value_counts().sort_index().to_dict()}") - logger.info(f"Away strength tier distribution: {df['away_strength_tier'].value_counts().sort_index().to_dict()}") - logger.info(f"Strength difference distribution: {df['strength_tier_difference'].value_counts().sort_index().to_dict()}") - - # 7. Match Competitiveness Level (average match quality) - if 'home_strength_tier' in df.columns and 'away_strength_tier' in df.columns: - # Create competitiveness as average of both team tiers: (home_tier + away_tier) / 2 - df['match_competitiveness'] = (df['home_strength_tier'] + df['away_strength_tier']) / 2 - + data_copy = data.copy() + + data_copy["home_team_league"] = ( + data_copy["home_encoded"].astype(str) + "_" + data_copy["league_encoded"].astype(str) + ) + data_copy["away_team_league"] = ( + data_copy["away_encoded"].astype(str) + "_" + data_copy["league_encoded"].astype(str) + ) + data_copy["home_team_league_encoded"] = encoder.fit_transform(data_copy["home_team_league"]) + data_copy["away_team_league_encoded"] = encoder.fit_transform(data_copy["away_team_league"]) + + return data_copy + + +def _add_team_matchup_features(data): + """Add team matchup features capturing historical team dynamics.""" + if "home_encoded" not in data.columns or "away_encoded" not in data.columns: + return data + + encoder = LabelEncoder() + data_copy = data.copy() + + # Create consistent team pair identifier (smaller ID first) + data_copy["team_pair"] = data_copy.apply( + lambda row: f"{min(row['home_encoded'], row['away_encoded'])}_{max(row['home_encoded'], row['away_encoded'])}", + axis=1, + ) + data_copy["team_pair_encoded"] = encoder.fit_transform(data_copy["team_pair"]) + + return data_copy + + +def _add_season_league_context(data): + """Add season-league context features.""" + if "season_encoded" not in data.columns or "league_encoded" not in data.columns: + return data + + encoder = LabelEncoder() + data_copy = data.copy() + + data_copy["season_league"] = ( + data_copy["season_encoded"].astype(str) + "_" + data_copy["league_encoded"].astype(str) + ) + data_copy["season_league_encoded"] = encoder.fit_transform(data_copy["season_league"]) + + return data_copy + + +def _add_venue_league_context(data): + """Add venue-league context features capturing venue effects within leagues.""" + if "venue_encoded" not in data.columns or "league_encoded" not in data.columns: + return data + + encoder = LabelEncoder() + data_copy = data.copy() + + data_copy["venue_league"] = ( + data_copy["venue_encoded"].astype(str) + "_" + data_copy["league_encoded"].astype(str) + ) + data_copy["venue_league_encoded"] = encoder.fit_transform(data_copy["venue_league"]) + + return data_copy + + +def _add_temporal_features(data): + """Add temporal features for seasonality.""" + if "year" not in data.columns or "month" not in data.columns: + return data + + encoder = LabelEncoder() + data_copy = data.copy() + + data_copy["year_month"] = data_copy["year"].astype(str) + "_" + data_copy["month"].astype(str) + data_copy["year_month_encoded"] = encoder.fit_transform(data_copy["year_month"]) + + return data_copy + + +def _get_strength_tier(position): + """Convert league position to strength tier.""" + if pd.isna(position): + return 0 # Unknown/missing position + elif position <= 4: + return 1 # Elite (top 4) + elif position <= 8: + return 2 # Strong (5-8) + elif position <= 12: + return 3 # Medium (9-12) + else: + return 4 # Weak (13+) + + +def _add_strength_tier_features(data): + """Add team strength tier features based on league positions.""" + if "home_league_position" not in data.columns or "away_league_position" not in data.columns: + return data + + data_copy = data.copy() + + data_copy["home_strength_tier"] = data_copy["home_league_position"].apply(_get_strength_tier) + data_copy["away_strength_tier"] = data_copy["away_league_position"].apply(_get_strength_tier) + + # Create strength difference feature (useful for predicting outcomes) + data_copy["strength_tier_difference"] = abs( + data_copy["home_strength_tier"] - data_copy["away_strength_tier"] + ) + + logger.info( + f"Home strength tier distribution: {data_copy['home_strength_tier'].value_counts().sort_index().to_dict()}" + ) + logger.info( + f"Away strength tier distribution: {data_copy['away_strength_tier'].value_counts().sort_index().to_dict()}" + ) + logger.info( + f"Strength difference distribution: {data_copy['strength_tier_difference'].value_counts().sort_index().to_dict()}" + ) + + return data_copy + + +def _add_competitiveness_features(data): + """Add match competitiveness level features.""" + if "home_strength_tier" not in data.columns or "away_strength_tier" not in data.columns: + return data + + data_copy = data.copy() + # Create competitiveness as average of both team tiers: (home_tier + away_tier) / 2 + data_copy["match_competitiveness"] = ( + data_copy["home_strength_tier"] + data_copy["away_strength_tier"] + ) / 2 + + return data_copy + + +def _log_new_features(data): + """Log information about newly created features.""" + new_features = [ + "home_team_league_encoded", + "away_team_league_encoded", + "team_pair_encoded", + "season_league_encoded", + "venue_league_encoded", + "year_month_encoded", + "home_strength_tier", + "away_strength_tier", + "match_competitiveness", + ] + logger.info("Enhanced categorical features created:") - new_features = ['home_team_league_encoded', 'away_team_league_encoded', 'team_pair_encoded', 'season_league_encoded', - 'venue_league_encoded', 'year_month_encoded', 'home_strength_tier', 'away_strength_tier', - 'match_competitiveness'] - for feature in new_features: - if feature in data_enh.columns: - logger.info(f" - {feature}: {data_enh[feature].nunique()} unique values") - + if feature in data.columns: + logger.info(f" - {feature}: {data[feature].nunique()} unique values") + + +def create_enhanced_categorical_features(data): + """ + Create enhanced categorical features that leverage team, league, and temporal interactions. + + Args: + data (pd.DataFrame): Input dataset + + Returns: + pd.DataFrame: Dataset with enhanced categorical features + """ + logger.info("Creating enhanced categorical features for better model performance") + + # Clean base encoded columns + data_enh = _clean_base_encoded_columns(data) + + # Add various categorical feature enhancements + data_enh = _add_team_league_interactions(data_enh) + data_enh = _add_team_matchup_features(data_enh) + data_enh = _add_season_league_context(data_enh) + data_enh = _add_venue_league_context(data_enh) + data_enh = _add_temporal_features(data_enh) + data_enh = _add_strength_tier_features(data_enh) + data_enh = _add_competitiveness_features(data_enh) + + # Log feature creation summary + _log_new_features(data_enh) + return data_enh + def calculate_league_draw_k_factors(data): """Calculate draw-specific K-factors for all leagues and add to data.""" league_draw_k_factors = {} - + unique_leagues = data["league_encoded"].unique() for league in unique_leagues: league_data = data[data["league_encoded"] == league] draw_k_factor = calculate_draw_k_factor(league_data, logger) league_draw_k_factors[league] = draw_k_factor - + logger.info(f"League {league} - Draw K-factor: {draw_k_factor}") - + # Add k_factor column to data based on league_encoded - data['k_factor'] = data['league_encoded'].map(league_draw_k_factors) - + data["k_factor"] = data["league_encoded"].map(league_draw_k_factors) + return data + @retry_on_error(max_retries=3, delay=1.0) def import_training_data_ensemble_date_stratified(): """ @@ -1876,13 +2214,15 @@ def import_training_data_ensemble_date_stratified(): are selected as the validation set, ensuring temporal consistency and preventing data leakage while maintaining representative samples across all dates. """ - parquet_path = os.path.join(project_root, "data", "new_api_training_final.parquet") - parquet_path_new = os.path.join(project_root, "data", "prediction", "new_api_prediction_eval.parquet") - data_path = os.path.join(project_root, "data", "new_api_training_final.xlsx") - data_path_new = os.path.join(project_root, "data", "prediction", "new_api_prediction_eval.xlsx") + parquet_path = os.path.join(project_root, FilePaths.NEW_API_TRAINING_FINAL_PARQUET) + parquet_path_new = os.path.join( + project_root, FilePaths.NEW_API_PREDICTION_EVAL_PARQUET + ) + data_path = os.path.join(project_root, FilePaths.NEW_API_TRAINING_FINAL_XLSX) + data_path_new = os.path.join(project_root, FilePaths.NEW_API_PREDICTION_EVAL_XLSX) logger.info("Starting date-stratified training data import") - + # Check if parquet file exists and is valid if os.path.exists(parquet_path): try: @@ -1890,26 +2230,27 @@ def import_training_data_ensemble_date_stratified(): data_val = pd.read_parquet(parquet_path_new) logger.info(f"Loaded training data from parquet: {parquet_path}") if "is_draw" not in data.columns: - logger.info("is_draw column not found in parquet file, creating target variable") + logger.info(Messages.IS_DRAW_COLUMN_NOT_FOUND) data["is_draw"] = (data["match_outcome"] == 2).astype(int) if "is_draw" not in data_val.columns: - logger.info("is_draw column not found in parquet file, creating target variable") + logger.info(Messages.IS_DRAW_COLUMN_NOT_FOUND) data_val["is_draw"] = (data_val["match_outcome"] == 2).astype(int) except Exception as e: logger.info(f"Failed to load parquet file, falling back to Excel: {str(e)}") data = pd.read_excel(data_path) + data_val = pd.read_excel(data_path_new) else: data = pd.read_excel(data_path) data_val = pd.read_excel(data_path_new) logger.info(f"Loaded training data from Excel: {data_path}") - + # Create target variable data["is_draw"] = (data["match_outcome"] == 2).astype(int) - + # Select features and target - columns_to_drop = [ + columns_to_drop: list[str] = [ "match_outcome", - "home_goals", + "home_goals", "away_goals", "total_goals", "score", @@ -1917,7 +2258,7 @@ def import_training_data_ensemble_date_stratified(): "draw", "venue_name", "Home", - "Away", + "Away", "away_win", "Date", "date", @@ -1932,7 +2273,7 @@ def import_training_data_ensemble_date_stratified(): "mid_season_factor", ] data = data.drop(columns=columns_to_drop, errors="ignore") - + # Convert all numeric-like columns data = convert_numeric_columns( data=data, @@ -1944,7 +2285,7 @@ def import_training_data_ensemble_date_stratified(): # Define integer columns that should remain as int64 int_columns = [ "h2h_draws", - "home_h2h_wins", + "home_h2h_wins", "h2h_matches", "Away_points_cum", "Home_points_cum", @@ -1953,7 +2294,7 @@ def import_training_data_ensemble_date_stratified(): "venue_encoded", "date_encoded", ] - + # Convert integer columns back to int64 for col in int_columns: data[col] = data[col].astype("int64") @@ -1961,19 +2302,23 @@ def import_training_data_ensemble_date_stratified(): common_columns = list(set(data.columns) & set(data_val.columns)) data = data[common_columns] data_val = data_val[common_columns] - + data = pd.concat([data, data_val], ignore_index=True) + # Ensure data is a DataFrame after concat + assert isinstance(data, pd.DataFrame), "data must be DataFrame after concat" logger.info(f"Merged training and validation data, total shape: {data.shape}") # Validate date_encoded column exists if "date_encoded" not in data.columns: logger.info( "date_encoded column not found in data", - error_code=DataProcessingError.MISSING_REQUIRED_COLUMNS + error_code=DataProcessingError.MISSING_REQUIRED_COLUMNS, ) raise ValueError("date_encoded column is required for date-stratified splitting") - + # Analyze date distribution - date_counts = data['date_encoded'].value_counts().sort_index() + date_encoded_series = data["date_encoded"] + assert isinstance(date_encoded_series, pd.Series), "date_encoded must be a Series" + date_counts = date_encoded_series.value_counts().sort_index() logger.info(f"Found {len(date_counts)} unique dates in dataset") logger.info(f"Date range: {date_counts.index.min()} to {date_counts.index.max()}") logger.info(f"Average matches per date: {date_counts.mean():.1f}") @@ -1981,49 +2326,64 @@ def import_training_data_ensemble_date_stratified(): # Drop last 3 dates to prevent data leakage sorted_dates = sorted(date_counts.index) dates_to_drop = sorted_dates[-7:] # Get the last 7 dates - + if dates_to_drop: logger.info(f"Dropping last 3 dates to prevent data leakage: {dates_to_drop}") - data = data[~data['date_encoded'].isin(dates_to_drop)] - + data = data[~date_encoded_series.isin(dates_to_drop)] + # Update date counts after dropping - date_counts = data['date_encoded'].value_counts().sort_index() + date_encoded_series = data["date_encoded"] + assert isinstance(date_encoded_series, pd.Series), "date_encoded must be a Series" + date_counts = date_encoded_series.value_counts().sort_index() logger.info(f"After dropping last 3 dates: {len(date_counts)} unique dates remaining") logger.info(f"New date range: {date_counts.index.min()} to {date_counts.index.max()}") - logger.info(f"Dropped {len(data) - data.shape[0] if 'original_shape' in locals() else 'unknown'} samples") - + logger.info( + f"Dropped {len(data) - data.shape[0] if 'original_shape' in locals() else 'unknown'} samples" + ) + # Check for dates with insufficient samples min_samples_per_date = 10 # Minimum to ensure at least 1 validation sample (15% of 7 = 1.05) insufficient_dates = date_counts[date_counts < min_samples_per_date] if len(insufficient_dates) > 0: - logger.info(f"Warning: {len(insufficient_dates)} dates have fewer than {min_samples_per_date} samples") - + logger.info( + f"Warning: {len(insufficient_dates)} dates have fewer than {min_samples_per_date} samples" + ) + # Perform date-stratified split train_indices = [] val_indices = [] error_date_count = 0 for date_encoded in date_counts.index: # Get all samples for this date - date_mask = data['date_encoded'] == date_encoded + date_mask = data["date_encoded"] == date_encoded date_data = data[date_mask] - + # Ensure date_data is DataFrame + assert isinstance(date_data, pd.DataFrame), "date_data must be DataFrame" + if len(date_data) < min_samples_per_date: # If too few samples, put all in training set train_indices.extend(date_data.index.tolist()) continue - + # Stratified split within this date to maintain draw rate try: date_train_idx, date_val_idx = train_test_split( date_data.index, test_size=0.30, # 15% for validation random_state=42, - stratify=date_data['is_draw'] + stratify=date_data["is_draw"], ) - train_indices.extend(date_train_idx.tolist()) - val_indices.extend(date_val_idx.tolist()) - - except ValueError as e: + # Convert indices to list format for extending + try: + train_indices.extend(date_train_idx.tolist()) # type: ignore + except AttributeError: + train_indices.extend(list(date_train_idx)) + try: + val_indices.extend(date_val_idx.tolist()) # type: ignore + except AttributeError: + val_indices.extend(list(date_val_idx)) + + except ValueError: # logger.info(f"Stratification failed for date {date_encoded}") error_date_count += 1 data_train_idx = date_data.index @@ -2033,43 +2393,188 @@ def import_training_data_ensemble_date_stratified(): # Create train and validation sets train_data = data.loc[train_indices] val_data = data.loc[val_indices] - + # Split training data further to create test set (20% of training data) - X_train, X_test, y_train, y_test = train_test_split( - train_data.drop(columns="is_draw", errors="ignore"), - train_data["is_draw"], + x_train_full = train_data.drop(columns="is_draw", errors="ignore") + y_train_full = train_data["is_draw"] + + x_train_split, x_test_split, y_train_split, y_test_split = train_test_split( + x_train_full, + y_train_full, test_size=0.2, random_state=42, - stratify=train_data["is_draw"] + stratify=y_train_full, ) - + + # Convert numpy arrays back to DataFrames/Series + x_train_split = pd.DataFrame(x_train_split, columns=x_train_full.columns) + x_test_split = pd.DataFrame(x_test_split, columns=x_train_full.columns) + y_train_split = pd.Series(y_train_split, name="is_draw") + y_test_split = pd.Series(y_test_split, name="is_draw") + # Update train_data to be the reduced training set - train_data = pd.concat([X_train, y_train], axis=1) - test_data = pd.concat([X_test, y_test], axis=1) + train_data = pd.concat([x_train_split, y_train_split], axis=1) + test_data = pd.concat([x_test_split, y_test_split], axis=1) # Prepare features and targets + # Select features and target X_train = train_data.drop(columns="is_draw", errors="ignore") y_train = train_data["is_draw"] X_test = test_data.drop(columns="is_draw", errors="ignore") y_test = test_data["is_draw"] - X_val = val_data.drop(columns="is_draw", errors="ignore") + x_val = val_data.drop(columns="is_draw", errors="ignore") y_val = val_data["is_draw"] - + # Log split statistics logger.info("Date-stratified split completed:") - logger.info(f"Training set: {len(X_train)} samples ({len(X_train)/len(data)*100:.1f}%)") - logger.info(f"Validation set: {len(X_val)} samples ({len(X_val)/len(data)*100:.1f}%)") + logger.info(f"Training set: {len(X_train)} samples ({len(X_train) / len(data) * 100:.1f}%)") + logger.info(f"Validation set: {len(x_val)} samples ({len(x_val) / len(data) * 100:.1f}%)") logger.info(f"Training draw rate: {y_train.mean():.2%}") logger.info(f"Validation draw rate: {y_val.mean():.2%}") - + # Final validation - if len(X_val) == 0: + if len(x_val) == 0: logger.info( "Validation set is empty after date-stratified split", - error_code=DataProcessingError.INSUFFICIENT_SAMPLES + error_code=DataProcessingError.INSUFFICIENT_SAMPLES, ) - raise ValueError("Validation set is empty - check date distribution and minimum sample requirements") - - return X_train, y_train, X_test, y_test, X_val, y_val + raise ValueError( + "Validation set is empty - check date distribution and minimum sample requirements" + ) + + return X_train, y_train, X_test, y_test, x_val, y_val + + +def _load_and_validate_featuretools_data(): + """Load training data and validate featuretools features are present.""" + logger.info("Loading training data for feature evaluation...") + X_train, y_train, _, _, _, _ = import_training_data_ensemble_date_stratified() + + ft_features = [col for col in X_train.columns if col.startswith("ft_")] + + if not ft_features: + logger.warning("No featuretools features found in training data") + logger.info("Run update_api_data_new_for_draws() first to generate featuretools features") + return None, None, None + + logger.info(f"Found {len(ft_features)} featuretools features in training data") + return X_train, y_train, ft_features + + +def _evaluate_featuretools_features(X_train, y_train, ft_features): + """Evaluate featuretools features using multiple importance methods.""" + from src.utils.featuretools_automated_features import SoccerFeaturetoolsEngineer + + logger.info("Evaluating featuretools features using multiple methods...") + + ft_engineer = SoccerFeaturetoolsEngineer(experiment_logger=logger) + + # Get top features using different evaluation methods + top_features_mi = ft_engineer.evaluate_feature_importance( + X=X_train, y=y_train, new_feature_names=ft_features, method="mutual_info", top_k=50 + ) + + top_features_corr = ft_engineer.evaluate_feature_importance( + X=X_train, y=y_train, new_feature_names=ft_features, method="correlation", top_k=50 + ) + + top_features_combined = ft_engineer.evaluate_feature_importance( + X=X_train, y=y_train, new_feature_names=ft_features, method="combined", top_k=100 + ) + + logger.info("Evaluation complete:") + logger.info(f" - Top features by mutual info: {len(top_features_mi)}") + logger.info(f" - Top features by correlation: {len(top_features_corr)}") + logger.info(f" - Top features by combined score: {len(top_features_combined)}") + + return top_features_mi, top_features_corr, top_features_combined + + +def _get_model_feature_strategies(top_features_mi, top_features_corr, top_features_combined): + """Define model-specific feature selection strategies.""" + return { + "xgb": top_features_combined[:25], # XGBoost handles many features well + "catboost": top_features_mi[:20], # CatBoost + mutual info for categorical handling + "lgbm": top_features_combined[:20], # LightGBM similar to XGBoost + "rf": top_features_corr[:15], # Random Forest + correlation + "tabnet": top_features_combined[:30], # TabNet can handle complex interactions + "mlp": top_features_corr[:15], # Neural networks + correlation for linear relationships + "pytorch": top_features_combined[:25], # PyTorch can handle complex interactions + "svm": top_features_corr[:10], # SVM + correlation for linear relationships + } + + +def _update_model_selections(updated_selections, model_feature_strategies): + """Update feature selections for each model type.""" + for model_name, selected_ft_features in model_feature_strategies.items(): + if model_name in updated_selections: + # Remove duplicates while preserving order + new_model_features = [] + existing_model_features = set(updated_selections[model_name]) + + for feature in selected_ft_features: + if feature not in existing_model_features: + new_model_features.append(feature) + existing_model_features.add(feature) + + # Add new featuretools features to existing selection + updated_selections[model_name].extend(new_model_features) + logger.info(f"Added {len(new_model_features)} featuretools features to {model_name}") + + if new_model_features: + logger.info(f" {model_name} examples: {new_model_features[:3]}") + + +def _update_all_selection(updated_selections, top_features_combined): + """Update the 'all' selection with top featuretools features.""" + all_top_ft_features = list(dict.fromkeys(top_features_combined)) # Remove duplicates + existing_all_features = set(updated_selections.get("all", [])) + new_all_features = [f for f in all_top_ft_features if f not in existing_all_features] + + if "all" not in updated_selections: + updated_selections["all"] = [] + updated_selections["all"].extend(new_all_features) + + return new_all_features + + +def _create_integration_report(ft_features, new_all_features): + """Create and log feature integration analysis report.""" + logger.info("Integration summary:") + logger.info(f" - Total featuretools features available: {len(ft_features)}") + logger.info(f" - Featuretools features selected for 'all': {len(new_all_features)}") + logger.info( + f" - Selection efficiency: {len(new_all_features)}/{len(ft_features)} = {len(new_all_features) / len(ft_features) * 100:.1f}%" + ) + + # Create feature analysis report + feature_analysis = { + "temporal_features": [f for f in ft_features if "ft_temporal_" in f], + "relational_features": [f for f in ft_features if "ft_relational_" in f], + "interaction_features": [ + f + for f in ft_features + if f.startswith("ft_") and "temporal" not in f and "relational" not in f + ], + "selected_temporal": [f for f in new_all_features if "ft_temporal_" in f], + "selected_relational": [f for f in new_all_features if "ft_relational_" in f], + "selected_interaction": [ + f + for f in new_all_features + if f.startswith("ft_") and "temporal" not in f and "relational" not in f + ], + } + + logger.info("Feature type analysis:") + for feature_type, features in feature_analysis.items(): + if "selected_" in feature_type: + original_type = feature_type.replace("selected_", "") + if original_type in feature_analysis: + original_count = len(feature_analysis[original_type]) + selected_count = len(features) + if original_count > 0: + logger.info( + f" {feature_type}: {selected_count}/{original_count} ({selected_count / original_count * 100:.1f}%)" + ) def integrate_featuretools_features_to_selection(): @@ -2079,197 +2584,105 @@ def integrate_featuretools_features_to_selection(): using intelligent evaluation methods. """ try: - from src.utils.featuretools_automated_features import SoccerFeaturetoolsEngineer - logger.info("=== Integrating Featuretools Features to Selection ===") - - # Load training data to evaluate featuretools features - logger.info("Loading training data for feature evaluation...") - X_train, y_train, _, _, _, _ = import_training_data_ensemble_date_stratified() - - # Check if featuretools features are present - ft_features = [col for col in X_train.columns if col.startswith('ft_')] - - if not ft_features: - logger.warning("No featuretools features found in training data") - logger.info("Run update_api_data_new_for_draws() first to generate featuretools features") + + # Load and validate data + X_train, y_train, ft_features = _load_and_validate_featuretools_data() + if X_train is None: return - - logger.info(f"Found {len(ft_features)} featuretools features in training data") - - # Initialize featuretools engineer for feature evaluation - ft_engineer = SoccerFeaturetoolsEngineer(logger=logger, mlflow_tracking=False) - - # Evaluate featuretools features using multiple methods - logger.info("Evaluating featuretools features using multiple methods...") - - # Get top features using different evaluation methods - top_features_mi = ft_engineer.evaluate_feature_importance( - X=X_train, - y=y_train, - new_feature_names=ft_features, - method='mutual_info', - top_k=50 - ) - - top_features_corr = ft_engineer.evaluate_feature_importance( - X=X_train, - y=y_train, - new_feature_names=ft_features, - method='correlation', - top_k=50 - ) - - top_features_combined = ft_engineer.evaluate_feature_importance( - X=X_train, - y=y_train, - new_feature_names=ft_features, - method='combined', - top_k=100 + + # Evaluate features + top_features_mi, top_features_corr, top_features_combined = _evaluate_featuretools_features( + X_train, y_train, ft_features ) - - logger.info("Evaluation complete:") - logger.info(f" - Top features by mutual info: {len(top_features_mi)}") - logger.info(f" - Top features by correlation: {len(top_features_corr)}") - logger.info(f" - Top features by combined score: {len(top_features_combined)}") - + # Load existing feature selections json_path = project_root / "src" / "utils" / "selected_features_ensemble_new.json" - + if not json_path.exists(): - logger.error(f"Feature selection file not found: {json_path}") + logger.info(f"Feature selection file not found: {json_path}") return - + with open(json_path) as f: existing_selections = json.load(f) - + logger.info("Loaded existing feature selections") - - # Model-specific feature selection strategy for featuretools features - model_feature_strategies = { - 'xgb': top_features_combined[:25], # XGBoost handles many features well - 'catboost': top_features_mi[:20], # CatBoost + mutual info for categorical handling - 'lgbm': top_features_combined[:20], # LightGBM similar to XGBoost - 'rf': top_features_corr[:15], # Random Forest + correlation - 'tabnet': top_features_combined[:30], # TabNet can handle complex interactions - 'mlp': top_features_corr[:15], # Neural networks + correlation for linear relationships - 'pytorch': top_features_combined[:25], # PyTorch can handle complex interactions - 'svm': top_features_corr[:10] # SVM + correlation for linear relationships - } - - # Create updated feature selections + + # Get model strategies and update selections + model_feature_strategies = _get_model_feature_strategies( + top_features_mi, top_features_corr, top_features_combined + ) + updated_selections = existing_selections.copy() - - # Add strategically selected featuretools features to each model - for model_name, selected_ft_features in model_feature_strategies.items(): - if model_name in updated_selections: - # Remove duplicates while preserving order - new_model_features = [] - existing_model_features = set(updated_selections[model_name]) - - for feature in selected_ft_features: - if feature not in existing_model_features: - new_model_features.append(feature) - existing_model_features.add(feature) - - # Add new featuretools features to existing selection - updated_selections[model_name].extend(new_model_features) - logger.info(f"Added {len(new_model_features)} featuretools features to {model_name}") - - if new_model_features: - logger.info(f" {model_name} examples: {new_model_features[:3]}") - - # Update 'all' selection with all top featuretools features - all_top_ft_features = list(dict.fromkeys(top_features_combined)) # Remove duplicates - existing_all_features = set(updated_selections.get('all', [])) - new_all_features = [f for f in all_top_ft_features if f not in existing_all_features] - - if 'all' not in updated_selections: - updated_selections['all'] = [] - updated_selections['all'].extend(new_all_features) - + _update_model_selections(updated_selections, model_feature_strategies) + + # Update 'all' selection + new_all_features = _update_all_selection(updated_selections, top_features_combined) + # Ensure new_all_features is a list (type guard for linter) + if not isinstance(new_all_features, list): + raise TypeError("new_all_features must be a list") + # Cast to help linter understand the type + new_all_features = cast(list[str], new_all_features) + # Save updated selections with metadata output_data = { - 'feature_selections': updated_selections, - 'metadata': { - 'last_updated': pd.Timestamp.now().isoformat(), - 'featuretools_integration': { - 'total_ft_features_available': len(ft_features), - 'ft_features_selected': len(new_all_features), - 'selection_methods': ['mutual_info', 'correlation', 'combined'], - 'model_strategies': { - 'xgb': 'Combined score (handles many features)', - 'catboost': 'Mutual information (categorical handling)', - 'lgbm': 'Combined score (tree-based)', - 'rf': 'Correlation (ensemble method)', - 'tabnet': 'Combined score (deep tabular)', - 'mlp': 'Correlation (linear relationships)', - 'pytorch': 'Combined score (complex interactions)', - 'svm': 'Correlation (linear classifier)' - } - } - } + "feature_selections": updated_selections, + "metadata": { + "last_updated": pd.Timestamp.now().isoformat(), + "featuretools_integration": { + "total_ft_features_available": len(ft_features or []), # type: ignore + "ft_features_selected": len(new_all_features or []), # type: ignore + "selection_methods": ["mutual_info", "correlation", "combined"], + "model_strategies": { + "xgb": "Combined score (handles many features)", + "catboost": "Mutual information (categorical handling)", + "lgbm": "Combined score (tree-based)", + "rf": "Correlation (ensemble method)", + "tabnet": "Combined score (deep tabular)", + "mlp": "Correlation (linear relationships)", + "pytorch": "Combined score (complex interactions)", + "svm": "Correlation (linear classifier)", + }, + }, + }, } - - # Backup original file - backup_path = json_path.with_suffix('.backup.json') - with open(backup_path, 'w') as f: + + # Backup and save + backup_path = json_path.with_suffix(".backup.json") + with open(backup_path, "w") as f: json.dump(existing_selections, f, indent=2) logger.info(f"Backed up original selections to: {backup_path}") - - # Save updated selections - with open(json_path, 'w') as f: + + with open(json_path, "w") as f: json.dump(output_data, f, indent=2) - + logger.info(f"Updated feature selections saved to: {json_path}") - logger.info("Integration summary:") - logger.info(f" - Total featuretools features available: {len(ft_features)}") - logger.info(f" - Featuretools features selected for 'all': {len(new_all_features)}") - logger.info(f" - Selection efficiency: {len(new_all_features)}/{len(ft_features)} = {len(new_all_features)/len(ft_features)*100:.1f}%") - - # Create feature analysis report - feature_analysis = { - 'temporal_features': [f for f in ft_features if 'ft_temporal_' in f], - 'relational_features': [f for f in ft_features if 'ft_relational_' in f], - 'interaction_features': [f for f in ft_features if f.startswith('ft_') and 'temporal' not in f and 'relational' not in f], - 'selected_temporal': [f for f in new_all_features if 'ft_temporal_' in f], - 'selected_relational': [f for f in new_all_features if 'ft_relational_' in f], - 'selected_interaction': [f for f in new_all_features if f.startswith('ft_') and 'temporal' not in f and 'relational' not in f] - } - - logger.info("Feature type analysis:") - for feature_type, features in feature_analysis.items(): - if 'selected_' in feature_type: - original_type = feature_type.replace('selected_', '') - if original_type in feature_analysis: - original_count = len(feature_analysis[original_type]) - selected_count = len(features) - if original_count > 0: - logger.info(f" {feature_type}: {selected_count}/{original_count} ({selected_count/original_count*100:.1f}%)") - + + # Create and log integration report + _create_integration_report(ft_features, new_all_features) + logger.info("=== Featuretools Integration Complete ===") - + except Exception as e: - logger.error(f"Error integrating featuretools features: {str(e)}") + logger.info(f"Error integrating featuretools features: {str(e)}") import traceback + traceback.print_exc() def add_featuretools_features(updated_data): """ Add featuretools automated features to the dataset. - + Args: updated_data (pd.DataFrame): Input data to enhance with featuretools features - + Returns: pd.DataFrame: Data enhanced with featuretools features """ # Import the featuretools engineer from src.utils.featuretools_automated_features import SoccerFeaturetoolsEngineer - - - + # Initialize with central logger logger = ExperimentLogger("soccer_features") # FEATURETOOLS INTEGRATION - Add automated feature engineering @@ -2278,15 +2691,12 @@ def add_featuretools_features(updated_data): experiment_logger=logger, max_depth=2, chunk_size=5000, # Automatically calculated if None - memory_limit_gb=4.0 + memory_limit_gb=4.0, ) # Run optimized feature engineering - enhanced_df, feature_defs = engineer.run_hybrid_feature_engineering( - df=updated_data, - include_temporal=True, - include_relational=True, - include_interactions=True + enhanced_df, _ = engineer.run_hybrid_feature_engineering( + df=updated_data, include_temporal=True, include_relational=True, include_interactions=True ) # Get performance statistics @@ -2294,15 +2704,14 @@ def add_featuretools_features(updated_data): logger.info("Feature engineering stats", extra=stats) return enhanced_df + if __name__ == "__main__": - # update_api_training_data_for_draws() - # logger.info("Training data updated successfully") - # update_api_data_for_draws() - # logger.info("Prediction data updated successfully") update_api_data_new_for_draws() logger.info("New prediction data updated successfully") - X_train, y_train, X_test, y_test, X_val, y_val = import_training_data_ensemble_date_stratified() - logger.info(f"Ensemble evaluation set created with columns: {X_train.shape} and {y_train.shape}") + X_train, y_train, X_test, y_test, x_val, y_val = import_training_data_ensemble_date_stratified() + logger.info( + f"Ensemble evaluation set created with columns: {X_train.shape} and {y_train.shape}" + ) logger.info(f"Ensemble evaluation set created with columns: {X_test.shape} and {y_test.shape}") - logger.info(f"Ensemble evaluation set created with columns: {X_val.shape} and {y_val.shape}") + logger.info(f"Ensemble evaluation set created with columns: {x_val.shape} and {y_val.shape}") diff --git a/src/utils/delete_small_logs.py b/src/utils/delete_small_logs.py index 1bca867..91a2a00 100644 --- a/src/utils/delete_small_logs.py +++ b/src/utils/delete_small_logs.py @@ -3,49 +3,50 @@ Script to delete .log files that are 5KB or smaller from the logs folder. """ -import os import pathlib def delete_small_log_files(logs_dir: str = "logs", max_size_kb: int = 5) -> None: """ Delete .log files that are smaller than or equal to the specified size. - + Args: logs_dir: Path to the logs directory (default: "logs") max_size_kb: Maximum file size in KB to delete (default: 5) """ logs_path = pathlib.Path(logs_dir) - + # Check if logs directory exists if not logs_path.exists(): print(f"❌ Logs directory '{logs_dir}' does not exist.") return - + if not logs_path.is_dir(): print(f"❌ '{logs_dir}' is not a directory.") return - + max_size_bytes = max_size_kb * 1024 # Convert KB to bytes deleted_files: list[str] = [] skipped_files: list[str] = [] - - print(f"🔍 Scanning for .log files in '{logs_dir}' and subfolders that are {max_size_kb}KB or smaller...") - + + print( + f"🔍 Scanning for .log files in '{logs_dir}' and subfolders that are {max_size_kb}KB or smaller..." + ) + # Find all .log files recursively (including subfolders) log_files = list(logs_path.rglob("*.log")) - + if not log_files: print(f"ℹ️ No .log files found in '{logs_dir}'") return - + print(f"📁 Found {len(log_files)} .log file(s)") - + for log_file in log_files: try: file_size = log_file.stat().st_size file_size_kb = file_size / 1024 - + if file_size <= max_size_bytes: log_file.unlink() # Delete the file relative_path = log_file.relative_to(logs_path) @@ -54,17 +55,19 @@ def delete_small_log_files(logs_dir: str = "logs", max_size_kb: int = 5) -> None else: relative_path = log_file.relative_to(logs_path) skipped_files.append(f"{relative_path} ({file_size_kb:.2f}KB)") - print(f"⏭️ Skipped: {relative_path} ({file_size_kb:.2f}KB) - larger than {max_size_kb}KB") - + print( + f"⏭️ Skipped: {relative_path} ({file_size_kb:.2f}KB) - larger than {max_size_kb}KB" + ) + except OSError as e: relative_path = log_file.relative_to(logs_path) print(f"❌ Error processing {relative_path}: {e}") - + # Summary print("\n📊 Summary:") print(f" • Deleted: {len(deleted_files)} file(s)") print(f" • Skipped: {len(skipped_files)} file(s)") - + if deleted_files: print("\n🗑️ Deleted files:") for file_info in deleted_files: @@ -75,21 +78,23 @@ def main(): """Main function with safety confirmation.""" logs_dir = "logs" max_size_kb = 5 - - print(f"⚠️ This script will delete .log files {max_size_kb}KB or smaller from '{logs_dir}' folder.") - + + print( + f"⚠️ This script will delete .log files {max_size_kb}KB or smaller from '{logs_dir}' folder." + ) + # Safety confirmation try: confirm = input("Do you want to continue? (y/N): ").strip().lower() - if confirm not in ['y', 'yes']: + if confirm not in ["y", "yes"]: print("❌ Operation cancelled.") return except KeyboardInterrupt: print("\n❌ Operation cancelled.") return - + delete_small_log_files(logs_dir, max_size_kb) if __name__ == "__main__": - main() \ No newline at end of file + main() diff --git a/src/utils/feature_selection_ensemble.py b/src/utils/feature_selection_ensemble.py index baac2f2..8e9cd5a 100644 --- a/src/utils/feature_selection_ensemble.py +++ b/src/utils/feature_selection_ensemble.py @@ -21,20 +21,20 @@ import numpy as np import pandas as pd import torch -import torch.optim as optim from catboost import CatBoostClassifier from lightgbm import LGBMClassifier from pytorch_tabnet.metrics import Metric from pytorch_tabnet.tab_model import TabNetClassifier -from sklearn.ensemble import RandomForestClassifier -from sklearn.inspection import permutation_importance from sklearn.metrics import precision_score, recall_score -from sklearn.neural_network import MLPClassifier -from sklearn.preprocessing import StandardScaler from sklearn.utils.multiclass import type_of_target -from torch.optim.lr_scheduler import CosineAnnealingLR, OneCycleLR, ReduceLROnPlateau from xgboost import XGBClassifier +from utils.create_evaluation_set import ( + create_ensemble_evaluation_set, + import_selected_features_ensemble, + import_training_data_ensemble, +) + # Add project root to Python path try: project_root = Path(__file__).parent.parent @@ -45,8 +45,8 @@ print(f"Project root feature_selection_ensemble: {project_root}") except Exception as e: print(f"Error setting project root path: {e}") - sys.path.append(os.getcwd().parent) - print(f"Current directory feature_selection_ensemble: {os.getcwd().parent}") + sys.path.append(str(Path(os.getcwd()).parent)) + print(f"Current directory feature_selection_ensemble: {Path(os.getcwd()).parent}") # Local imports from utils.logger import ExperimentLogger @@ -56,12 +56,6 @@ experiment_name=experiment_name, log_dir="./logs/feature_selection_ensemble" ) -from utils.create_evaluation_set import ( - create_ensemble_evaluation_set, - import_selected_features_ensemble, - import_training_data_ensemble, -) - # Set fixed seed and hash seed for determinism SEED = 19 os.environ["PYTHONHASHSEED"] = str(SEED) @@ -79,6 +73,7 @@ min_recall = 0.3 + class PrecisionFocusedMetric(Metric): def __init__(self, beta=0.5): self._name = "precision_focused" @@ -96,37 +91,44 @@ def __call__(self, y_true, y_score): # Convert back to 1D: take the argmax along the class axis (axis=1) y_true_flat = np.argmax(y_true, axis=1) elif y_true_type == "binary": - y_true_flat = y_true.astype(int) # Ensure integer type + y_true_flat = y_true.astype(int) # Ensure integer type else: # Handle unexpected types or raise an error - logger.warning(f"Unexpected y_true type '{y_true_type}' in PrecisionFocusedMetric. Attempting to flatten.") + logger.warning( + f"Unexpected y_true type '{y_true_type}' in PrecisionFocusedMetric. Attempting to flatten." + ) try: - y_true_flat = y_true.astype(int).ravel() # General attempt to flatten + y_true_flat = y_true.astype(int).ravel() # General attempt to flatten except Exception as e: logger.error(f"Could not convert y_true to 1D array: {e}") - return 0.0 # Return 0 score if conversion fails + return 0.0 # Return 0 score if conversion fails # Ensure y_score handling is robust # Check if y_score has 2 columns (expected for binary probabilities) if y_score.ndim == 2 and y_score.shape[1] == 2: - pred = (y_score[:, 1] > 0.5).astype(int) # Use probability of positive class - elif y_score.ndim == 1: # If y_score is already 1D predictions/scores - pred = (y_score > 0.5).astype(int) # Threshold directly + pred = (y_score[:, 1] > 0.5).astype(int) # Use probability of positive class + elif y_score.ndim == 1: # If y_score is already 1D predictions/scores + pred = (y_score > 0.5).astype(int) # Threshold directly else: logger.error(f"Unexpected y_score shape {y_score.shape} in PrecisionFocusedMetric.") - return 0.0 # Return 0 score if y_score format is wrong + return 0.0 # Return 0 score if y_score format is wrong # Calculate precision and recall safely try: # Check target types again just before sklearn call for debugging # logger.debug(f"y_true_flat type: {type_of_target(y_true_flat)}, pred type: {type_of_target(pred)}") - precision = precision_score(y_true_flat, pred, zero_division=0) - recall = recall_score(y_true_flat, pred, zero_division=0) + precision = precision_score(y_true_flat, pred, zero_division="warn") + recall = recall_score(y_true_flat, pred, zero_division="warn") except ValueError as e: logger.error(f"Error calculating scores in PrecisionFocusedMetric: {e}") - logger.error(f"y_true_flat sample: {y_true_flat[:5]}, shape: {y_true_flat.shape}, type: {type_of_target(y_true_flat)}") - logger.error(f"pred sample: {pred[:5]}, shape: {pred.shape}, type: {type_of_target(pred)}") - return 0.0 # Return 0 score if scikit-learn metric fails + logger.error( + f"y_true_flat sample: {y_true_flat[:5]}, shape: {y_true_flat.shape}, " + f"type: {type_of_target(y_true_flat)}" + ) + logger.error( + f"pred sample: {pred[:5]}, shape: {pred.shape}, type: {type_of_target(pred)}" + ) + return 0.0 # Return 0 score if scikit-learn metric fails # If recall below threshold, return 0 if recall < min_recall: @@ -140,7 +142,7 @@ def __call__(self, y_true, y_score): def select_features( - X: pd.DataFrame, y: pd.Series, top_k: Optional[int] = 120, verbose: bool = True + x: pd.DataFrame, y: pd.Series, top_k: Optional[int] = 120, verbose: bool = True ) -> list[str]: """ Select features based on composite importance scores from XGBoost, CatBoost, and LightGBM. @@ -149,7 +151,7 @@ def select_features( Finally, it returns the names of the top features as specified by `top_k` or those whose composite score exceeds the median output. Args: - X (pd.DataFrame): The input features. + x (pd.DataFrame): The input features. y (pd.Series): The target variable. top_k (Optional[int], optional): Number of top features to select. If None, features with composite importance above the median are selected. @@ -233,10 +235,10 @@ def select_features( ) models = {"xgb": xgb_model, "cat": cat_model, "lgbm": lgbm_model, "tabnet": tabnet_model} # DataFrame to store importance scores for each feature from each model. - importance_df = pd.DataFrame(index=X.columns) + importance_df = pd.DataFrame(index=x.columns) for name, model in models.items(): - model.fit(X, y) + model.fit(x, y) if name == "xgb": imp = model.feature_importances_ elif name == "cat": @@ -246,7 +248,7 @@ def select_features( elif name == "tabnet": imp = model.feature_importances_ else: - imp = np.zeros(X.shape[1]) + imp = np.zeros(x.shape[1]) # Normalize the scores so that they sum to 1. norm_imp = imp / np.sum(imp) if np.sum(imp) > 0 else imp importance_df[name] = norm_imp @@ -255,11 +257,11 @@ def select_features( # Sort the features by the composite score in descending order importance_df = importance_df.sort_values(by="composite", ascending=False) if top_k is not None: - selected_features = importance_df.head(top_k).index.tolist() + selected_features = [str(col) for col in importance_df.head(top_k).index] else: # Otherwise select features with composite importance above the median value. median_value = importance_df["composite"].median() - selected_features = importance_df[importance_df["composite"] > median_value].index.tolist() + selected_features = [str(col) for col in importance_df[importance_df["composite"] > median_value].index] if verbose: print("Selected Features:") print(selected_features) @@ -269,9 +271,9 @@ def select_features( def select_features_differentiated( - X: pd.DataFrame, + x: pd.DataFrame, y: pd.Series, - X_val: pd.DataFrame, + x_val: pd.DataFrame, y_val: pd.Series, top_k_per_model: int = 65, fixed_features: Optional[list[str]] = None, @@ -281,7 +283,7 @@ def select_features_differentiated( Select features separately for XGBoost, CatBoost, LightGBM and Random Forest, then provide the union of the selected features with the fixed features always included. Args: - X (pd.DataFrame): Input feature dataframe. + x (pd.DataFrame): Input feature dataframe. y (pd.Series): Target variable. top_k_per_model (int): Number of top features to select for each model. fixed_features (Optional[List[str]]): Features that will be included in all sets. @@ -292,35 +294,11 @@ def select_features_differentiated( fixed_features = fixed_features or [] from src.models.StackedEnsemble.base.neural.mlp_model import create_model as create_model_mlp - # from src.models.StackedEnsemble.base.neural.tabnet_model import ( - # create_model as create_model_tabnet, - # ) + models = { - # "tabnet": create_model_tabnet({ - # "fit_weights": 1, - # "scheduler_type": "none", - # "learning_rate": 0.0025663935806495273, - # "eps": 4.4331373706050046e-07, - # "n_d": 75, - # "n_a": 48, - # "n_steps": 6, - # "gamma": 2.4000000000000004, - # "lambda_sparse": 0.00045184291660952525, - # "momentum": 0.9249999999999999, - # "patience": 38, - # "max_epochs": 180, - # "batch_size": 768, - # "virtual_batch_size": 384, - # "verbose": 0, - # "n_independent": 4, - # "n_shared": 3, - # "weight_decay": 0.00037739593920011535, - # "scheduler_pct_start": 0.25, - # "scheduler_final_div_factor": 1100.0 - # }), "xgb": XGBClassifier( tree_method="hist", - device="cuda", + device="cuda", nthread=8, objective="binary:logistic", eval_metric=["aucpr", "error", "logloss"], @@ -337,27 +315,9 @@ def select_features_differentiated( scale_pos_weight=2.36, seed=19, ), - # "cat": CatBoostClassifier( - # learning_rate=0.055, - # depth=7, - # min_data_in_leaf=165, - # subsample=0.5900000000000001, - # colsample_bylevel=0.5800000000000001, - # reg_lambda=0.6540483398088304, - # leaf_estimation_iterations=12, - # bagging_temperature=9.3, - # scale_pos_weight=4.7, - # early_stopping_rounds=700, - # loss_function="Logloss", - # eval_metric="AUC", - # custom_metric=["Precision", "Recall"], - # task_type="CPU", - # thread_count=4, - # verbose=-1, - # ), "lgbm": LGBMClassifier( objective="binary", - metric=["aucpr", "binary_logloss"], + metric=["aucpr", "binary_logloss"], verbose=-1, n_jobs=8, random_state=19, @@ -377,77 +337,46 @@ def select_features_differentiated( cat_smooth=23.400000000000002, max_bin=250, ), - # "rf": RandomForestClassifier( - # n_estimators=1060, - # max_depth=6, - # min_samples_split=70, - # min_samples_leaf=24, - # max_features=1.0, - # bootstrap=True, - # class_weight={0: 1.0, 1: 2.0}, - # criterion="entropy", - # random_state=19, - # n_jobs=8, - # verbose=0 - # ), - "mlp": create_model_mlp({ - 'input_dim': X.shape[1], - 'hidden_layers': 3, - 'neurons_per_layer': 62, - 'dropout_rate': 0.04, - 'activation': 'tanh', - 'l1_regularization': 0.004681388569246714, - 'l2_regularization': 0.0001459323364245875, - 'learning_rate': 0.04014677776290513, - 'batch_size': 423, - 'epochs': 173, - 'patience': 23, - 'class_weight_multiplier': 2.2 - }), + "mlp": create_model_mlp( + { + "input_dim": x.shape[1], + "hidden_layers": 3, + "neurons_per_layer": 62, + "dropout_rate": 0.04, + "activation": "tanh", + "l1_regularization": 0.004681388569246714, + "l2_regularization": 0.0001459323364245875, + "learning_rate": 0.04014677776290513, + "batch_size": 423, + "epochs": 173, + "patience": 23, + "class_weight_multiplier": 2.2, + } + ), } selected = {} # For each model, fit on the entire dataset and get sorted features by importance. for name, model in models.items(): if name == "xgb": - model.fit(X, y, eval_set=[(X_val, y_val)], verbose=False) + model.fit(x, y, eval_set=[(x_val, y_val)], verbose=False) imp = np.array(model.feature_importances_) - elif name == "cat": - model.fit(X, y, eval_set=[(X_val, y_val)], verbose=False) - imp = model.get_feature_importance() elif name == "lgbm": - model.fit(X, y, eval_set=[(X_val, y_val)]) - imp = np.array(model.feature_importances_) - elif name == "rf": - model.fit(X, y) - imp = np.array(model.feature_importances_) - elif name == "tabnet": - model.fit( - X.values, - y.values, - eval_set=[(X_val.values, y_val.values)], - eval_metric=[PrecisionFocusedMetric], - patience=30, - max_epochs=110, - batch_size=1858, - virtual_batch_size=2736, - weights=1, - drop_last=False, - ) + model.fit(x, y, eval_set=[(x_val, y_val)]) imp = np.array(model.feature_importances_) elif name == "mlp": scaler = pickle.load(open("src/models/scalers/scaler_mlp.pkl", "rb")) - X_scaled = scaler.fit_transform(X) - X_val_scaled = scaler.transform(X_val) - model.fit(X_scaled, y) + x_scaled = scaler.fit_transform(x) + scaler.transform(x_val) + model.fit(x_scaled, y) # Get weight matrix of first Dense layer - W = model.layers[0].get_weights()[0] - # Sum abs(weights) across neurons → one score per input feature - imp = np.abs(W).sum(axis=1) + w = model.layers[0].get_weights()[0] + # Sum abs(weights) across neurons → one score per input feature + imp = np.abs(w).sum(axis=1) else: - imp = np.zeros(X.shape[1]) + imp = np.zeros(x.shape[1]) # Create a DataFrame mapping features to their importance - imp_df = pd.DataFrame({"feature": X.columns, "importance": imp}) + imp_df = pd.DataFrame({"feature": x.columns, "importance": imp}) imp_df = imp_df.sort_values(by="importance", ascending=False) # Select the top_k features @@ -471,10 +400,7 @@ def select_features_differentiated( # Return a dictionary with details for each model and the overall union. return { "xgb": selected["xgb"], - # "cat": selected["cat"], "lgbm": selected["lgbm"], - # "rf": selected["rf"], - # "tabnet": selected["tabnet"], "mlp": selected["mlp"], "union": union_features, } @@ -519,34 +445,34 @@ def sync_columns(train_df, val_df, logger): logger.info("Starting feature selection...") features_train, features_val = sync_columns(features_train, features_val, logger) # Merge training and test features while maintaining column consistency - features_combined = pd.concat([features_train, features_test], axis=0) + features_combined = pd.concat([features_train, features_test], axis=0) # type: ignore # Ensure consistent column order and alignment - features_combined = features_combined[features_train.columns] - target_combined = pd.concat([target_train, target_test], axis=0) + features_combined = features_combined[features_train.columns] # type: ignore + target_combined = pd.concat([target_train, target_test], axis=0) # type: ignore # Handle NaN values by filling with column means for numeric columns logger.info("Handling NaN values in features") numeric_cols = features_combined.select_dtypes(include=np.number).columns - features_combined[numeric_cols] = features_combined[numeric_cols].fillna( - features_combined[numeric_cols].mean() + features_combined[numeric_cols] = features_combined[numeric_cols].fillna( # type: ignore + features_combined[numeric_cols].mean() # type: ignore ) - features_val[numeric_cols] = features_val[numeric_cols].fillna( - features_combined[numeric_cols].mean() + features_val[numeric_cols] = features_val[numeric_cols].fillna( # type: ignore + features_combined[numeric_cols].mean() # type: ignore ) # For categorical columns, fill with mode categorical_cols = features_combined.select_dtypes(include=["object", "category"]).columns for col in categorical_cols: - mode_val = features_combined[col].mode()[0] - features_combined[col] = features_combined[col].fillna(mode_val) - features_val[col] = features_val[col].fillna(mode_val) + mode_val = features_combined[col].mode()[0] # type: ignore + features_combined[col] = features_combined[col].fillna(mode_val) # type: ignore + features_val[col] = features_val[col].fillna(mode_val) # type: ignore # Verify no NaN values remain - if features_combined.isna().any().any() or features_val.isna().any().any(): + if features_combined.isna().any().any() or features_val.isna().any().any(): # type: ignore logger.error("NaN values still present after imputation") raise ValueError("Failed to handle all NaN values") # Log the merge operation logger.info(f"Merged training and test features. Combined shape: {features_combined.shape}") selected_features = select_features_differentiated( - features_combined, target_combined, features_val, target_val, verbose=True + features_combined, target_combined, features_val, target_val, verbose=True # type: ignore ) logger.info(f"Selected features: {selected_features}") diff --git a/src/utils/featuretools_automated_features.py b/src/utils/featuretools_automated_features.py index e6364e2..5510156 100644 --- a/src/utils/featuretools_automated_features.py +++ b/src/utils/featuretools_automated_features.py @@ -6,9 +6,8 @@ Optimized for memory efficiency and performance with central logger integration. """ -import os import warnings -from typing import Dict, List, Optional, Tuple, Union +from typing import Optional, Union import featuretools as ft import numpy as np @@ -36,7 +35,7 @@ class SoccerFeaturetoolsEngineer: """ Automated feature engineering for soccer prediction using Featuretools. - + This class implements a hybrid approach that combines: 1. Temporal feature synthesis for time-aware patterns 2. Relational feature generation across entities (teams, matches, venues) @@ -53,7 +52,7 @@ def __init__( ): """ Initialize the automated feature engineer with optimizations. - + Args: experiment_logger: Central ExperimentLogger instance max_depth: Maximum depth for deep feature synthesis @@ -69,25 +68,27 @@ def __init__( self.entityset = None self.feature_matrix = None self.feature_defs = None - + # Performance monitoring self._feature_generation_stats = { - 'temporal_features': 0, - 'relational_features': 0, - 'interaction_features': 0, - 'memory_usage_mb': 0 + "temporal_features": 0, + "relational_features": 0, + "interaction_features": 0, + "memory_usage_mb": 0, } - + # Define primitives for different feature types self._setup_primitives() - - self.logger.info("SoccerFeaturetoolsEngineer initialized with optimizations", - extra={ - 'max_depth': max_depth, - 'chunk_size': self.chunk_size, - 'memory_limit_gb': memory_limit_gb - }) - + + self.logger.info( + "SoccerFeaturetoolsEngineer initialized with optimizations", + extra={ + "max_depth": max_depth, + "chunk_size": self.chunk_size, + "memory_limit_gb": memory_limit_gb, + }, + ) + def _calculate_optimal_chunk_size(self) -> int: """Calculate optimal chunk size based on available memory.""" try: @@ -96,347 +97,410 @@ def _calculate_optimal_chunk_size(self) -> int: return max(1000, min(optimal_size, 10000)) # Between 1K and 10K rows except: return 5000 # Safe default - + def _optimize_data_types(self, df: pd.DataFrame) -> pd.DataFrame: """Optimize data types for memory efficiency.""" try: df_optimized = df.copy() - + # Optimize integer columns - for col in df_optimized.select_dtypes(include=['int64']).columns: + for col in df_optimized.select_dtypes(include=["int64"]).columns: col_min, col_max = df_optimized[col].min(), df_optimized[col].max() if col_min >= 0: if col_max < 255: - df_optimized[col] = df_optimized[col].astype('uint8') + df_optimized[col] = df_optimized[col].astype("uint8") elif col_max < 65535: - df_optimized[col] = df_optimized[col].astype('uint16') + df_optimized[col] = df_optimized[col].astype("uint16") elif col_max < 4294967295: - df_optimized[col] = df_optimized[col].astype('uint32') + df_optimized[col] = df_optimized[col].astype("uint32") else: if col_min > -128 and col_max < 127: - df_optimized[col] = df_optimized[col].astype('int8') + df_optimized[col] = df_optimized[col].astype("int8") elif col_min > -32768 and col_max < 32767: - df_optimized[col] = df_optimized[col].astype('int16') + df_optimized[col] = df_optimized[col].astype("int16") elif col_min > -2147483648 and col_max < 2147483647: - df_optimized[col] = df_optimized[col].astype('int32') - + df_optimized[col] = df_optimized[col].astype("int32") + # Optimize float columns - for col in df_optimized.select_dtypes(include=['float64']).columns: - df_optimized[col] = pd.to_numeric(df_optimized[col], downcast='float') - + for col in df_optimized.select_dtypes(include=["float64"]).columns: + df_optimized[col] = pd.to_numeric(df_optimized[col], downcast="float") + return df_optimized except Exception as e: self.logger.warning(f"Data type optimization failed: {str(e)}") return df - + def _setup_primitives(self) -> None: """Setup primitive collections for different feature engineering approaches.""" - + # Soccer-specific feature combinations self.soccer_feature_groups = { - 'attack_features': [ - 'home_attack_strength', 'away_attack_strength', - 'home_goal_rollingaverage', 'away_goal_rollingaverage', - 'home_xG_rolling_rollingaverage', 'away_xG_rolling_rollingaverage', - 'home_shot_on_target_rollingaverage', 'away_shot_on_target_rollingaverage' + "attack_features": [ + "home_attack_strength", + "away_attack_strength", + "home_goal_rollingaverage", + "away_goal_rollingaverage", + "home_xG_rolling_rollingaverage", + "away_xG_rolling_rollingaverage", + "home_shot_on_target_rollingaverage", + "away_shot_on_target_rollingaverage", ], - 'defense_features': [ - 'home_defense_weakness', 'away_defense_weakness', - 'home_saves_rollingaverage', 'away_saves_rollingaverage', - 'defensive_stability' + "defense_features": [ + "home_defense_weakness", + "away_defense_weakness", + "home_saves_rollingaverage", + "away_saves_rollingaverage", + "defensive_stability", ], - 'form_features': [ - 'home_form_momentum', 'away_form_momentum', - 'form_stability', 'form_difference', - 'home_xg_form', 'away_xg_form' + "form_features": [ + "home_form_momentum", + "away_form_momentum", + "form_stability", + "form_difference", + "home_xg_form", + "away_xg_form", ], - 'tactical_features': [ - 'Home_possession_mean', 'away_possession_mean', - 'home_corners_rollingaverage', 'away_corners_rollingaverage', - 'home_fouls_rollingaverage', 'away_fouls_rollingaverage' + "tactical_features": [ + "Home_possession_mean", + "away_possession_mean", + "home_corners_rollingaverage", + "away_corners_rollingaverage", + "home_fouls_rollingaverage", + "away_fouls_rollingaverage", ], - 'elo_features': [ - 'home_team_elo', 'away_team_elo', - 'elo_difference', 'elo_similarity' - ] + "elo_features": ["home_team_elo", "away_team_elo", "elo_difference", "elo_similarity"], } - def prepare_data_for_featuretools(self, df: pd.DataFrame) -> Dict[str, pd.DataFrame]: + def prepare_data_for_featuretools(self, df: pd.DataFrame) -> dict[str, pd.DataFrame]: """ Prepare and structure data for featuretools entity creation with optimizations. - + Args: df: Input dataframe with soccer match data - + Returns: Dictionary of dataframes for different entities """ try: initial_memory = df.memory_usage(deep=True).sum() / 1024**2 - self.logger.info("Preparing data for featuretools entity creation", - extra={'input_shape': df.shape, 'input_memory_mb': initial_memory}) - + self.logger.info( + "Preparing data for featuretools entity creation", + extra={"input_shape": df.shape, "input_memory_mb": initial_memory}, + ) + # OPTIMIZATION: Data type optimization first df_optimized = self._optimize_data_types(df.copy()) - + # OPTIMIZATION: Categorical encoding for low-cardinality columns - categorical_candidates = ['home_encoded', 'away_encoded', 'venue_encoded', 'league_encoded'] + categorical_candidates = [ + "home_encoded", + "away_encoded", + "venue_encoded", + "league_encoded", + ] for col in categorical_candidates: if col in df_optimized.columns: unique_ratio = df_optimized[col].nunique() / len(df_optimized) if unique_ratio < 0.5: # Less than 50% unique values - df_optimized[col] = df_optimized[col].astype('category') - + df_optimized[col] = df_optimized[col].astype("category") + # Continue with existing logic using optimized dataframe matches_df = df_optimized - + # Ensure required columns exist - required_cols = ['fixture_id', 'date_encoded', 'home_encoded', 'away_encoded'] + required_cols = ["fixture_id", "date_encoded", "home_encoded", "away_encoded"] missing_cols = [col for col in required_cols if col not in df.columns] if missing_cols: raise ValueError(f"Missing required columns: {missing_cols}") - + # Create main matches entity matches_df = df.copy() - + # Create unique fixture_id to handle potential duplicates from combined data splits - original_fixture_count = len(matches_df['fixture_id'].unique()) + original_fixture_count = len(matches_df["fixture_id"].unique()) total_rows = len(matches_df) - + if original_fixture_count != total_rows: - self.logger.warning(f"Non-unique fixture_ids detected: {original_fixture_count} unique vs {total_rows} total") - self.logger.info("Creating unique fixture_id sequence for featuretools compatibility") - + self.logger.warning( + f"Non-unique fixture_ids detected: {original_fixture_count} unique vs {total_rows} total" + ) + self.logger.info( + "Creating unique fixture_id sequence for featuretools compatibility" + ) + # Store original fixture_id for reference - matches_df['original_fixture_id'] = matches_df['fixture_id'].copy() - + matches_df["original_fixture_id"] = matches_df["fixture_id"].copy() + # Create unique sequential fixture_id - matches_df['fixture_id'] = range(len(matches_df)) - - self.logger.info(f"Generated unique fixture_id sequence: 0 to {len(matches_df)-1}") + matches_df["fixture_id"] = range(len(matches_df)) + + self.logger.info( + f"Generated unique fixture_id sequence: 0 to {len(matches_df) - 1}" + ) else: - self.logger.info(f"Fixture_id column is already unique ({original_fixture_count} matches)") - + self.logger.info( + f"Fixture_id column is already unique ({original_fixture_count} matches)" + ) + # Ensure proper data types - matches_df['fixture_id'] = matches_df['fixture_id'].astype(int) - matches_df['home_encoded'] = matches_df['home_encoded'].astype(int) - matches_df['away_encoded'] = matches_df['away_encoded'].astype(int) - + matches_df["fixture_id"] = matches_df["fixture_id"].astype(int) + matches_df["home_encoded"] = matches_df["home_encoded"].astype(int) + matches_df["away_encoded"] = matches_df["away_encoded"].astype(int) + # Convert date if needed with proper format handling - if 'date_encoded' in matches_df.columns: + if "date_encoded" in matches_df.columns: try: # Check if date_encoded contains integer days since reference date - if matches_df['date_encoded'].dtype in ['int64', 'int32', 'float64']: + if matches_df["date_encoded"].dtype in ["int64", "int32", "float64"]: # Convert integer days to actual dates (assuming reference date 2020-08-11) - reference_date = pd.to_datetime('2020-08-11', format='%Y-%m-%d') - matches_df['match_date'] = reference_date + pd.to_timedelta(matches_df['date_encoded'], unit='D') - self.logger.info("Converted date_encoded (integer days) to match_date successfully") - elif matches_df['date_encoded'].dtype == 'object': + reference_date = pd.to_datetime("2020-08-11", format="%Y-%m-%d") + matches_df["match_date"] = reference_date + pd.to_timedelta( + matches_df["date_encoded"], unit="D" + ) + self.logger.info( + "Converted date_encoded (integer days) to match_date successfully" + ) + elif matches_df["date_encoded"].dtype == "object": # Try common date formats explicitly to avoid Woodwork warnings - date_formats = ['%Y-%m-%d', '%Y/%m/%d', '%d/%m/%Y', '%m/%d/%Y', '%Y-%m-%d %H:%M:%S'] - matches_df['match_date'] = None - + date_formats = [ + "%Y-%m-%d", + "%Y/%m/%d", + "%d/%m/%Y", + "%m/%d/%Y", + "%Y-%m-%d %H:%M:%S", + ] + matches_df["match_date"] = None + for fmt in date_formats: try: # Suppress warnings during format attempts with warnings.catch_warnings(): warnings.simplefilter("ignore") - parsed_dates = pd.to_datetime(matches_df['date_encoded'], format=fmt, errors='coerce') + parsed_dates = pd.to_datetime( + matches_df["date_encoded"], format=fmt, errors="coerce" + ) if not parsed_dates.isna().all(): - matches_df['match_date'] = parsed_dates - self.logger.info(f"Successfully parsed dates using format: {fmt}") + matches_df["match_date"] = parsed_dates + self.logger.info( + f"Successfully parsed dates using format: {fmt}" + ) break except: continue - + # If no format worked, try general parsing with warnings suppressed - if matches_df['match_date'].isna().all(): + if matches_df["match_date"].isna().all(): with warnings.catch_warnings(): warnings.simplefilter("ignore") - matches_df['match_date'] = pd.to_datetime(matches_df['date_encoded'], errors='coerce') + matches_df["match_date"] = pd.to_datetime( + matches_df["date_encoded"], errors="coerce" + ) self.logger.info("Used general date parsing as fallback") else: # If it's already datetime, use as is - matches_df['match_date'] = pd.to_datetime(matches_df['date_encoded']) - + matches_df["match_date"] = pd.to_datetime(matches_df["date_encoded"]) + except Exception as e: - self.logger.warning(f"Date conversion failed: {str(e)}, creating sequential dates") + self.logger.warning( + f"Date conversion failed: {str(e)}, creating sequential dates" + ) # If all else fails, create a simple date sequence - matches_df['match_date'] = pd.date_range( - start='2020-01-01', - periods=len(matches_df), - freq='D' + matches_df["match_date"] = pd.date_range( + start="2020-01-01", periods=len(matches_df), freq="D" ) - + # Ensure no NaT values remain - if matches_df['match_date'].isna().any(): - self.logger.warning("Some dates could not be parsed, filling with sequential dates") + if matches_df["match_date"].isna().any(): + self.logger.warning( + "Some dates could not be parsed, filling with sequential dates" + ) # Fill NaT values with sequential dates - base_date = pd.to_datetime('2020-01-01', format='%Y-%m-%d') - mask = matches_df['match_date'].isna() + base_date = pd.to_datetime("2020-01-01", format="%Y-%m-%d") + mask = matches_df["match_date"].isna() # Use vectorized operations instead of iterating - sequential_dates = pd.date_range(start=base_date, periods=mask.sum(), freq='D') - matches_df.loc[mask, 'match_date'] = sequential_dates - + sequential_dates = pd.date_range(start=base_date, periods=mask.sum(), freq="D") + matches_df.loc[mask, "match_date"] = sequential_dates + # Create teams entity with validation - home_teams = matches_df[['home_encoded']].rename(columns={'home_encoded': 'team_id'}) - away_teams = matches_df[['away_encoded']].rename(columns={'away_encoded': 'team_id'}) + home_teams = matches_df[["home_encoded"]].rename(columns={"home_encoded": "team_id"}) + away_teams = matches_df[["away_encoded"]].rename(columns={"away_encoded": "team_id"}) teams_df = pd.concat([home_teams, away_teams]).drop_duplicates().reset_index(drop=True) - + # Validate teams entity - if len(teams_df['team_id'].unique()) != len(teams_df): - self.logger.warning("Duplicate team_ids detected in teams entity, removing duplicates") - teams_df = teams_df.drop_duplicates(subset=['team_id']).reset_index(drop=True) - + if len(teams_df["team_id"].unique()) != len(teams_df): + self.logger.warning( + "Duplicate team_ids detected in teams entity, removing duplicates" + ) + teams_df = teams_df.drop_duplicates(subset=["team_id"]).reset_index(drop=True) + self.logger.info(f"Created teams entity with {len(teams_df)} unique teams") - + # Create venues entity if venue data exists venues_df = None - if 'venue_encoded' in matches_df.columns: - venue_cols = ['venue_encoded'] - if 'venue_capacity' in matches_df.columns: - venue_cols.append('venue_capacity') - if 'venue_draw_rate' in matches_df.columns: - venue_cols.append('venue_draw_rate') - + if "venue_encoded" in matches_df.columns: + venue_cols = ["venue_encoded"] + if "venue_capacity" in matches_df.columns: + venue_cols.append("venue_capacity") + if "venue_draw_rate" in matches_df.columns: + venue_cols.append("venue_draw_rate") + venues_df = matches_df[venue_cols].drop_duplicates() - venues_df = venues_df.rename(columns={'venue_encoded': 'venue_id'}) + venues_df = venues_df.rename(columns={"venue_encoded": "venue_id"}) venues_df = venues_df.dropna().reset_index(drop=True) - + # Validate venue entity uniqueness - if len(venues_df) > 0 and len(venues_df['venue_id'].unique()) != len(venues_df): + if len(venues_df) > 0 and len(venues_df["venue_id"].unique()) != len(venues_df): self.logger.warning("Duplicate venue_ids detected, removing duplicates") - venues_df = venues_df.drop_duplicates(subset=['venue_id']).reset_index(drop=True) - + venues_df = venues_df.drop_duplicates(subset=["venue_id"]).reset_index( + drop=True + ) + self.logger.info(f"Created venues entity with {len(venues_df)} unique venues") - + # Create leagues entity if league data exists leagues_df = None - if 'league_encoded' in matches_df.columns: - league_cols = ['league_encoded'] - if 'league_competitiveness' in matches_df.columns: - league_cols.append('league_competitiveness') - if 'league_draw_rate' in matches_df.columns: - league_cols.append('league_draw_rate') - + if "league_encoded" in matches_df.columns: + league_cols = ["league_encoded"] + if "league_competitiveness" in matches_df.columns: + league_cols.append("league_competitiveness") + if "league_draw_rate" in matches_df.columns: + league_cols.append("league_draw_rate") + leagues_df = matches_df[league_cols].drop_duplicates() - leagues_df = leagues_df.rename(columns={'league_encoded': 'league_id'}) + leagues_df = leagues_df.rename(columns={"league_encoded": "league_id"}) leagues_df = leagues_df.dropna().reset_index(drop=True) - + # Validate league entity uniqueness - if len(leagues_df) > 0 and len(leagues_df['league_id'].unique()) != len(leagues_df): + if len(leagues_df) > 0 and len(leagues_df["league_id"].unique()) != len(leagues_df): self.logger.warning("Duplicate league_ids detected, removing duplicates") - leagues_df = leagues_df.drop_duplicates(subset=['league_id']).reset_index(drop=True) - + leagues_df = leagues_df.drop_duplicates(subset=["league_id"]).reset_index( + drop=True + ) + self.logger.info(f"Created leagues entity with {len(leagues_df)} unique leagues") - + entities = { - 'matches': matches_df, - 'teams': teams_df, + "matches": matches_df, + "teams": teams_df, } - + if venues_df is not None and len(venues_df) > 0: - entities['venues'] = venues_df - + entities["venues"] = venues_df + if leagues_df is not None and len(leagues_df) > 0: - entities['leagues'] = leagues_df - + entities["leagues"] = leagues_df + # OPTIMIZATION: Report memory savings - final_memory = sum(entity_df.memory_usage(deep=True).sum() for entity_df in entities.values()) / 1024**2 + final_memory = ( + sum(entity_df.memory_usage(deep=True).sum() for entity_df in entities.values()) + / 1024**2 + ) memory_saved = initial_memory - final_memory - - self.logger.info(f"Created {len(entities)} entities for featuretools", - extra={ - 'final_memory_mb': final_memory, - 'memory_saved_mb': memory_saved, - 'savings_percent': (memory_saved / initial_memory) * 100 if initial_memory > 0 else 0 - }) + + self.logger.info( + f"Created {len(entities)} entities for featuretools", + extra={ + "final_memory_mb": final_memory, + "memory_saved_mb": memory_saved, + "savings_percent": (memory_saved / initial_memory) * 100 + if initial_memory > 0 + else 0, + }, + ) return entities - + except Exception as e: - self.logger.error(f"Error preparing data for featuretools: {str(e)}", - extra={'function': 'prepare_data_for_featuretools'}) + self.logger.error( + f"Error preparing data for featuretools: {str(e)}", + extra={"function": "prepare_data_for_featuretools"}, + ) raise - def create_entityset(self, entities: Dict[str, pd.DataFrame]) -> ft.EntitySet: + def create_entityset(self, entities: dict[str, pd.DataFrame]) -> ft.EntitySet: """ Create featuretools EntitySet with proper relationships. - + Args: entities: Dictionary of prepared dataframes - + Returns: Configured EntitySet """ try: self.logger.info("Creating featuretools EntitySet") - + # Suppress Woodwork warnings during EntitySet creation with warnings.catch_warnings(): warnings.simplefilter("ignore", category=UserWarning) warnings.filterwarnings("ignore", message="Could not infer format") - + es = ft.EntitySet(id="soccer_matches") - + # Ensure match_date is properly formatted for Woodwork - matches_df = entities['matches'].copy() + matches_df = entities["matches"].copy() if "match_date" in matches_df.columns: # Ensure datetime format is standardized - matches_df['match_date'] = pd.to_datetime(matches_df['match_date'], errors='coerce') + matches_df["match_date"] = pd.to_datetime( + matches_df["match_date"], errors="coerce" + ) # Remove any remaining NaT values - if matches_df['match_date'].isna().any(): - self.logger.warning("Removing rows with invalid dates for EntitySet creation") - matches_df = matches_df.dropna(subset=['match_date']) - + if matches_df["match_date"].isna().any(): + self.logger.warning( + "Removing rows with invalid dates for EntitySet creation" + ) + matches_df = matches_df.dropna(subset=["match_date"]) + # Add matches entity (primary entity) es = es.add_dataframe( dataframe=matches_df, dataframe_name="matches", index="fixture_id", time_index="match_date" if "match_date" in matches_df.columns else None, - make_index=False + make_index=False, ) - + # Add teams entity es = es.add_dataframe( - dataframe=entities['teams'], + dataframe=entities["teams"], dataframe_name="teams", index="team_id", - make_index=False + make_index=False, ) - + # Add relationships # Home team relationship es = es.add_relationship("teams", "team_id", "matches", "home_encoded") - # Away team relationship + # Away team relationship es = es.add_relationship("teams", "team_id", "matches", "away_encoded") - + # Add venues entity and relationship if available - if 'venues' in entities and len(entities['venues']) > 0: + if "venues" in entities and len(entities["venues"]) > 0: es = es.add_dataframe( - dataframe=entities['venues'], + dataframe=entities["venues"], dataframe_name="venues", index="venue_id", - make_index=False + make_index=False, ) - if 'venue_encoded' in matches_df.columns: + if "venue_encoded" in matches_df.columns: es = es.add_relationship("venues", "venue_id", "matches", "venue_encoded") - + # Add leagues entity and relationship if available - if 'leagues' in entities and len(entities['leagues']) > 0: + if "leagues" in entities and len(entities["leagues"]) > 0: es = es.add_dataframe( - dataframe=entities['leagues'], - dataframe_name="leagues", + dataframe=entities["leagues"], + dataframe_name="leagues", index="league_id", - make_index=False + make_index=False, ) - if 'league_encoded' in matches_df.columns: - es = es.add_relationship("leagues", "league_id", "matches", "league_encoded") - + if "league_encoded" in matches_df.columns: + es = es.add_relationship( + "leagues", "league_id", "matches", "league_encoded" + ) + self.entityset = es self.logger.info(f"EntitySet created with {len(es.dataframes)} entities") return es - + except Exception as e: self.logger.error(f"Error creating EntitySet: {str(e)}") raise @@ -444,69 +508,69 @@ def create_entityset(self, entities: Dict[str, pd.DataFrame]) -> ft.EntitySet: def generate_interaction_features(self, df: pd.DataFrame) -> pd.DataFrame: """ Generate feature interactions specific to soccer prediction. - + Args: df: Input dataframe with existing features - + Returns: Dataframe with new interaction features """ try: self.logger.info("Generating soccer-specific interaction features") - + result_df = df.copy() - + # Attack vs Defense interactions - if all(col in df.columns for col in ['home_attack_strength', 'away_defense_weakness']): - result_df['ft_home_attack_vs_away_defense'] = ( - df['home_attack_strength'] * (1 - df['away_defense_weakness']) + if all(col in df.columns for col in ["home_attack_strength", "away_defense_weakness"]): + result_df["ft_home_attack_vs_away_defense"] = df["home_attack_strength"] * ( + 1 - df["away_defense_weakness"] ) - - if all(col in df.columns for col in ['away_attack_strength', 'home_defense_weakness']): - result_df['ft_away_attack_vs_home_defense'] = ( - df['away_attack_strength'] * (1 - df['home_defense_weakness']) + + if all(col in df.columns for col in ["away_attack_strength", "home_defense_weakness"]): + result_df["ft_away_attack_vs_home_defense"] = df["away_attack_strength"] * ( + 1 - df["home_defense_weakness"] ) - + # Form momentum interactions - if all(col in df.columns for col in ['home_form_momentum', 'away_form_momentum']): - result_df['ft_form_momentum_difference'] = ( - df['home_form_momentum'] - df['away_form_momentum'] + if all(col in df.columns for col in ["home_form_momentum", "away_form_momentum"]): + result_df["ft_form_momentum_difference"] = ( + df["home_form_momentum"] - df["away_form_momentum"] ) - result_df['ft_form_momentum_product'] = ( - df['home_form_momentum'] * df['away_form_momentum'] + result_df["ft_form_momentum_product"] = ( + df["home_form_momentum"] * df["away_form_momentum"] ) - + # ELO-based interactions - if all(col in df.columns for col in ['elo_difference', 'form_difference']): - result_df['ft_elo_form_interaction'] = df['elo_difference'] * df['form_difference'] - + if all(col in df.columns for col in ["elo_difference", "form_difference"]): + result_df["ft_elo_form_interaction"] = df["elo_difference"] * df["form_difference"] + # Possession-based interactions - if all(col in df.columns for col in ['Home_possession_mean', 'away_possession_mean']): - result_df['ft_possession_dominance'] = ( - df['Home_possession_mean'] - df['away_possession_mean'] + if all(col in df.columns for col in ["Home_possession_mean", "away_possession_mean"]): + result_df["ft_possession_dominance"] = ( + df["Home_possession_mean"] - df["away_possession_mean"] ) - + # xG momentum interactions - if all(col in df.columns for col in ['home_xg_momentum', 'away_xg_momentum']): - result_df['ft_xg_momentum_ratio'] = ( - df['home_xg_momentum'] / (df['away_xg_momentum'] + 1e-10) + if all(col in df.columns for col in ["home_xg_momentum", "away_xg_momentum"]): + result_df["ft_xg_momentum_ratio"] = df["home_xg_momentum"] / ( + df["away_xg_momentum"] + 1e-10 ) - + # League context interactions - if all(col in df.columns for col in ['league_competitiveness', 'elo_difference']): - result_df['ft_competitive_elo_interaction'] = ( - df['league_competitiveness'] * abs(df['elo_difference']) + if all(col in df.columns for col in ["league_competitiveness", "elo_difference"]): + result_df["ft_competitive_elo_interaction"] = df["league_competitiveness"] * abs( + df["elo_difference"] ) - + # Rest advantage interactions - if all(col in df.columns for col in ['home_rest_days', 'away_rest_days']): - result_df['ft_rest_advantage'] = df['home_rest_days'] - df['away_rest_days'] - + if all(col in df.columns for col in ["home_rest_days", "away_rest_days"]): + result_df["ft_rest_advantage"] = df["home_rest_days"] - df["away_rest_days"] + new_features = len(result_df.columns) - len(df.columns) self.logger.info(f"Generated {new_features} interaction features") - + return result_df - + except Exception as e: self.logger.error(f"Error generating interaction features: {str(e)}") return df @@ -519,50 +583,61 @@ def run_hybrid_feature_engineering( include_interactions: bool = True, temporal_window: int = 5, temporal_gap: int = 1, - ) -> Tuple[pd.DataFrame, Dict[str, List]]: + ) -> tuple[pd.DataFrame, dict[str, list]]: """ Run the complete hybrid feature engineering pipeline. - + Args: df: Input dataframe with existing features include_temporal: Whether to generate temporal features - include_relational: Whether to generate relational features + include_relational: Whether to generate relational features include_interactions: Whether to generate interaction features temporal_window: Window length for temporal features temporal_gap: Gap for temporal features - + Returns: Tuple of (augmented_dataframe, feature_definitions_dict) """ try: # OPTIMIZATION: Memory monitoring and performance tracking initial_memory = df.memory_usage(deep=True).sum() / 1024**2 - - self.logger.info("Starting optimized hybrid feature engineering pipeline", - extra={ - 'include_temporal': include_temporal, - 'include_relational': include_relational, - 'include_interactions': include_interactions, - 'temporal_window': temporal_window, - 'temporal_gap': temporal_gap, - 'max_depth': self.max_depth, - 'input_features': len(df.columns), - 'initial_memory_mb': initial_memory, - 'chunk_size': self.chunk_size - }) - + + self.logger.info( + "Starting optimized hybrid feature engineering pipeline", + extra={ + "include_temporal": include_temporal, + "include_relational": include_relational, + "include_interactions": include_interactions, + "temporal_window": temporal_window, + "temporal_gap": temporal_gap, + "max_depth": self.max_depth, + "input_features": len(df.columns), + "initial_memory_mb": initial_memory, + "chunk_size": self.chunk_size, + }, + ) + # OPTIMIZATION: Chunking for large datasets if len(df) > self.chunk_size: - self.logger.info(f"Dataset size ({len(df)}) exceeds chunk size ({self.chunk_size}), using chunked processing") - return self._run_chunked_feature_engineering(df, include_temporal, include_relational, include_interactions, temporal_window, temporal_gap) - + self.logger.info( + f"Dataset size ({len(df)}) exceeds chunk size ({self.chunk_size}), using chunked processing" + ) + return self._run_chunked_feature_engineering( + df, + include_temporal, + include_relational, + include_interactions, + temporal_window, + temporal_gap, + ) + # Step 1: Prepare data and create EntitySet entities = self.prepare_data_for_featuretools(df) entityset = self.create_entityset(entities) - + result_df = df.copy() all_feature_defs = {} - + # Step 2: Generate temporal features using rolling primitives if include_temporal: try: @@ -574,12 +649,12 @@ def run_hybrid_feature_engineering( Lag(periods=temporal_gap), Lag(periods=temporal_gap + 2), ] - + # Generate temporal features with warning suppression with warnings.catch_warnings(): warnings.simplefilter("ignore", category=UserWarning) warnings.filterwarnings("ignore", message="Could not infer format") - + temporal_fm, temporal_defs = ft.dfs( entityset=entityset, target_dataframe_name="matches", @@ -589,25 +664,35 @@ def run_hybrid_feature_engineering( n_jobs=self.n_jobs, verbose=False, ) - + if not temporal_fm.empty: # Select only new numeric features - new_temporal_cols = [col for col in temporal_fm.columns if col not in df.columns] + new_temporal_cols = [ + col for col in temporal_fm.columns if col not in df.columns + ] if new_temporal_cols: - temporal_features = temporal_fm[new_temporal_cols].select_dtypes(include=[np.number]) - + temporal_features = temporal_fm[new_temporal_cols].select_dtypes( + include=[np.number] + ) + # Add prefix to new features - temporal_features.columns = [f"ft_temporal_{col}" for col in temporal_features.columns] - - result_df = result_df.join(temporal_features, how='left') - all_feature_defs['temporal'] = temporal_defs - - self.logger.info(f"Added {len(temporal_features.columns)} temporal features") - + temporal_features.columns = [ + f"ft_temporal_{col}" for col in temporal_features.columns + ] + + result_df = result_df.join(temporal_features, how="left") + all_feature_defs["temporal"] = temporal_defs + + self.logger.info( + f"Added {len(temporal_features.columns)} temporal features" + ) + except Exception as e: self.logger.warning(f"Temporal feature generation failed: {str(e)}") - self.logger.info("Skipping temporal features due to error - this is normal for small datasets or insufficient temporal data") - + self.logger.info( + "Skipping temporal features due to error - this is normal for small datasets or insufficient temporal data" + ) + # Step 3: Generate relational features if include_relational and len(entityset.dataframes) > 1: try: @@ -615,7 +700,7 @@ def run_hybrid_feature_engineering( with warnings.catch_warnings(): warnings.simplefilter("ignore", category=UserWarning) warnings.filterwarnings("ignore", message="Could not infer format") - + relational_fm, relational_defs = ft.dfs( entityset=entityset, target_dataframe_name="matches", @@ -625,86 +710,104 @@ def run_hybrid_feature_engineering( n_jobs=self.n_jobs, verbose=False, ) - + if not relational_fm.empty: # Select only new numeric features - new_relational_cols = [col for col in relational_fm.columns if col not in df.columns] + new_relational_cols = [ + col for col in relational_fm.columns if col not in df.columns + ] if new_relational_cols: - relational_features = relational_fm[new_relational_cols].select_dtypes(include=[np.number]) - + relational_features = relational_fm[new_relational_cols].select_dtypes( + include=[np.number] + ) + # Add prefix to new features - relational_features.columns = [f"ft_relational_{col}" for col in relational_features.columns] - - result_df = result_df.join(relational_features, how='left') - all_feature_defs['relational'] = relational_defs - - self.logger.info(f"Added {len(relational_features.columns)} relational features") - + relational_features.columns = [ + f"ft_relational_{col}" for col in relational_features.columns + ] + + result_df = result_df.join(relational_features, how="left") + all_feature_defs["relational"] = relational_defs + + self.logger.info( + f"Added {len(relational_features.columns)} relational features" + ) + except Exception as e: self.logger.warning(f"Relational feature generation failed: {str(e)}") - self.logger.info("Skipping relational features due to error - this is normal for datasets without sufficient entity relationships") - + self.logger.info( + "Skipping relational features due to error - this is normal for datasets without sufficient entity relationships" + ) + # Step 4: Generate interaction features if include_interactions: result_df = self.generate_interaction_features(result_df) - all_feature_defs['interactions'] = ['soccer_specific_interactions'] - + all_feature_defs["interactions"] = ["soccer_specific_interactions"] + # Step 5: Clean up and validate result_df = self._clean_generated_features(result_df, df) - + # OPTIMIZATION: Performance and memory reporting final_memory = result_df.memory_usage(deep=True).sum() / 1024**2 new_features = len(result_df.columns) - len(df.columns) memory_increase = final_memory - initial_memory - + # Update performance stats - self._feature_generation_stats.update({ - 'total_new_features': new_features, - 'memory_usage_mb': final_memory, - 'memory_increase_mb': memory_increase - }) - - self.logger.info(f"Hybrid feature engineering completed. Added {new_features} new features", - extra={ - 'new_features': new_features, - 'final_memory_mb': final_memory, - 'memory_increase_mb': memory_increase, - 'features_per_mb': new_features / max(memory_increase, 0.1), - 'output_features': len(result_df.columns) - }) - + self._feature_generation_stats.update( + { + "total_new_features": new_features, + "memory_usage_mb": final_memory, + "memory_increase_mb": memory_increase, + } + ) + + self.logger.info( + f"Hybrid feature engineering completed. Added {new_features} new features", + extra={ + "new_features": new_features, + "final_memory_mb": final_memory, + "memory_increase_mb": memory_increase, + "features_per_mb": new_features / max(memory_increase, 0.1), + "output_features": len(result_df.columns), + }, + ) + return result_df, all_feature_defs - + except Exception as e: - self.logger.error(f"Error in hybrid feature engineering: {str(e)}", - extra={'function': 'run_hybrid_feature_engineering'}) + self.logger.error( + f"Error in hybrid feature engineering: {str(e)}", + extra={"function": "run_hybrid_feature_engineering"}, + ) raise - def _clean_generated_features(self, result_df: pd.DataFrame, original_df: pd.DataFrame) -> pd.DataFrame: + def _clean_generated_features( + self, result_df: pd.DataFrame, original_df: pd.DataFrame + ) -> pd.DataFrame: """ Clean and validate generated features with improved filtering. - + Args: result_df: Dataframe with generated features original_df: Original input dataframe - + Returns: Cleaned dataframe """ try: # Handle infinite values result_df = result_df.replace([np.inf, -np.inf], np.nan) - + # Fill NaN values with 0 for new features only new_columns = [col for col in result_df.columns if col not in original_df.columns] result_df[new_columns] = result_df[new_columns].fillna(0) - + # Enhanced feature filtering - be less aggressive features_to_remove = [] - + for col in new_columns: col_data = result_df[col] - + # Remove only truly problematic features if col_data.std() == 0 and col_data.nunique() <= 1: # Only remove if completely constant (all same value) @@ -715,51 +818,57 @@ def _clean_generated_features(self, result_df: pd.DataFrame, original_df: pd.Dat elif abs(col_data.max()) < 1e-10 and abs(col_data.min()) < 1e-10: # Remove if all values are essentially zero features_to_remove.append(col) - + if features_to_remove: result_df = result_df.drop(columns=features_to_remove) - self.logger.info(f"Removed {len(features_to_remove)} problematic features (constant/missing/zero)") + self.logger.info( + f"Removed {len(features_to_remove)} problematic features (constant/missing/zero)" + ) else: self.logger.info("No problematic features found - all generated features retained") - + return result_df - + except Exception as e: self.logger.error(f"Error cleaning generated features: {str(e)}") return result_df def _run_chunked_feature_engineering( - self, - df: pd.DataFrame, - include_temporal: bool, - include_relational: bool, - include_interactions: bool, - temporal_window: int, - temporal_gap: int - ) -> Tuple[pd.DataFrame, Dict[str, List]]: + self, + df: pd.DataFrame, + include_temporal: bool, + include_relational: bool, + include_interactions: bool, + temporal_window: int, + temporal_gap: int, + ) -> tuple[pd.DataFrame, dict[str, list]]: """Run feature engineering in chunks for large datasets.""" try: self.logger.info(f"Processing {len(df)} rows in chunks of {self.chunk_size}") - - chunks = [df[i:i + self.chunk_size] for i in range(0, len(df), self.chunk_size)] + + chunks = [df[i : i + self.chunk_size] for i in range(0, len(df), self.chunk_size)] processed_chunks = [] all_feature_defs = {} - + for i, chunk in enumerate(chunks): - self.logger.debug(f"Processing chunk {i+1}/{len(chunks)}") - + self.logger.debug(f"Processing chunk {i + 1}/{len(chunks)}") + # Temporarily disable chunking for recursive call original_chunk_size = self.chunk_size self.chunk_size = len(chunk) + 1 # Ensure no further chunking - + try: # Process chunk with regular method chunk_result, chunk_defs = self.run_hybrid_feature_engineering( - chunk, include_temporal, include_relational, include_interactions, - temporal_window, temporal_gap + chunk, + include_temporal, + include_relational, + include_interactions, + temporal_window, + temporal_gap, ) processed_chunks.append(chunk_result) - + # Merge feature definitions for key, value in chunk_defs.items(): if key not in all_feature_defs: @@ -767,79 +876,87 @@ def _run_chunked_feature_engineering( finally: # Restore original chunk size self.chunk_size = original_chunk_size - + # Combine all chunks result_df = pd.concat(processed_chunks, ignore_index=True) - - self.logger.info(f"Chunked processing completed. Total features: {len(result_df.columns)}") + + self.logger.info( + f"Chunked processing completed. Total features: {len(result_df.columns)}" + ) return result_df, all_feature_defs - + except Exception as e: self.logger.error(f"Error in chunked feature engineering: {str(e)}") raise def evaluate_feature_importance( - self, - X: pd.DataFrame, - y: pd.Series, - new_feature_names: List[str], - method: str = 'combined', - top_k: int = 50 - ) -> List[str]: + self, + X: pd.DataFrame, + y: pd.Series, + new_feature_names: list[str], + method: str = "combined", + top_k: int = 50, + ) -> list[str]: """ Optimized feature importance evaluation using multiple methods. - + Args: X: Feature matrix y: Target variable new_feature_names: List of new feature names to evaluate method: Evaluation method ('mutual_info', 'correlation', 'variance', 'combined') top_k: Number of top features to return - + Returns: List of top feature names ranked by importance """ try: from sklearn.feature_selection import mutual_info_classif from sklearn.preprocessing import StandardScaler - + if len(new_feature_names) == 0: return [] - - self.logger.info("Starting optimized feature importance evaluation", - extra={'num_features': len(new_feature_names), 'method': method}) - + + self.logger.info( + "Starting optimized feature importance evaluation", + extra={"num_features": len(new_feature_names), "method": method}, + ) + # OPTIMIZATION: Memory-efficient feature selection X_new = X[new_feature_names].copy() X_new = self._optimize_data_types(X_new) # Apply data type optimization - + # Remove any remaining NaN/inf values X_new = X_new.replace([np.inf, -np.inf], np.nan).fillna(0) - + feature_scores = {} - - if method in ['mutual_info', 'combined']: + + if method in ["mutual_info", "combined"]: # Mutual information for classification try: mi_scores = mutual_info_classif(X_new, y, random_state=42) for i, feature in enumerate(new_feature_names): feature_scores[f"{feature}_mi"] = mi_scores[i] - self.logger.info(f"Calculated mutual information scores for {len(new_feature_names)} features") + self.logger.info( + f"Calculated mutual information scores for {len(new_feature_names)} features" + ) except Exception as e: self.logger.warning(f"Mutual information calculation failed: {str(e)}") - - if method in ['correlation', 'combined']: + + if method in ["correlation", "combined"]: # Correlation with target try: for feature in new_feature_names: corr = abs(X_new[feature].corr(y)) if not np.isnan(corr): feature_scores[f"{feature}_corr"] = corr - self.logger.info(f"Calculated correlation scores for {len(new_feature_names)} features") + self.logger.info( + f"Calculated correlation scores for {len(new_feature_names)} features" + ) except Exception as e: self.logger.warning(f"Correlation calculation failed: {str(e)}") - - if method in ['variance', 'combined']: + + if method in ["variance", "combined"]: # Variance-based scoring (higher variance = more informative) try: scaler = StandardScaler() @@ -847,14 +964,16 @@ def evaluate_feature_importance( for i, feature in enumerate(new_feature_names): variance = np.var(X_scaled[:, i]) feature_scores[f"{feature}_var"] = variance - self.logger.info(f"Calculated variance scores for {len(new_feature_names)} features") + self.logger.info( + f"Calculated variance scores for {len(new_feature_names)} features" + ) except Exception as e: self.logger.warning(f"Variance calculation failed: {str(e)}") - + if not feature_scores: self.logger.warning("No feature scores calculated, returning original list") return new_feature_names[:top_k] - + # Combine scores by feature combined_scores = {} for feature in new_feature_names: @@ -865,46 +984,52 @@ def evaluate_feature_importance( scores.append(feature_scores[f"{feature}_corr"]) if f"{feature}_var" in feature_scores: scores.append(feature_scores[f"{feature}_var"]) - + if scores: # Use mean of available scores combined_scores[feature] = np.mean(scores) else: combined_scores[feature] = 0.0 - + # Sort by combined score ranked_features = sorted(combined_scores.items(), key=lambda x: x[1], reverse=True) top_features = [feature for feature, score in ranked_features[:top_k]] - - self.logger.info(f"Feature importance evaluation completed", - extra={ - 'selected_features': len(top_features), - 'method': method, - 'top_feature_score': max(combined_scores.values()) if combined_scores else 0, - 'avg_feature_score': np.mean(list(combined_scores.values())) if combined_scores else 0 - }) - + + self.logger.info( + "Feature importance evaluation completed", + extra={ + "selected_features": len(top_features), + "method": method, + "top_feature_score": max(combined_scores.values()) if combined_scores else 0, + "avg_feature_score": np.mean(list(combined_scores.values())) + if combined_scores + else 0, + }, + ) + # Log top feature scores for debugging if ranked_features: self.logger.debug(f"Top 5 feature scores: {ranked_features[:5]}") - + return top_features - + except Exception as e: - self.logger.error(f"Error in feature importance evaluation: {str(e)}", - extra={'function': 'evaluate_feature_importance'}) + self.logger.error( + f"Error in feature importance evaluation: {str(e)}", + extra={"function": "evaluate_feature_importance"}, + ) return new_feature_names[:top_k] - - def get_performance_stats(self) -> Dict[str, Union[int, float]]: + + def get_performance_stats(self) -> dict[str, Union[int, float]]: """Get current performance statistics.""" return self._feature_generation_stats.copy() - + def reset_performance_stats(self) -> None: """Reset performance statistics.""" self._feature_generation_stats = { - 'temporal_features': 0, - 'relational_features': 0, - 'interaction_features': 0, - 'memory_usage_mb': 0 + "temporal_features": 0, + "relational_features": 0, + "interaction_features": 0, + "memory_usage_mb": 0, } - self.logger.debug("Performance statistics reset") \ No newline at end of file + self.logger.debug("Performance statistics reset") diff --git a/src/utils/featuretools_demo.py b/src/utils/featuretools_demo.py index 75e1626..367f7e0 100644 --- a/src/utils/featuretools_demo.py +++ b/src/utils/featuretools_demo.py @@ -23,95 +23,92 @@ logger = ExperimentLogger(experiment_name="featuretools_demo") - def load_real_data() -> pd.DataFrame: """ Load real soccer match data using the project's DataLoader. This provides actual match data with all ~300 engineered features. """ logger.info("Loading real soccer match data using DataLoader...") - + # Initialize the data loader data_loader = DataLoader(experiment_name="featuretools_real_data") - + # Load all data splits X_train, y_train, X_test, y_test, X_val, y_val = data_loader.load_data() - + # Combine all data for feature engineering demonstration # In production, you would typically only use training data for feature engineering logger.info("Combining all data splits for comprehensive feature engineering demo...") - + # Add target variable to each split for identification X_train_with_target = X_train.copy() - X_train_with_target['target'] = y_train - X_train_with_target['split'] = 'train' - + X_train_with_target["target"] = y_train + X_train_with_target["split"] = "train" + X_test_with_target = X_test.copy() - X_test_with_target['target'] = y_test - X_test_with_target['split'] = 'test' - + X_test_with_target["target"] = y_test + X_test_with_target["split"] = "test" + X_val_with_target = X_val.copy() - X_val_with_target['target'] = y_val - X_val_with_target['split'] = 'validation' - + X_val_with_target["target"] = y_val + X_val_with_target["split"] = "validation" + # Combine all splits - combined_df = pd.concat([ - X_train_with_target, - X_test_with_target, - X_val_with_target - ], ignore_index=True) - + combined_df = pd.concat( + [X_train_with_target, X_test_with_target, X_val_with_target], ignore_index=True + ) + # Add required columns for featuretools if they don't exist - if 'fixture_id' not in combined_df.columns: - combined_df['fixture_id'] = range(len(combined_df)) + if "fixture_id" not in combined_df.columns: + combined_df["fixture_id"] = range(len(combined_df)) logger.info("Added fixture_id column (generated sequence)") - - if 'date_encoded' not in combined_df.columns: + + if "date_encoded" not in combined_df.columns: # Create a date sequence based on row order (assuming chronological order) - combined_df['date_encoded'] = pd.date_range( - start='2020-01-01', - periods=len(combined_df), - freq='D' + combined_df["date_encoded"] = pd.date_range( + start="2020-01-01", periods=len(combined_df), freq="D" ) logger.info("Added date_encoded column (generated date sequence)") - + # Ensure proper data types for featuretools - combined_df['fixture_id'] = combined_df['fixture_id'].astype(int) - if 'home_encoded' in combined_df.columns: - combined_df['home_encoded'] = combined_df['home_encoded'].astype(int) - if 'away_encoded' in combined_df.columns: - combined_df['away_encoded'] = combined_df['away_encoded'].astype(int) - if 'venue_encoded' in combined_df.columns: - combined_df['venue_encoded'] = combined_df['venue_encoded'].astype(int) - if 'league_encoded' in combined_df.columns: - combined_df['league_encoded'] = combined_df['league_encoded'].astype(int) - - logger.info(f"Loaded real dataset with {len(combined_df)} rows and {len(combined_df.columns)} features") + combined_df["fixture_id"] = combined_df["fixture_id"].astype(int) + if "home_encoded" in combined_df.columns: + combined_df["home_encoded"] = combined_df["home_encoded"].astype(int) + if "away_encoded" in combined_df.columns: + combined_df["away_encoded"] = combined_df["away_encoded"].astype(int) + if "venue_encoded" in combined_df.columns: + combined_df["venue_encoded"] = combined_df["venue_encoded"].astype(int) + if "league_encoded" in combined_df.columns: + combined_df["league_encoded"] = combined_df["league_encoded"].astype(int) + + logger.info( + f"Loaded real dataset with {len(combined_df)} rows and {len(combined_df.columns)} features" + ) logger.info(f"Data splits: Train={len(X_train)}, Test={len(X_test)}, Validation={len(X_val)}") logger.info(f"Draw rate: {combined_df['target'].mean():.2%}") - + # Log feature categories feature_names = data_loader.get_feature_names() logger.info(f"Available features from DataLoader: {len(feature_names)}") - + return combined_df def load_sample_real_data(sample_size: int = 1000) -> pd.DataFrame: """ Load a sample of real data for faster demonstration. - + Args: sample_size: Number of samples to use for demo - + Returns: Sampled dataframe with real features """ logger.info(f"Loading sample of {sample_size} rows from real data for demo...") - + # Load full real data full_df = load_real_data() - + # Sample for faster processing if len(full_df) > sample_size: sampled_df = full_df.sample(n=sample_size, random_state=42).reset_index(drop=True) @@ -119,44 +116,46 @@ def load_sample_real_data(sample_size: int = 1000) -> pd.DataFrame: else: sampled_df = full_df logger.info(f"Using all {len(full_df)} rows (less than requested sample size)") - + return sampled_df def demonstrate_featuretools_pipeline(): """Demonstrate the complete featuretools feature engineering pipeline.""" - + logger.info("=== Featuretools Automated Feature Engineering Demo ===") - + # Step 1: Load real data (sampled for demo performance) logger.info("Step 1: Loading real soccer match data (sampled for demo)...") df = load_sample_real_data(sample_size=2000) # Use 2000 samples for demo original_feature_count = len(df.columns) - + # Step 2: Initialize the feature engineer logger.info("Step 2: Initializing SoccerFeaturetoolsEngineer...") engineer = SoccerFeaturetoolsEngineer( logger=logger, mlflow_tracking=True, max_depth=2, - n_jobs=1 # CPU-only constraint from project requirements + n_jobs=1, # CPU-only constraint from project requirements ) - + # Step 3: Run hybrid feature engineering logger.info("Step 3: Running hybrid feature engineering...") - + try: # Start MLflow experiment mlflow.set_experiment("featuretools_soccer_demo") - + with mlflow.start_run(run_name="hybrid_feature_engineering"): # Log original data info - mlflow.log_params({ - 'original_features': original_feature_count, - 'data_samples': len(df), - 'approach': 'hybrid_temporal_relational_interactions' - }) - + mlflow.log_params( + { + "original_features": original_feature_count, + "data_samples": len(df), + "approach": "hybrid_temporal_relational_interactions", + } + ) + # Run feature engineering augmented_df, feature_defs = engineer.run_hybrid_feature_engineering( df=df, @@ -164,45 +163,53 @@ def demonstrate_featuretools_pipeline(): include_relational=True, include_interactions=True, temporal_window=5, - temporal_gap=1 + temporal_gap=1, ) - + # Calculate results new_feature_count = len(augmented_df.columns) added_features = new_feature_count - original_feature_count - + # Log results - mlflow.log_metrics({ - 'original_features': original_feature_count, - 'new_features': new_feature_count, - 'added_features': added_features, - 'feature_increase_ratio': added_features / original_feature_count - }) - - logger.info(f" Feature engineering completed successfully!") - logger.info(f" Original features: {original_feature_count} (real engineered features)") + mlflow.log_metrics( + { + "original_features": original_feature_count, + "new_features": new_feature_count, + "added_features": added_features, + "feature_increase_ratio": added_features / original_feature_count, + } + ) + + logger.info(" Feature engineering completed successfully!") + logger.info( + f" Original features: {original_feature_count} (real engineered features)" + ) logger.info(f" New features: {new_feature_count}") logger.info(f" Added features: {added_features}") logger.info(f" Increase ratio: {added_features / original_feature_count:.2%}") - logger.info(f" Data source: Real soccer match data from DataLoader") - + logger.info(" Data source: Real soccer match data from DataLoader") + # Step 4: Analyze new features logger.info("Step 4: Analyzing new features...") - + new_feature_names = [col for col in augmented_df.columns if col not in df.columns] - + if new_feature_names: logger.info("New features generated:") - + # Group by feature type - temporal_features = [f for f in new_feature_names if 'ft_temporal_' in f] - relational_features = [f for f in new_feature_names if 'ft_relational_' in f] - interaction_features = [f for f in new_feature_names if f.startswith('ft_') and 'temporal' not in f and 'relational' not in f] - + temporal_features = [f for f in new_feature_names if "ft_temporal_" in f] + relational_features = [f for f in new_feature_names if "ft_relational_" in f] + interaction_features = [ + f + for f in new_feature_names + if f.startswith("ft_") and "temporal" not in f and "relational" not in f + ] + logger.info(f" - Temporal features: {len(temporal_features)}") logger.info(f" - Relational features: {len(relational_features)}") logger.info(f" - Interaction features: {len(interaction_features)}") - + # Log feature examples if temporal_features: logger.info(f" Temporal examples: {temporal_features[:3]}") @@ -210,47 +217,51 @@ def demonstrate_featuretools_pipeline(): logger.info(f" Relational examples: {relational_features[:3]}") if interaction_features: logger.info(f" Interaction examples: {interaction_features[:3]}") - + # Save feature definitions feature_def_path = "featuretools_feature_definitions.json" engineer.save_feature_definitions(feature_defs, feature_def_path) - + # Log artifact mlflow.log_artifact(feature_def_path) - + else: logger.warning("No new features were generated") - + # Step 5: Data quality checks logger.info("Step 5: Performing data quality checks...") - + # Check for missing values in new features if new_feature_names: missing_stats = augmented_df[new_feature_names].isnull().sum() high_missing = missing_stats[missing_stats > len(augmented_df) * 0.1] - + if len(high_missing) > 0: logger.warning(f"Features with >10% missing values: {len(high_missing)}") else: logger.info("No features with excessive missing values") - + # Check for infinite values - inf_count = np.isinf(augmented_df[new_feature_names].select_dtypes(include=[np.number])).sum().sum() + inf_count = ( + np.isinf(augmented_df[new_feature_names].select_dtypes(include=[np.number])) + .sum() + .sum() + ) if inf_count > 0: logger.warning(f"Found {inf_count} infinite values in new features") else: logger.info("No infinite values found in new features") - + # Basic statistics logger.info("Feature statistics summary:") stats = augmented_df[new_feature_names].describe() logger.info(f" Mean std: {stats.loc['std'].mean():.4f}") logger.info(f" Zero variance features: {(stats.loc['std'] == 0).sum()}") - + logger.info("=== Demo completed successfully! ===") - + return augmented_df, feature_defs - + except Exception as e: logger.error(f"Error in feature engineering pipeline: {str(e)}") raise @@ -258,162 +269,182 @@ def demonstrate_featuretools_pipeline(): def demonstrate_feature_integration(): """Demonstrate how to integrate new features with existing ensemble models using intelligent selection.""" - + logger.info("=== Enhanced Feature Integration Demo ===") - + # Load only training data for feature engineering (best practice) logger.info("Loading training data only for feature engineering...") data_loader = DataLoader(experiment_name="featuretools_integration") X_train, y_train, _, _, _, _ = data_loader.load_data() - + # Add required columns for featuretools if they don't exist - if 'fixture_id' not in X_train.columns: - X_train['fixture_id'] = range(len(X_train)) - if 'date_encoded' not in X_train.columns: - X_train['date_encoded'] = pd.date_range(start='2020-01-01', periods=len(X_train), freq='D') - + if "fixture_id" not in X_train.columns: + X_train["fixture_id"] = range(len(X_train)) + if "date_encoded" not in X_train.columns: + X_train["date_encoded"] = pd.date_range(start="2020-01-01", periods=len(X_train), freq="D") + # Add team and venue IDs if missing (for demonstration) - for col, max_val in [('home_encoded', 21), ('away_encoded', 21), ('venue_encoded', 11), ('league_encoded', 6)]: + for col, max_val in [ + ("home_encoded", 21), + ("away_encoded", 21), + ("venue_encoded", 11), + ("league_encoded", 6), + ]: if col not in X_train.columns: X_train[col] = np.random.randint(1, max_val, len(X_train)) - + # Ensure proper data types - for col in ['fixture_id', 'home_encoded', 'away_encoded', 'venue_encoded', 'league_encoded']: + for col in ["fixture_id", "home_encoded", "away_encoded", "venue_encoded", "league_encoded"]: if col in X_train.columns: X_train[col] = X_train[col].astype(int) - + logger.info(f"Training data prepared: {X_train.shape}") - + # Run feature engineering on training data engineer = SoccerFeaturetoolsEngineer(logger=logger, mlflow_tracking=False) - + augmented_df, feature_defs = engineer.run_hybrid_feature_engineering( - df=X_train, - include_temporal=True, - include_relational=True, - include_interactions=True + df=X_train, include_temporal=True, include_relational=True, include_interactions=True ) - + # Get new features new_features = [col for col in augmented_df.columns if col not in X_train.columns] - + if new_features: logger.info(f"Generated {len(new_features)} new features for ensemble integration") - + # Enhanced feature selection using multiple evaluation methods logger.info("Evaluating feature importance using multiple methods...") - + # Evaluate features using different methods top_features_mi = engineer.evaluate_feature_importance( - X=augmented_df, - y=y_train, + X=augmented_df, + y=y_train, new_feature_names=new_features, - method='mutual_info', - top_k=50 + method="mutual_info", + top_k=50, ) - + top_features_corr = engineer.evaluate_feature_importance( - X=augmented_df, - y=y_train, + X=augmented_df, + y=y_train, new_feature_names=new_features, - method='correlation', - top_k=50 + method="correlation", + top_k=50, ) - + top_features_combined = engineer.evaluate_feature_importance( - X=augmented_df, - y=y_train, + X=augmented_df, + y=y_train, new_feature_names=new_features, - method='combined', - top_k=100 # Get more for model-specific selection + method="combined", + top_k=100, # Get more for model-specific selection ) - + logger.info(f"Top features by mutual info: {len(top_features_mi)}") logger.info(f"Top features by correlation: {len(top_features_corr)}") logger.info(f"Top features by combined score: {len(top_features_combined)}") - + # Load existing feature selections - with open('src/utils/selected_features_ensemble_new.json', 'r') as f: + with open("src/utils/selected_features_ensemble_new.json") as f: existing_selections = json.load(f) - + # Create updated feature selections with intelligent selection updated_selections = existing_selections.copy() - + # Model-specific feature selection strategy model_feature_strategies = { - 'xgb': top_features_combined[:30], # XGBoost handles many features well - 'catboost': top_features_mi[:25], # CatBoost + mutual info for categorical handling - 'mlp': top_features_corr[:20], # Neural networks + correlation for linear relationships - 'pytorch': top_features_combined[:35] # PyTorch can handle complex interactions + "xgb": top_features_combined[:30], # XGBoost handles many features well + "catboost": top_features_mi[:25], # CatBoost + mutual info for categorical handling + "mlp": top_features_corr[:20], # Neural networks + correlation for linear relationships + "pytorch": top_features_combined[:35], # PyTorch can handle complex interactions } - + # Add strategically selected features to each model for model_name, selected_features in model_feature_strategies.items(): if model_name in updated_selections: # Remove duplicates while preserving order new_model_features = [] existing_model_features = set(updated_selections[model_name]) - + for feature in selected_features: if feature not in existing_model_features: new_model_features.append(feature) existing_model_features.add(feature) - + updated_selections[model_name].extend(new_model_features) - logger.info(f"Added {len(new_model_features)} strategically selected features to {model_name}") + logger.info( + f"Added {len(new_model_features)} strategically selected features to {model_name}" + ) logger.info(f" {model_name} examples: {new_model_features[:3]}") - + # Update 'all' selection with all top features (removing duplicates) - all_top_features = list(dict.fromkeys(top_features_combined)) # Remove duplicates while preserving order - updated_selections['all'].extend(all_top_features) - + all_top_features = list( + dict.fromkeys(top_features_combined) + ) # Remove duplicates while preserving order + updated_selections["all"].extend(all_top_features) + # Save updated selections with metadata output_data = { - 'feature_selections': updated_selections, - 'metadata': { - 'generation_timestamp': pd.Timestamp.now().isoformat(), - 'total_generated_features': len(new_features), - 'features_selected_for_models': len(all_top_features), - 'selection_methods': ['mutual_info', 'correlation', 'combined'], - 'model_strategies': { - 'xgb': 'Combined score (handles many features)', - 'catboost': 'Mutual information (categorical handling)', - 'mlp': 'Correlation (linear relationships)', - 'pytorch': 'Combined score (complex interactions)' - } - } + "feature_selections": updated_selections, + "metadata": { + "generation_timestamp": pd.Timestamp.now().isoformat(), + "total_generated_features": len(new_features), + "features_selected_for_models": len(all_top_features), + "selection_methods": ["mutual_info", "correlation", "combined"], + "model_strategies": { + "xgb": "Combined score (handles many features)", + "catboost": "Mutual information (categorical handling)", + "mlp": "Correlation (linear relationships)", + "pytorch": "Combined score (complex interactions)", + }, + }, } - + output_path = "enhanced_selected_features_with_featuretools.json" - with open(output_path, 'w') as f: + with open(output_path, "w") as f: json.dump(output_data, f, indent=2) - + logger.info(f"Enhanced feature selections saved to {output_path}") logger.info(f"Total features in 'all': {len(updated_selections['all'])}") - logger.info(f"Selection efficiency: {len(all_top_features)}/{len(new_features)} = {len(all_top_features)/len(new_features)*100:.1f}%") - + logger.info( + f"Selection efficiency: {len(all_top_features)}/{len(new_features)} = {len(all_top_features) / len(new_features) * 100:.1f}%" + ) + # Create feature analysis report feature_analysis = { - 'temporal_features': [f for f in new_features if 'ft_temporal_' in f], - 'relational_features': [f for f in new_features if 'ft_relational_' in f], - 'interaction_features': [f for f in new_features if f.startswith('ft_') and 'temporal' not in f and 'relational' not in f], - 'selected_temporal': [f for f in all_top_features if 'ft_temporal_' in f], - 'selected_relational': [f for f in all_top_features if 'ft_relational_' in f], - 'selected_interaction': [f for f in all_top_features if f.startswith('ft_') and 'temporal' not in f and 'relational' not in f] + "temporal_features": [f for f in new_features if "ft_temporal_" in f], + "relational_features": [f for f in new_features if "ft_relational_" in f], + "interaction_features": [ + f + for f in new_features + if f.startswith("ft_") and "temporal" not in f and "relational" not in f + ], + "selected_temporal": [f for f in all_top_features if "ft_temporal_" in f], + "selected_relational": [f for f in all_top_features if "ft_relational_" in f], + "selected_interaction": [ + f + for f in all_top_features + if f.startswith("ft_") and "temporal" not in f and "relational" not in f + ], } - + logger.info("Feature type analysis:") for feature_type, features in feature_analysis.items(): - if 'selected_' in feature_type: - original_type = feature_type.replace('selected_', '') + if "selected_" in feature_type: + original_type = feature_type.replace("selected_", "") if original_type in feature_analysis: original_count = len(feature_analysis[original_type]) selected_count = len(features) if original_count > 0: - logger.info(f" {feature_type}: {selected_count}/{original_count} ({selected_count/original_count*100:.1f}%)") + logger.info( + f" {feature_type}: {selected_count}/{original_count} ({selected_count / original_count * 100:.1f}%)" + ) else: - logger.info(f" {feature_type}: {selected_count}/0 (no original features of this type)") - + logger.info( + f" {feature_type}: {selected_count}/0 (no original features of this type)" + ) + else: logger.warning("No new features generated for integration") @@ -423,19 +454,19 @@ def demonstrate_feature_integration(): try: # Main feature engineering demo augmented_df, feature_defs = demonstrate_featuretools_pipeline() - + # Feature integration demo demonstrate_feature_integration() - - print("\n" + "="*60) + + print("\n" + "=" * 60) print("🎉 Featuretools demonstration completed successfully!") - print("="*60) + print("=" * 60) print("\nNext steps:") print("1. Integrate new features into your ensemble models") print("2. Evaluate model performance with augmented features") print("3. Use feature importance to select best new features") print("4. Update your training pipeline with the new features") - + except Exception as e: logger.error(f"Demo failed: {str(e)}") - sys.exit(1) \ No newline at end of file + sys.exit(1) diff --git a/src/utils/logger.py b/src/utils/logger.py index 563f3e7..52a8001 100644 --- a/src/utils/logger.py +++ b/src/utils/logger.py @@ -5,7 +5,6 @@ import logging import os import sys -import unicodedata from logging.handlers import RotatingFileHandler from pathlib import Path from typing import Any, Optional @@ -19,80 +18,83 @@ def sanitize_unicode_for_console(text: str) -> str: """ Sanitize Unicode text for console output, replacing problematic characters. - + Args: text: Input text that may contain Unicode characters - + Returns: Sanitized text safe for console output """ # Dictionary of common emoji replacements for console-safe alternatives emoji_replacements = { - '📊': '[CHART]', - '🎯': '[TARGET]', - '🚀': '[ROCKET]', - '✓': '[OK]', - '✗': '[FAIL]', - '⚠': '[WARN]', - '🎉': '[SUCCESS]', - '📈': '[GRAPH]', - '🔍': '[SEARCH]', - '⭐': '[STAR]', - '🔧': '[TOOL]', - '📦': '[PACKAGE]', - '⚡': '[FAST]', - '🎲': '[DICE]', - '🧠': '[BRAIN]', - '🏆': '[TROPHY]', - '🔥': '[FIRE]', - '💡': '[IDEA]', - '📝': '[NOTE]', - '🎪': '[CIRCUS]', - '🎨': '[ART]', - '🌟': '[SPARKLE]', - '🎵': '[MUSIC]', - '🎭': '[THEATER]', - '🎬': '[MOVIE]', - '🎮': '[GAME]', - '🎸': '[GUITAR]', - '🎤': '[MIC]', - '🎧': '[HEADPHONE]', - '🎺': '[TRUMPET]', - '🎻': '[VIOLIN]', - '🥁': '[DRUM]', - '🎹': '[PIANO]', + "📊": "[CHART]", + "🎯": "[TARGET]", + "🚀": "[ROCKET]", + "✓": "[OK]", + "✗": "[FAIL]", + "⚠": "[WARN]", + "🎉": "[SUCCESS]", + "📈": "[GRAPH]", + "🔍": "[SEARCH]", + "⭐": "[STAR]", + "🔧": "[TOOL]", + "📦": "[PACKAGE]", + "⚡": "[FAST]", + "🎲": "[DICE]", + "🧠": "[BRAIN]", + "🏆": "[TROPHY]", + "🔥": "[FIRE]", + "💡": "[IDEA]", + "📝": "[NOTE]", + "🎪": "[CIRCUS]", + "🎨": "[ART]", + "🌟": "[SPARKLE]", + "🎵": "[MUSIC]", + "🎭": "[THEATER]", + "🎬": "[MOVIE]", + "🎮": "[GAME]", + "🎸": "[GUITAR]", + "🎤": "[MIC]", + "🎧": "[HEADPHONE]", + "🎺": "[TRUMPET]", + "🎻": "[VIOLIN]", + "🥁": "[DRUM]", + "🎹": "[PIANO]", } - + # Replace known emojis first sanitized = text for emoji, replacement in emoji_replacements.items(): sanitized = sanitized.replace(emoji, replacement) - + # Handle any remaining problematic Unicode characters try: # Try to encode with the system's default encoding - sanitized.encode(sys.stdout.encoding or 'utf-8', errors='strict') + sanitized.encode(sys.stdout.encoding or "utf-8", errors="strict") return sanitized except (UnicodeEncodeError, LookupError): # If that fails, replace problematic characters try: # Try UTF-8 first - sanitized.encode('utf-8', errors='strict') + sanitized.encode("utf-8", errors="strict") return sanitized except UnicodeEncodeError: # Last resort: replace all non-ASCII characters - return ''.join(char if ord(char) < 128 else f'[U+{ord(char):04X}]' for char in sanitized) + return "".join( + char if ord(char) < 128 else f"[U+{ord(char):04X}]" for char in sanitized + ) class UnicodeAwareFormatter(logging.Formatter): """ Custom formatter that handles Unicode characters safely. """ + def format(self, record): # Sanitize the message for console output - if hasattr(record, 'msg') and isinstance(record.msg, str): + if hasattr(record, "msg") and isinstance(record.msg, str): record.msg = sanitize_unicode_for_console(record.msg) - + # Format the basic message record.extra_fields = "" if hasattr(record, "extra"): @@ -118,18 +120,19 @@ class UnicodeAwareStreamHandler(logging.StreamHandler): """ Stream handler that properly handles Unicode encoding issues. """ + def __init__(self, stream=None): super().__init__(stream) - + if stream is None: stream = sys.stdout - - if hasattr(stream, 'reconfigure'): + + if hasattr(stream, "reconfigure"): try: - stream.reconfigure(encoding='utf-8', errors='replace') + stream.reconfigure(encoding="utf-8", errors="replace") except (AttributeError, OSError): pass - + def emit(self, record): """ Emit a record with proper Unicode handling. @@ -146,7 +149,7 @@ def emit(self, record): except Exception: # Last resort: print a simple error message try: - self.stream.write(f"[UNICODE ERROR] Log message could not be displayed\n") + self.stream.write("[UNICODE ERROR] Log message could not be displayed\n") self.stream.flush() except Exception: pass @@ -154,6 +157,7 @@ def emit(self, record): class ReadableFormatter(UnicodeAwareFormatter): """Legacy formatter name for backward compatibility.""" + pass @@ -239,19 +243,19 @@ def _configure_logging(self) -> None: # Create handlers with Unicode support file_handler = RotatingFileHandler( - self.log_file, - maxBytes=max_bytes, + self.log_file, + maxBytes=max_bytes, backupCount=backup_count, - encoding='utf-8' # Ensure UTF-8 encoding for file + encoding="utf-8", # Ensure UTF-8 encoding for file ) - + # Use Unicode-aware stream handler for console console_handler = UnicodeAwareStreamHandler(sys.stdout) # Configure formatters with Unicode support file_formatter = UnicodeAwareFormatter(log_format) console_formatter = UnicodeAwareFormatter(log_format) - + file_handler.setFormatter(file_formatter) console_handler.setFormatter(console_formatter) @@ -262,7 +266,7 @@ def _configure_logging(self) -> None: # Add handlers to the instance logger self.logger.addHandler(file_handler) self.logger.addHandler(console_handler) - + # CRITICAL: Disable propagation to prevent double logging self.logger.propagate = False @@ -288,17 +292,17 @@ def _ensure_stream_handler(self) -> None: """Ensure that the logger has at least one Unicode-aware StreamHandler attached.""" # Check if we already have a StreamHandler to avoid duplication has_stream_handler = any( - isinstance(handler, (logging.StreamHandler, UnicodeAwareStreamHandler)) + isinstance(handler, (logging.StreamHandler, UnicodeAwareStreamHandler)) for handler in self.logger.handlers ) - + if not has_stream_handler: console_handler = UnicodeAwareStreamHandler(sys.stdout) log_format = "%(asctime)s | %(levelname)-8s | %(name)s | %(message)s%(extra_fields)s" formatter = UnicodeAwareFormatter(log_format) console_handler.setFormatter(formatter) self.logger.addHandler(console_handler) - + # Maintain propagation setting to prevent double logging self.logger.propagate = False @@ -312,7 +316,7 @@ def _log(self, level: str, msg: str, extra: Optional[dict[str, Any]] = None) -> """ # Sanitize the message for Unicode safety safe_msg = sanitize_unicode_for_console(msg) - + if self.structured_logger: log_method = getattr(self.structured_logger, level.lower()) log_method(safe_msg, **(extra or {})) @@ -359,11 +363,11 @@ def error( def debug(self, msg: str, extra: Optional[dict[str, Any]] = None) -> None: """Log a debug message with Unicode support.""" self._log("DEBUG", msg, extra) - + def log_unicode_safe(self, level: str, msg: str, **kwargs) -> None: """ Explicitly Unicode-safe logging method. - + Args: level: Log level (INFO, WARNING, ERROR, DEBUG) msg: Message to log (will be sanitized) diff --git a/src/utils/mlflow_integration.py b/src/utils/mlflow_integration.py index 3e7bc98..3e458fb 100644 --- a/src/utils/mlflow_integration.py +++ b/src/utils/mlflow_integration.py @@ -23,6 +23,8 @@ # Initialize logger logger = ExperimentLogger(experiment_name="mlflow_integration", log_dir="logs/mlflow_integration") project_root = Path(__file__).parent.parent + + class MLflowIntegration: """MLflow integration for experiment tracking and model management.""" @@ -335,31 +337,31 @@ def cleanup_deleted_runs(mlruns_dir="mlruns"): print(f"Error processing experiment {exp.name} (ID: {exp.experiment_id}): {str(e)}") continue + def cleanup_empty_experiments(mlruns_dir="mlruns"): """Clean up experiments that have no runs. - + Args: mlruns_dir: Path to mlruns directory. Defaults to 'mlruns'. """ client = MlflowClient() experiments = client.search_experiments() - + for exp in experiments: print(f"Checking Experiment: {exp.name} (ID: {exp.experiment_id})") try: # Search for all runs (active and deleted) runs = client.search_runs( - [exp.experiment_id], - run_view_type=mlflow.entities.ViewType.ALL + [exp.experiment_id], run_view_type=mlflow.entities.ViewType.ALL ) - + if len(runs) == 0: print(f"Experiment {exp.name} has no runs - deleting...") exp_path = os.path.join(mlruns_dir, exp.experiment_id) - + # Delete from tracking server client.delete_experiment(exp.experiment_id) - + # Remove experiment directory if it exists if os.path.exists(exp_path): print(f"Removing experiment directory at {exp_path}") diff --git a/src/utils/outlier_detection.py b/src/utils/outlier_detection.py index 9f369ac..40b0483 100644 --- a/src/utils/outlier_detection.py +++ b/src/utils/outlier_detection.py @@ -24,7 +24,7 @@ def remove_outliers_isolation_forest( ) -> tuple[pd.DataFrame, pd.Series]: """ Remove outliers from training data using Isolation Forest with pre-fitted scaler. - + Args: X_train: Training features DataFrame y_train: Training labels Series @@ -32,21 +32,21 @@ def remove_outliers_isolation_forest( contamination: Expected proportion of outliers (default: 0.01 = 1%) random_state: Random state for reproducibility (default: 19) logger: Logger instance for tracking - + Returns: Tuple of cleaned datasets: (X_train_clean, y_train_clean) """ if logger is None: logger = ExperimentLogger(experiment_name="outlier_detection") - + logger.info("Starting outlier detection with Isolation Forest") logger.info(f"Training data shape before outlier removal: {X_train.shape}") - logger.info(f"Contamination rate: {contamination} ({contamination*100:.1f}%)") - + logger.info(f"Contamination rate: {contamination} ({contamination * 100:.1f}%)") + # Record original shapes and class distribution original_train_size = len(X_train) original_positive_rate = y_train.mean() - + try: # Step 1: Use pre-fitted scaler or create new one if scaler is not None: @@ -56,7 +56,7 @@ def remove_outliers_isolation_forest( logger.info("Creating new StandardScaler for outlier detection") scaler = StandardScaler() X_train_scaled = scaler.fit_transform(X_train) - + # Step 2: Fit Isolation Forest on scaled training data logger.info("Fitting Isolation Forest model") iso_forest = IsolationForest( @@ -65,33 +65,37 @@ def remove_outliers_isolation_forest( n_jobs=-1, # Use all available cores bootstrap=False, # For reproducibility ) - + # Predict outliers (-1 for outliers, 1 for inliers) outlier_predictions = iso_forest.fit_predict(X_train_scaled) - + # Step 3: Identify inliers and outliers inlier_mask = outlier_predictions == 1 outlier_count = np.sum(~inlier_mask) outlier_percentage = (outlier_count / original_train_size) * 100 - - logger.info(f"Detected {outlier_count} outliers ({outlier_percentage:.2f}% of training data)") - + + logger.info( + f"Detected {outlier_count} outliers ({outlier_percentage:.2f}% of training data)" + ) + # Step 4: Filter training data to keep only inliers X_train_clean = X_train.loc[inlier_mask].copy() y_train_clean = y_train.loc[inlier_mask].copy() - + # Reset indices to ensure clean sequential indexing X_train_clean.reset_index(drop=True, inplace=True) y_train_clean.reset_index(drop=True, inplace=True) - + # Log results new_train_size = len(X_train_clean) new_positive_rate = y_train_clean.mean() - + logger.info(f"Training data shape after outlier removal: {X_train_clean.shape}") logger.info(f"Removed {original_train_size - new_train_size} samples") - logger.info(f"Class distribution - Before: {original_positive_rate:.3f}, After: {new_positive_rate:.3f}") - + logger.info( + f"Class distribution - Before: {original_positive_rate:.3f}, After: {new_positive_rate:.3f}" + ) + # Check if class distribution changed significantly distribution_change = abs(new_positive_rate - original_positive_rate) if distribution_change > 0.05: # 5% threshold @@ -99,9 +103,9 @@ def remove_outliers_isolation_forest( f"Significant class distribution change detected: {distribution_change:.3f}. " "Consider adjusting contamination rate." ) - + return X_train_clean, y_train_clean - + except Exception as e: logger.error(f"Error during outlier detection: {str(e)}") logger.warning("Returning original datasets without outlier removal") @@ -117,63 +121,71 @@ def analyze_outlier_impact( ) -> dict: """ Analyze the impact of outlier removal on dataset characteristics. - + Args: X_before: Features before outlier removal y_before: Labels before outlier removal X_after: Features after outlier removal y_after: Labels after outlier removal logger: Logger instance - + Returns: Dictionary with analysis results """ if logger is None: logger = ExperimentLogger(experiment_name="outlier_analysis") - + analysis = {} - + # Basic statistics - analysis['samples_before'] = len(X_before) - analysis['samples_after'] = len(X_after) - analysis['samples_removed'] = analysis['samples_before'] - analysis['samples_after'] - analysis['removal_percentage'] = (analysis['samples_removed'] / analysis['samples_before']) * 100 - + analysis["samples_before"] = len(X_before) + analysis["samples_after"] = len(X_after) + analysis["samples_removed"] = analysis["samples_before"] - analysis["samples_after"] + analysis["removal_percentage"] = ( + analysis["samples_removed"] / analysis["samples_before"] + ) * 100 + # Class distribution - analysis['positive_rate_before'] = y_before.mean() - analysis['positive_rate_after'] = y_after.mean() - analysis['class_distribution_change'] = abs(analysis['positive_rate_after'] - analysis['positive_rate_before']) - + analysis["positive_rate_before"] = y_before.mean() + analysis["positive_rate_after"] = y_after.mean() + analysis["class_distribution_change"] = abs( + analysis["positive_rate_after"] - analysis["positive_rate_before"] + ) + # Feature statistics changes - analysis['feature_stats'] = {} + analysis["feature_stats"] = {} for col in X_before.columns: if col in X_after.columns: before_stats = { - 'mean': X_before[col].mean(), - 'std': X_before[col].std(), - 'min': X_before[col].min(), - 'max': X_before[col].max() + "mean": X_before[col].mean(), + "std": X_before[col].std(), + "min": X_before[col].min(), + "max": X_before[col].max(), } after_stats = { - 'mean': X_after[col].mean(), - 'std': X_after[col].std(), - 'min': X_after[col].min(), - 'max': X_after[col].max() + "mean": X_after[col].mean(), + "std": X_after[col].std(), + "min": X_after[col].min(), + "max": X_after[col].max(), } - - analysis['feature_stats'][col] = { - 'before': before_stats, - 'after': after_stats, - 'mean_change': abs(after_stats['mean'] - before_stats['mean']), - 'std_change': abs(after_stats['std'] - before_stats['std']) + + analysis["feature_stats"][col] = { + "before": before_stats, + "after": after_stats, + "mean_change": abs(after_stats["mean"] - before_stats["mean"]), + "std_change": abs(after_stats["std"] - before_stats["std"]), } - + # Log summary logger.info("Outlier Removal Impact Analysis:") - logger.info(f" Samples removed: {analysis['samples_removed']} ({analysis['removal_percentage']:.2f}%)") + logger.info( + f" Samples removed: {analysis['samples_removed']} ({analysis['removal_percentage']:.2f}%)" + ) logger.info(f" Class distribution change: {analysis['class_distribution_change']:.4f}") - logger.info(f" Positive rate: {analysis['positive_rate_before']:.3f} -> {analysis['positive_rate_after']:.3f}") - + logger.info( + f" Positive rate: {analysis['positive_rate_before']:.3f} -> {analysis['positive_rate_after']:.3f}" + ) + return analysis @@ -186,22 +198,22 @@ def get_outlier_scores( ) -> np.ndarray: """ Get outlier scores for data samples without removing them. - + Args: X: Features DataFrame scaler: Pre-fitted scaler (optional, will create new one if None) contamination: Expected proportion of outliers random_state: Random state for reproducibility logger: Logger instance - + Returns: Array of outlier scores (lower scores indicate more outlier-like behavior) """ if logger is None: logger = ExperimentLogger(experiment_name="outlier_scoring") - + logger.info(f"Computing outlier scores for {X.shape[0]} samples") - + try: # Use pre-fitted scaler or create new one if scaler is not None: @@ -209,7 +221,7 @@ def get_outlier_scores( else: scaler = StandardScaler() X_scaled = scaler.fit_transform(X) - + # Fit Isolation Forest iso_forest = IsolationForest( contamination=contamination, @@ -217,14 +229,14 @@ def get_outlier_scores( n_jobs=-1, bootstrap=False, ) - + # Get outlier scores scores = iso_forest.fit(X_scaled).decision_function(X_scaled) - + logger.info(f"Outlier scores computed. Range: [{scores.min():.3f}, {scores.max():.3f}]") - + return scores - + except Exception as e: logger.error(f"Error computing outlier scores: {str(e)}") - return np.zeros(len(X)) \ No newline at end of file + return np.zeros(len(X))