# Utility functions for filtering the dataframe import pandas as pd import assets.text_content as tc def filter_cols(df): df = df[[ tc.MODEL_NAME, tc.CLEMSCORE, tc.INPUT, tc.OUTPUT, tc.LATENCY, tc.CONTEXT, tc.PARAMS, tc.RELEASE_DATE, tc.LICENSE ]] return df def convert_date_components_to_timestamp(year: str, month: str) -> int: """Convert year and month strings to timestamp.""" # Create a datetime object for the first day of the month date_str = f"{year}-{month:02d}-01" return int(pd.to_datetime(date_str).timestamp()) def filter_by_date(df: pd.DataFrame, start_year: str, start_month: str, end_year: str, end_month: str, date_column: str) -> pd.DataFrame: """ Filter DataFrame by date range using separate year and month components. Args: df: DataFrame to filter start_year: Starting year (e.g., "2023") start_month: Starting month (e.g., "1" for January) end_year: Ending year (e.g., "2024") end_month: Ending month (e.g., "12" for December) date_column: Name of the date column to filter on """ # Convert string inputs to integers for date creation start_timestamp = convert_date_components_to_timestamp( int(start_year), int(start_month) ) end_timestamp = convert_date_components_to_timestamp( int(end_year), int(end_month) ) # Convert the DataFrame's date column to timestamps for comparison date_timestamps = pd.to_datetime(df[date_column]).apply(lambda x: int(x.timestamp())) # Filter the DataFrame return df[ (date_timestamps >= start_timestamp) & (date_timestamps <= end_timestamp) ] def filter(df, language_list, parameters, input_price, output_price, multimodal, context, open_weight, start_year, start_month, end_year, end_month, license ): if not df.empty: # Check if df is non-empty df = df[df[tc.LANGS].apply(lambda x: all(lang in x for lang in language_list))] if not df.empty: # Split dataframe by Open Weight open_weight_true = df[df[tc.OPEN_WEIGHT] == True] open_weight_false = df[df[tc.OPEN_WEIGHT] == False] # Get max parameter size for open weight models max_parameter_size = open_weight_true[tc.PARAMS].max() if not open_weight_true.empty else 0 # Filter only the open weight models based on parameters if not open_weight_true.empty: if parameters[1] >= max_parameter_size: filtered_open = open_weight_true[ (open_weight_true[tc.PARAMS] >= parameters[0]) ] else: filtered_open = open_weight_true[ (open_weight_true[tc.PARAMS] >= parameters[0]) & (open_weight_true[tc.PARAMS] <= parameters[1]) ] # Combine filtered open weight models with unfiltered commercial models df = pd.concat([filtered_open, open_weight_false]) if not df.empty: # Check if df is non-empty df = df[(df[tc.INPUT] >= input_price[0]) & (df[tc.INPUT] <= input_price[1])] if not df.empty: # Check if df is non-empty df = df[(df[tc.OUTPUT] >= output_price[0]) & (df[tc.OUTPUT] <= output_price[1])] if not df.empty: # Check if df is non-empty if tc.SINGLE_IMG in multimodal: df = df[df[tc.SINGLE_IMG] == True] if tc.MULT_IMG in multimodal: df = df[df[tc.MULT_IMG] == True] if tc.AUDIO in multimodal: df = df[df[tc.AUDIO] == True] if tc.VIDEO in multimodal: df = df[df[tc.VIDEO] == True] # if not df.empty: # Check if df is non-empty # df = df[(df['Context Size (k)'] >= (context[0])) & (df['Context Size (k)'] <= (context[1]))] if not df.empty: # Check if df is non-empty if tc.OPEN in open_weight and tc.COMM not in open_weight: df = df[df[tc.OPEN_WEIGHT] == True] elif tc.COMM in open_weight and tc.OPEN not in open_weight: df = df[df[tc.OPEN_WEIGHT] == False] elif tc.OPEN not in open_weight and tc.COMM not in open_weight: # Return empty DataFrame with same columns df = pd.DataFrame(columns=df.columns) if not df.empty: # Check if df is non-empty df = df[df[tc.LICENSE_NAME].apply(lambda x: any(lic in x for lic in license))] # # Convert 'Release Date' to int temporarily # if not df.empty: # Check if df is non-empty # df[tc.TEMP_DATE] = pd.to_datetime(df[tc.TEMP_DATE]).astype(int) // 10**9 # Convert to seconds since epoch # # Convert start and end to int (seconds since epoch) # start = int(pd.to_datetime(start).timestamp()) # end = int(pd.to_datetime(end).timestamp()) # # Filter based on the converted 'Release Date' # if not df.empty: # Check if df is non-empty # df = df[(df[tc.TEMP_DATE] >= start) & (df[tc.TEMP_DATE] <= end)] df = filter_by_date(df, start_year, start_month, end_year, end_month, tc.TEMP_DATE) df = filter_cols(df) df = df.sort_values(by=tc.CLEMSCORE, ascending=False) return df # Return the filtered dataframe