Spaces:

Koshti10
/

LLMCalc

Runtime error

File size: 5,397 Bytes

# Utility functions for filtering the dataframe

import pandas as pd
import assets.text_content as tc

def filter_cols(df):

    df = df[[
    tc.MODEL_NAME, 
    tc.CLEMSCORE,
    tc.INPUT, 
    tc.OUTPUT,
    tc.LATENCY,
    tc.CONTEXT, 
    tc.PARAMS,
    tc.RELEASE_DATE, 
    tc.LICENSE
    ]]
    
    return df

def convert_date_components_to_timestamp(year: str, month: str) -> int:
    """Convert year and month strings to timestamp."""
    # Create a datetime object for the first day of the month
    date_str = f"{year}-{month:02d}-01"
    return int(pd.to_datetime(date_str).timestamp())

def filter_by_date(df: pd.DataFrame, 
                  start_year: str, 
                  start_month: str,
                  end_year: str,
                  end_month: str,
                  date_column: str) -> pd.DataFrame:
    """
    Filter DataFrame by date range using separate year and month components.
    
    Args:
        df: DataFrame to filter
        start_year: Starting year (e.g., "2023")
        start_month: Starting month (e.g., "1" for January)
        end_year: Ending year (e.g., "2024")
        end_month: Ending month (e.g., "12" for December)
        date_column: Name of the date column to filter on
    """
    # Convert string inputs to integers for date creation
    start_timestamp = convert_date_components_to_timestamp(
        int(start_year), 
        int(start_month)
    )
    
    end_timestamp = convert_date_components_to_timestamp(
        int(end_year), 
        int(end_month)
    )
    
    # Convert the DataFrame's date column to timestamps for comparison
    date_timestamps = pd.to_datetime(df[date_column]).apply(lambda x: int(x.timestamp()))
    
    # Filter the DataFrame
    return df[
        (date_timestamps >= start_timestamp) & 
        (date_timestamps <= end_timestamp)
    ]
    


def filter(df, language_list, parameters, input_price, output_price, multimodal,
           context, open_weight, start_year, start_month, end_year, end_month, license ):
    

    if not df.empty:  # Check if df is non-empty
        df = df[df[tc.LANGS].apply(lambda x: all(lang in x for lang in language_list))]

    if not df.empty:
        # Split dataframe by Open Weight
        open_weight_true = df[df[tc.OPEN_WEIGHT] == True]
        open_weight_false = df[df[tc.OPEN_WEIGHT] == False]
        
        # Get max parameter size for open weight models
        max_parameter_size = open_weight_true[tc.PARAMS].max() if not open_weight_true.empty else 0
        
        # Filter only the open weight models based on parameters
        if not open_weight_true.empty:
            if parameters[1] >= max_parameter_size:
                filtered_open = open_weight_true[
                    (open_weight_true[tc.PARAMS] >= parameters[0])
                ]
            else:
                filtered_open = open_weight_true[
                    (open_weight_true[tc.PARAMS] >= parameters[0]) & 
                    (open_weight_true[tc.PARAMS] <= parameters[1])
                ]
            
            # Combine filtered open weight models with unfiltered commercial models
            df = pd.concat([filtered_open, open_weight_false])

    if not df.empty:  # Check if df is non-empty
        df = df[(df[tc.INPUT] >= input_price[0]) & (df[tc.INPUT] <= input_price[1])]
    
    if not df.empty:  # Check if df is non-empty
        df = df[(df[tc.OUTPUT] >= output_price[0]) & (df[tc.OUTPUT] <= output_price[1])]

    if not df.empty:  # Check if df is non-empty
        if tc.SINGLE_IMG in multimodal:
            df = df[df[tc.SINGLE_IMG] == True]
        if tc.MULT_IMG in multimodal:
            df = df[df[tc.MULT_IMG] == True]
        if tc.AUDIO in multimodal:
            df = df[df[tc.AUDIO] == True]
        if tc.VIDEO in multimodal:
            df = df[df[tc.VIDEO] == True]

    # if not df.empty:  # Check if df is non-empty
    #     df = df[(df['Context Size (k)'] >= (context[0])) & (df['Context Size (k)'] <= (context[1]))]

    if not df.empty:  # Check if df is non-empty
        if tc.OPEN in open_weight and tc.COMM not in open_weight:
            df = df[df[tc.OPEN_WEIGHT] == True]
        elif tc.COMM in open_weight and tc.OPEN not in open_weight:
            df = df[df[tc.OPEN_WEIGHT] == False]
        elif tc.OPEN not in open_weight and tc.COMM not in open_weight:
            # Return empty DataFrame with same columns
            df = pd.DataFrame(columns=df.columns)
        
    if not df.empty:  # Check if df is non-empty
        df = df[df[tc.LICENSE_NAME].apply(lambda x: any(lic in x for lic in license))]

    # # Convert 'Release Date' to int temporarily
    # if not df.empty:  # Check if df is non-empty
    #     df[tc.TEMP_DATE] = pd.to_datetime(df[tc.TEMP_DATE]).astype(int) // 10**9  # Convert to seconds since epoch

    # # Convert start and end to int (seconds since epoch)
    # start = int(pd.to_datetime(start).timestamp())  
    # end = int(pd.to_datetime(end).timestamp())    

    # # Filter based on the converted 'Release Date'
    # if not df.empty:  # Check if df is non-empty
    #     df = df[(df[tc.TEMP_DATE] >= start) & (df[tc.TEMP_DATE] <= end)]

    df = filter_by_date(df, start_year, start_month, end_year, end_month, tc.TEMP_DATE)

    df = filter_cols(df)
    df = df.sort_values(by=tc.CLEMSCORE, ascending=False)

    return df  # Return the filtered dataframe