File size: 5,397 Bytes
68e6513 fbdc657 144f299 fbdc657 68e6513 144f299 68e6513 1580227 68e6513 fd62121 68e6513 fbdc657 fd62121 fbdc657 ee6a180 3433b65 144f299 ee6a180 144f299 1580227 ee6a180 144f299 ee6a180 144f299 ee6a180 144f299 ee6a180 3433b65 144f299 3433b65 144f299 3433b65 144f299 3433b65 ee6a180 3433b65 144f299 7ce5480 fbdc657 3433b65 144f299 fbdc657 fd62121 fbdc657 fd62121 fbdc657 fd62121 68e6513 144f299 ee6a180 68e6513 fd62121 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 |
# Utility functions for filtering the dataframe
import pandas as pd
import assets.text_content as tc
def filter_cols(df):
df = df[[
tc.MODEL_NAME,
tc.CLEMSCORE,
tc.INPUT,
tc.OUTPUT,
tc.LATENCY,
tc.CONTEXT,
tc.PARAMS,
tc.RELEASE_DATE,
tc.LICENSE
]]
return df
def convert_date_components_to_timestamp(year: str, month: str) -> int:
"""Convert year and month strings to timestamp."""
# Create a datetime object for the first day of the month
date_str = f"{year}-{month:02d}-01"
return int(pd.to_datetime(date_str).timestamp())
def filter_by_date(df: pd.DataFrame,
start_year: str,
start_month: str,
end_year: str,
end_month: str,
date_column: str) -> pd.DataFrame:
"""
Filter DataFrame by date range using separate year and month components.
Args:
df: DataFrame to filter
start_year: Starting year (e.g., "2023")
start_month: Starting month (e.g., "1" for January)
end_year: Ending year (e.g., "2024")
end_month: Ending month (e.g., "12" for December)
date_column: Name of the date column to filter on
"""
# Convert string inputs to integers for date creation
start_timestamp = convert_date_components_to_timestamp(
int(start_year),
int(start_month)
)
end_timestamp = convert_date_components_to_timestamp(
int(end_year),
int(end_month)
)
# Convert the DataFrame's date column to timestamps for comparison
date_timestamps = pd.to_datetime(df[date_column]).apply(lambda x: int(x.timestamp()))
# Filter the DataFrame
return df[
(date_timestamps >= start_timestamp) &
(date_timestamps <= end_timestamp)
]
def filter(df, language_list, parameters, input_price, output_price, multimodal,
context, open_weight, start_year, start_month, end_year, end_month, license ):
if not df.empty: # Check if df is non-empty
df = df[df[tc.LANGS].apply(lambda x: all(lang in x for lang in language_list))]
if not df.empty:
# Split dataframe by Open Weight
open_weight_true = df[df[tc.OPEN_WEIGHT] == True]
open_weight_false = df[df[tc.OPEN_WEIGHT] == False]
# Get max parameter size for open weight models
max_parameter_size = open_weight_true[tc.PARAMS].max() if not open_weight_true.empty else 0
# Filter only the open weight models based on parameters
if not open_weight_true.empty:
if parameters[1] >= max_parameter_size:
filtered_open = open_weight_true[
(open_weight_true[tc.PARAMS] >= parameters[0])
]
else:
filtered_open = open_weight_true[
(open_weight_true[tc.PARAMS] >= parameters[0]) &
(open_weight_true[tc.PARAMS] <= parameters[1])
]
# Combine filtered open weight models with unfiltered commercial models
df = pd.concat([filtered_open, open_weight_false])
if not df.empty: # Check if df is non-empty
df = df[(df[tc.INPUT] >= input_price[0]) & (df[tc.INPUT] <= input_price[1])]
if not df.empty: # Check if df is non-empty
df = df[(df[tc.OUTPUT] >= output_price[0]) & (df[tc.OUTPUT] <= output_price[1])]
if not df.empty: # Check if df is non-empty
if tc.SINGLE_IMG in multimodal:
df = df[df[tc.SINGLE_IMG] == True]
if tc.MULT_IMG in multimodal:
df = df[df[tc.MULT_IMG] == True]
if tc.AUDIO in multimodal:
df = df[df[tc.AUDIO] == True]
if tc.VIDEO in multimodal:
df = df[df[tc.VIDEO] == True]
# if not df.empty: # Check if df is non-empty
# df = df[(df['Context Size (k)'] >= (context[0])) & (df['Context Size (k)'] <= (context[1]))]
if not df.empty: # Check if df is non-empty
if tc.OPEN in open_weight and tc.COMM not in open_weight:
df = df[df[tc.OPEN_WEIGHT] == True]
elif tc.COMM in open_weight and tc.OPEN not in open_weight:
df = df[df[tc.OPEN_WEIGHT] == False]
elif tc.OPEN not in open_weight and tc.COMM not in open_weight:
# Return empty DataFrame with same columns
df = pd.DataFrame(columns=df.columns)
if not df.empty: # Check if df is non-empty
df = df[df[tc.LICENSE_NAME].apply(lambda x: any(lic in x for lic in license))]
# # Convert 'Release Date' to int temporarily
# if not df.empty: # Check if df is non-empty
# df[tc.TEMP_DATE] = pd.to_datetime(df[tc.TEMP_DATE]).astype(int) // 10**9 # Convert to seconds since epoch
# # Convert start and end to int (seconds since epoch)
# start = int(pd.to_datetime(start).timestamp())
# end = int(pd.to_datetime(end).timestamp())
# # Filter based on the converted 'Release Date'
# if not df.empty: # Check if df is non-empty
# df = df[(df[tc.TEMP_DATE] >= start) & (df[tc.TEMP_DATE] <= end)]
df = filter_by_date(df, start_year, start_month, end_year, end_month, tc.TEMP_DATE)
df = filter_cols(df)
df = df.sort_values(by=tc.CLEMSCORE, ascending=False)
return df # Return the filtered dataframe
|