File size: 5,397 Bytes
68e6513
 
fbdc657
144f299
fbdc657
68e6513
 
 
144f299
 
 
 
 
 
 
 
 
68e6513
1580227
68e6513
 
fd62121
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
68e6513
fbdc657
fd62121
fbdc657
ee6a180
3433b65
144f299
ee6a180
 
 
144f299
 
1580227
ee6a180
144f299
ee6a180
 
 
 
 
144f299
ee6a180
 
 
144f299
 
ee6a180
 
 
 
 
3433b65
144f299
3433b65
 
144f299
3433b65
 
144f299
 
 
 
 
 
 
 
3433b65
ee6a180
 
 
3433b65
144f299
 
 
 
 
7ce5480
 
fbdc657
3433b65
144f299
fbdc657
fd62121
 
 
fbdc657
fd62121
 
 
fbdc657
fd62121
 
 
 
 
68e6513
 
144f299
ee6a180
68e6513
 
fd62121
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
# Utility functions for filtering the dataframe

import pandas as pd
import assets.text_content as tc

def filter_cols(df):

    df = df[[
    tc.MODEL_NAME, 
    tc.CLEMSCORE,
    tc.INPUT, 
    tc.OUTPUT,
    tc.LATENCY,
    tc.CONTEXT, 
    tc.PARAMS,
    tc.RELEASE_DATE, 
    tc.LICENSE
    ]]
    
    return df

def convert_date_components_to_timestamp(year: str, month: str) -> int:
    """Convert year and month strings to timestamp."""
    # Create a datetime object for the first day of the month
    date_str = f"{year}-{month:02d}-01"
    return int(pd.to_datetime(date_str).timestamp())

def filter_by_date(df: pd.DataFrame, 
                  start_year: str, 
                  start_month: str,
                  end_year: str,
                  end_month: str,
                  date_column: str) -> pd.DataFrame:
    """
    Filter DataFrame by date range using separate year and month components.
    
    Args:
        df: DataFrame to filter
        start_year: Starting year (e.g., "2023")
        start_month: Starting month (e.g., "1" for January)
        end_year: Ending year (e.g., "2024")
        end_month: Ending month (e.g., "12" for December)
        date_column: Name of the date column to filter on
    """
    # Convert string inputs to integers for date creation
    start_timestamp = convert_date_components_to_timestamp(
        int(start_year), 
        int(start_month)
    )
    
    end_timestamp = convert_date_components_to_timestamp(
        int(end_year), 
        int(end_month)
    )
    
    # Convert the DataFrame's date column to timestamps for comparison
    date_timestamps = pd.to_datetime(df[date_column]).apply(lambda x: int(x.timestamp()))
    
    # Filter the DataFrame
    return df[
        (date_timestamps >= start_timestamp) & 
        (date_timestamps <= end_timestamp)
    ]
    


def filter(df, language_list, parameters, input_price, output_price, multimodal,
           context, open_weight, start_year, start_month, end_year, end_month, license ):
    

    if not df.empty:  # Check if df is non-empty
        df = df[df[tc.LANGS].apply(lambda x: all(lang in x for lang in language_list))]

    if not df.empty:
        # Split dataframe by Open Weight
        open_weight_true = df[df[tc.OPEN_WEIGHT] == True]
        open_weight_false = df[df[tc.OPEN_WEIGHT] == False]
        
        # Get max parameter size for open weight models
        max_parameter_size = open_weight_true[tc.PARAMS].max() if not open_weight_true.empty else 0
        
        # Filter only the open weight models based on parameters
        if not open_weight_true.empty:
            if parameters[1] >= max_parameter_size:
                filtered_open = open_weight_true[
                    (open_weight_true[tc.PARAMS] >= parameters[0])
                ]
            else:
                filtered_open = open_weight_true[
                    (open_weight_true[tc.PARAMS] >= parameters[0]) & 
                    (open_weight_true[tc.PARAMS] <= parameters[1])
                ]
            
            # Combine filtered open weight models with unfiltered commercial models
            df = pd.concat([filtered_open, open_weight_false])

    if not df.empty:  # Check if df is non-empty
        df = df[(df[tc.INPUT] >= input_price[0]) & (df[tc.INPUT] <= input_price[1])]
    
    if not df.empty:  # Check if df is non-empty
        df = df[(df[tc.OUTPUT] >= output_price[0]) & (df[tc.OUTPUT] <= output_price[1])]

    if not df.empty:  # Check if df is non-empty
        if tc.SINGLE_IMG in multimodal:
            df = df[df[tc.SINGLE_IMG] == True]
        if tc.MULT_IMG in multimodal:
            df = df[df[tc.MULT_IMG] == True]
        if tc.AUDIO in multimodal:
            df = df[df[tc.AUDIO] == True]
        if tc.VIDEO in multimodal:
            df = df[df[tc.VIDEO] == True]

    # if not df.empty:  # Check if df is non-empty
    #     df = df[(df['Context Size (k)'] >= (context[0])) & (df['Context Size (k)'] <= (context[1]))]

    if not df.empty:  # Check if df is non-empty
        if tc.OPEN in open_weight and tc.COMM not in open_weight:
            df = df[df[tc.OPEN_WEIGHT] == True]
        elif tc.COMM in open_weight and tc.OPEN not in open_weight:
            df = df[df[tc.OPEN_WEIGHT] == False]
        elif tc.OPEN not in open_weight and tc.COMM not in open_weight:
            # Return empty DataFrame with same columns
            df = pd.DataFrame(columns=df.columns)
        
    if not df.empty:  # Check if df is non-empty
        df = df[df[tc.LICENSE_NAME].apply(lambda x: any(lic in x for lic in license))]

    # # Convert 'Release Date' to int temporarily
    # if not df.empty:  # Check if df is non-empty
    #     df[tc.TEMP_DATE] = pd.to_datetime(df[tc.TEMP_DATE]).astype(int) // 10**9  # Convert to seconds since epoch

    # # Convert start and end to int (seconds since epoch)
    # start = int(pd.to_datetime(start).timestamp())  
    # end = int(pd.to_datetime(end).timestamp())    

    # # Filter based on the converted 'Release Date'
    # if not df.empty:  # Check if df is non-empty
    #     df = df[(df[tc.TEMP_DATE] >= start) & (df[tc.TEMP_DATE] <= end)]

    df = filter_by_date(df, start_year, start_month, end_year, end_month, tc.TEMP_DATE)

    df = filter_cols(df)
    df = df.sort_values(by=tc.CLEMSCORE, ascending=False)

    return df  # Return the filtered dataframe