File size: 6,414 Bytes
ef818ff
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
import pandas as pd
import json
import os

from src.collect_data import fetch_version_metadata, fetch_registry_data
from assets.text_content import LANG_MAPPING
PRICING_PATH = os.path.join('assets', 'pricing.json')

# Convert parameters to float, handling both B and T suffixes
def convert_parameters(param):
    if pd.isna(param) or param == '':
        return None
    param = str(param)
    if 'T' in param:
        return float(param.replace('T', '')) * 1000
    return float(param.replace('B', ''))

# Clean price strings by removing '$' and handling empty strings
def clean_price(price):
    if pd.isna(price) or price == '':
        return None
    return float(price.replace('$', ''))

# Handle language mapping for both string and list inputs
def map_languages(languages):
    if isinstance(languages, float) and pd.isna(languages):
        return None
    # If it's already a list
    if isinstance(languages, list):
        return ', '.join([LANG_MAPPING.get(str(lang), str(lang)) for lang in languages])
    # If it's a string
    if isinstance(languages, str):
        return ', '.join([LANG_MAPPING.get(lang.strip(), lang.strip()) for lang in languages.split(',')])
    # If it's an array or any other type
    try:
        return ', '.join([str(lang) for lang in languages])
    except:
        return str(languages)
    
# Extract multimodality fields
def get_multimodality_field(model_data, field):
    try:
        return model_data.get('model_config', {}).get('multimodality', {}).get(field, False)
    except:
        return False


def merge_data():

    mm_latency_df, mm_result_df, text_latency_df, text_result_df = fetch_version_metadata()
    registry_data = fetch_registry_data()
    with open(PRICING_PATH, 'r') as f:
        pricing_data = json.load(f)

    # Ensure the unnamed column is renamed to 'model'
    mm_result_df.rename(columns={'Unnamed: 0': 'model', '-, clemscore': 'clemscore'}, inplace=True)
    text_result_df.rename(columns={'Unnamed: 0': 'model', '-, clemscore': 'clemscore'}, inplace=True)
    mm_result_df['model'] = mm_result_df['model'].str.split('-t0.0--').str[0]
    text_result_df['model'] = text_result_df['model'].str.split('-t0.0--').str[0] # Bug in get_latency.py, split by -t0.0 instead of -t (gpt-3.5-turbo/gpt-4-turbo breaks) 

    # Merge datasets to compute average values
    avg_latency_df = pd.concat([mm_latency_df, text_latency_df], axis=0).groupby('model')['latency'].mean().reset_index()
    avg_clemscore_df = pd.concat([mm_result_df, text_result_df], axis=0).groupby('model')['clemscore'].mean().reset_index()

    # Merge latency, clemscore, registry, and pricing data
    lat_clem_df = pd.merge(avg_latency_df, avg_clemscore_df, on='model', how='outer')

    # Convert registry_data to DataFrame for easier merging
    registry_df = pd.DataFrame(registry_data)
    
    # Extract license info
    registry_df['license_name'] = registry_df['license'].apply(lambda x: x['name'])
    registry_df['license_url'] = registry_df['license'].apply(lambda x: x['url'])

    # Add individual multimodality columns
    registry_df['single_image'] = registry_df.apply(lambda x: get_multimodality_field(x, 'single_image'), axis=1)
    registry_df['multiple_images'] = registry_df.apply(lambda x: get_multimodality_field(x, 'multiple_images'), axis=1)
    registry_df['audio'] = registry_df.apply(lambda x: get_multimodality_field(x, 'audio'), axis=1)
    registry_df['video'] = registry_df.apply(lambda x: get_multimodality_field(x, 'video'), axis=1)

    # Update columns list to include new multimodality fields
    registry_df = registry_df[[
        'model_name', 'parameters', 'release_date', 'open_weight',
        'languages', 'context_size', 'license_name', 'license_url',
        'single_image', 'multiple_images', 'audio', 'video'
    ]]
    
    # Merge with previous data
    merged_df = pd.merge(
        lat_clem_df,
        registry_df,
        left_on='model',
        right_on='model_name',
        how='inner'
    )
    
    # Update column renaming
    merged_df = merged_df.rename(columns={
        'model': 'Model Name',
        'latency': 'Latency (s)',
        'clemscore': 'Clemscore',
        'parameters': 'Parameters (B)',
        'release_date': 'Release Date',
        'open_weight': 'Open Weight',
        'languages': 'Languages',
        'context_size': 'Context Size (k)',
        'license_name': 'License Name',
        'license_url': 'License URL',
        'single_image': 'Single Image',
        'multiple_images': 'Multiple Images',
        'audio': 'Audio',
        'video': 'Video'
    })
    
    # Convert pricing_data list to DataFrame
    pricing_df = pd.DataFrame(pricing_data)
    pricing_df['input'] = pricing_df['input'].apply(clean_price)
    pricing_df['output'] = pricing_df['output'].apply(clean_price)
    
    # Merge pricing data with the existing dataframe
    merged_df = pd.merge(
        merged_df,
        pricing_df,
        left_on='Model Name',
        right_on='model_id',
        how='left'
    )
    
    # Drop duplicate model column and rename price columns
    merged_df = merged_df.drop('model_id', axis=1)
    merged_df = merged_df.rename(columns={
        'input': 'Input $/1M tokens',
        'output': 'Output $/1M tokens'
    })
    
    # Fill NaN values with 0.0 for pricing columns
    merged_df['Input $/1M tokens'] = merged_df['Input $/1M tokens'].fillna(0.0)
    merged_df['Output $/1M tokens'] = merged_df['Output $/1M tokens'].fillna(0.0)
    
    # Convert parameters and set to None for commercial models
    merged_df['Parameters (B)'] = merged_df.apply(
        lambda row: None if not row['Open Weight'] else convert_parameters(row['Parameters (B)']), 
        axis=1
    )

    merged_df['License'] = merged_df.apply(lambda row: f'<a href="{row["License URL"]}" style="color: blue;">{row["License Name"]}</a>', axis=1)
    merged_df['Temp Date'] = merged_df['Release Date']

    merged_df['Languages'] = merged_df['Languages'].apply(map_languages)

    # Sort by Clemscore in descending order
    merged_df = merged_df.sort_values(by='Clemscore', ascending=False)
    
    # Drop model_name column
    merged_df.drop(columns=['model_name'], inplace=True)
    
    return merged_df

if __name__=='__main__':
    merged_df = merge_data()
    # # Save to CSV
    output_path = os.path.join('assets', 'merged_data.csv')
    merged_df.to_csv(output_path, index=False)