|
import pandas as pd |
|
import json |
|
import os |
|
|
|
from src.collect_data import fetch_version_metadata, fetch_registry_data |
|
from assets.text_content import LANG_MAPPING |
|
PRICING_PATH = os.path.join('assets', 'pricing.json') |
|
|
|
|
|
def convert_parameters(param): |
|
if pd.isna(param) or param == '': |
|
return None |
|
param = str(param) |
|
if 'T' in param: |
|
return float(param.replace('T', '')) * 1000 |
|
return float(param.replace('B', '')) |
|
|
|
|
|
def clean_price(price): |
|
if pd.isna(price) or price == '': |
|
return None |
|
return float(price.replace('$', '')) |
|
|
|
|
|
def map_languages(languages): |
|
if isinstance(languages, float) and pd.isna(languages): |
|
return None |
|
|
|
if isinstance(languages, list): |
|
return ', '.join([LANG_MAPPING.get(str(lang), str(lang)) for lang in languages]) |
|
|
|
if isinstance(languages, str): |
|
return ', '.join([LANG_MAPPING.get(lang.strip(), lang.strip()) for lang in languages.split(',')]) |
|
|
|
try: |
|
return ', '.join([str(lang) for lang in languages]) |
|
except: |
|
return str(languages) |
|
|
|
|
|
def get_multimodality_field(model_data, field): |
|
try: |
|
return model_data.get('model_config', {}).get('multimodality', {}).get(field, False) |
|
except: |
|
return False |
|
|
|
|
|
def merge_data(): |
|
|
|
mm_latency_df, mm_result_df, text_latency_df, text_result_df = fetch_version_metadata() |
|
registry_data = fetch_registry_data() |
|
with open(PRICING_PATH, 'r') as f: |
|
pricing_data = json.load(f) |
|
|
|
|
|
mm_result_df.rename(columns={'Unnamed: 0': 'model', '-, clemscore': 'clemscore'}, inplace=True) |
|
text_result_df.rename(columns={'Unnamed: 0': 'model', '-, clemscore': 'clemscore'}, inplace=True) |
|
mm_result_df['model'] = mm_result_df['model'].str.split('-t0.0--').str[0] |
|
text_result_df['model'] = text_result_df['model'].str.split('-t0.0--').str[0] |
|
|
|
|
|
avg_latency_df = pd.concat([mm_latency_df, text_latency_df], axis=0).groupby('model')['latency'].mean().reset_index() |
|
avg_clemscore_df = pd.concat([mm_result_df, text_result_df], axis=0).groupby('model')['clemscore'].mean().reset_index() |
|
|
|
|
|
lat_clem_df = pd.merge(avg_latency_df, avg_clemscore_df, on='model', how='outer') |
|
|
|
|
|
registry_df = pd.DataFrame(registry_data) |
|
|
|
|
|
registry_df['license_name'] = registry_df['license'].apply(lambda x: x['name']) |
|
registry_df['license_url'] = registry_df['license'].apply(lambda x: x['url']) |
|
|
|
|
|
registry_df['single_image'] = registry_df.apply(lambda x: get_multimodality_field(x, 'single_image'), axis=1) |
|
registry_df['multiple_images'] = registry_df.apply(lambda x: get_multimodality_field(x, 'multiple_images'), axis=1) |
|
registry_df['audio'] = registry_df.apply(lambda x: get_multimodality_field(x, 'audio'), axis=1) |
|
registry_df['video'] = registry_df.apply(lambda x: get_multimodality_field(x, 'video'), axis=1) |
|
|
|
|
|
registry_df = registry_df[[ |
|
'model_name', 'parameters', 'release_date', 'open_weight', |
|
'languages', 'context_size', 'license_name', 'license_url', |
|
'single_image', 'multiple_images', 'audio', 'video' |
|
]] |
|
|
|
|
|
merged_df = pd.merge( |
|
lat_clem_df, |
|
registry_df, |
|
left_on='model', |
|
right_on='model_name', |
|
how='inner' |
|
) |
|
|
|
|
|
merged_df = merged_df.rename(columns={ |
|
'model': 'Model Name', |
|
'latency': 'Latency (s)', |
|
'clemscore': 'Clemscore', |
|
'parameters': 'Parameters (B)', |
|
'release_date': 'Release Date', |
|
'open_weight': 'Open Weight', |
|
'languages': 'Languages', |
|
'context_size': 'Context Size (k)', |
|
'license_name': 'License Name', |
|
'license_url': 'License URL', |
|
'single_image': 'Single Image', |
|
'multiple_images': 'Multiple Images', |
|
'audio': 'Audio', |
|
'video': 'Video' |
|
}) |
|
|
|
|
|
pricing_df = pd.DataFrame(pricing_data) |
|
pricing_df['input'] = pricing_df['input'].apply(clean_price) |
|
pricing_df['output'] = pricing_df['output'].apply(clean_price) |
|
|
|
|
|
merged_df = pd.merge( |
|
merged_df, |
|
pricing_df, |
|
left_on='Model Name', |
|
right_on='model_id', |
|
how='left' |
|
) |
|
|
|
|
|
merged_df = merged_df.drop('model_id', axis=1) |
|
merged_df = merged_df.rename(columns={ |
|
'input': 'Input $/1M tokens', |
|
'output': 'Output $/1M tokens' |
|
}) |
|
|
|
|
|
merged_df['Input $/1M tokens'] = merged_df['Input $/1M tokens'].fillna(0.0) |
|
merged_df['Output $/1M tokens'] = merged_df['Output $/1M tokens'].fillna(0.0) |
|
|
|
merged_df['Parameters (B)'] = merged_df['Parameters (B)'].apply(convert_parameters) |
|
|
|
merged_df['License'] = merged_df.apply(lambda row: f'<a href="{row["License URL"]}" style="color: blue;">{row["License Name"]}</a>', axis=1) |
|
merged_df['Temp Date'] = merged_df['Release Date'] |
|
|
|
merged_df['Languages'] = merged_df['Languages'].apply(map_languages) |
|
|
|
|
|
|
|
merged_df.drop(columns=['model_name'], inplace=True) |
|
|
|
return merged_df |
|
|
|
if __name__=='__main__': |
|
merged_df = merge_data() |
|
|
|
output_path = os.path.join('assets', 'merged_data.csv') |
|
merged_df.to_csv(output_path, index=False) |