Spaces:
Sleeping
Sleeping
File size: 6,414 Bytes
ef818ff |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 |
import pandas as pd
import json
import os
from src.collect_data import fetch_version_metadata, fetch_registry_data
from assets.text_content import LANG_MAPPING
PRICING_PATH = os.path.join('assets', 'pricing.json')
# Convert parameters to float, handling both B and T suffixes
def convert_parameters(param):
if pd.isna(param) or param == '':
return None
param = str(param)
if 'T' in param:
return float(param.replace('T', '')) * 1000
return float(param.replace('B', ''))
# Clean price strings by removing '$' and handling empty strings
def clean_price(price):
if pd.isna(price) or price == '':
return None
return float(price.replace('$', ''))
# Handle language mapping for both string and list inputs
def map_languages(languages):
if isinstance(languages, float) and pd.isna(languages):
return None
# If it's already a list
if isinstance(languages, list):
return ', '.join([LANG_MAPPING.get(str(lang), str(lang)) for lang in languages])
# If it's a string
if isinstance(languages, str):
return ', '.join([LANG_MAPPING.get(lang.strip(), lang.strip()) for lang in languages.split(',')])
# If it's an array or any other type
try:
return ', '.join([str(lang) for lang in languages])
except:
return str(languages)
# Extract multimodality fields
def get_multimodality_field(model_data, field):
try:
return model_data.get('model_config', {}).get('multimodality', {}).get(field, False)
except:
return False
def merge_data():
mm_latency_df, mm_result_df, text_latency_df, text_result_df = fetch_version_metadata()
registry_data = fetch_registry_data()
with open(PRICING_PATH, 'r') as f:
pricing_data = json.load(f)
# Ensure the unnamed column is renamed to 'model'
mm_result_df.rename(columns={'Unnamed: 0': 'model', '-, clemscore': 'clemscore'}, inplace=True)
text_result_df.rename(columns={'Unnamed: 0': 'model', '-, clemscore': 'clemscore'}, inplace=True)
mm_result_df['model'] = mm_result_df['model'].str.split('-t0.0--').str[0]
text_result_df['model'] = text_result_df['model'].str.split('-t0.0--').str[0] # Bug in get_latency.py, split by -t0.0 instead of -t (gpt-3.5-turbo/gpt-4-turbo breaks)
# Merge datasets to compute average values
avg_latency_df = pd.concat([mm_latency_df, text_latency_df], axis=0).groupby('model')['latency'].mean().reset_index()
avg_clemscore_df = pd.concat([mm_result_df, text_result_df], axis=0).groupby('model')['clemscore'].mean().reset_index()
# Merge latency, clemscore, registry, and pricing data
lat_clem_df = pd.merge(avg_latency_df, avg_clemscore_df, on='model', how='outer')
# Convert registry_data to DataFrame for easier merging
registry_df = pd.DataFrame(registry_data)
# Extract license info
registry_df['license_name'] = registry_df['license'].apply(lambda x: x['name'])
registry_df['license_url'] = registry_df['license'].apply(lambda x: x['url'])
# Add individual multimodality columns
registry_df['single_image'] = registry_df.apply(lambda x: get_multimodality_field(x, 'single_image'), axis=1)
registry_df['multiple_images'] = registry_df.apply(lambda x: get_multimodality_field(x, 'multiple_images'), axis=1)
registry_df['audio'] = registry_df.apply(lambda x: get_multimodality_field(x, 'audio'), axis=1)
registry_df['video'] = registry_df.apply(lambda x: get_multimodality_field(x, 'video'), axis=1)
# Update columns list to include new multimodality fields
registry_df = registry_df[[
'model_name', 'parameters', 'release_date', 'open_weight',
'languages', 'context_size', 'license_name', 'license_url',
'single_image', 'multiple_images', 'audio', 'video'
]]
# Merge with previous data
merged_df = pd.merge(
lat_clem_df,
registry_df,
left_on='model',
right_on='model_name',
how='inner'
)
# Update column renaming
merged_df = merged_df.rename(columns={
'model': 'Model Name',
'latency': 'Latency (s)',
'clemscore': 'Clemscore',
'parameters': 'Parameters (B)',
'release_date': 'Release Date',
'open_weight': 'Open Weight',
'languages': 'Languages',
'context_size': 'Context Size (k)',
'license_name': 'License Name',
'license_url': 'License URL',
'single_image': 'Single Image',
'multiple_images': 'Multiple Images',
'audio': 'Audio',
'video': 'Video'
})
# Convert pricing_data list to DataFrame
pricing_df = pd.DataFrame(pricing_data)
pricing_df['input'] = pricing_df['input'].apply(clean_price)
pricing_df['output'] = pricing_df['output'].apply(clean_price)
# Merge pricing data with the existing dataframe
merged_df = pd.merge(
merged_df,
pricing_df,
left_on='Model Name',
right_on='model_id',
how='left'
)
# Drop duplicate model column and rename price columns
merged_df = merged_df.drop('model_id', axis=1)
merged_df = merged_df.rename(columns={
'input': 'Input $/1M tokens',
'output': 'Output $/1M tokens'
})
# Fill NaN values with 0.0 for pricing columns
merged_df['Input $/1M tokens'] = merged_df['Input $/1M tokens'].fillna(0.0)
merged_df['Output $/1M tokens'] = merged_df['Output $/1M tokens'].fillna(0.0)
# Convert parameters and set to None for commercial models
merged_df['Parameters (B)'] = merged_df.apply(
lambda row: None if not row['Open Weight'] else convert_parameters(row['Parameters (B)']),
axis=1
)
merged_df['License'] = merged_df.apply(lambda row: f'<a href="{row["License URL"]}" style="color: blue;">{row["License Name"]}</a>', axis=1)
merged_df['Temp Date'] = merged_df['Release Date']
merged_df['Languages'] = merged_df['Languages'].apply(map_languages)
# Sort by Clemscore in descending order
merged_df = merged_df.sort_values(by='Clemscore', ascending=False)
# Drop model_name column
merged_df.drop(columns=['model_name'], inplace=True)
return merged_df
if __name__=='__main__':
merged_df = merge_data()
# # Save to CSV
output_path = os.path.join('assets', 'merged_data.csv')
merged_df.to_csv(output_path, index=False) |