File size: 5,962 Bytes
b7fabea 54d8c8e b7fabea 54d8c8e b7fabea 54d8c8e b7fabea 54d8c8e b7fabea 54d8c8e b7fabea 54d8c8e b7fabea |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 |
# ... existing code ...
import pandas as pd
import json
# Load the JSON data
with open('src/combined_data.json') as f:
data = json.load(f)
# Flatten the data
flattened_data = []
for entry in data:
flattened_entry = {
"model_name": entry["model_name"],
"input_price": entry["pricing"]["input_price"],
"output_price": entry["pricing"]["output_price"],
"multimodality_image": entry["multimodality"]["image"],
"multimodality_multiple_image": entry["multimodality"]["multiple_image"],
"multimodality_audio": entry["multimodality"]["audio"],
"multimodality_video": entry["multimodality"]["video"],
"source": entry["pricing"]["source"],
"license_name": entry["license"]["name"],
"license_url": entry["license"]["url"],
"languages": ", ".join(entry["languages"]),
"release_date": entry["release_date"],
"parameter_size": entry["parameters"]["size"],
"estimated": entry["parameters"]["estimated"],
"open_weight": entry["open_weight"],
"context_size": entry["context_size"],
# ... additional prices ...
"additional_prices_context_caching": entry["pricing"].get("additional_prices", {}).get("context_caching", None),
"additional_prices_context_storage": entry["pricing"].get("additional_prices", {}).get("context_storage", None),
"additional_prices_image_input": entry["pricing"].get("additional_prices", {}).get("image_input", None),
"additional_prices_image_output": entry["pricing"].get("additional_prices", {}).get("image_output", None),
"additional_prices_video_input": entry["pricing"].get("additional_prices", {}).get("video_input", None),
"additional_prices_video_output": entry["pricing"].get("additional_prices", {}).get("video_output", None),
"additional_prices_audio_input": entry["pricing"].get("additional_prices", {}).get("audio_input", None),
"additional_prices_audio_output": entry["pricing"].get("additional_prices", {}).get("audio_output", None),
}
flattened_data.append(flattened_entry)
# Create a DataFrame
df = pd.DataFrame(flattened_data)
# Load the results CSV files
results_1_6_5_multimodal = pd.read_csv('src/results_1.6.5_multimodal.csv', header=None)
results_1_6_5_ascii = pd.read_csv('src/results_1.6.5_ascii.csv', header=None)
results_1_6 = pd.read_csv('src/results_1.6.csv', header=None)
# Split model names by '-t0.0' and use the first part
results_1_6_5_multimodal[0] = results_1_6_5_multimodal[0].str.split('-t0.0').str[0]
results_1_6_5_ascii[0] = results_1_6_5_ascii[0].str.split('-t0.0').str[0]
results_1_6[0] = results_1_6[0].str.split('-t0.0').str[0]
# Create a mapping for clemscore values
clemscore_map_1_6_5_multimodal = dict(zip(results_1_6_5_multimodal[0], results_1_6_5_multimodal[1]))
clemscore_map_1_6_5_ascii = dict(zip(results_1_6_5_ascii[0], results_1_6_5_ascii[1]))
clemscore_map_1_6 = dict(zip(results_1_6[0], results_1_6[1]))
# Add clemscore columns to the main DataFrame
df['clemscore_v1.6.5_multimodal'] = df['model_name'].map(clemscore_map_1_6_5_multimodal).fillna(0).astype(float)
df['clemscore_v1.6.5_ascii'] = df['model_name'].map(clemscore_map_1_6_5_ascii).fillna(0).astype(float)
df['clemscore_v1.6'] = df['model_name'].map(clemscore_map_1_6).fillna(0).astype(float)
# Load the latency CSV files
latency_1_6 = pd.read_csv('src/v1.6_latency.csv', header=None)
latency_1_6_5_ascii = pd.read_csv('src/v1.6.5_ascii_latency.csv', header=None)
latency_1_6_5_multimodal = pd.read_csv('src/v1.6.5_multimodal_latency.csv', header=None)
# Create a mapping for latency values
latency_map_1_6 = dict(zip(latency_1_6[0], latency_1_6[1]))
latency_map_1_6_5_ascii = dict(zip(latency_1_6_5_ascii[0], latency_1_6_5_ascii[1]))
latency_map_1_6_5_multimodal = dict(zip(latency_1_6_5_multimodal[0], latency_1_6_5_multimodal[1]))
# Add latency columns to the main DataFrame
df['latency_v1.6'] = df['model_name'].map(latency_map_1_6).fillna(0).astype(float)
df['latency_v1.6.5_multimodal'] = df['model_name'].map(latency_map_1_6_5_multimodal).fillna(0).astype(float)
df['latency_v1.6.5_ascii'] = df['model_name'].map(latency_map_1_6_5_ascii).fillna(0).astype(float)
# Calculate average latency and clemscore
df['average_clemscore'] = df[['clemscore_v1.6.5_multimodal', 'clemscore_v1.6.5_ascii', 'clemscore_v1.6']].mean(axis=1)
df['average_latency'] = df[['latency_v1.6', 'latency_v1.6.5_multimodal', 'latency_v1.6.5_ascii']].mean(axis=1)
# More clean up
# Clean and convert prices to float
df['input_price'] = df['input_price'].replace({'\$': '', '': None}, regex=True).astype(float)
df['output_price'] = df['output_price'].replace({'\$': '', '': None}, regex=True).astype(float)
# Clean and convert additional prices to float
additional_price_columns = [
'additional_prices_context_caching',
'additional_prices_context_storage',
'additional_prices_image_input',
'additional_prices_image_output',
'additional_prices_video_input',
'additional_prices_video_output',
'additional_prices_audio_input',
'additional_prices_audio_output'
]
for col in additional_price_columns:
df[col] = df[col].replace({'\$': '', '': None}, regex=True).astype(float)
# Clean and convert context to integer
df['context_size'] = df['context_size'].replace({'k': ''}, regex=True).astype(int)
df['parameter_size'] = df['parameter_size'].replace({'B': '', '': None}, regex=True).astype(float)
# Keep only the specified columns
df = df[[
'model_name',
'input_price',
'output_price',
'multimodality_image',
'multimodality_multiple_image',
'multimodality_audio',
'multimodality_video',
'source',
'license_name',
'license_url',
'languages',
'release_date',
'open_weight',
'context_size',
'average_clemscore',
'average_latency',
'parameter_size',
'estimated'
]]
# Save to CSV
df.to_csv('src/main_df.csv', index=False)
|