Spaces:

ai2lumos
/

lumos_data_demo

Runtime error

File size: 6,047 Bytes

from datasets import load_dataset, Dataset
import os
from datasets import load_dataset
from datasets.utils.logging import disable_progress_bar
from constants import column_names, all_task_types
from utils_display import make_clickable_model
import random
import json
disable_progress_bar()

id_to_data = None 
model_len_info = None 

def estimated_win_rate(elo_a, elo_b):
    """
    Calculate the estimated win rate for player A against player B using their Elo ratings.

    :param elo_a: Elo rating of player A
    :param elo_b: Elo rating of player B
    :return: Estimated win rate for player A
    """
    exponent = (elo_b - elo_a) / 400
    probability_a_wins = 1 / (1 + 10 ** exponent)
    return (1-probability_a_wins)*100



# Formats the columns
def formatter(x):
    if type(x) is str:
        x = x
    else: 
        x = round(x, 2)
    return x


def add_winrates(current_df):
    df = current_df.copy()
    elo_column = "Overall Elo" 

    # Correct way to filter the DataFrame and get the Elo rating for "gpt-4-0125-preview"
    model_a_elo = df[df["Model"].str.contains("gpt-4")][elo_column].iloc[0]

    # Correct way to filter the DataFrame and get the Elo rating for "gpt-3.5-turbo-0125"
    model_b_elo = df[df["Model"].str.contains("gpt-3.5")][elo_column].iloc[0]

    
    # Calculate the win rate of "gpt-4-0125-preview" against all models
    df['Win% vs GPT-4'] = df[elo_column].apply(lambda x: estimated_win_rate(model_a_elo, x)).apply(formatter)    
    df['Win% vs GPT-3.5T'] = df[elo_column].apply(lambda x: estimated_win_rate(model_b_elo, x)).apply(formatter)    
    # apply the formatter for the two new columns 
    cols = list(df.columns)
    cols.remove("# battles"); cols.append("# battles")
    cols.remove("Length"); cols.append("Length")
    df = df[cols]
    return df

def add_winrates_tasks(current_df, ref="gpt-4"):
    new_df = current_df.copy()
    for t in all_task_types:
        column = column_names[t]
        model_a_elo = current_df[current_df["Model"].str.contains(ref)][column].iloc[0]
        new_df[column] = current_df[column].apply(lambda x: estimated_win_rate(model_a_elo, x)).apply(formatter)
    return new_df
        

def post_processing(df, model_len_info):
    if model_len_info:
        df["Length"] = df["model name "].apply(lambda x: model_len_info[x])

    for col in df.columns:
        if col == "model name ":
            df[col] = df[col].apply(lambda x: x.replace(x, make_clickable_model(x)))
        else:
            df[col] = df[col].apply(formatter) # For numerical values 
    df.rename(columns=column_names, inplace=True)
    df.sort_values(by="Overall Elo", inplace=True, ascending=False)
    # put the "Overall Elo" and "Task-Avg Elo" column to the front
    # add the length info
    df = df[["Model", "Overall Elo", "Task-Avg Elo"] + [col for col in df.columns if col not in ["Model", "Overall Elo", "Task-Avg Elo"]]]
    return df

def apply_length_penalty(original_df, ablation_df, length_penalty=0.2):
    original_df = original_df.copy()
    ablation_df = ablation_df.copy()
    # replace all values in original_df with the values as z = x - y * length_penalty where y is from ablation_df at the same row and column
    # except for the "Model" column and the "# battles" column 
    # do not assume the order of the rows are the same in both dataframes
    for i, row in original_df.iterrows():
        for col in original_df.columns:
            if col == "Model" or col == "# battles" or col == "Length":
                continue
            # assert that the model names are the same in both dataframes
            assert original_df.at[i, "Model"] == ablation_df[ablation_df["Model"] == row["Model"]]["Model"].values[0]
            original_df[col] = original_df[col].astype(float)
            original_df.at[i, col] = original_df.at[i, col] - ablation_df[ablation_df["Model"] == row["Model"]][col].values[0] * length_penalty
    # post_processing
    original_df = post_processing(original_df, model_len_info=None)
    return original_df


def load_benchdata_dict():
    with open("data_dir/predictions_logs.jsonl", "r") as f:
        bench_data = [json.loads(d) for d in f]
    id_to_data = {}
    for item in bench_data:
        id_to_data[item["idx"]] = item
    return id_to_data

def load_eval_results():
    with open("data_dir/predictions_logs.jsonl", "r") as f:
        eval_results = [json.loads(d) for d in f]
    return eval_results



def sample_an_eval_result(eval_results, model_list=[]):
    global id_to_data          
    eval_results = list(eval_results)
    random.shuffle(eval_results)
    
    for eval_item in eval_results:
        print(eval_item.keys())
        model = eval_item['model']
        task_type = eval_item['task_type'] # primary task type
        if model not in model_list:
            continue
        plan_history = eval_item['plan_prompts']
        ground_history = eval_item['ground_prompts']
        task = eval_item['question']
        
        if "image" in eval_item:
            result_dict = {
                "session_id": eval_item['idx'],
                "task": task,
                "task_type": task_type,
                "plan_history": plan_history,
                "ground_history": ground_history,
                "pred": eval_item['pred'],
                "answer": eval_item['answer'],
                "correctness": eval_item['correctness'],
                "image": eval_item['image'].replace("eval/aokvqa/images/val2017/", "file/data_dir/test_images/")
            }
        else:
            result_dict = {
                "session_id": eval_item['idx'],
                "task": task,
                "task_type": task_type,
                "plan_history": plan_history,
                "ground_history": ground_history,
                "pred": eval_item['pred'],
                "answer": eval_item['answer'],
                "correctness": eval_item['correctness'],
                "image": None
            }
        break 
    return result_dict

id_to_data = load_benchdata_dict()