MMLU-by-task-Leaderboard / details_data_processor.py
Corey Morris
WIP commit. Finding files can be identical as the method in results_data_processor.
c32735e
raw
history blame
8.28 kB
import pandas as pd
import os
import fnmatch
import json
import re
import numpy as np
import requests
class DetailsDataProcessor:
# Download
#url example https://huggingface.co/datasets/open-llm-leaderboard/details/resolve/main/64bits/LexPodLM-13B/details_harness%7ChendrycksTest-moral_scenarios%7C5_2023-07-25T13%3A41%3A51.227672.json
def __init__(self, directory='results', pattern='moral*.json'):
self.directory = directory
self.pattern = pattern
# self.data = self.process_data()
# self.ranked_data = self.rank_data()
# download a file from a single url and save it to a local directory
@staticmethod
def download_file(url, filename):
r = requests.get(url, allow_redirects=True)
open(filename, 'wb').write(r.content)
@staticmethod
def single_file_pipeline(url, filename):
DetailsDataProcessor.download_file(url, filename)
# read file
with open(filename) as f:
data = json.load(f)
# convert to dataframe
df = pd.DataFrame(data)
return df
@staticmethod
def generate_url(file_path):
base_url = 'https://huggingface.co/datasets/open-llm-leaderboard/details/resolve/main/'
organization = '64bits'
model = 'LexPodLM-13B'
filename = '_2023-07-25T13%3A41%3A51.227672.json'
# extract organization, model, and filename from file_path instead of hardcoding
# filename = file_path.split('/')[-1]
other_chunk = 'details_harness%7ChendrycksTest-moral_scenarios%7C5'
constructed_url = base_url + organization + '/' + model + '/' + other_chunk + filename
return constructed_url
# @staticmethod
# def _find_files(directory, pattern):
# for root, dirs, files in os.walk(directory):
# for basename in files:
# if fnmatch.fnmatch(basename, pattern):
# filename = os.path.join(root, basename)
# yield filename
def _find_files(self, directory, pattern):
matching_files = [] # List to hold matching filenames
for root, dirs, files in os.walk(directory):
for basename in files:
if fnmatch.fnmatch(basename, pattern):
filename = os.path.join(root, basename)
matching_files.append(filename) # Append the matching filename to the list
return matching_files # Return the list of matching filenames
def pipeline(self):
dataframes = []
for file_path in self._find_files(self.directory, self.pattern):
print(file_path)
url = self.generate_url(file_path)
file_path = file_path.split('/')[-1]
df = self.single_file_pipeline(url, file_path)
dataframes.append(df)
return dataframes
# @staticmethod
# def _find_files(directory, pattern):
# for root, dirs, files in os.walk(directory):
# for basename in files:
# if fnmatch.fnmatch(basename, pattern):
# filename = os.path.join(root, basename)
# yield filename
# def _read_and_transform_data(self, filename):
# with open(filename) as f:
# data = json.load(f)
# df = pd.DataFrame(data['results']).T
# return df
# def _cleanup_dataframe(self, df, model_name):
# df = df.rename(columns={'acc': model_name})
# df.index = (df.index.str.replace('hendrycksTest-', 'MMLU_', regex=True)
# .str.replace('harness\|', '', regex=True)
# .str.replace('\|5', '', regex=True))
# return df[[model_name]]
# def _extract_mc1(self, df, model_name):
# df = df.rename(columns={'mc1': model_name})
# # rename row harness|truthfulqa:mc|0 to truthfulqa:mc1
# df.index = (df.index.str.replace('mc\|0', 'mc1', regex=True))
# # just return the harness|truthfulqa:mc1 row
# df = df.loc[['harness|truthfulqa:mc1']]
# return df[[model_name]]
# def _extract_mc2(self, df, model_name):
# # rename row harness|truthfulqa:mc|0 to truthfulqa:mc2
# df = df.rename(columns={'mc2': model_name})
# df.index = (df.index.str.replace('mc\|0', 'mc2', regex=True))
# df = df.loc[['harness|truthfulqa:mc2']]
# return df[[model_name]]
# # remove extreme outliers from column harness|truthfulqa:mc1
# def _remove_mc1_outliers(self, df):
# mc1 = df['harness|truthfulqa:mc1']
# # Identify the outliers
# # outliers_condition = mc1 > mc1.quantile(.95)
# outliers_condition = mc1 == 1.0
# # Replace the outliers with NaN
# df.loc[outliers_condition, 'harness|truthfulqa:mc1'] = np.nan
# return df
# @staticmethod
# def _extract_parameters(model_name):
# """
# Function to extract parameters from model name.
# It handles names with 'b/B' for billions and 'm/M' for millions.
# """
# # pattern to match a number followed by 'b' (representing billions) or 'm' (representing millions)
# pattern = re.compile(r'(\d+\.?\d*)([bBmM])')
# match = pattern.search(model_name)
# if match:
# num, magnitude = match.groups()
# num = float(num)
# # convert millions to billions
# if magnitude.lower() == 'm':
# num /= 1000
# return num
# # return NaN if no match
# return np.nan
# def process_data(self):
# dataframes = []
# organization_names = []
# for filename in self._find_files(self.directory, self.pattern):
# raw_data = self._read_and_transform_data(filename)
# split_path = filename.split('/')
# model_name = split_path[2]
# organization_name = split_path[1]
# cleaned_data = self._cleanup_dataframe(raw_data, model_name)
# mc1 = self._extract_mc1(raw_data, model_name)
# mc2 = self._extract_mc2(raw_data, model_name)
# cleaned_data = pd.concat([cleaned_data, mc1])
# cleaned_data = pd.concat([cleaned_data, mc2])
# organization_names.append(organization_name)
# dataframes.append(cleaned_data)
# data = pd.concat(dataframes, axis=1).transpose()
# # Add organization column
# data['organization'] = organization_names
# # Add Model Name and rearrange columns
# data['Model Name'] = data.index
# cols = data.columns.tolist()
# cols = cols[-1:] + cols[:-1]
# data = data[cols]
# # Remove the 'Model Name' column
# data = data.drop(columns=['Model Name'])
# # Add average column
# data['MMLU_average'] = data.filter(regex='MMLU').mean(axis=1)
# # Reorder columns to move 'MMLU_average' to the third position
# cols = data.columns.tolist()
# cols = cols[:2] + cols[-1:] + cols[2:-1]
# data = data[cols]
# # Drop specific columns
# data = data.drop(columns=['all', 'truthfulqa:mc|0'])
# # Add parameter count column using extract_parameters function
# data['Parameters'] = data.index.to_series().apply(self._extract_parameters)
# # move the parameters column to the front of the dataframe
# cols = data.columns.tolist()
# cols = cols[-1:] + cols[:-1]
# data = data[cols]
# # remove extreme outliers from column harness|truthfulqa:mc1
# data = self._remove_mc1_outliers(data)
# return data
# def rank_data(self):
# # add rank for each column to the dataframe
# # copy the data dataframe to avoid modifying the original dataframe
# rank_data = self.data.copy()
# for col in list(rank_data.columns):
# rank_data[col + "_rank"] = rank_data[col].rank(ascending=False, method='min')
# return rank_data
# def get_data(self, selected_models):
# return self.data[self.data.index.isin(selected_models)]