import os import random import time import numpy as np import gradio as gr import pandas as pd import zipfile from apscheduler.schedulers.background import BackgroundScheduler from huggingface_hub import HfApi, snapshot_download from datasets import load_dataset from src.utils import load_all_data from src.md import ABOUT_TEXT, TOP_TEXT, SUBMIT_TEXT from src.css import custom_css api = HfApi() COLLAB_TOKEN = os.environ.get("COLLAB_TOKEN") eval_set_repo_id = "KwaiVGI/VideoGen-RewardBench" eval_set_dir = "dataset" eval_results_dir = "evals" def restart_space(): api.restart_space(repo_id=eval_set_repo_id, token=COLLAB_TOKEN) color_map = { "Generative": "#7497db", "Custom Classifiers": "#E8ECF2", "Seq. Classifiers": "#ffcd75", "DPO": "#75809c", } def color_model_type_column(df, color_map): """ Apply color to the 'Model Type' column of the DataFrame based on a given color mapping. Parameters: df (pd.DataFrame): The DataFrame containing the 'Model Type' column. color_map (dict): A dictionary mapping model types to colors. Returns: pd.Styler: The styled DataFrame. """ # Function to apply color based on the model type def apply_color(val): color = color_map.get(val, "default") # Default color if not specified in color_map return f'background-color: {color}' # Format for different columns format_dict = {col: "{:.2f}" for col in df.columns if col not in ['Avg.', 'Model', 'Model Type']} format_dict['Avg.'] = "{:.2f}" format_dict[''] = "{:d}" return df.style.applymap(apply_color, subset=['Model Type']).format(format_dict, na_rep='') def regex_table(dataframe, regex, filter_button, style=True): """ Takes a Model as a regex, then returns only the rows that has that in it. """ # Split regex statement by comma and trim whitespace around regexes regex_list = [x.strip() for x in regex.split(",")] # Join the list into a single regex pattern with '|' acting as OR combined_regex = '|'.join(regex_list) update_scores = False if isinstance(filter_button, list) or isinstance(filter_button, str): if "Seq. Classifiers" not in filter_button: dataframe = dataframe[~dataframe["Model Type"].str.contains("Seq. Classifiers", case=False, na=False)] if "Custom Classifiers" not in filter_button: dataframe = dataframe[~dataframe["Model Type"].str.contains("Custom Classifiers", case=False, na=False)] if "Generative" not in filter_button: dataframe = dataframe[~dataframe["Model Type"].str.contains("Generative", case=False, na=False)] if "w/o Ties" not in filter_button: dataframe = dataframe[[col for col in dataframe.columns if "w/o Ties" not in col]] if "w/ Ties" not in filter_button: dataframe = dataframe[[col for col in dataframe.columns if "w/ Ties" not in col]] # Filter the dataframe such that 'model' contains any of the regex patterns data = dataframe[dataframe["Model"].str.contains(combined_regex, case=False, na=False)] data.reset_index(drop=True, inplace=True) data.insert(0, '', range(len(data))) data = color_model_type_column(data, color_map) return data repo = snapshot_download( local_dir=eval_set_dir, repo_id=eval_set_repo_id, use_auth_token=COLLAB_TOKEN, tqdm_class=None, etag_timeout=30, repo_type="dataset", ) with zipfile.ZipFile(os.path.join(eval_set_dir, 'videos.zip'), 'r') as zip_ref: zip_ref.extractall(eval_set_dir) rewardbench_data = load_all_data(eval_results_dir).sort_values(by='Avg.', ascending=False) col_types_rewardbench = ["number"] + ["markdown"]+ ["str"] + ["number"] * (len(rewardbench_data.columns) - 1) # for showing random samples eval_set = pd.read_csv(os.path.join(eval_set_dir, 'videogen-rewardbench.csv')) subsets = list(eval_set['prompt'].unique()) # N=20 # if len(subsets) > N: # random.seed(time.time()) # subsets = random.sample(subsets, N) def random_sample(selected_prompts): # Filter the eval_set based on the selected prompts filtered_data = eval_set[eval_set['prompt'] == selected_prompts] if filtered_data.empty: return "No data available for the selected prompt(s)." # Randomly select a sample from the filtered data sample = filtered_data.sample(n=1, random_state=int(time.time())).iloc[0] # Prepare the markdown text with the required fields markdown_text = f"**Prompt**: {sample['prompt']}\n\n\n" markdown_text += f"**Preference**: \n" markdown_text += "| **Visual Quality** | **Motion Quality** | **Text Alignment** | **Overall** | **A_model** | **B_model** |\n" markdown_text += "|:------------------:|:------------------:|:------------------:|:-----------:|:-----------:|:-----------:|\n" markdown_text += "| " markdown_text += f"{'A>B' if sample['VQ'] == 'A' else 'AB' if sample['MQ'] == 'A' else 'AB' if sample['TA'] == 'A' else 'AB' if sample['Overall'] == 'A' else 'A """) with gr.Tabs(elem_classes="tab-buttons") as tabs: with gr.TabItem("🏆 VideoGen-RewardBench Leaderboard"): with gr.Row(): with gr.Column(scale=4): search_1 = gr.Textbox(label="Model Search (delimit with , )", placeholder="Model Search (delimit with , )", show_label=False) with gr.Column(scale=6): model_types_1 = gr.CheckboxGroup(["Seq. Classifiers", "Custom Classifiers", "Generative", "w/o Ties", "w/ Ties"], value=["Seq. Classifiers", "Custom Classifiers", "Generative", "w/o Ties", "w/ Ties"], label="Model Types", show_label=False) with gr.Row(): # reference data rewardbench_table_hidden = gr.Dataframe( rewardbench_data, datatype=col_types_rewardbench, headers=rewardbench_data.columns.tolist(), visible=False, ) rewardbench_table = gr.Dataframe( regex_table(rewardbench_data.copy(), "", ["Seq. Classifiers", "Custom Classifiers", "Generative", "Others", "w/o Ties", "w/ Ties"]), datatype=col_types_rewardbench, headers=rewardbench_data.columns.tolist(), elem_id="rewardbench_dataframe_avg", # height=1000, ) with gr.Row(): gr.Markdown(ABOUT_TEXT) with gr.TabItem("📤 How to Submit"): with gr.Row(): gr.Markdown(SUBMIT_TEXT) with gr.TabItem("🔍 Dataset Viewer"): with gr.Row(): # loads one sample gr.Markdown("""## Random Dataset Sample Viewer""") subset_selector = gr.Dropdown(subsets, label="Subset", value=None, multiselect=False) button = gr.Button("Show Random Sample") with gr.Row(): sample_display = gr.Markdown("{sampled data loads here}") with gr.Row(): video_a_display = gr.Video() video_b_display = gr.Video() button.click(fn=random_sample, inputs=[subset_selector], outputs=[sample_display, video_a_display, video_b_display]) search_1.change(regex_table, inputs=[rewardbench_table_hidden, search_1, model_types_1], outputs=rewardbench_table) model_types_1.change(regex_table, inputs=[rewardbench_table_hidden, search_1, model_types_1], outputs=rewardbench_table) with gr.Row(): with gr.Accordion("📚 Citation", open=False): citation_button = gr.Textbox( value=r"""@article{liu2025improving, title={Improving Video Generation with Human Feedback}, author={Liu, Jie and Liu, Gongye and Liang, Jiajun and Yuan, Ziyang and Liu, Xiaokun and Zheng, Mingwu and Wu, Xiele and Wang, Qiulin and Qin, Wenyu and Xia, Menghan and others}, journal={arXiv preprint arXiv:2501.13918}, year={2025} }""", lines=5, label="Copy the following to cite these results.", elem_id="citation-button", show_copy_button=True, ) scheduler = BackgroundScheduler() scheduler.add_job(restart_space, "interval", seconds=1800) scheduler.start() app.queue(default_concurrency_limit=40).launch()