v1
Browse files
    	
        app.py
    ADDED
    
    | @@ -0,0 +1,64 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            import gradio as gr
         | 
| 2 | 
            +
            import pandas as pd
         | 
| 3 | 
            +
             | 
| 4 | 
            +
            # Data for Table 1: Robustness Results
         | 
| 5 | 
            +
            robustness_data = {
         | 
| 6 | 
            +
                "Model Name": [
         | 
| 7 | 
            +
                    "Gemini 2.0 Flash Exp", "Gemini 1.5 Pro 002", "OpenAI GPT-4o", "OpenAI o1", "OpenAI o3-mini",
         | 
| 8 | 
            +
                    "DeepSeek-R1-Distill-Llama-8B", "DeepSeek-R1-Distill-Qwen-14B", "DeepSeek-R1-Distill-Qwen-32B",
         | 
| 9 | 
            +
                    "DeepSeek-R1-Distill-Llama-70B", "DeepSeek-R1", "Meta-Llama-3.1-8B-Instruct",
         | 
| 10 | 
            +
                    "Meta-Llama-3.1-70B-Instruct", "Meta-Llama-3.3-70B-Instruct", "Qwen2.5-7B-Instruct",
         | 
| 11 | 
            +
                    "Qwen2.5-14B-Instruct", "Qwen2.5-32B-Instruct", "Qwen2.5-72B-Instruct", "Qwen2.5-7B-Instruct-1M",
         | 
| 12 | 
            +
                    "Qwen2.5-14B-Instruct-1M", "Nemotron-70B-Instruct-HF", "Phi-3-mini-128k-Instruct",
         | 
| 13 | 
            +
                    "Phi-3-small-128k-Instruct", "Phi-3-medium-128k-Instruct", "Palmyra-Fin-128k-Instruct"
         | 
| 14 | 
            +
                ],
         | 
| 15 | 
            +
                "Baseline": [0.95, 0.96, 0.95, 0.97, 0.98, 0.83, 0.95, 0.95, 0.96, 0.94, 0.91, 0.94, 0.95, 0.92, 0.95, 0.95, 0.94, 0.91, 0.95, 0.94, 0.86, 0.88, 0.89, 0.96],
         | 
| 16 | 
            +
                "Misspelled (Ξ)": ["0.95 (0.0)", "0.95 (0.0)", "0.94 (β0.01)", "0.95 (β0.02)", "0.96 (β0.02)", "0.85 (β0.02)", "0.90 (β0.05)", "0.97 (β0.02)", "0.97 (β0.01)", "0.94 (0.0)", "0.90 (β0.01)", "0.92 (β0.02)", "0.92 (β0.03)", "0.91 (β0.01)", "0.94 (β0.01)", "0.94 (0.0)", "0.94 (0.0)", "0.91 (0.0)", "0.92 (β0.03)", "0.94 (0.0)", "0.85 (β0.01)", "0.84 (β0.04)", "0.84 (β0.05)", "0.93 (β0.03)"],
         | 
| 17 | 
            +
                "Incomplete (Ξ)": ["0.95 (0.0)", "0.94 (β0.02)", "0.94 (β0.01)", "0.94 (β0.03)", "0.96 (β0.02)", "0.82 (β0.01)", "0.92 (β0.03)", "0.95 (0.0)", "0.95 (β0.01)", "0.93 (β0.01)", "0.86 (β0.05)", "0.94 (0.0)", "0.93 (β0.02)", "0.90 (β0.02)", "0.94 (β0.01)", "0.93 (β0.02)", "0.93 (β0.01)", "0.91 (0.0)", "0.91 (β0.04)", "0.93 (β0.01)", "0.78 (β0.08)", "0.78 (β0.10)", "0.84 (β0.05)", "0.92 (β0.04)"],
         | 
| 18 | 
            +
                "Out-of-Domain (Ξ)": ["0.88 (β0.07)", "0.92 (β0.04)", "0.92 (β0.03)", "0.89 (β0.08)", "0.95 (β0.03)", "0.87 (β0.04)", "0.93 (β0.02)", "0.92 (β0.03)", "0.94 (β0.02)", "0.91 (β0.03)", "0.82 (β0.09)", "0.87 (β0.07)", "0.90 (β0.05)", "0.85 (β0.07)", "0.94 (β0.01)", "0.92 (β0.03)", "0.92 (β0.02)", "0.86 (β0.05)", "0.91 (β0.04)", "0.90 (β0.04)", "0.79 (β0.07)", "0.83 (β0.05)", "0.81 (β0.08)", "0.90 (β0.06)"],
         | 
| 19 | 
            +
                "OCR Context (Ξ)": ["0.91 (β0.04)", "0.92 (β0.04)", "0.95 (0.0)", "0.94 (β0.03)", "0.90 (β0.08)", "0.72 (β0.11)", "0.86 (β0.09)", "0.89 (β0.06)", "0.93 (β0.03)", "0.88 (β0.06)", "0.80 (β0.11)", "0.88 (β0.06)", "0.89 (β0.06)", "0.80 (β0.12)", "0.88 (β0.07)", "0.92 (β0.03)", "0.91 (β0.03)", "0.77 (β0.14)", "0.89 (β0.06)", "0.91 (β0.03)", "0.69 (β0.17)", "0.78 (β0.10)", "0.72 (β0.17)", "0.89 (β0.07)"],
         | 
| 20 | 
            +
                "Robustness (Ξ)": ["0.83 (β0.12)", "0.84 (β0.12)", "0.85 (β0.10)", "0.81 (β0.16)", "0.90 (β0.08)", "0.64 (β0.19)", "0.82 (β0.13)", "0.86 (β0.09)", "0.89 (β0.07)", "0.80 (β0.14)", "0.70 (β0.21)", "0.80 (β0.14)", "0.82 (β0.13)", "0.75 (β0.17)", "0.86 (β0.09)", "0.85 (β0.10)", "0.84 (β0.10)", "0.74 (β0.17)", "0.80 (β0.15)", "0.82 (β0.12)", "0.58 (β0.28)", "0.70 (β0.18)", "0.63 (β0.26)", "0.83 (β0.13)"]
         | 
| 21 | 
            +
            }
         | 
| 22 | 
            +
             | 
| 23 | 
            +
            # Data for Table 2: Context Grounding Results
         | 
| 24 | 
            +
            context_grounding_data = {
         | 
| 25 | 
            +
                "Model Name": [
         | 
| 26 | 
            +
                    "Gemini 2.0 Flash Exp", "Gemini 1.5 Pro 002", "OpenAI GPT-4o", "OpenAI o1", "OpenAI o3-mini",
         | 
| 27 | 
            +
                    "DeepSeek-R1-Distill-Llama-8B", "DeepSeek-R1-Distill-Qwen-14B", "DeepSeek-R1-Distill-Qwen-32B",
         | 
| 28 | 
            +
                    "DeepSeek-R1-Distill-Llama-70B", "DeepSeek-R1", "Meta-Llama-3.1-8B-Instruct",
         | 
| 29 | 
            +
                    "Meta-Llama-3.1-70B-Instruct", "Meta-Llama-3.3-70B-Instruct", "Qwen2.5-7B-Instruct",
         | 
| 30 | 
            +
                    "Qwen2.5-14B-Instruct", "Qwen2.5-32B-Instruct", "Qwen2.5-72B-Instruct", "Qwen2.5-7B-Instruct-1M",
         | 
| 31 | 
            +
                    "Qwen2.5-14B-Instruct-1M", "Nemotron-70B-Instruct-HF", "Phi-3-mini-128k-Instruct",
         | 
| 32 | 
            +
                    "Phi-3-small-128k-Instruct", "Phi-3-medium-128k-Instruct", "Palmyra-Fin-128k-Instruct"
         | 
| 33 | 
            +
                ],
         | 
| 34 | 
            +
                "Irrelevant Ctx": [0.81, 0.74, 0.52, 0.56, 0.67, 0.32, 0.49, 0.54, 0.50, 0.51, 0.67, 0.46, 0.50, 0.75, 0.75, 0.89, 0.69, 0.63, 0.78, 0.52, 0.54, 0.37, 0.36, 0.95],
         | 
| 35 | 
            +
                "No Ctx": [0.66, 0.64, 0.43, 0.55, 0.51, 0.27, 0.21, 0.24, 0.27, 0.22, 0.63, 0.37, 0.40, 0.64, 0.61, 0.68, 0.60, 0.58, 0.53, 0.48, 0.34, 0.26, 0.25, 0.66],
         | 
| 36 | 
            +
                "Ctx Grounding QA": [0.77, 0.72, 0.50, 0.57, 0.63, 0.30, 0.36, 0.40, 0.41, 0.39, 0.70, 0.48, 0.47, 0.75, 0.70, 0.82, 0.68, 0.65, 0.69, 0.52, 0.47, 0.34, 0.33, 0.83],
         | 
| 37 | 
            +
                "Ctx Grounding TG": [0.46, 0.52, 0.25, 0.45, 0.27, 0.25, 0.27, 0.35, 0.22, 0.20, 0.27, 0.37, 0.31, 0.31, 0.55, 0.55, 0.39, 0.29, 0.37, 0.39, 0.24, 0.10, 0.14, 0.65],
         | 
| 38 | 
            +
                "Ctx Grounding": [0.74, 0.69, 0.47, 0.55, 0.59, 0.30, 0.35, 0.39, 0.38, 0.37, 0.65, 0.47, 0.45, 0.70, 0.68, 0.79, 0.64, 0.60, 0.65, 0.50, 0.44, 0.31, 0.30, 0.80],
         | 
| 39 | 
            +
                "Robustness": [0.83, 0.84, 0.85, 0.81, 0.90, 0.64, 0.82, 0.86, 0.89, 0.80, 0.70, 0.80, 0.82, 0.75, 0.86, 0.85, 0.84, 0.74, 0.80, 0.82, 0.58, 0.70, 0.63, 0.83],
         | 
| 40 | 
            +
                "Compliance": [0.76, 0.72, 0.52, 0.59, 0.63, 0.34, 0.40, 0.44, 0.43, 0.41, 0.66, 0.51, 0.49, 0.71, 0.71, 0.80, 0.67, 0.62, 0.68, 0.54, 0.46, 0.35, 0.34, 0.81]
         | 
| 41 | 
            +
            }
         | 
| 42 | 
            +
             | 
| 43 | 
            +
            # Function to create the Gradio interface
         | 
| 44 | 
            +
            def create_leaderboard():
         | 
| 45 | 
            +
                # Convert data to DataFrames for better display
         | 
| 46 | 
            +
                robustness_df = pd.DataFrame(robustness_data)
         | 
| 47 | 
            +
                context_grounding_df = pd.DataFrame(context_grounding_data)
         | 
| 48 | 
            +
             | 
| 49 | 
            +
                # Create Gradio interface with two tabs for each table
         | 
| 50 | 
            +
                with gr.Blocks(title="Model Performance Leaderboard") as demo:
         | 
| 51 | 
            +
                    gr.Markdown("# Model Performance Leaderboard")
         | 
| 52 | 
            +
                    
         | 
| 53 | 
            +
                    with gr.Tab("Robustness Results"):
         | 
| 54 | 
            +
                        gr.DataFrame(value=robustness_df, label="Robustness Results", wrap=True)
         | 
| 55 | 
            +
                    
         | 
| 56 | 
            +
                    with gr.Tab("Context Grounding Results"):
         | 
| 57 | 
            +
                        gr.DataFrame(value=context_grounding_df, label="Context Grounding Results", wrap=True)
         | 
| 58 | 
            +
             | 
| 59 | 
            +
                return demo
         | 
| 60 | 
            +
             | 
| 61 | 
            +
            # Launch the Gradio app
         | 
| 62 | 
            +
            if __name__ == "__main__":
         | 
| 63 | 
            +
                demo = create_leaderboard()
         | 
| 64 | 
            +
                demo.launch()
         | 

