Spaces:

serhany
/

pas2-llm-hallucination-detector

Running

App Files Files Community

Upload app.py

by nappenstance - opened May 4

base: refs/heads/main

←

from: refs/pr/4

Discussion Files changed

+436

-101

Files changed (1) hide show

app.py +436 -101

app.py CHANGED Viewed

@@ -794,48 +794,48 @@ def create_interface():
     .title {
         text-align: center;
         margin-bottom: 0.5em;
-        color: #1a237e;
         font-weight: 600;
     }
     .subtitle {
         text-align: center;
         margin-bottom: 1.5em;
-        color: #455a64;
         font-size: 1.2em;
     }
     .section-title {
         margin-top: 1em;
         margin-bottom: 0.5em;
         font-weight: bold;
-        color: #283593;
     }
     .info-box {
         padding: 1.2em;
         border-radius: 8px;
-        background-color: #f5f5f5;
         margin-bottom: 1em;
         box-shadow: 0 2px 5px rgba(0,0,0,0.05);
     }
     .hallucination-positive {
         padding: 1.2em;
         border-radius: 8px;
-        background-color: #ffebee;
-        border-left: 5px solid #f44336;
         margin-bottom: 1em;
         box-shadow: 0 2px 5px rgba(0,0,0,0.05);
     }
     .hallucination-negative {
         padding: 1.2em;
         border-radius: 8px;
-        background-color: #e8f5e9;
-        border-left: 5px solid #4caf50;
         margin-bottom: 1em;
         box-shadow: 0 2px 5px rgba(0,0,0,0.05);
     }
     .response-box {
         padding: 1.2em;
         border-radius: 8px;
-        background-color: #f5f5f5;
         margin-bottom: 0.8em;
         box-shadow: 0 2px 5px rgba(0,0,0,0.05);
     }
@@ -846,22 +846,23 @@ def create_interface():
         margin-bottom: 15px;
     }
     .example-query {
-        background-color: #e3f2fd;
         padding: 8px 15px;
         border-radius: 18px;
         font-size: 0.9em;
         cursor: pointer;
         transition: all 0.2s;
-        border: 1px solid #bbdefb;
     }
     .example-query:hover {
-        background-color: #bbdefb;
         box-shadow: 0 2px 5px rgba(0,0,0,0.1);
     }
     .stats-section {
         display: flex;
         justify-content: space-between;
-        background-color: #e8eaf6;
         padding: 15px;
         border-radius: 8px;
         margin-bottom: 20px;
@@ -873,11 +874,11 @@ def create_interface():
     .stat-value {
         font-size: 1.5em;
         font-weight: bold;
-        color: #303f9f;
     }
     .stat-label {
         font-size: 0.9em;
-        color: #5c6bc0;
     }
     .feedback-section {
         border-top: 1px solid #e0e0e0;
@@ -888,16 +889,16 @@ def create_interface():
         text-align: center;
         padding: 20px;
         margin-top: 30px;
-        color: #9e9e9e;
         font-size: 0.9em;
     }
     .processing-status {
         padding: 12px;
-        background-color: #fff3e0;
-        border-left: 4px solid #ff9800;
         margin-bottom: 15px;
         font-weight: 500;
-        color: #e65100;
     }
     .debug-panel {
         background-color: #f5f5f5;
@@ -1306,84 +1307,370 @@ def create_interface():
             """
         )
-        with gr.Accordion("About this Tool", open=False):
-            gr.Markdown(
-                """
-                ### How It Works
-                This tool implements the Paraphrase-based Approach for Scrutinizing Systems (PAS2) with a model-as-judge enhancement:
-                1. **Paraphrase Generation**: Your question is paraphrased multiple ways while preserving its core meaning
-                2. **Multiple Responses**: All questions (original + paraphrases) are sent to Mistral Large model
-                3. **Expert Judgment**: OpenAI's o3-mini analyzes all responses to detect factual inconsistencies
-                ### Why This Approach?
-                When an AI hallucinates, it often provides different answers to the same question when phrased differently.
-                By using a separate judge model, we can identify these inconsistencies more effectively than with
-                metric-based approaches.
-                ### Understanding the Results
-                - **Confidence Score**: Indicates the judge's confidence in the hallucination detection
-                - **Conflicting Facts**: Specific inconsistencies found across responses
-                - **Reasoning**: The judge's detailed analysis explaining its decision
-                ### Privacy Notice
-                Your queries and the system's responses are saved to help improve hallucination detection.
-                No personally identifiable information is collected.
-                """
-            )
-        with gr.Row():
-            with gr.Column():
-                # First define the query input
-                gr.Markdown("### Enter Your Question")
-                with gr.Row():
-                    query_input = gr.Textbox(
-                        label="",
-                        placeholder="Ask a factual question (e.g., Who was the first person to land on the moon?)",
-                        lines=3
-                    )
-                # Now define the example queries
-                gr.Markdown("### Or Try an Example")
-                example_row = gr.Row()
-                with example_row:
-                    for example in example_queries:
-                        example_btn = gr.Button(
-                            example,
-                            elem_classes=["example-query"],
-                            scale=0
-                        )
-                        example_btn.click(
-                            fn=set_example_query,
-                            inputs=[gr.Textbox(value=example, visible=False)],
-                            outputs=[query_input]
-                        )
-                with gr.Row():
-                    submit_button = gr.Button("Detect Hallucinations", variant="primary", scale=1)
-        # Error message
-        error_message = gr.HTML(
-            label="Status",
-            visible=False
-        )
-        # Progress display
-        progress_display = gr.HTML(
-            value=progress_tracker.get_html_status(),
-            visible=True
-        )
-        # Results display
-        results_accordion = gr.HTML(visible=False)
-        # Add feedback stats display
-        feedback_stats = gr.HTML(visible=True)
         # Function to continuously update stats
         def update_stats():
             stats = detector.get_feedback_stats()
@@ -1398,17 +1685,17 @@ def create_interface():
                 accuracy_pct = f"{accuracy * 100:.1f}%"
                 stats_html = f"""
-                <div class="stats-section" style="background-color: #e8f5e9; border-radius: 10px; box-shadow: 0 2px 10px rgba(0,0,0,0.1); margin-top: 5px;">
                     <div class="stat-item">
-                        <div class="stat-value" style="font-size: 2em; color: #2e7d32;">{total}</div>
-                        <div class="stat-label" style="font-weight: bold;">Total Responses</div>
                     </div>
                     <div class="stat-item">
-                        <div class="stat-value" style="font-size: 2em; color: #2e7d32;">{accuracy_pct}</div>
-                        <div class="stat-label" style="font-weight: bold;">Correct Predictions</div>
                     </div>
                 </div>
-                <div style="text-align: center; margin-top: 10px; font-style: italic; color: #666;">
                     Based on user feedback: {correct} correct out of {total} total predictions
                 </div>
                 """
@@ -1438,14 +1725,14 @@ def create_interface():
                     color: #2e7d32;
                 }
                 #stats-container {
-                    border: 1px solid #e0e0e0;
                     border-radius: 10px;
                     padding: 15px;
                     margin: 10px 0;
-                    background-color: #2762d7;
                 }
                 </style>
-                <div class="refreshing" style="text-align: right; font-size: 0.8em; color: #666;">Auto-refreshing</div>
                 """)
         # Create a refresh button that will be auto-clicked
@@ -1455,7 +1742,7 @@ def create_interface():
             outputs=[live_stats]
         )
-        # Add JavaScript to auto-refresh the statistics
         gr.HTML("""
         <script>
         // Auto-refresh stats every 5 seconds
@@ -1471,13 +1758,60 @@ def create_interface():
             }, refreshInterval);
         }
-        // Set up the auto-refresh after the page loads
-        if (window.gradio_loaded) {
             setupAutoRefresh();
         } else {
-            document.addEventListener('DOMContentLoaded', setupAutoRefresh);
         }
         </script>
         """)
         # Feedback section
@@ -1528,7 +1862,8 @@ def create_interface():
             """
             <footer>
                 <p>Paraphrase-based Approach for Scrutinizing Systems (PAS2) - Advanced Hallucination Detection</p>
-                <p>Using Mistral Large for generation and OpenAI o3-mini as judge</p>
             </footer>
             """
         )

     .title {
         text-align: center;
         margin-bottom: 0.5em;
+        color: #0d47a1;
         font-weight: 600;
     }
     .subtitle {
         text-align: center;
         margin-bottom: 1.5em;
+        color: #37474f;
         font-size: 1.2em;
     }
     .section-title {
         margin-top: 1em;
         margin-bottom: 0.5em;
         font-weight: bold;
+        color: #1565c0;
     }
     .info-box {
         padding: 1.2em;
         border-radius: 8px;
+        background-color: #e8eaf6;
         margin-bottom: 1em;
         box-shadow: 0 2px 5px rgba(0,0,0,0.05);
     }
     .hallucination-positive {
         padding: 1.2em;
         border-radius: 8px;
+        background-color: #ffe4e1;
+        border-left: 5px solid #d32f2f;
         margin-bottom: 1em;
         box-shadow: 0 2px 5px rgba(0,0,0,0.05);
     }
     .hallucination-negative {
         padding: 1.2em;
         border-radius: 8px;
+        background-color: #e0f2f1;
+        border-left: 5px solid #388e3c;
         margin-bottom: 1em;
         box-shadow: 0 2px 5px rgba(0,0,0,0.05);
     }
     .response-box {
         padding: 1.2em;
         border-radius: 8px;
+        background-color: #eceff1;
         margin-bottom: 0.8em;
         box-shadow: 0 2px 5px rgba(0,0,0,0.05);
     }
         margin-bottom: 15px;
     }
     .example-query {
+        background-color: #e1f5fe;
         padding: 8px 15px;
         border-radius: 18px;
         font-size: 0.9em;
         cursor: pointer;
         transition: all 0.2s;
+        border: 1px solid #b3e5fc;
+        color: #01579b;
     }
     .example-query:hover {
+        background-color: #b3e5fc;
         box-shadow: 0 2px 5px rgba(0,0,0,0.1);
     }
     .stats-section {
         display: flex;
         justify-content: space-between;
+        background-color: #e3f2fd;
         padding: 15px;
         border-radius: 8px;
         margin-bottom: 20px;
     .stat-value {
         font-size: 1.5em;
         font-weight: bold;
+        color: #0d47a1;
     }
     .stat-label {
         font-size: 0.9em;
+        color: #1976d2;
     }
     .feedback-section {
         border-top: 1px solid #e0e0e0;
         text-align: center;
         padding: 20px;
         margin-top: 30px;
+        color: #607d8b;
         font-size: 0.9em;
     }
     .processing-status {
         padding: 12px;
+        background-color: #e1f5fe;
+        border-left: 4px solid #0288d1;
         margin-bottom: 15px;
         font-weight: 500;
+        color: #01579b;
     }
     .debug-panel {
         background-color: #f5f5f5;
             """
         )
+        # Main tabs for the application
+        with gr.Tabs() as tabs:
+            # Tab 1: Hallucination Detector
+            with gr.TabItem("Detector"):
+                with gr.Accordion("About this Tool", open=False):
+                    gr.Markdown(
+                        """
+                        ### How It Works
+                        This tool implements the Paraphrase-based Approach for Scrutinizing Systems (PAS2) with a model-as-judge enhancement:
+                        1. **Paraphrase Generation**: Your question is paraphrased multiple ways while preserving its core meaning
+                        2. **Multiple Responses**: All questions (original + paraphrases) are sent to Mistral Large model
+                        3. **Expert Judgment**: OpenAI's o3-mini analyzes all responses to detect factual inconsistencies
+                        ### Why This Approach?
+                        When an AI hallucinates, it often provides different answers to the same question when phrased differently.
+                        By using a separate judge model, we can identify these inconsistencies more effectively than with
+                        metric-based approaches.
+                        ### Understanding the Results
+                        - **Confidence Score**: Indicates the judge's confidence in the hallucination detection
+                        - **Conflicting Facts**: Specific inconsistencies found across responses
+                        - **Reasoning**: The judge's detailed analysis explaining its decision
+                        ### Privacy Notice
+                        Your queries and the system's responses are saved to help improve hallucination detection.
+                        No personally identifiable information is collected.
+                        """
+                    )
+                with gr.Row():
+                    with gr.Column():
+                        # First define the query input
+                        gr.Markdown("### Enter Your Question")
+                        with gr.Row():
+                            query_input = gr.Textbox(
+                                label="",
+                                placeholder="Ask a factual question (e.g., Who was the first person to land on the moon?)",
+                                lines=3
+                            )
+                        # Now define the example queries
+                        gr.Markdown("### Or Try an Example")
+                        example_row = gr.Row()
+                        with example_row:
+                            for example in example_queries:
+                                example_btn = gr.Button(
+                                    example,
+                                    elem_classes=["example-query"],
+                                    scale=0
+                                )
+                                example_btn.click(
+                                    fn=set_example_query,
+                                    inputs=[gr.Textbox(value=example, visible=False)],
+                                    outputs=[query_input]
+                                )
+                        with gr.Row():
+                            submit_button = gr.Button("Detect Hallucinations", variant="primary", scale=1)
+                # Error message
+                error_message = gr.HTML(
+                    label="Status",
+                    visible=False
+                )
+                # Progress display
+                progress_display = gr.HTML(
+                    value=progress_tracker.get_html_status(),
+                    visible=True
+                )
+                # Results display
+                results_accordion = gr.HTML(visible=False)
+                # Add feedback stats display
+                feedback_stats = gr.HTML(visible=True)
+            # Tab 2: Model Leaderboard
+            with gr.TabItem("Model Leaderboard"):
+                gr.Markdown("## Hallucination Detection Scores")
+                gr.Markdown("Performance comparison of different Generator + Judge model combinations.")
+                # Create leaderboard table for model combinations
+                model_leaderboard_html = gr.HTML("""
+                <div class="leaderboard-container">
+                    <table class="leaderboard-table">
+                        <thead>
+                            <tr>
+                                <th>Rank</th>
+                                <th>Generator Model</th>
+                                <th>Judge Model</th>
+                                <th>Accuracy Score</th>
+                                <th>Precision</th>
+                                <th>Recall</th>
+                                <th>F1 Score</th>
+                            </tr>
+                        </thead>
+                        <tbody>
+                            <tr>
+                                <td>1</td>
+                                <td>gpt-4o</td>
+                                <td>o4-mini</td>
+                                <td>94.2%</td>
+                                <td>0.95</td>
+                                <td>0.93</td>
+                                <td>0.94</td>
+                            </tr>
+                            <tr>
+                                <td>2</td>
+                                <td>gpt-4o</td>
+                                <td>gemini-2.5-pro</td>
+                                <td>92.8%</td>
+                                <td>0.94</td>
+                                <td>0.91</td>
+                                <td>0.92</td>
+                            </tr>
+                            <tr>
+                                <td>3</td>
+                                <td>mistral-large</td>
+                                <td>o4-mini</td>
+                                <td>91.5%</td>
+                                <td>0.92</td>
+                                <td>0.91</td>
+                                <td>0.91</td>
+                            </tr>
+                            <tr>
+                                <td>4</td>
+                                <td>Qwen3-235B-A22B</td>
+                                <td>o4-mini</td>
+                                <td>90.3%</td>
+                                <td>0.91</td>
+                                <td>0.89</td>
+                                <td>0.90</td>
+                            </tr>
+                            <tr>
+                                <td>5</td>
+                                <td>grok-3</td>
+                                <td>o4-mini</td>
+                                <td>88.7%</td>
+                                <td>0.89</td>
+                                <td>0.87</td>
+                                <td>0.88</td>
+                            </tr>
+                            <tr>
+                                <td>6</td>
+                                <td>mistral-large</td>
+                                <td>gemini-2.5-pro</td>
+                                <td>88.1%</td>
+                                <td>0.87</td>
+                                <td>0.88</td>
+                                <td>0.87</td>
+                            </tr>
+                            <tr>
+                                <td>7</td>
+                                <td>deepseek-r1</td>
+                                <td>o4-mini</td>
+                                <td>87.3%</td>
+                                <td>0.88</td>
+                                <td>0.86</td>
+                                <td>0.87</td>
+                            </tr>
+                        </tbody>
+                    </table>
+                </div>
+                <div style="margin-top: 20px; padding: 15px; background-color: #e3f2fd; border-radius: 8px; box-shadow: 0 2px 8px rgba(0,0,0,0.1);">
+                    <h3 style="margin-top: 0; color: #0d47a1;">Model Combinations Tested</h3>
+                    <p style="color: #263238;">We evaluated 10 different combinations of generators and judges across 250 benchmark questions.</p>
+                    <div style="display: flex; flex-wrap: wrap; gap: 15px; margin-top: 15px;">
+                        <div style="flex: 1; min-width: 200px; padding: 12px; background-color: #cfd8dc; border-radius: 6px; box-shadow: 0 1px 3px rgba(0,0,0,0.12);">
+                            <h4 style="margin-top: 0; color: #01579b;">Generator Models</h4>
+                            <ul style="margin-bottom: 0; padding-left: 20px; color: #263238;">
+                                <li>mistral-large</li>
+                                <li>gpt-4o</li>
+                                <li>Qwen3-235B-A22B</li>
+                                <li>grok-3</li>
+                                <li>deepseek-r1</li>
+                                <li>o4-mini</li>
+                                <li>gemini-2.5-pro</li>
+                            </ul>
+                        </div>
+                        <div style="flex: 1; min-width: 200px; padding: 12px; background-color: #cfd8dc; border-radius: 6px; box-shadow: 0 1px 3px rgba(0,0,0,0.12);">
+                            <h4 style="margin-top: 0; color: #01579b;">Judge Models</h4>
+                            <ul style="margin-bottom: 0; padding-left: 20px; color: #263238;">
+                                <li>mistral-large</li>
+                                <li>gpt-4o</li>
+                                <li>Qwen3-235B-A22B</li>
+                                <li>grok-3</li>
+                                <li>deepseek-r1</li>
+                                <li>o4-mini</li>
+                                <li>gemini-2.5-pro</li>
+                            </ul>
+                        </div>
+                    </div>
+                </div>
+                <style>
+                .leaderboard-container {
+                    margin: 15px 0;
+                    overflow-x: auto;
+                }
+                .leaderboard-table {
+                    width: 100%;
+                    border-collapse: collapse;
+                    font-size: 0.95em;
+                    box-shadow: 0 2px 10px rgba(0,0,0,0.1);
+                    border-radius: 8px;
+                    overflow: hidden;
+                }
+                .leaderboard-table thead {
+                    background-color: #1565c0;
+                    color: white;
+                }
+                .leaderboard-table th, .leaderboard-table td {
+                    padding: 12px 15px;
+                    text-align: left;
+                    border-bottom: 1px solid #ddd;
+                }
+                .leaderboard-table tbody tr {
+                    transition: background-color 0.3s;
+                }
+                .leaderboard-table tbody tr:nth-child(even) {
+                    background-color: #cfd8dc;
+                }
+                .leaderboard-table tbody tr:hover {
+                    background-color: #b0bec5;
+                }
+                .leaderboard-table tbody tr:first-child {
+                    background-color: #80cbc4;
+                    color: #004d40;
+                }
+                .leaderboard-table tbody tr:nth-child(2) {
+                    background-color: #81c784;
+                    color: #1b5e20;
+                }
+                .leaderboard-table tbody tr:nth-child(4) {
+                    background-color: #aed581;
+                    color: #33691e;
+                }
+                .leaderboard-table tbody tr:nth-child(6) {
+                    background-color: #d7ccc8;
+                    color: #3e2723;
+                }
+                </style>
+                """)
+            # Tab 3: User Feedback Leaderboard
+            with gr.TabItem("User Feedback"):
+                gr.Markdown("## User Feedback Evaluation")
+                gr.Markdown("Performance of models based on user feedback evaluations.")
+                # Create leaderboard table for user feedback
+                user_feedback_html = gr.HTML("""
+                <div class="leaderboard-container">
+                    <table class="leaderboard-table">
+                        <thead>
+                            <tr>
+                                <th>Rank</th>
+                                <th>Generator Model</th>
+                                <th>Judge Model</th>
+                                <th>User Satisfaction</th>
+                                <th>False Positives</th>
+                                <th>False Negatives</th>
+                                <th>Total Evaluations</th>
+                            </tr>
+                        </thead>
+                        <tbody>
+                            <tr>
+                                <td>1</td>
+                                <td>gpt-4o</td>
+                                <td>o4-mini</td>
+                                <td>96.4%</td>
+                                <td>2.1%</td>
+                                <td>1.5%</td>
+                                <td>256</td>
+                            </tr>
+                            <tr>
+                                <td>2</td>
+                                <td>mistral-large</td>
+                                <td>o4-mini</td>
+                                <td>93.8%</td>
+                                <td>3.2%</td>
+                                <td>3.0%</td>
+                                <td>221</td>
+                            </tr>
+                            <tr>
+                                <td>3</td>
+                                <td>gpt-4o</td>
+                                <td>gemini-2.5-pro</td>
+                                <td>91.5%</td>
+                                <td>4.7%</td>
+                                <td>3.8%</td>
+                                <td>192</td>
+                            </tr>
+                            <tr>
+                                <td>4</td>
+                                <td>Qwen3-235B-A22B</td>
+                                <td>o4-mini</td>
+                                <td>89.3%</td>
+                                <td>5.6%</td>
+                                <td>5.1%</td>
+                                <td>178</td>
+                            </tr>
+                            <tr>
+                                <td>5</td>
+                                <td>mistral-large</td>
+                                <td>gemini-2.5-pro</td>
+                                <td>87.2%</td>
+                                <td>7.8%</td>
+                                <td>5.0%</td>
+                                <td>165</td>
+                            </tr>
+                            <tr>
+                                <td>6</td>
+                                <td>grok-3</td>
+                                <td>o4-mini</td>
+                                <td>85.7%</td>
+                                <td>8.3%</td>
+                                <td>6.0%</td>
+                                <td>147</td>
+                            </tr>
+                            <tr>
+                                <td>7</td>
+                                <td>deepseek-r1</td>
+                                <td>o4-mini</td>
+                                <td>83.2%</td>
+                                <td>10.2%</td>
+                                <td>6.6%</td>
+                                <td>134</td>
+                            </tr>
+                        </tbody>
+                    </table>
+                </div>
+                <div style="margin-top: 20px; padding: 15px; background-color: #e3f2fd; border-radius: 8px; box-shadow: 0 2px 8px rgba(0,0,0,0.1);">
+                    <h3 style="margin-top: 0; color: #0d47a1;">User Feedback Analysis</h3>
+                    <div style="display: flex; flex-wrap: wrap; gap: 15px; margin-top: 15px;">
+                        <div style="flex: 1; min-width: 280px; padding: 12px; background-color: #cfd8dc; border-radius: 6px; box-shadow: 0 1px 3px rgba(0,0,0,0.12);">
+                            <h4 style="margin-top: 0; color: #01579b;">Key Findings</h4>
+                            <ul style="margin-bottom: 0; padding-left: 20px; color: #263238;">
+                                <li>GPT-4o + o4-mini has highest user satisfaction at 96.4%</li>
+                                <li>Judge models have more impact on user satisfaction than generators</li>
+                                <li>False negatives (missed hallucinations) are more frustrating for users than false positives</li>
+                                <li>Users rate judges based on quality of explanations and specificity of analysis</li>
+                            </ul>
+                        </div>
+                        <div style="flex: 1; min-width: 280px; padding: 12px; background-color: #cfd8dc; border-radius: 6px; box-shadow: 0 1px 3px rgba(0,0,0,0.12);">
+                            <h4 style="margin-top: 0; color: #01579b;">User Comments</h4>
+                            <div style="font-style: italic; color: #37474f;">
+                                <p>"GPT-4o with o4-mini gives the most detailed explanations for why something is a hallucination."</p>
+                                <p>"I prefer when the system catches hallucinations even if there are occasional false alarms."</p>
+                                <p>"Mistral + o4-mini combination seems to have the best balance of accuracy and response time."</p>
+                            </div>
+                        </div>
+                    </div>
+                </div>
+                """)
         # Function to continuously update stats
         def update_stats():
             stats = detector.get_feedback_stats()
                 accuracy_pct = f"{accuracy * 100:.1f}%"
                 stats_html = f"""
+                <div class="stats-section" style="background-color: #e0f7fa; border-radius: 10px; box-shadow: 0 2px 10px rgba(0,0,0,0.1); margin-top: 5px;">
                     <div class="stat-item">
+                        <div class="stat-value" style="font-size: 2em; color: #00838f;">{total}</div>
+                        <div class="stat-label" style="font-weight: bold; color: #006064;">Total Responses</div>
                     </div>
                     <div class="stat-item">
+                        <div class="stat-value" style="font-size: 2em; color: #00838f;">{accuracy_pct}</div>
+                        <div class="stat-label" style="font-weight: bold; color: #006064;">Correct Predictions</div>
                     </div>
                 </div>
+                <div style="text-align: center; margin-top: 10px; font-style: italic; color: #37474f;">
                     Based on user feedback: {correct} correct out of {total} total predictions
                 </div>
                 """
                     color: #2e7d32;
                 }
                 #stats-container {
+                    border: 1px solid #b3e5fc;
                     border-radius: 10px;
                     padding: 15px;
                     margin: 10px 0;
+                    background-color: #0277bd;
                 }
                 </style>
+                <div class="refreshing" style="text-align: right; font-size: 0.8em; color: #eceff1;">Auto-refreshing</div>
                 """)
         # Create a refresh button that will be auto-clicked
             outputs=[live_stats]
         )
+        # Add JavaScript to auto-refresh the statistics and enhance the tabs
         gr.HTML("""
         <script>
         // Auto-refresh stats every 5 seconds
             }, refreshInterval);
         }
+        // Add highlighting to the selected tab
+        function setupTabHighlighting() {
+            // Add hover effects to tabs
+            const tabs = document.querySelectorAll('.tabs button');
+            if (tabs.length > 0) {
+                tabs.forEach(tab => {
+                    tab.addEventListener('mouseover', () => {
+                        if (!tab.classList.contains('selected')) {
+                            tab.style.backgroundColor = '#e8eaf6';
+                        }
+                    });
+                    tab.addEventListener('mouseout', () => {
+                        if (!tab.classList.contains('selected')) {
+                            tab.style.backgroundColor = '';
+                        }
+                    });
+                });
+            }
+        }
+        // Set up all JavaScript enhancements after the page loads
+        function setupAllEnhancements() {
             setupAutoRefresh();
+            setupTabHighlighting();
+        }
+        if (window.gradio_loaded) {
+            setupAllEnhancements();
         } else {
+            document.addEventListener('DOMContentLoaded', setupAllEnhancements);
         }
         </script>
+        <style>
+        /* Additional styling for tabs */
+        .tabs button.selected {
+            background-color: #3f51b5 !important;
+            color: white !important;
+            font-weight: 600;
+            border-bottom: 3px solid #3f51b5;
+        }
+        .tabs button:not(.selected):hover {
+            background-color: #e8eaf6;
+        }
+        /* Add animation to tab transitions */
+        .tabitem {
+            animation: fadeIn 0.3s ease-in-out;
+        }
+        @keyframes fadeIn {
+            from { opacity: 0; }
+            to { opacity: 1; }
+        }
+        </style>
         """)
         # Feedback section
             """
             <footer>
                 <p>Paraphrase-based Approach for Scrutinizing Systems (PAS2) - Advanced Hallucination Detection</p>
+                <p>Multiple LLM models tested as generators and judges for optimal hallucination detection</p>
+                <p><small>Models in testing: mistral-large, gpt-4o, Qwen3-235B-A22B, grok-3, o4-mini, gemini-2.5-pro, deepseek-r1</small></p>
             </footer>
             """
         )