Spaces:

serhany
/

pas2-llm-hallucination-detector

Running

App Files Files Community

serhany

nappenstance commited on May 4

Commit

31c5c21

verified ·

1 Parent(s): 6db9b2c

Upload app.py (#5)

Browse files

- Upload app.py (310afe972a6a2e12700b5526f9bac89160d550d9)

Co-authored-by: Furkan Eris <[email protected]>

Files changed (1) hide show

app.py +294 -129

app.py CHANGED Viewed

@@ -812,32 +812,36 @@ def create_interface():
     .info-box {
         padding: 1.2em;
         border-radius: 8px;
-        background-color: #e8eaf6;
         margin-bottom: 1em;
         box-shadow: 0 2px 5px rgba(0,0,0,0.05);
     }
     .hallucination-positive {
         padding: 1.2em;
         border-radius: 8px;
-        background-color: #ffe4e1;
         border-left: 5px solid #d32f2f;
         margin-bottom: 1em;
         box-shadow: 0 2px 5px rgba(0,0,0,0.05);
     }
     .hallucination-negative {
         padding: 1.2em;
         border-radius: 8px;
-        background-color: #e0f2f1;
         border-left: 5px solid #388e3c;
         margin-bottom: 1em;
         box-shadow: 0 2px 5px rgba(0,0,0,0.05);
     }
     .response-box {
         padding: 1.2em;
         border-radius: 8px;
-        background-color: #eceff1;
         margin-bottom: 0.8em;
         box-shadow: 0 2px 5px rgba(0,0,0,0.05);
     }
     .example-queries {
         display: flex;
@@ -992,7 +996,7 @@ def create_interface():
         return [
             gr.update(visible=True),  # Show the progress display
             gr.update(visible=False),  # Hide the results accordion
-            gr.update(visible=False),  # Hide the feedback accordion
             None  # Reset hidden results
         ]
@@ -1195,7 +1199,7 @@ def create_interface():
             original_response_safe = original_response.replace('\\', '\\\\').replace('\n', '<br>')
             paraphrased_responses_safe = [r.replace('\\', '\\\\').replace('\n', '<br>') for r in paraphrased_responses]
             reasoning_safe = reasoning.replace('\\', '\\\\').replace('\n', '<br>')
-            conflicting_facts_text_safe = conflicting_facts_text.replace('\\', '\\\\').replace('\n', '<br>') if conflicting_facts_text else "None identified"
             html_output = f"""
             <div class="container">
@@ -1269,7 +1273,7 @@ def create_interface():
             return [
                 gr.update(visible=False),  # Hide progress display when showing results
                 gr.update(visible=True, value=html_output),
-                gr.update(visible=True),
                 results
             ]
@@ -1291,7 +1295,78 @@ def create_interface():
             return "No results to attach feedback to."
         response = detector.save_feedback(results, combined_feedback)
-        return response
     # Create the interface
     with gr.Blocks(css=css, theme=gr.themes.Soft()) as interface:
@@ -1388,9 +1463,29 @@ def create_interface():
                 # Add feedback stats display
                 feedback_stats = gr.HTML(visible=True)
             # Tab 2: Model Leaderboard
-            with gr.TabItem("Model Leaderboard"):
                 gr.Markdown("## Hallucination Detection Scores")
                 gr.Markdown("Performance comparison of different Generator + Judge model combinations.")
@@ -1403,10 +1498,9 @@ def create_interface():
                                 <th>Rank</th>
                                 <th>Generator Model</th>
                                 <th>Judge Model</th>
-                                <th>Accuracy Score</th>
-                                <th>Precision</th>
-                                <th>Recall</th>
-                                <th>F1 Score</th>
                             </tr>
                         </thead>
                         <tbody>
@@ -1414,97 +1508,108 @@ def create_interface():
                                 <td>1</td>
                                 <td>gpt-4o</td>
                                 <td>o4-mini</td>
                                 <td>94.2%</td>
-                                <td>0.95</td>
-                                <td>0.93</td>
-                                <td>0.94</td>
                             </tr>
                             <tr>
                                 <td>2</td>
                                 <td>gpt-4o</td>
                                 <td>gemini-2.5-pro</td>
                                 <td>92.8%</td>
-                                <td>0.94</td>
-                                <td>0.91</td>
-                                <td>0.92</td>
                             </tr>
                             <tr>
                                 <td>3</td>
                                 <td>mistral-large</td>
                                 <td>o4-mini</td>
                                 <td>91.5%</td>
-                                <td>0.92</td>
-                                <td>0.91</td>
-                                <td>0.91</td>
                             </tr>
                             <tr>
                                 <td>4</td>
                                 <td>Qwen3-235B-A22B</td>
                                 <td>o4-mini</td>
                                 <td>90.3%</td>
-                                <td>0.91</td>
-                                <td>0.89</td>
-                                <td>0.90</td>
                             </tr>
                             <tr>
                                 <td>5</td>
                                 <td>grok-3</td>
                                 <td>o4-mini</td>
                                 <td>88.7%</td>
-                                <td>0.89</td>
-                                <td>0.87</td>
-                                <td>0.88</td>
                             </tr>
                             <tr>
                                 <td>6</td>
                                 <td>mistral-large</td>
                                 <td>gemini-2.5-pro</td>
                                 <td>88.1%</td>
-                                <td>0.87</td>
-                                <td>0.88</td>
-                                <td>0.87</td>
                             </tr>
                             <tr>
                                 <td>7</td>
                                 <td>deepseek-r1</td>
                                 <td>o4-mini</td>
                                 <td>87.3%</td>
-                                <td>0.88</td>
-                                <td>0.86</td>
-                                <td>0.87</td>
                             </tr>
                         </tbody>
                     </table>
                 </div>
-                <div style="margin-top: 20px; padding: 15px; background-color: #e3f2fd; border-radius: 8px; box-shadow: 0 2px 8px rgba(0,0,0,0.1);">
-                    <h3 style="margin-top: 0; color: #0d47a1;">Model Combinations Tested</h3>
-                    <p style="color: #263238;">We evaluated 10 different combinations of generators and judges across 250 benchmark questions.</p>
                     <div style="display: flex; flex-wrap: wrap; gap: 15px; margin-top: 15px;">
-                        <div style="flex: 1; min-width: 200px; padding: 12px; background-color: #cfd8dc; border-radius: 6px; box-shadow: 0 1px 3px rgba(0,0,0,0.12);">
-                            <h4 style="margin-top: 0; color: #01579b;">Generator Models</h4>
-                            <ul style="margin-bottom: 0; padding-left: 20px; color: #263238;">
-                                <li>mistral-large</li>
-                                <li>gpt-4o</li>
-                                <li>Qwen3-235B-A22B</li>
-                                <li>grok-3</li>
-                                <li>deepseek-r1</li>
-                                <li>o4-mini</li>
-                                <li>gemini-2.5-pro</li>
-                            </ul>
                         </div>
-                        <div style="flex: 1; min-width: 200px; padding: 12px; background-color: #cfd8dc; border-radius: 6px; box-shadow: 0 1px 3px rgba(0,0,0,0.12);">
-                            <h4 style="margin-top: 0; color: #01579b;">Judge Models</h4>
-                            <ul style="margin-bottom: 0; padding-left: 20px; color: #263238;">
-                                <li>mistral-large</li>
-                                <li>gpt-4o</li>
-                                <li>Qwen3-235B-A22B</li>
-                                <li>grok-3</li>
-                                <li>deepseek-r1</li>
-                                <li>o4-mini</li>
-                                <li>gemini-2.5-pro</li>
-                            </ul>
                         </div>
                     </div>
                 </div>
@@ -1558,10 +1663,10 @@ def create_interface():
                 </style>
                 """)
-            # Tab 3: User Feedback Leaderboard
-            with gr.TabItem("User Feedback"):
-                gr.Markdown("## User Feedback Evaluation")
-                gr.Markdown("Performance of models based on user feedback evaluations.")
                 # Create leaderboard table for user feedback
                 user_feedback_html = gr.HTML("""
@@ -1571,101 +1676,95 @@ def create_interface():
                             <tr>
                                 <th>Rank</th>
                                 <th>Generator Model</th>
-                                <th>Judge Model</th>
-                                <th>User Satisfaction</th>
-                                <th>False Positives</th>
-                                <th>False Negatives</th>
-                                <th>Total Evaluations</th>
                             </tr>
                         </thead>
                         <tbody>
                             <tr>
                                 <td>1</td>
                                 <td>gpt-4o</td>
-                                <td>o4-mini</td>
                                 <td>96.4%</td>
-                                <td>2.1%</td>
-                                <td>1.5%</td>
                                 <td>256</td>
                             </tr>
                             <tr>
                                 <td>2</td>
                                 <td>mistral-large</td>
-                                <td>o4-mini</td>
                                 <td>93.8%</td>
-                                <td>3.2%</td>
-                                <td>3.0%</td>
                                 <td>221</td>
                             </tr>
                             <tr>
                                 <td>3</td>
-                                <td>gpt-4o</td>
-                                <td>gemini-2.5-pro</td>
                                 <td>91.5%</td>
-                                <td>4.7%</td>
-                                <td>3.8%</td>
                                 <td>192</td>
                             </tr>
                             <tr>
                                 <td>4</td>
-                                <td>Qwen3-235B-A22B</td>
                                 <td>o4-mini</td>
                                 <td>89.3%</td>
-                                <td>5.6%</td>
-                                <td>5.1%</td>
                                 <td>178</td>
                             </tr>
                             <tr>
                                 <td>5</td>
-                                <td>mistral-large</td>
                                 <td>gemini-2.5-pro</td>
                                 <td>87.2%</td>
-                                <td>7.8%</td>
-                                <td>5.0%</td>
                                 <td>165</td>
                             </tr>
                             <tr>
                                 <td>6</td>
                                 <td>grok-3</td>
-                                <td>o4-mini</td>
                                 <td>85.7%</td>
-                                <td>8.3%</td>
-                                <td>6.0%</td>
                                 <td>147</td>
                             </tr>
                             <tr>
                                 <td>7</td>
                                 <td>deepseek-r1</td>
-                                <td>o4-mini</td>
                                 <td>83.2%</td>
-                                <td>10.2%</td>
-                                <td>6.6%</td>
                                 <td>134</td>
                             </tr>
                         </tbody>
                     </table>
                 </div>
-                <div style="margin-top: 20px; padding: 15px; background-color: #e3f2fd; border-radius: 8px; box-shadow: 0 2px 8px rgba(0,0,0,0.1);">
-                    <h3 style="margin-top: 0; color: #0d47a1;">User Feedback Analysis</h3>
                     <div style="display: flex; flex-wrap: wrap; gap: 15px; margin-top: 15px;">
-                        <div style="flex: 1; min-width: 280px; padding: 12px; background-color: #cfd8dc; border-radius: 6px; box-shadow: 0 1px 3px rgba(0,0,0,0.12);">
-                            <h4 style="margin-top: 0; color: #01579b;">Key Findings</h4>
-                            <ul style="margin-bottom: 0; padding-left: 20px; color: #263238;">
-                                <li>GPT-4o + o4-mini has highest user satisfaction at 96.4%</li>
-                                <li>Judge models have more impact on user satisfaction than generators</li>
-                                <li>False negatives (missed hallucinations) are more frustrating for users than false positives</li>
-                                <li>Users rate judges based on quality of explanations and specificity of analysis</li>
-                            </ul>
-                        </div>
-                        <div style="flex: 1; min-width: 280px; padding: 12px; background-color: #cfd8dc; border-radius: 6px; box-shadow: 0 1px 3px rgba(0,0,0,0.12);">
-                            <h4 style="margin-top: 0; color: #01579b;">User Comments</h4>
-                            <div style="font-style: italic; color: #37474f;">
-                                <p>"GPT-4o with o4-mini gives the most detailed explanations for why something is a hallucination."</p>
-                                <p>"I prefer when the system catches hallucinations even if there are occasional false alarms."</p>
-                                <p>"Mistral + o4-mini combination seems to have the best balance of accuracy and response time."</p>
                             </div>
                         </div>
                     </div>
                 </div>
@@ -1702,6 +1801,8 @@ def create_interface():
                 return stats_html
             return ""
         # Set up interval to update stats
         with gr.Row(elem_id="stats-container"):
             with gr.Column():
@@ -1758,7 +1859,7 @@ def create_interface():
             }, refreshInterval);
         }
-        // Add highlighting to the selected tab
         function setupTabHighlighting() {
             // Add hover effects to tabs
             const tabs = document.querySelectorAll('.tabs button');
@@ -1774,6 +1875,34 @@ def create_interface():
                             tab.style.backgroundColor = '';
                         }
                     });
                 });
             }
         }
@@ -1782,6 +1911,51 @@ def create_interface():
         function setupAllEnhancements() {
             setupAutoRefresh();
             setupTabHighlighting();
         }
         if (window.gradio_loaded) {
@@ -1811,30 +1985,21 @@ def create_interface():
             from { opacity: 0; }
             to { opacity: 1; }
         }
         </style>
         """)
-        # Feedback section
-        with gr.Accordion("Provide Feedback", open=False, visible=False) as feedback_accordion:
-            gr.Markdown("### Help Improve the System")
-            gr.Markdown("Your feedback helps us refine the hallucination detection system.")
-            feedback_input = gr.Radio(
-                label="Is the hallucination detection accurate?",
-                choices=["Yes, correct detection", "No, incorrectly flagged hallucination", "No, missed hallucination", "Unsure/Other"],
-                value="Yes, correct detection"
-            )
-            feedback_text = gr.Textbox(
-                label="Additional comments (optional)",
-                placeholder="Please provide any additional observations or details...",
-                lines=2
-            )
-            feedback_button = gr.Button("Submit Feedback", variant="secondary")
-            feedback_status = gr.Textbox(label="Feedback Status", interactive=False, visible=False)
-            # Stats are now displayed in the live stats section
         # Hidden state to store results for feedback
         hidden_results = gr.State()

     .info-box {
         padding: 1.2em;
         border-radius: 8px;
+        background-color: #b0bec5;
         margin-bottom: 1em;
         box-shadow: 0 2px 5px rgba(0,0,0,0.05);
+        color: #263238;
     }
     .hallucination-positive {
         padding: 1.2em;
         border-radius: 8px;
+        background-color: #ffcdd2;
         border-left: 5px solid #d32f2f;
         margin-bottom: 1em;
         box-shadow: 0 2px 5px rgba(0,0,0,0.05);
+        color: #b71c1c;
     }
     .hallucination-negative {
         padding: 1.2em;
         border-radius: 8px;
+        background-color: #c8e6c9;
         border-left: 5px solid #388e3c;
         margin-bottom: 1em;
         box-shadow: 0 2px 5px rgba(0,0,0,0.05);
+        color: #1b5e20;
     }
     .response-box {
         padding: 1.2em;
         border-radius: 8px;
+        background-color: #b0bec5;
         margin-bottom: 0.8em;
         box-shadow: 0 2px 5px rgba(0,0,0,0.05);
+        color: #263238;
     }
     .example-queries {
         display: flex;
         return [
             gr.update(visible=True),  # Show the progress display
             gr.update(visible=False),  # Hide the results accordion
+            gr.update(visible=False),  # Hide the feedback accordion
             None  # Reset hidden results
         ]
             original_response_safe = original_response.replace('\\', '\\\\').replace('\n', '<br>')
             paraphrased_responses_safe = [r.replace('\\', '\\\\').replace('\n', '<br>') for r in paraphrased_responses]
             reasoning_safe = reasoning.replace('\\', '\\\\').replace('\n', '<br>')
+            conflicting_facts_text_safe = conflicting_facts_text.replace('\\', '\\\\').replace('\n', '<br>') if conflicting_facts_text else "<strong>None identified</strong>"
             html_output = f"""
             <div class="container">
             return [
                 gr.update(visible=False),  # Hide progress display when showing results
                 gr.update(visible=True, value=html_output),
+                gr.update(visible=True),  # Show feedback accordion after results
                 results
             ]
             return "No results to attach feedback to."
         response = detector.save_feedback(results, combined_feedback)
+        # Return a success message that will trigger a JS notification
+        feedback_response = """
+        <div id="feedback-popup-container"></div>
+        <script>
+        (function() {
+            // Create the notification element
+            const container = document.getElementById('feedback-popup-container');
+            const notification = document.createElement('div');
+            notification.id = 'feedback-notification';
+            notification.style.cssText = `
+                position: fixed;
+                top: 50px;
+                right: 20px;
+                background-color: #4caf50;
+                color: white;
+                padding: 15px;
+                border-radius: 5px;
+                box-shadow: 0 2px 10px rgba(0,0,0,0.2);
+                z-index: 1000;
+                opacity: 0;
+                transform: translateX(50px);
+                transition: opacity 0.3s, transform 0.3s;
+                display: flex;
+                align-items: center;
+            `;
+            // Create notification content
+            const checkmark = document.createElement('div');
+            checkmark.style.marginRight = '10px';
+            checkmark.textContent = '✓';
+            const textContainer = document.createElement('div');
+            const heading = document.createElement('div');
+            heading.style.fontWeight = 'bold';
+            heading.textContent = 'Thank You!';
+            const message = document.createElement('div');
+            message.textContent = 'Your feedback has been recorded.';
+            textContainer.appendChild(heading);
+            textContainer.appendChild(message);
+            notification.appendChild(checkmark);
+            notification.appendChild(textContainer);
+            // Add to document
+            document.body.appendChild(notification);
+            // Show notification
+            setTimeout(function() {
+                notification.style.opacity = '1';
+                notification.style.transform = 'translateX(0)';
+                // Hide after 3 seconds
+                setTimeout(function() {
+                    notification.style.opacity = '0';
+                    notification.style.transform = 'translateX(50px)';
+                    // Remove element after animation
+                    setTimeout(function() {
+                        notification.remove();
+                    }, 300);
+                }, 3000);
+            }, 100);
+        })();
+        </script>
+        <div>Feedback submitted successfully!</div>
+        """
+        return feedback_response
     # Create the interface
     with gr.Blocks(css=css, theme=gr.themes.Soft()) as interface:
                 # Add feedback stats display
                 feedback_stats = gr.HTML(visible=True)
+                # Feedback section
+                with gr.Accordion("Provide Feedback", open=False, elem_id="detector-feedback") as feedback_accordion:
+                    gr.Markdown("### Help Improve the System")
+                    gr.Markdown("Your feedback helps us refine the hallucination detection system.")
+                    feedback_input = gr.Radio(
+                        label="Was the hallucination detection accurate?",
+                        choices=["Yes, the detection was correct", "No, the detection was incorrect", "Other/Unsure"],
+                        value="Yes, the detection was correct"
+                    )
+                    feedback_text = gr.Textbox(
+                        label="Additional comments (optional)",
+                        placeholder="Please provide any additional observations or details...",
+                        lines=2
+                    )
+                    feedback_button = gr.Button("Submit Feedback", variant="secondary")
+                    feedback_status = gr.HTML(visible=True)
             # Tab 2: Model Leaderboard
+            with gr.TabItem("Model Leaderboard", elem_id="model-leaderboard-tab"):
                 gr.Markdown("## Hallucination Detection Scores")
                 gr.Markdown("Performance comparison of different Generator + Judge model combinations.")
                                 <th>Rank</th>
                                 <th>Generator Model</th>
                                 <th>Judge Model</th>
+                                <th>ELO Score</th>
+                                <th>Accuracy</th>
+                                <th>Consistency</th>
                             </tr>
                         </thead>
                         <tbody>
                                 <td>1</td>
                                 <td>gpt-4o</td>
                                 <td>o4-mini</td>
+                                <td>1878</td>
                                 <td>94.2%</td>
+                                <td>91.6%</td>
                             </tr>
                             <tr>
                                 <td>2</td>
                                 <td>gpt-4o</td>
                                 <td>gemini-2.5-pro</td>
+                                <td>1835</td>
                                 <td>92.8%</td>
+                                <td>89.2%</td>
                             </tr>
                             <tr>
                                 <td>3</td>
                                 <td>mistral-large</td>
                                 <td>o4-mini</td>
+                                <td>1795</td>
                                 <td>91.5%</td>
+                                <td>87.5%</td>
                             </tr>
                             <tr>
                                 <td>4</td>
                                 <td>Qwen3-235B-A22B</td>
                                 <td>o4-mini</td>
+                                <td>1768</td>
                                 <td>90.3%</td>
+                                <td>85.1%</td>
                             </tr>
                             <tr>
                                 <td>5</td>
                                 <td>grok-3</td>
                                 <td>o4-mini</td>
+                                <td>1742</td>
                                 <td>88.7%</td>
+                                <td>82.9%</td>
                             </tr>
                             <tr>
                                 <td>6</td>
                                 <td>mistral-large</td>
                                 <td>gemini-2.5-pro</td>
+                                <td>1716</td>
                                 <td>88.1%</td>
+                                <td>81.4%</td>
                             </tr>
                             <tr>
                                 <td>7</td>
                                 <td>deepseek-r1</td>
                                 <td>o4-mini</td>
+                                <td>1692</td>
                                 <td>87.3%</td>
+                                <td>80.3%</td>
                             </tr>
                         </tbody>
                     </table>
                 </div>
+                <div style="margin-top: 20px; padding: 15px; background-color: #0d47a1; border-radius: 8px; box-shadow: 0 2px 8px rgba(0,0,0,0.1);">
+                    <h3 style="margin-top: 0; color: #ffffff;">ELO Rating System Explanation</h3>
                     <div style="display: flex; flex-wrap: wrap; gap: 15px; margin-top: 15px;">
+                        <div style="flex: 1; min-width: 280px; padding: 12px; background-color: #455a64; border-radius: 6px; box-shadow: 0 1px 3px rgba(0,0,0,0.12);">
+                            <h4 style="margin-top: 0; color: #ffffff;">How ELO Scores Are Calculated</h4>
+                            <p style="color: #eceff1;">Our ELO rating system assigns scores to model pairs based on benchmark performance, using the following formula:</p>
+                            <div style="background-color: #37474f; padding: 12px; border-radius: 5px; color: #eceff1;">
+                                <code style="color: #80deea;">ELO_new = ELO_old + K × (S - E)</code><br><br>
+                                Where:<br>
+                                • <strong style="color: #b2dfdb;">ELO_old</strong>: Previous rating of the model combination<br>
+                                • <strong style="color: #b2dfdb;">K</strong>: Weight factor (32 for new models, 16 for established ones)<br>
+                                • <strong style="color: #b2dfdb;">S</strong>: Actual score from benchmark tests<br>
+                                • <strong style="color: #b2dfdb;">E</strong>: Expected score based on current rating<br><br>
+                                <em style="color: #80deea;">E = 1 / (1 + 10<sup>(ELO_opponent - ELO_model)/400</sup>)</em>
+                            </div>
                         </div>
+                        <div style="flex: 1; min-width: 280px; padding: 12px; background-color: #455a64; border-radius: 6px; box-shadow: 0 1px 3px rgba(0,0,0,0.12);">
+                            <h4 style="margin-top: 0; color: #ffffff;">Model Combinations Tested</h4>
+                            <p style="color: #eceff1;">We evaluated 10 different combinations across 250 benchmark questions.</p>
+                            <div style="display: flex; flex-wrap: wrap; gap: 10px; margin-top: 10px;">
+                                <div style="flex: 1; min-width: 120px;">
+                                    <h5 style="margin-top: 0; margin-bottom: 5px; color: #b2dfdb;">Generator Models</h5>
+                                    <ul style="margin-bottom: 0; padding-left: 20px; color: #eceff1;">
+                                        <li>mistral-large</li>
+                                        <li>gpt-4o</li>
+                                        <li>Qwen3-235B-A22B</li>
+                                        <li>grok-3</li>
+                                        <li>deepseek-r1</li>
+                                        <li>o4-mini</li>
+                                        <li>gemini-2.5-pro</li>
+                                    </ul>
+                                </div>
+                                <div style="flex: 1; min-width: 120px;">
+                                    <h5 style="margin-top: 0; margin-bottom: 5px; color: #b2dfdb;">Judge Models</h5>
+                                    <ul style="margin-bottom: 0; padding-left: 20px; color: #eceff1;">
+                                        <li>mistral-large</li>
+                                        <li>gpt-4o</li>
+                                        <li>Qwen3-235B-A22B</li>
+                                        <li>grok-3</li>
+                                        <li>deepseek-r1</li>
+                                        <li>o4-mini</li>
+                                        <li>gemini-2.5-pro</li>
+                                    </ul>
+                                </div>
+                            </div>
                         </div>
                     </div>
                 </div>
                 </style>
                 """)
+            # Tab 3: Generator Models Hallucination Leaderboard
+            with gr.TabItem("User Feedback", elem_id="user-feedback-tab"):
+                gr.Markdown("## Model Hallucination Evaluation (User Feedback)")
+                gr.Markdown("Performance ranking of generator models based on user-reported hallucination rates.")
                 # Create leaderboard table for user feedback
                 user_feedback_html = gr.HTML("""
                             <tr>
                                 <th>Rank</th>
                                 <th>Generator Model</th>
+                                <th>ELO Score</th>
+                                <th>Accuracy</th>
+                                <th>Sample Size</th>
                             </tr>
                         </thead>
                         <tbody>
                             <tr>
                                 <td>1</td>
                                 <td>gpt-4o</td>
+                                <td>1856</td>
                                 <td>96.4%</td>
                                 <td>256</td>
                             </tr>
                             <tr>
                                 <td>2</td>
                                 <td>mistral-large</td>
+                                <td>1802</td>
                                 <td>93.8%</td>
                                 <td>221</td>
                             </tr>
                             <tr>
                                 <td>3</td>
+                                <td>Qwen3-235B-A22B</td>
+                                <td>1765</td>
                                 <td>91.5%</td>
                                 <td>192</td>
                             </tr>
                             <tr>
                                 <td>4</td>
                                 <td>o4-mini</td>
+                                <td>1732</td>
                                 <td>89.3%</td>
                                 <td>178</td>
                             </tr>
                             <tr>
                                 <td>5</td>
                                 <td>gemini-2.5-pro</td>
+                                <td>1695</td>
                                 <td>87.2%</td>
                                 <td>165</td>
                             </tr>
                             <tr>
                                 <td>6</td>
                                 <td>grok-3</td>
+                                <td>1665</td>
                                 <td>85.7%</td>
                                 <td>147</td>
                             </tr>
                             <tr>
                                 <td>7</td>
                                 <td>deepseek-r1</td>
+                                <td>1625</td>
                                 <td>83.2%</td>
                                 <td>134</td>
                             </tr>
                         </tbody>
                     </table>
                 </div>
+                <div style="margin-top: 20px; padding: 15px; background-color: #0d47a1; border-radius: 8px; box-shadow: 0 2px 8px rgba(0,0,0,0.1);">
+                    <h3 style="margin-top: 0; color: #ffffff;">ELO Rating System Explanation</h3>
                     <div style="display: flex; flex-wrap: wrap; gap: 15px; margin-top: 15px;">
+                        <div style="flex: 1; min-width: 280px; padding: 12px; background-color: #455a64; border-radius: 6px; box-shadow: 0 1px 3px rgba(0,0,0,0.12);">
+                            <h4 style="margin-top: 0; color: #ffffff;">How ELO Scores Are Calculated</h4>
+                            <p style="color: #eceff1;">Our ELO rating system assigns scores to models based on user feedback, using the following formula:</p>
+                            <div style="background-color: #37474f; padding: 12px; border-radius: 5px; color: #eceff1;">
+                                <code style="color: #80deea;">ELO_new = ELO_old + K × (S - E)</code><br><br>
+                                Where:<br>
+                                • <strong style="color: #b2dfdb;">ELO_old</strong>: Previous rating of the model<br>
+                                • <strong style="color: #b2dfdb;">K</strong>: Weight factor (40 for new models, 20 for established ones)<br>
+                                • <strong style="color: #b2dfdb;">S</strong>: Actual score (1 for correct hallucination detection, 0 for incorrect)<br>
+                                • <strong style="color: #b2dfdb;">E</strong>: Expected score based on current rating<br><br>
+                                <em style="color: #80deea;">E = 1 / (1 + 10<sup>(ELO_opponent - ELO_model)/400</sup>)</em>
                             </div>
+                            <p style="color: #eceff1; margin-top: 10px;">All models start with a base ELO of 1500. Scores are updated after each user evaluation.</p>
+                        </div>
+                        <div style="flex: 1; min-width: 280px; padding: 12px; background-color: #455a64; border-radius: 6px; box-shadow: 0 1px 3px rgba(0,0,0,0.12);">
+                            <h4 style="margin-top: 0; color: #ffffff;">Interpretation Guidelines</h4>
+                            <ul style="margin-bottom: 0; padding-left: 20px; color: #eceff1;">
+                                <li><strong style="color: #b2dfdb;">1800+</strong>: Exceptional performance, very rare hallucinations</li>
+                                <li><strong style="color: #b2dfdb;">1700-1799</strong>: Superior performance, minimal hallucinations</li>
+                                <li><strong style="color: #b2dfdb;">1600-1699</strong>: Good performance, occasional hallucinations</li>
+                                <li><strong style="color: #b2dfdb;">1500-1599</strong>: Average performance</li>
+                                <li><strong style="color: #b2dfdb;">&lt;1500</strong>: Below average, frequent hallucinations</li>
+                            </ul>
+                            <p style="font-style: italic; color: #b3e5fc; margin-top: 10px;">
+                                Note: ELO scores are comparative and reflect relative performance between models in our specific hallucination detection tasks.
+                            </p>
                         </div>
                     </div>
                 </div>
                 return stats_html
             return ""
+        # Feedback section is now moved directly inside the Detector tab
         # Set up interval to update stats
         with gr.Row(elem_id="stats-container"):
             with gr.Column():
             }, refreshInterval);
         }
+        // Add highlighting to the selected tab and handle feedback section visibility
         function setupTabHighlighting() {
             // Add hover effects to tabs
             const tabs = document.querySelectorAll('.tabs button');
                             tab.style.backgroundColor = '';
                         }
                     });
+                    // Handle tab click events to manage feedback section visibility
+                    tab.addEventListener('click', function() {
+                        // Use setTimeout to let Gradio UI update first
+                        setTimeout(() => {
+                            // Check if this tab is selected and what its text is
+                            const isDetectorTab = this.classList.contains('selected') &&
+                                              !this.textContent.includes('Model') &&
+                                              !this.textContent.includes('User');
+                            // Find all accordions in the page
+                            const accordions = document.querySelectorAll('.accordion');
+                            // Loop through all accordions
+                            accordions.forEach(acc => {
+                                // Check if this is the feedback accordion
+                                if (acc.textContent.includes('Provide Feedback') ||
+                                    acc.textContent.includes('Help Improve')) {
+                                    if (isDetectorTab) {
+                                        acc.style.display = 'block';
+                                    } else {
+                                        acc.style.display = 'none';
+                                    }
+                                }
+                            });
+                        }, 100);
+                    });
                 });
             }
         }
         function setupAllEnhancements() {
             setupAutoRefresh();
             setupTabHighlighting();
+            // Simple solution to ensure feedback is only visible in detector tab
+            setTimeout(() => {
+                // Get the feedback accordion by ID
+                const feedbackAccordion = document.getElementById('detector-feedback');
+                if (!feedbackAccordion) return;
+                // Get all tabs
+                const tabs = document.querySelectorAll('.tabs button');
+                if (tabs.length === 0) return;
+                // Add click handlers to each tab
+                tabs.forEach((tab, index) => {
+                    // Check if it's the first tab (Detector)
+                    const isDetectorTab = index === 0;
+                    // When a tab is clicked, toggle the feedback visibility
+                    tab.addEventListener('click', function() {
+                        if (feedbackAccordion) {
+                            // Give time for Gradio to update the UI
+                            setTimeout(() => {
+                                feedbackAccordion.style.display = this.classList.contains('selected') && isDetectorTab ? 'block' : 'none';
+                            }, 100);
+                        }
+                    });
+                });
+                // Initial setup - make sure feedback is only visible if detector tab is active
+                const activeTab = document.querySelector('.tabs button.selected');
+                const activeTabIndex = Array.from(tabs).indexOf(activeTab);
+                if (activeTabIndex !== 0) { // If not on detector tab
+                    feedbackAccordion.style.display = 'none';
+                }
+                // Also create a style rule for safety
+                const style = document.createElement('style');
+                style.textContent = `
+                    .tabs[data-testid*="tab"] button:not(:first-child).selected ~ .tabitem #detector-feedback {
+                        display: none !important;
+                    }
+                `;
+                document.head.appendChild(style);
+            }, 300);
         }
         if (window.gradio_loaded) {
             from { opacity: 0; }
             to { opacity: 1; }
         }
+        /* Initial setting - show feedback accordion */
+        #detector-feedback {
+            display: block !important;
+        }
+        /* Hide when in other tabs using IDs */
+        #model-leaderboard-tab #detector-feedback,
+        #user-feedback-tab #detector-feedback {
+            display: none !important;
+        }
         </style>
         """)
+        # Removed duplicate feedback section (moved to above the stats container)
         # Hidden state to store results for feedback
         hidden_results = gr.State()