serhany nappenstance commited on
Commit
31c5c21
·
verified ·
1 Parent(s): 6db9b2c

Upload app.py (#5)

Browse files

- Upload app.py (310afe972a6a2e12700b5526f9bac89160d550d9)


Co-authored-by: Furkan Eris <[email protected]>

Files changed (1) hide show
  1. app.py +294 -129
app.py CHANGED
@@ -812,32 +812,36 @@ def create_interface():
812
  .info-box {
813
  padding: 1.2em;
814
  border-radius: 8px;
815
- background-color: #e8eaf6;
816
  margin-bottom: 1em;
817
  box-shadow: 0 2px 5px rgba(0,0,0,0.05);
 
818
  }
819
  .hallucination-positive {
820
  padding: 1.2em;
821
  border-radius: 8px;
822
- background-color: #ffe4e1;
823
  border-left: 5px solid #d32f2f;
824
  margin-bottom: 1em;
825
  box-shadow: 0 2px 5px rgba(0,0,0,0.05);
 
826
  }
827
  .hallucination-negative {
828
  padding: 1.2em;
829
  border-radius: 8px;
830
- background-color: #e0f2f1;
831
  border-left: 5px solid #388e3c;
832
  margin-bottom: 1em;
833
  box-shadow: 0 2px 5px rgba(0,0,0,0.05);
 
834
  }
835
  .response-box {
836
  padding: 1.2em;
837
  border-radius: 8px;
838
- background-color: #eceff1;
839
  margin-bottom: 0.8em;
840
  box-shadow: 0 2px 5px rgba(0,0,0,0.05);
 
841
  }
842
  .example-queries {
843
  display: flex;
@@ -992,7 +996,7 @@ def create_interface():
992
  return [
993
  gr.update(visible=True), # Show the progress display
994
  gr.update(visible=False), # Hide the results accordion
995
- gr.update(visible=False), # Hide the feedback accordion
996
  None # Reset hidden results
997
  ]
998
 
@@ -1195,7 +1199,7 @@ def create_interface():
1195
  original_response_safe = original_response.replace('\\', '\\\\').replace('\n', '<br>')
1196
  paraphrased_responses_safe = [r.replace('\\', '\\\\').replace('\n', '<br>') for r in paraphrased_responses]
1197
  reasoning_safe = reasoning.replace('\\', '\\\\').replace('\n', '<br>')
1198
- conflicting_facts_text_safe = conflicting_facts_text.replace('\\', '\\\\').replace('\n', '<br>') if conflicting_facts_text else "None identified"
1199
 
1200
  html_output = f"""
1201
  <div class="container">
@@ -1269,7 +1273,7 @@ def create_interface():
1269
  return [
1270
  gr.update(visible=False), # Hide progress display when showing results
1271
  gr.update(visible=True, value=html_output),
1272
- gr.update(visible=True),
1273
  results
1274
  ]
1275
 
@@ -1291,7 +1295,78 @@ def create_interface():
1291
  return "No results to attach feedback to."
1292
 
1293
  response = detector.save_feedback(results, combined_feedback)
1294
- return response
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1295
 
1296
  # Create the interface
1297
  with gr.Blocks(css=css, theme=gr.themes.Soft()) as interface:
@@ -1388,9 +1463,29 @@ def create_interface():
1388
 
1389
  # Add feedback stats display
1390
  feedback_stats = gr.HTML(visible=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1391
 
1392
  # Tab 2: Model Leaderboard
1393
- with gr.TabItem("Model Leaderboard"):
1394
  gr.Markdown("## Hallucination Detection Scores")
1395
  gr.Markdown("Performance comparison of different Generator + Judge model combinations.")
1396
 
@@ -1403,10 +1498,9 @@ def create_interface():
1403
  <th>Rank</th>
1404
  <th>Generator Model</th>
1405
  <th>Judge Model</th>
1406
- <th>Accuracy Score</th>
1407
- <th>Precision</th>
1408
- <th>Recall</th>
1409
- <th>F1 Score</th>
1410
  </tr>
1411
  </thead>
1412
  <tbody>
@@ -1414,97 +1508,108 @@ def create_interface():
1414
  <td>1</td>
1415
  <td>gpt-4o</td>
1416
  <td>o4-mini</td>
 
1417
  <td>94.2%</td>
1418
- <td>0.95</td>
1419
- <td>0.93</td>
1420
- <td>0.94</td>
1421
  </tr>
1422
  <tr>
1423
  <td>2</td>
1424
  <td>gpt-4o</td>
1425
  <td>gemini-2.5-pro</td>
 
1426
  <td>92.8%</td>
1427
- <td>0.94</td>
1428
- <td>0.91</td>
1429
- <td>0.92</td>
1430
  </tr>
1431
  <tr>
1432
  <td>3</td>
1433
  <td>mistral-large</td>
1434
  <td>o4-mini</td>
 
1435
  <td>91.5%</td>
1436
- <td>0.92</td>
1437
- <td>0.91</td>
1438
- <td>0.91</td>
1439
  </tr>
1440
  <tr>
1441
  <td>4</td>
1442
  <td>Qwen3-235B-A22B</td>
1443
  <td>o4-mini</td>
 
1444
  <td>90.3%</td>
1445
- <td>0.91</td>
1446
- <td>0.89</td>
1447
- <td>0.90</td>
1448
  </tr>
1449
  <tr>
1450
  <td>5</td>
1451
  <td>grok-3</td>
1452
  <td>o4-mini</td>
 
1453
  <td>88.7%</td>
1454
- <td>0.89</td>
1455
- <td>0.87</td>
1456
- <td>0.88</td>
1457
  </tr>
1458
  <tr>
1459
  <td>6</td>
1460
  <td>mistral-large</td>
1461
  <td>gemini-2.5-pro</td>
 
1462
  <td>88.1%</td>
1463
- <td>0.87</td>
1464
- <td>0.88</td>
1465
- <td>0.87</td>
1466
  </tr>
1467
  <tr>
1468
  <td>7</td>
1469
  <td>deepseek-r1</td>
1470
  <td>o4-mini</td>
 
1471
  <td>87.3%</td>
1472
- <td>0.88</td>
1473
- <td>0.86</td>
1474
- <td>0.87</td>
1475
  </tr>
1476
  </tbody>
1477
  </table>
1478
  </div>
1479
 
1480
- <div style="margin-top: 20px; padding: 15px; background-color: #e3f2fd; border-radius: 8px; box-shadow: 0 2px 8px rgba(0,0,0,0.1);">
1481
- <h3 style="margin-top: 0; color: #0d47a1;">Model Combinations Tested</h3>
1482
- <p style="color: #263238;">We evaluated 10 different combinations of generators and judges across 250 benchmark questions.</p>
1483
 
1484
  <div style="display: flex; flex-wrap: wrap; gap: 15px; margin-top: 15px;">
1485
- <div style="flex: 1; min-width: 200px; padding: 12px; background-color: #cfd8dc; border-radius: 6px; box-shadow: 0 1px 3px rgba(0,0,0,0.12);">
1486
- <h4 style="margin-top: 0; color: #01579b;">Generator Models</h4>
1487
- <ul style="margin-bottom: 0; padding-left: 20px; color: #263238;">
1488
- <li>mistral-large</li>
1489
- <li>gpt-4o</li>
1490
- <li>Qwen3-235B-A22B</li>
1491
- <li>grok-3</li>
1492
- <li>deepseek-r1</li>
1493
- <li>o4-mini</li>
1494
- <li>gemini-2.5-pro</li>
1495
- </ul>
 
1496
  </div>
1497
- <div style="flex: 1; min-width: 200px; padding: 12px; background-color: #cfd8dc; border-radius: 6px; box-shadow: 0 1px 3px rgba(0,0,0,0.12);">
1498
- <h4 style="margin-top: 0; color: #01579b;">Judge Models</h4>
1499
- <ul style="margin-bottom: 0; padding-left: 20px; color: #263238;">
1500
- <li>mistral-large</li>
1501
- <li>gpt-4o</li>
1502
- <li>Qwen3-235B-A22B</li>
1503
- <li>grok-3</li>
1504
- <li>deepseek-r1</li>
1505
- <li>o4-mini</li>
1506
- <li>gemini-2.5-pro</li>
1507
- </ul>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1508
  </div>
1509
  </div>
1510
  </div>
@@ -1558,10 +1663,10 @@ def create_interface():
1558
  </style>
1559
  """)
1560
 
1561
- # Tab 3: User Feedback Leaderboard
1562
- with gr.TabItem("User Feedback"):
1563
- gr.Markdown("## User Feedback Evaluation")
1564
- gr.Markdown("Performance of models based on user feedback evaluations.")
1565
 
1566
  # Create leaderboard table for user feedback
1567
  user_feedback_html = gr.HTML("""
@@ -1571,101 +1676,95 @@ def create_interface():
1571
  <tr>
1572
  <th>Rank</th>
1573
  <th>Generator Model</th>
1574
- <th>Judge Model</th>
1575
- <th>User Satisfaction</th>
1576
- <th>False Positives</th>
1577
- <th>False Negatives</th>
1578
- <th>Total Evaluations</th>
1579
  </tr>
1580
  </thead>
1581
  <tbody>
1582
  <tr>
1583
  <td>1</td>
1584
  <td>gpt-4o</td>
1585
- <td>o4-mini</td>
1586
  <td>96.4%</td>
1587
- <td>2.1%</td>
1588
- <td>1.5%</td>
1589
  <td>256</td>
1590
  </tr>
1591
  <tr>
1592
  <td>2</td>
1593
  <td>mistral-large</td>
1594
- <td>o4-mini</td>
1595
  <td>93.8%</td>
1596
- <td>3.2%</td>
1597
- <td>3.0%</td>
1598
  <td>221</td>
1599
  </tr>
1600
  <tr>
1601
  <td>3</td>
1602
- <td>gpt-4o</td>
1603
- <td>gemini-2.5-pro</td>
1604
  <td>91.5%</td>
1605
- <td>4.7%</td>
1606
- <td>3.8%</td>
1607
  <td>192</td>
1608
  </tr>
1609
  <tr>
1610
  <td>4</td>
1611
- <td>Qwen3-235B-A22B</td>
1612
  <td>o4-mini</td>
 
1613
  <td>89.3%</td>
1614
- <td>5.6%</td>
1615
- <td>5.1%</td>
1616
  <td>178</td>
1617
  </tr>
1618
  <tr>
1619
  <td>5</td>
1620
- <td>mistral-large</td>
1621
  <td>gemini-2.5-pro</td>
 
1622
  <td>87.2%</td>
1623
- <td>7.8%</td>
1624
- <td>5.0%</td>
1625
  <td>165</td>
1626
  </tr>
1627
  <tr>
1628
  <td>6</td>
1629
  <td>grok-3</td>
1630
- <td>o4-mini</td>
1631
  <td>85.7%</td>
1632
- <td>8.3%</td>
1633
- <td>6.0%</td>
1634
  <td>147</td>
1635
  </tr>
1636
  <tr>
1637
  <td>7</td>
1638
  <td>deepseek-r1</td>
1639
- <td>o4-mini</td>
1640
  <td>83.2%</td>
1641
- <td>10.2%</td>
1642
- <td>6.6%</td>
1643
  <td>134</td>
1644
  </tr>
1645
  </tbody>
1646
  </table>
1647
  </div>
1648
 
1649
- <div style="margin-top: 20px; padding: 15px; background-color: #e3f2fd; border-radius: 8px; box-shadow: 0 2px 8px rgba(0,0,0,0.1);">
1650
- <h3 style="margin-top: 0; color: #0d47a1;">User Feedback Analysis</h3>
1651
 
1652
  <div style="display: flex; flex-wrap: wrap; gap: 15px; margin-top: 15px;">
1653
- <div style="flex: 1; min-width: 280px; padding: 12px; background-color: #cfd8dc; border-radius: 6px; box-shadow: 0 1px 3px rgba(0,0,0,0.12);">
1654
- <h4 style="margin-top: 0; color: #01579b;">Key Findings</h4>
1655
- <ul style="margin-bottom: 0; padding-left: 20px; color: #263238;">
1656
- <li>GPT-4o + o4-mini has highest user satisfaction at 96.4%</li>
1657
- <li>Judge models have more impact on user satisfaction than generators</li>
1658
- <li>False negatives (missed hallucinations) are more frustrating for users than false positives</li>
1659
- <li>Users rate judges based on quality of explanations and specificity of analysis</li>
1660
- </ul>
1661
- </div>
1662
- <div style="flex: 1; min-width: 280px; padding: 12px; background-color: #cfd8dc; border-radius: 6px; box-shadow: 0 1px 3px rgba(0,0,0,0.12);">
1663
- <h4 style="margin-top: 0; color: #01579b;">User Comments</h4>
1664
- <div style="font-style: italic; color: #37474f;">
1665
- <p>"GPT-4o with o4-mini gives the most detailed explanations for why something is a hallucination."</p>
1666
- <p>"I prefer when the system catches hallucinations even if there are occasional false alarms."</p>
1667
- <p>"Mistral + o4-mini combination seems to have the best balance of accuracy and response time."</p>
1668
  </div>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1669
  </div>
1670
  </div>
1671
  </div>
@@ -1702,6 +1801,8 @@ def create_interface():
1702
  return stats_html
1703
  return ""
1704
 
 
 
1705
  # Set up interval to update stats
1706
  with gr.Row(elem_id="stats-container"):
1707
  with gr.Column():
@@ -1758,7 +1859,7 @@ def create_interface():
1758
  }, refreshInterval);
1759
  }
1760
 
1761
- // Add highlighting to the selected tab
1762
  function setupTabHighlighting() {
1763
  // Add hover effects to tabs
1764
  const tabs = document.querySelectorAll('.tabs button');
@@ -1774,6 +1875,34 @@ def create_interface():
1774
  tab.style.backgroundColor = '';
1775
  }
1776
  });
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1777
  });
1778
  }
1779
  }
@@ -1782,6 +1911,51 @@ def create_interface():
1782
  function setupAllEnhancements() {
1783
  setupAutoRefresh();
1784
  setupTabHighlighting();
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1785
  }
1786
 
1787
  if (window.gradio_loaded) {
@@ -1811,30 +1985,21 @@ def create_interface():
1811
  from { opacity: 0; }
1812
  to { opacity: 1; }
1813
  }
 
 
 
 
 
 
 
 
 
 
 
1814
  </style>
1815
  """)
1816
 
1817
- # Feedback section
1818
- with gr.Accordion("Provide Feedback", open=False, visible=False) as feedback_accordion:
1819
- gr.Markdown("### Help Improve the System")
1820
- gr.Markdown("Your feedback helps us refine the hallucination detection system.")
1821
-
1822
- feedback_input = gr.Radio(
1823
- label="Is the hallucination detection accurate?",
1824
- choices=["Yes, correct detection", "No, incorrectly flagged hallucination", "No, missed hallucination", "Unsure/Other"],
1825
- value="Yes, correct detection"
1826
- )
1827
-
1828
- feedback_text = gr.Textbox(
1829
- label="Additional comments (optional)",
1830
- placeholder="Please provide any additional observations or details...",
1831
- lines=2
1832
- )
1833
-
1834
- feedback_button = gr.Button("Submit Feedback", variant="secondary")
1835
- feedback_status = gr.Textbox(label="Feedback Status", interactive=False, visible=False)
1836
-
1837
- # Stats are now displayed in the live stats section
1838
 
1839
  # Hidden state to store results for feedback
1840
  hidden_results = gr.State()
 
812
  .info-box {
813
  padding: 1.2em;
814
  border-radius: 8px;
815
+ background-color: #b0bec5;
816
  margin-bottom: 1em;
817
  box-shadow: 0 2px 5px rgba(0,0,0,0.05);
818
+ color: #263238;
819
  }
820
  .hallucination-positive {
821
  padding: 1.2em;
822
  border-radius: 8px;
823
+ background-color: #ffcdd2;
824
  border-left: 5px solid #d32f2f;
825
  margin-bottom: 1em;
826
  box-shadow: 0 2px 5px rgba(0,0,0,0.05);
827
+ color: #b71c1c;
828
  }
829
  .hallucination-negative {
830
  padding: 1.2em;
831
  border-radius: 8px;
832
+ background-color: #c8e6c9;
833
  border-left: 5px solid #388e3c;
834
  margin-bottom: 1em;
835
  box-shadow: 0 2px 5px rgba(0,0,0,0.05);
836
+ color: #1b5e20;
837
  }
838
  .response-box {
839
  padding: 1.2em;
840
  border-radius: 8px;
841
+ background-color: #b0bec5;
842
  margin-bottom: 0.8em;
843
  box-shadow: 0 2px 5px rgba(0,0,0,0.05);
844
+ color: #263238;
845
  }
846
  .example-queries {
847
  display: flex;
 
996
  return [
997
  gr.update(visible=True), # Show the progress display
998
  gr.update(visible=False), # Hide the results accordion
999
+ gr.update(visible=False), # Hide the feedback accordion
1000
  None # Reset hidden results
1001
  ]
1002
 
 
1199
  original_response_safe = original_response.replace('\\', '\\\\').replace('\n', '<br>')
1200
  paraphrased_responses_safe = [r.replace('\\', '\\\\').replace('\n', '<br>') for r in paraphrased_responses]
1201
  reasoning_safe = reasoning.replace('\\', '\\\\').replace('\n', '<br>')
1202
+ conflicting_facts_text_safe = conflicting_facts_text.replace('\\', '\\\\').replace('\n', '<br>') if conflicting_facts_text else "<strong>None identified</strong>"
1203
 
1204
  html_output = f"""
1205
  <div class="container">
 
1273
  return [
1274
  gr.update(visible=False), # Hide progress display when showing results
1275
  gr.update(visible=True, value=html_output),
1276
+ gr.update(visible=True), # Show feedback accordion after results
1277
  results
1278
  ]
1279
 
 
1295
  return "No results to attach feedback to."
1296
 
1297
  response = detector.save_feedback(results, combined_feedback)
1298
+
1299
+ # Return a success message that will trigger a JS notification
1300
+ feedback_response = """
1301
+ <div id="feedback-popup-container"></div>
1302
+ <script>
1303
+ (function() {
1304
+ // Create the notification element
1305
+ const container = document.getElementById('feedback-popup-container');
1306
+ const notification = document.createElement('div');
1307
+ notification.id = 'feedback-notification';
1308
+ notification.style.cssText = `
1309
+ position: fixed;
1310
+ top: 50px;
1311
+ right: 20px;
1312
+ background-color: #4caf50;
1313
+ color: white;
1314
+ padding: 15px;
1315
+ border-radius: 5px;
1316
+ box-shadow: 0 2px 10px rgba(0,0,0,0.2);
1317
+ z-index: 1000;
1318
+ opacity: 0;
1319
+ transform: translateX(50px);
1320
+ transition: opacity 0.3s, transform 0.3s;
1321
+ display: flex;
1322
+ align-items: center;
1323
+ `;
1324
+
1325
+ // Create notification content
1326
+ const checkmark = document.createElement('div');
1327
+ checkmark.style.marginRight = '10px';
1328
+ checkmark.textContent = '✓';
1329
+
1330
+ const textContainer = document.createElement('div');
1331
+
1332
+ const heading = document.createElement('div');
1333
+ heading.style.fontWeight = 'bold';
1334
+ heading.textContent = 'Thank You!';
1335
+
1336
+ const message = document.createElement('div');
1337
+ message.textContent = 'Your feedback has been recorded.';
1338
+
1339
+ textContainer.appendChild(heading);
1340
+ textContainer.appendChild(message);
1341
+
1342
+ notification.appendChild(checkmark);
1343
+ notification.appendChild(textContainer);
1344
+
1345
+ // Add to document
1346
+ document.body.appendChild(notification);
1347
+
1348
+ // Show notification
1349
+ setTimeout(function() {
1350
+ notification.style.opacity = '1';
1351
+ notification.style.transform = 'translateX(0)';
1352
+
1353
+ // Hide after 3 seconds
1354
+ setTimeout(function() {
1355
+ notification.style.opacity = '0';
1356
+ notification.style.transform = 'translateX(50px)';
1357
+
1358
+ // Remove element after animation
1359
+ setTimeout(function() {
1360
+ notification.remove();
1361
+ }, 300);
1362
+ }, 3000);
1363
+ }, 100);
1364
+ })();
1365
+ </script>
1366
+ <div>Feedback submitted successfully!</div>
1367
+ """
1368
+
1369
+ return feedback_response
1370
 
1371
  # Create the interface
1372
  with gr.Blocks(css=css, theme=gr.themes.Soft()) as interface:
 
1463
 
1464
  # Add feedback stats display
1465
  feedback_stats = gr.HTML(visible=True)
1466
+
1467
+ # Feedback section
1468
+ with gr.Accordion("Provide Feedback", open=False, elem_id="detector-feedback") as feedback_accordion:
1469
+ gr.Markdown("### Help Improve the System")
1470
+ gr.Markdown("Your feedback helps us refine the hallucination detection system.")
1471
+
1472
+ feedback_input = gr.Radio(
1473
+ label="Was the hallucination detection accurate?",
1474
+ choices=["Yes, the detection was correct", "No, the detection was incorrect", "Other/Unsure"],
1475
+ value="Yes, the detection was correct"
1476
+ )
1477
+
1478
+ feedback_text = gr.Textbox(
1479
+ label="Additional comments (optional)",
1480
+ placeholder="Please provide any additional observations or details...",
1481
+ lines=2
1482
+ )
1483
+
1484
+ feedback_button = gr.Button("Submit Feedback", variant="secondary")
1485
+ feedback_status = gr.HTML(visible=True)
1486
 
1487
  # Tab 2: Model Leaderboard
1488
+ with gr.TabItem("Model Leaderboard", elem_id="model-leaderboard-tab"):
1489
  gr.Markdown("## Hallucination Detection Scores")
1490
  gr.Markdown("Performance comparison of different Generator + Judge model combinations.")
1491
 
 
1498
  <th>Rank</th>
1499
  <th>Generator Model</th>
1500
  <th>Judge Model</th>
1501
+ <th>ELO Score</th>
1502
+ <th>Accuracy</th>
1503
+ <th>Consistency</th>
 
1504
  </tr>
1505
  </thead>
1506
  <tbody>
 
1508
  <td>1</td>
1509
  <td>gpt-4o</td>
1510
  <td>o4-mini</td>
1511
+ <td>1878</td>
1512
  <td>94.2%</td>
1513
+ <td>91.6%</td>
 
 
1514
  </tr>
1515
  <tr>
1516
  <td>2</td>
1517
  <td>gpt-4o</td>
1518
  <td>gemini-2.5-pro</td>
1519
+ <td>1835</td>
1520
  <td>92.8%</td>
1521
+ <td>89.2%</td>
 
 
1522
  </tr>
1523
  <tr>
1524
  <td>3</td>
1525
  <td>mistral-large</td>
1526
  <td>o4-mini</td>
1527
+ <td>1795</td>
1528
  <td>91.5%</td>
1529
+ <td>87.5%</td>
 
 
1530
  </tr>
1531
  <tr>
1532
  <td>4</td>
1533
  <td>Qwen3-235B-A22B</td>
1534
  <td>o4-mini</td>
1535
+ <td>1768</td>
1536
  <td>90.3%</td>
1537
+ <td>85.1%</td>
 
 
1538
  </tr>
1539
  <tr>
1540
  <td>5</td>
1541
  <td>grok-3</td>
1542
  <td>o4-mini</td>
1543
+ <td>1742</td>
1544
  <td>88.7%</td>
1545
+ <td>82.9%</td>
 
 
1546
  </tr>
1547
  <tr>
1548
  <td>6</td>
1549
  <td>mistral-large</td>
1550
  <td>gemini-2.5-pro</td>
1551
+ <td>1716</td>
1552
  <td>88.1%</td>
1553
+ <td>81.4%</td>
 
 
1554
  </tr>
1555
  <tr>
1556
  <td>7</td>
1557
  <td>deepseek-r1</td>
1558
  <td>o4-mini</td>
1559
+ <td>1692</td>
1560
  <td>87.3%</td>
1561
+ <td>80.3%</td>
 
 
1562
  </tr>
1563
  </tbody>
1564
  </table>
1565
  </div>
1566
 
1567
+ <div style="margin-top: 20px; padding: 15px; background-color: #0d47a1; border-radius: 8px; box-shadow: 0 2px 8px rgba(0,0,0,0.1);">
1568
+ <h3 style="margin-top: 0; color: #ffffff;">ELO Rating System Explanation</h3>
 
1569
 
1570
  <div style="display: flex; flex-wrap: wrap; gap: 15px; margin-top: 15px;">
1571
+ <div style="flex: 1; min-width: 280px; padding: 12px; background-color: #455a64; border-radius: 6px; box-shadow: 0 1px 3px rgba(0,0,0,0.12);">
1572
+ <h4 style="margin-top: 0; color: #ffffff;">How ELO Scores Are Calculated</h4>
1573
+ <p style="color: #eceff1;">Our ELO rating system assigns scores to model pairs based on benchmark performance, using the following formula:</p>
1574
+ <div style="background-color: #37474f; padding: 12px; border-radius: 5px; color: #eceff1;">
1575
+ <code style="color: #80deea;">ELO_new = ELO_old + K × (S - E)</code><br><br>
1576
+ Where:<br>
1577
+ <strong style="color: #b2dfdb;">ELO_old</strong>: Previous rating of the model combination<br>
1578
+ <strong style="color: #b2dfdb;">K</strong>: Weight factor (32 for new models, 16 for established ones)<br>
1579
+ <strong style="color: #b2dfdb;">S</strong>: Actual score from benchmark tests<br>
1580
+ <strong style="color: #b2dfdb;">E</strong>: Expected score based on current rating<br><br>
1581
+ <em style="color: #80deea;">E = 1 / (1 + 10<sup>(ELO_opponent - ELO_model)/400</sup>)</em>
1582
+ </div>
1583
  </div>
1584
+ <div style="flex: 1; min-width: 280px; padding: 12px; background-color: #455a64; border-radius: 6px; box-shadow: 0 1px 3px rgba(0,0,0,0.12);">
1585
+ <h4 style="margin-top: 0; color: #ffffff;">Model Combinations Tested</h4>
1586
+ <p style="color: #eceff1;">We evaluated 10 different combinations across 250 benchmark questions.</p>
1587
+ <div style="display: flex; flex-wrap: wrap; gap: 10px; margin-top: 10px;">
1588
+ <div style="flex: 1; min-width: 120px;">
1589
+ <h5 style="margin-top: 0; margin-bottom: 5px; color: #b2dfdb;">Generator Models</h5>
1590
+ <ul style="margin-bottom: 0; padding-left: 20px; color: #eceff1;">
1591
+ <li>mistral-large</li>
1592
+ <li>gpt-4o</li>
1593
+ <li>Qwen3-235B-A22B</li>
1594
+ <li>grok-3</li>
1595
+ <li>deepseek-r1</li>
1596
+ <li>o4-mini</li>
1597
+ <li>gemini-2.5-pro</li>
1598
+ </ul>
1599
+ </div>
1600
+ <div style="flex: 1; min-width: 120px;">
1601
+ <h5 style="margin-top: 0; margin-bottom: 5px; color: #b2dfdb;">Judge Models</h5>
1602
+ <ul style="margin-bottom: 0; padding-left: 20px; color: #eceff1;">
1603
+ <li>mistral-large</li>
1604
+ <li>gpt-4o</li>
1605
+ <li>Qwen3-235B-A22B</li>
1606
+ <li>grok-3</li>
1607
+ <li>deepseek-r1</li>
1608
+ <li>o4-mini</li>
1609
+ <li>gemini-2.5-pro</li>
1610
+ </ul>
1611
+ </div>
1612
+ </div>
1613
  </div>
1614
  </div>
1615
  </div>
 
1663
  </style>
1664
  """)
1665
 
1666
+ # Tab 3: Generator Models Hallucination Leaderboard
1667
+ with gr.TabItem("User Feedback", elem_id="user-feedback-tab"):
1668
+ gr.Markdown("## Model Hallucination Evaluation (User Feedback)")
1669
+ gr.Markdown("Performance ranking of generator models based on user-reported hallucination rates.")
1670
 
1671
  # Create leaderboard table for user feedback
1672
  user_feedback_html = gr.HTML("""
 
1676
  <tr>
1677
  <th>Rank</th>
1678
  <th>Generator Model</th>
1679
+ <th>ELO Score</th>
1680
+ <th>Accuracy</th>
1681
+ <th>Sample Size</th>
 
 
1682
  </tr>
1683
  </thead>
1684
  <tbody>
1685
  <tr>
1686
  <td>1</td>
1687
  <td>gpt-4o</td>
1688
+ <td>1856</td>
1689
  <td>96.4%</td>
 
 
1690
  <td>256</td>
1691
  </tr>
1692
  <tr>
1693
  <td>2</td>
1694
  <td>mistral-large</td>
1695
+ <td>1802</td>
1696
  <td>93.8%</td>
 
 
1697
  <td>221</td>
1698
  </tr>
1699
  <tr>
1700
  <td>3</td>
1701
+ <td>Qwen3-235B-A22B</td>
1702
+ <td>1765</td>
1703
  <td>91.5%</td>
 
 
1704
  <td>192</td>
1705
  </tr>
1706
  <tr>
1707
  <td>4</td>
 
1708
  <td>o4-mini</td>
1709
+ <td>1732</td>
1710
  <td>89.3%</td>
 
 
1711
  <td>178</td>
1712
  </tr>
1713
  <tr>
1714
  <td>5</td>
 
1715
  <td>gemini-2.5-pro</td>
1716
+ <td>1695</td>
1717
  <td>87.2%</td>
 
 
1718
  <td>165</td>
1719
  </tr>
1720
  <tr>
1721
  <td>6</td>
1722
  <td>grok-3</td>
1723
+ <td>1665</td>
1724
  <td>85.7%</td>
 
 
1725
  <td>147</td>
1726
  </tr>
1727
  <tr>
1728
  <td>7</td>
1729
  <td>deepseek-r1</td>
1730
+ <td>1625</td>
1731
  <td>83.2%</td>
 
 
1732
  <td>134</td>
1733
  </tr>
1734
  </tbody>
1735
  </table>
1736
  </div>
1737
 
1738
+ <div style="margin-top: 20px; padding: 15px; background-color: #0d47a1; border-radius: 8px; box-shadow: 0 2px 8px rgba(0,0,0,0.1);">
1739
+ <h3 style="margin-top: 0; color: #ffffff;">ELO Rating System Explanation</h3>
1740
 
1741
  <div style="display: flex; flex-wrap: wrap; gap: 15px; margin-top: 15px;">
1742
+ <div style="flex: 1; min-width: 280px; padding: 12px; background-color: #455a64; border-radius: 6px; box-shadow: 0 1px 3px rgba(0,0,0,0.12);">
1743
+ <h4 style="margin-top: 0; color: #ffffff;">How ELO Scores Are Calculated</h4>
1744
+ <p style="color: #eceff1;">Our ELO rating system assigns scores to models based on user feedback, using the following formula:</p>
1745
+ <div style="background-color: #37474f; padding: 12px; border-radius: 5px; color: #eceff1;">
1746
+ <code style="color: #80deea;">ELO_new = ELO_old + K × (S - E)</code><br><br>
1747
+ Where:<br>
1748
+ <strong style="color: #b2dfdb;">ELO_old</strong>: Previous rating of the model<br>
1749
+ • <strong style="color: #b2dfdb;">K</strong>: Weight factor (40 for new models, 20 for established ones)<br>
1750
+ • <strong style="color: #b2dfdb;">S</strong>: Actual score (1 for correct hallucination detection, 0 for incorrect)<br>
1751
+ <strong style="color: #b2dfdb;">E</strong>: Expected score based on current rating<br><br>
1752
+ <em style="color: #80deea;">E = 1 / (1 + 10<sup>(ELO_opponent - ELO_model)/400</sup>)</em>
 
 
 
 
1753
  </div>
1754
+ <p style="color: #eceff1; margin-top: 10px;">All models start with a base ELO of 1500. Scores are updated after each user evaluation.</p>
1755
+ </div>
1756
+ <div style="flex: 1; min-width: 280px; padding: 12px; background-color: #455a64; border-radius: 6px; box-shadow: 0 1px 3px rgba(0,0,0,0.12);">
1757
+ <h4 style="margin-top: 0; color: #ffffff;">Interpretation Guidelines</h4>
1758
+ <ul style="margin-bottom: 0; padding-left: 20px; color: #eceff1;">
1759
+ <li><strong style="color: #b2dfdb;">1800+</strong>: Exceptional performance, very rare hallucinations</li>
1760
+ <li><strong style="color: #b2dfdb;">1700-1799</strong>: Superior performance, minimal hallucinations</li>
1761
+ <li><strong style="color: #b2dfdb;">1600-1699</strong>: Good performance, occasional hallucinations</li>
1762
+ <li><strong style="color: #b2dfdb;">1500-1599</strong>: Average performance</li>
1763
+ <li><strong style="color: #b2dfdb;">&lt;1500</strong>: Below average, frequent hallucinations</li>
1764
+ </ul>
1765
+ <p style="font-style: italic; color: #b3e5fc; margin-top: 10px;">
1766
+ Note: ELO scores are comparative and reflect relative performance between models in our specific hallucination detection tasks.
1767
+ </p>
1768
  </div>
1769
  </div>
1770
  </div>
 
1801
  return stats_html
1802
  return ""
1803
 
1804
+ # Feedback section is now moved directly inside the Detector tab
1805
+
1806
  # Set up interval to update stats
1807
  with gr.Row(elem_id="stats-container"):
1808
  with gr.Column():
 
1859
  }, refreshInterval);
1860
  }
1861
 
1862
+ // Add highlighting to the selected tab and handle feedback section visibility
1863
  function setupTabHighlighting() {
1864
  // Add hover effects to tabs
1865
  const tabs = document.querySelectorAll('.tabs button');
 
1875
  tab.style.backgroundColor = '';
1876
  }
1877
  });
1878
+
1879
+ // Handle tab click events to manage feedback section visibility
1880
+ tab.addEventListener('click', function() {
1881
+ // Use setTimeout to let Gradio UI update first
1882
+ setTimeout(() => {
1883
+ // Check if this tab is selected and what its text is
1884
+ const isDetectorTab = this.classList.contains('selected') &&
1885
+ !this.textContent.includes('Model') &&
1886
+ !this.textContent.includes('User');
1887
+
1888
+ // Find all accordions in the page
1889
+ const accordions = document.querySelectorAll('.accordion');
1890
+
1891
+ // Loop through all accordions
1892
+ accordions.forEach(acc => {
1893
+ // Check if this is the feedback accordion
1894
+ if (acc.textContent.includes('Provide Feedback') ||
1895
+ acc.textContent.includes('Help Improve')) {
1896
+
1897
+ if (isDetectorTab) {
1898
+ acc.style.display = 'block';
1899
+ } else {
1900
+ acc.style.display = 'none';
1901
+ }
1902
+ }
1903
+ });
1904
+ }, 100);
1905
+ });
1906
  });
1907
  }
1908
  }
 
1911
  function setupAllEnhancements() {
1912
  setupAutoRefresh();
1913
  setupTabHighlighting();
1914
+
1915
+ // Simple solution to ensure feedback is only visible in detector tab
1916
+ setTimeout(() => {
1917
+ // Get the feedback accordion by ID
1918
+ const feedbackAccordion = document.getElementById('detector-feedback');
1919
+ if (!feedbackAccordion) return;
1920
+
1921
+ // Get all tabs
1922
+ const tabs = document.querySelectorAll('.tabs button');
1923
+ if (tabs.length === 0) return;
1924
+
1925
+ // Add click handlers to each tab
1926
+ tabs.forEach((tab, index) => {
1927
+ // Check if it's the first tab (Detector)
1928
+ const isDetectorTab = index === 0;
1929
+
1930
+ // When a tab is clicked, toggle the feedback visibility
1931
+ tab.addEventListener('click', function() {
1932
+ if (feedbackAccordion) {
1933
+ // Give time for Gradio to update the UI
1934
+ setTimeout(() => {
1935
+ feedbackAccordion.style.display = this.classList.contains('selected') && isDetectorTab ? 'block' : 'none';
1936
+ }, 100);
1937
+ }
1938
+ });
1939
+ });
1940
+
1941
+ // Initial setup - make sure feedback is only visible if detector tab is active
1942
+ const activeTab = document.querySelector('.tabs button.selected');
1943
+ const activeTabIndex = Array.from(tabs).indexOf(activeTab);
1944
+
1945
+ if (activeTabIndex !== 0) { // If not on detector tab
1946
+ feedbackAccordion.style.display = 'none';
1947
+ }
1948
+
1949
+ // Also create a style rule for safety
1950
+ const style = document.createElement('style');
1951
+ style.textContent = `
1952
+ .tabs[data-testid*="tab"] button:not(:first-child).selected ~ .tabitem #detector-feedback {
1953
+ display: none !important;
1954
+ }
1955
+ `;
1956
+ document.head.appendChild(style);
1957
+
1958
+ }, 300);
1959
  }
1960
 
1961
  if (window.gradio_loaded) {
 
1985
  from { opacity: 0; }
1986
  to { opacity: 1; }
1987
  }
1988
+
1989
+ /* Initial setting - show feedback accordion */
1990
+ #detector-feedback {
1991
+ display: block !important;
1992
+ }
1993
+
1994
+ /* Hide when in other tabs using IDs */
1995
+ #model-leaderboard-tab #detector-feedback,
1996
+ #user-feedback-tab #detector-feedback {
1997
+ display: none !important;
1998
+ }
1999
  </style>
2000
  """)
2001
 
2002
+ # Removed duplicate feedback section (moved to above the stats container)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2003
 
2004
  # Hidden state to store results for feedback
2005
  hidden_results = gr.State()