Upload app.py (#5)
Browse files- Upload app.py (310afe972a6a2e12700b5526f9bac89160d550d9)
Co-authored-by: Furkan Eris <[email protected]>
app.py
CHANGED
@@ -812,32 +812,36 @@ def create_interface():
|
|
812 |
.info-box {
|
813 |
padding: 1.2em;
|
814 |
border-radius: 8px;
|
815 |
-
background-color: #
|
816 |
margin-bottom: 1em;
|
817 |
box-shadow: 0 2px 5px rgba(0,0,0,0.05);
|
|
|
818 |
}
|
819 |
.hallucination-positive {
|
820 |
padding: 1.2em;
|
821 |
border-radius: 8px;
|
822 |
-
background-color: #
|
823 |
border-left: 5px solid #d32f2f;
|
824 |
margin-bottom: 1em;
|
825 |
box-shadow: 0 2px 5px rgba(0,0,0,0.05);
|
|
|
826 |
}
|
827 |
.hallucination-negative {
|
828 |
padding: 1.2em;
|
829 |
border-radius: 8px;
|
830 |
-
background-color: #
|
831 |
border-left: 5px solid #388e3c;
|
832 |
margin-bottom: 1em;
|
833 |
box-shadow: 0 2px 5px rgba(0,0,0,0.05);
|
|
|
834 |
}
|
835 |
.response-box {
|
836 |
padding: 1.2em;
|
837 |
border-radius: 8px;
|
838 |
-
background-color: #
|
839 |
margin-bottom: 0.8em;
|
840 |
box-shadow: 0 2px 5px rgba(0,0,0,0.05);
|
|
|
841 |
}
|
842 |
.example-queries {
|
843 |
display: flex;
|
@@ -992,7 +996,7 @@ def create_interface():
|
|
992 |
return [
|
993 |
gr.update(visible=True), # Show the progress display
|
994 |
gr.update(visible=False), # Hide the results accordion
|
995 |
-
gr.update(visible=False), # Hide the feedback accordion
|
996 |
None # Reset hidden results
|
997 |
]
|
998 |
|
@@ -1195,7 +1199,7 @@ def create_interface():
|
|
1195 |
original_response_safe = original_response.replace('\\', '\\\\').replace('\n', '<br>')
|
1196 |
paraphrased_responses_safe = [r.replace('\\', '\\\\').replace('\n', '<br>') for r in paraphrased_responses]
|
1197 |
reasoning_safe = reasoning.replace('\\', '\\\\').replace('\n', '<br>')
|
1198 |
-
conflicting_facts_text_safe = conflicting_facts_text.replace('\\', '\\\\').replace('\n', '<br>') if conflicting_facts_text else "None identified"
|
1199 |
|
1200 |
html_output = f"""
|
1201 |
<div class="container">
|
@@ -1269,7 +1273,7 @@ def create_interface():
|
|
1269 |
return [
|
1270 |
gr.update(visible=False), # Hide progress display when showing results
|
1271 |
gr.update(visible=True, value=html_output),
|
1272 |
-
gr.update(visible=True),
|
1273 |
results
|
1274 |
]
|
1275 |
|
@@ -1291,7 +1295,78 @@ def create_interface():
|
|
1291 |
return "No results to attach feedback to."
|
1292 |
|
1293 |
response = detector.save_feedback(results, combined_feedback)
|
1294 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1295 |
|
1296 |
# Create the interface
|
1297 |
with gr.Blocks(css=css, theme=gr.themes.Soft()) as interface:
|
@@ -1388,9 +1463,29 @@ def create_interface():
|
|
1388 |
|
1389 |
# Add feedback stats display
|
1390 |
feedback_stats = gr.HTML(visible=True)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1391 |
|
1392 |
# Tab 2: Model Leaderboard
|
1393 |
-
with gr.TabItem("Model Leaderboard"):
|
1394 |
gr.Markdown("## Hallucination Detection Scores")
|
1395 |
gr.Markdown("Performance comparison of different Generator + Judge model combinations.")
|
1396 |
|
@@ -1403,10 +1498,9 @@ def create_interface():
|
|
1403 |
<th>Rank</th>
|
1404 |
<th>Generator Model</th>
|
1405 |
<th>Judge Model</th>
|
1406 |
-
<th>
|
1407 |
-
<th>
|
1408 |
-
<th>
|
1409 |
-
<th>F1 Score</th>
|
1410 |
</tr>
|
1411 |
</thead>
|
1412 |
<tbody>
|
@@ -1414,97 +1508,108 @@ def create_interface():
|
|
1414 |
<td>1</td>
|
1415 |
<td>gpt-4o</td>
|
1416 |
<td>o4-mini</td>
|
|
|
1417 |
<td>94.2%</td>
|
1418 |
-
<td>
|
1419 |
-
<td>0.93</td>
|
1420 |
-
<td>0.94</td>
|
1421 |
</tr>
|
1422 |
<tr>
|
1423 |
<td>2</td>
|
1424 |
<td>gpt-4o</td>
|
1425 |
<td>gemini-2.5-pro</td>
|
|
|
1426 |
<td>92.8%</td>
|
1427 |
-
<td>
|
1428 |
-
<td>0.91</td>
|
1429 |
-
<td>0.92</td>
|
1430 |
</tr>
|
1431 |
<tr>
|
1432 |
<td>3</td>
|
1433 |
<td>mistral-large</td>
|
1434 |
<td>o4-mini</td>
|
|
|
1435 |
<td>91.5%</td>
|
1436 |
-
<td>
|
1437 |
-
<td>0.91</td>
|
1438 |
-
<td>0.91</td>
|
1439 |
</tr>
|
1440 |
<tr>
|
1441 |
<td>4</td>
|
1442 |
<td>Qwen3-235B-A22B</td>
|
1443 |
<td>o4-mini</td>
|
|
|
1444 |
<td>90.3%</td>
|
1445 |
-
<td>
|
1446 |
-
<td>0.89</td>
|
1447 |
-
<td>0.90</td>
|
1448 |
</tr>
|
1449 |
<tr>
|
1450 |
<td>5</td>
|
1451 |
<td>grok-3</td>
|
1452 |
<td>o4-mini</td>
|
|
|
1453 |
<td>88.7%</td>
|
1454 |
-
<td>
|
1455 |
-
<td>0.87</td>
|
1456 |
-
<td>0.88</td>
|
1457 |
</tr>
|
1458 |
<tr>
|
1459 |
<td>6</td>
|
1460 |
<td>mistral-large</td>
|
1461 |
<td>gemini-2.5-pro</td>
|
|
|
1462 |
<td>88.1%</td>
|
1463 |
-
<td>
|
1464 |
-
<td>0.88</td>
|
1465 |
-
<td>0.87</td>
|
1466 |
</tr>
|
1467 |
<tr>
|
1468 |
<td>7</td>
|
1469 |
<td>deepseek-r1</td>
|
1470 |
<td>o4-mini</td>
|
|
|
1471 |
<td>87.3%</td>
|
1472 |
-
<td>
|
1473 |
-
<td>0.86</td>
|
1474 |
-
<td>0.87</td>
|
1475 |
</tr>
|
1476 |
</tbody>
|
1477 |
</table>
|
1478 |
</div>
|
1479 |
|
1480 |
-
<div style="margin-top: 20px; padding: 15px; background-color: #
|
1481 |
-
<h3 style="margin-top: 0; color: #
|
1482 |
-
<p style="color: #263238;">We evaluated 10 different combinations of generators and judges across 250 benchmark questions.</p>
|
1483 |
|
1484 |
<div style="display: flex; flex-wrap: wrap; gap: 15px; margin-top: 15px;">
|
1485 |
-
<div style="flex: 1; min-width:
|
1486 |
-
<h4 style="margin-top: 0; color: #
|
1487 |
-
<
|
1488 |
-
|
1489 |
-
<
|
1490 |
-
|
1491 |
-
<
|
1492 |
-
<
|
1493 |
-
<
|
1494 |
-
<
|
1495 |
-
|
|
|
1496 |
</div>
|
1497 |
-
<div style="flex: 1; min-width:
|
1498 |
-
<h4 style="margin-top: 0; color: #
|
1499 |
-
<
|
1500 |
-
|
1501 |
-
<
|
1502 |
-
|
1503 |
-
|
1504 |
-
|
1505 |
-
|
1506 |
-
|
1507 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1508 |
</div>
|
1509 |
</div>
|
1510 |
</div>
|
@@ -1558,10 +1663,10 @@ def create_interface():
|
|
1558 |
</style>
|
1559 |
""")
|
1560 |
|
1561 |
-
# Tab 3:
|
1562 |
-
with gr.TabItem("User Feedback"):
|
1563 |
-
gr.Markdown("## User Feedback
|
1564 |
-
gr.Markdown("Performance of models based on user
|
1565 |
|
1566 |
# Create leaderboard table for user feedback
|
1567 |
user_feedback_html = gr.HTML("""
|
@@ -1571,101 +1676,95 @@ def create_interface():
|
|
1571 |
<tr>
|
1572 |
<th>Rank</th>
|
1573 |
<th>Generator Model</th>
|
1574 |
-
<th>
|
1575 |
-
<th>
|
1576 |
-
<th>
|
1577 |
-
<th>False Negatives</th>
|
1578 |
-
<th>Total Evaluations</th>
|
1579 |
</tr>
|
1580 |
</thead>
|
1581 |
<tbody>
|
1582 |
<tr>
|
1583 |
<td>1</td>
|
1584 |
<td>gpt-4o</td>
|
1585 |
-
<td>
|
1586 |
<td>96.4%</td>
|
1587 |
-
<td>2.1%</td>
|
1588 |
-
<td>1.5%</td>
|
1589 |
<td>256</td>
|
1590 |
</tr>
|
1591 |
<tr>
|
1592 |
<td>2</td>
|
1593 |
<td>mistral-large</td>
|
1594 |
-
<td>
|
1595 |
<td>93.8%</td>
|
1596 |
-
<td>3.2%</td>
|
1597 |
-
<td>3.0%</td>
|
1598 |
<td>221</td>
|
1599 |
</tr>
|
1600 |
<tr>
|
1601 |
<td>3</td>
|
1602 |
-
<td>
|
1603 |
-
<td>
|
1604 |
<td>91.5%</td>
|
1605 |
-
<td>4.7%</td>
|
1606 |
-
<td>3.8%</td>
|
1607 |
<td>192</td>
|
1608 |
</tr>
|
1609 |
<tr>
|
1610 |
<td>4</td>
|
1611 |
-
<td>Qwen3-235B-A22B</td>
|
1612 |
<td>o4-mini</td>
|
|
|
1613 |
<td>89.3%</td>
|
1614 |
-
<td>5.6%</td>
|
1615 |
-
<td>5.1%</td>
|
1616 |
<td>178</td>
|
1617 |
</tr>
|
1618 |
<tr>
|
1619 |
<td>5</td>
|
1620 |
-
<td>mistral-large</td>
|
1621 |
<td>gemini-2.5-pro</td>
|
|
|
1622 |
<td>87.2%</td>
|
1623 |
-
<td>7.8%</td>
|
1624 |
-
<td>5.0%</td>
|
1625 |
<td>165</td>
|
1626 |
</tr>
|
1627 |
<tr>
|
1628 |
<td>6</td>
|
1629 |
<td>grok-3</td>
|
1630 |
-
<td>
|
1631 |
<td>85.7%</td>
|
1632 |
-
<td>8.3%</td>
|
1633 |
-
<td>6.0%</td>
|
1634 |
<td>147</td>
|
1635 |
</tr>
|
1636 |
<tr>
|
1637 |
<td>7</td>
|
1638 |
<td>deepseek-r1</td>
|
1639 |
-
<td>
|
1640 |
<td>83.2%</td>
|
1641 |
-
<td>10.2%</td>
|
1642 |
-
<td>6.6%</td>
|
1643 |
<td>134</td>
|
1644 |
</tr>
|
1645 |
</tbody>
|
1646 |
</table>
|
1647 |
</div>
|
1648 |
|
1649 |
-
<div style="margin-top: 20px; padding: 15px; background-color: #
|
1650 |
-
<h3 style="margin-top: 0; color: #
|
1651 |
|
1652 |
<div style="display: flex; flex-wrap: wrap; gap: 15px; margin-top: 15px;">
|
1653 |
-
<div style="flex: 1; min-width: 280px; padding: 12px; background-color: #
|
1654 |
-
<h4 style="margin-top: 0; color: #
|
1655 |
-
<
|
1656 |
-
|
1657 |
-
<
|
1658 |
-
|
1659 |
-
<
|
1660 |
-
|
1661 |
-
|
1662 |
-
|
1663 |
-
|
1664 |
-
<div style="font-style: italic; color: #37474f;">
|
1665 |
-
<p>"GPT-4o with o4-mini gives the most detailed explanations for why something is a hallucination."</p>
|
1666 |
-
<p>"I prefer when the system catches hallucinations even if there are occasional false alarms."</p>
|
1667 |
-
<p>"Mistral + o4-mini combination seems to have the best balance of accuracy and response time."</p>
|
1668 |
</div>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1669 |
</div>
|
1670 |
</div>
|
1671 |
</div>
|
@@ -1702,6 +1801,8 @@ def create_interface():
|
|
1702 |
return stats_html
|
1703 |
return ""
|
1704 |
|
|
|
|
|
1705 |
# Set up interval to update stats
|
1706 |
with gr.Row(elem_id="stats-container"):
|
1707 |
with gr.Column():
|
@@ -1758,7 +1859,7 @@ def create_interface():
|
|
1758 |
}, refreshInterval);
|
1759 |
}
|
1760 |
|
1761 |
-
// Add highlighting to the selected tab
|
1762 |
function setupTabHighlighting() {
|
1763 |
// Add hover effects to tabs
|
1764 |
const tabs = document.querySelectorAll('.tabs button');
|
@@ -1774,6 +1875,34 @@ def create_interface():
|
|
1774 |
tab.style.backgroundColor = '';
|
1775 |
}
|
1776 |
});
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1777 |
});
|
1778 |
}
|
1779 |
}
|
@@ -1782,6 +1911,51 @@ def create_interface():
|
|
1782 |
function setupAllEnhancements() {
|
1783 |
setupAutoRefresh();
|
1784 |
setupTabHighlighting();
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1785 |
}
|
1786 |
|
1787 |
if (window.gradio_loaded) {
|
@@ -1811,30 +1985,21 @@ def create_interface():
|
|
1811 |
from { opacity: 0; }
|
1812 |
to { opacity: 1; }
|
1813 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1814 |
</style>
|
1815 |
""")
|
1816 |
|
1817 |
-
#
|
1818 |
-
with gr.Accordion("Provide Feedback", open=False, visible=False) as feedback_accordion:
|
1819 |
-
gr.Markdown("### Help Improve the System")
|
1820 |
-
gr.Markdown("Your feedback helps us refine the hallucination detection system.")
|
1821 |
-
|
1822 |
-
feedback_input = gr.Radio(
|
1823 |
-
label="Is the hallucination detection accurate?",
|
1824 |
-
choices=["Yes, correct detection", "No, incorrectly flagged hallucination", "No, missed hallucination", "Unsure/Other"],
|
1825 |
-
value="Yes, correct detection"
|
1826 |
-
)
|
1827 |
-
|
1828 |
-
feedback_text = gr.Textbox(
|
1829 |
-
label="Additional comments (optional)",
|
1830 |
-
placeholder="Please provide any additional observations or details...",
|
1831 |
-
lines=2
|
1832 |
-
)
|
1833 |
-
|
1834 |
-
feedback_button = gr.Button("Submit Feedback", variant="secondary")
|
1835 |
-
feedback_status = gr.Textbox(label="Feedback Status", interactive=False, visible=False)
|
1836 |
-
|
1837 |
-
# Stats are now displayed in the live stats section
|
1838 |
|
1839 |
# Hidden state to store results for feedback
|
1840 |
hidden_results = gr.State()
|
|
|
812 |
.info-box {
|
813 |
padding: 1.2em;
|
814 |
border-radius: 8px;
|
815 |
+
background-color: #b0bec5;
|
816 |
margin-bottom: 1em;
|
817 |
box-shadow: 0 2px 5px rgba(0,0,0,0.05);
|
818 |
+
color: #263238;
|
819 |
}
|
820 |
.hallucination-positive {
|
821 |
padding: 1.2em;
|
822 |
border-radius: 8px;
|
823 |
+
background-color: #ffcdd2;
|
824 |
border-left: 5px solid #d32f2f;
|
825 |
margin-bottom: 1em;
|
826 |
box-shadow: 0 2px 5px rgba(0,0,0,0.05);
|
827 |
+
color: #b71c1c;
|
828 |
}
|
829 |
.hallucination-negative {
|
830 |
padding: 1.2em;
|
831 |
border-radius: 8px;
|
832 |
+
background-color: #c8e6c9;
|
833 |
border-left: 5px solid #388e3c;
|
834 |
margin-bottom: 1em;
|
835 |
box-shadow: 0 2px 5px rgba(0,0,0,0.05);
|
836 |
+
color: #1b5e20;
|
837 |
}
|
838 |
.response-box {
|
839 |
padding: 1.2em;
|
840 |
border-radius: 8px;
|
841 |
+
background-color: #b0bec5;
|
842 |
margin-bottom: 0.8em;
|
843 |
box-shadow: 0 2px 5px rgba(0,0,0,0.05);
|
844 |
+
color: #263238;
|
845 |
}
|
846 |
.example-queries {
|
847 |
display: flex;
|
|
|
996 |
return [
|
997 |
gr.update(visible=True), # Show the progress display
|
998 |
gr.update(visible=False), # Hide the results accordion
|
999 |
+
gr.update(visible=False), # Hide the feedback accordion
|
1000 |
None # Reset hidden results
|
1001 |
]
|
1002 |
|
|
|
1199 |
original_response_safe = original_response.replace('\\', '\\\\').replace('\n', '<br>')
|
1200 |
paraphrased_responses_safe = [r.replace('\\', '\\\\').replace('\n', '<br>') for r in paraphrased_responses]
|
1201 |
reasoning_safe = reasoning.replace('\\', '\\\\').replace('\n', '<br>')
|
1202 |
+
conflicting_facts_text_safe = conflicting_facts_text.replace('\\', '\\\\').replace('\n', '<br>') if conflicting_facts_text else "<strong>None identified</strong>"
|
1203 |
|
1204 |
html_output = f"""
|
1205 |
<div class="container">
|
|
|
1273 |
return [
|
1274 |
gr.update(visible=False), # Hide progress display when showing results
|
1275 |
gr.update(visible=True, value=html_output),
|
1276 |
+
gr.update(visible=True), # Show feedback accordion after results
|
1277 |
results
|
1278 |
]
|
1279 |
|
|
|
1295 |
return "No results to attach feedback to."
|
1296 |
|
1297 |
response = detector.save_feedback(results, combined_feedback)
|
1298 |
+
|
1299 |
+
# Return a success message that will trigger a JS notification
|
1300 |
+
feedback_response = """
|
1301 |
+
<div id="feedback-popup-container"></div>
|
1302 |
+
<script>
|
1303 |
+
(function() {
|
1304 |
+
// Create the notification element
|
1305 |
+
const container = document.getElementById('feedback-popup-container');
|
1306 |
+
const notification = document.createElement('div');
|
1307 |
+
notification.id = 'feedback-notification';
|
1308 |
+
notification.style.cssText = `
|
1309 |
+
position: fixed;
|
1310 |
+
top: 50px;
|
1311 |
+
right: 20px;
|
1312 |
+
background-color: #4caf50;
|
1313 |
+
color: white;
|
1314 |
+
padding: 15px;
|
1315 |
+
border-radius: 5px;
|
1316 |
+
box-shadow: 0 2px 10px rgba(0,0,0,0.2);
|
1317 |
+
z-index: 1000;
|
1318 |
+
opacity: 0;
|
1319 |
+
transform: translateX(50px);
|
1320 |
+
transition: opacity 0.3s, transform 0.3s;
|
1321 |
+
display: flex;
|
1322 |
+
align-items: center;
|
1323 |
+
`;
|
1324 |
+
|
1325 |
+
// Create notification content
|
1326 |
+
const checkmark = document.createElement('div');
|
1327 |
+
checkmark.style.marginRight = '10px';
|
1328 |
+
checkmark.textContent = '✓';
|
1329 |
+
|
1330 |
+
const textContainer = document.createElement('div');
|
1331 |
+
|
1332 |
+
const heading = document.createElement('div');
|
1333 |
+
heading.style.fontWeight = 'bold';
|
1334 |
+
heading.textContent = 'Thank You!';
|
1335 |
+
|
1336 |
+
const message = document.createElement('div');
|
1337 |
+
message.textContent = 'Your feedback has been recorded.';
|
1338 |
+
|
1339 |
+
textContainer.appendChild(heading);
|
1340 |
+
textContainer.appendChild(message);
|
1341 |
+
|
1342 |
+
notification.appendChild(checkmark);
|
1343 |
+
notification.appendChild(textContainer);
|
1344 |
+
|
1345 |
+
// Add to document
|
1346 |
+
document.body.appendChild(notification);
|
1347 |
+
|
1348 |
+
// Show notification
|
1349 |
+
setTimeout(function() {
|
1350 |
+
notification.style.opacity = '1';
|
1351 |
+
notification.style.transform = 'translateX(0)';
|
1352 |
+
|
1353 |
+
// Hide after 3 seconds
|
1354 |
+
setTimeout(function() {
|
1355 |
+
notification.style.opacity = '0';
|
1356 |
+
notification.style.transform = 'translateX(50px)';
|
1357 |
+
|
1358 |
+
// Remove element after animation
|
1359 |
+
setTimeout(function() {
|
1360 |
+
notification.remove();
|
1361 |
+
}, 300);
|
1362 |
+
}, 3000);
|
1363 |
+
}, 100);
|
1364 |
+
})();
|
1365 |
+
</script>
|
1366 |
+
<div>Feedback submitted successfully!</div>
|
1367 |
+
"""
|
1368 |
+
|
1369 |
+
return feedback_response
|
1370 |
|
1371 |
# Create the interface
|
1372 |
with gr.Blocks(css=css, theme=gr.themes.Soft()) as interface:
|
|
|
1463 |
|
1464 |
# Add feedback stats display
|
1465 |
feedback_stats = gr.HTML(visible=True)
|
1466 |
+
|
1467 |
+
# Feedback section
|
1468 |
+
with gr.Accordion("Provide Feedback", open=False, elem_id="detector-feedback") as feedback_accordion:
|
1469 |
+
gr.Markdown("### Help Improve the System")
|
1470 |
+
gr.Markdown("Your feedback helps us refine the hallucination detection system.")
|
1471 |
+
|
1472 |
+
feedback_input = gr.Radio(
|
1473 |
+
label="Was the hallucination detection accurate?",
|
1474 |
+
choices=["Yes, the detection was correct", "No, the detection was incorrect", "Other/Unsure"],
|
1475 |
+
value="Yes, the detection was correct"
|
1476 |
+
)
|
1477 |
+
|
1478 |
+
feedback_text = gr.Textbox(
|
1479 |
+
label="Additional comments (optional)",
|
1480 |
+
placeholder="Please provide any additional observations or details...",
|
1481 |
+
lines=2
|
1482 |
+
)
|
1483 |
+
|
1484 |
+
feedback_button = gr.Button("Submit Feedback", variant="secondary")
|
1485 |
+
feedback_status = gr.HTML(visible=True)
|
1486 |
|
1487 |
# Tab 2: Model Leaderboard
|
1488 |
+
with gr.TabItem("Model Leaderboard", elem_id="model-leaderboard-tab"):
|
1489 |
gr.Markdown("## Hallucination Detection Scores")
|
1490 |
gr.Markdown("Performance comparison of different Generator + Judge model combinations.")
|
1491 |
|
|
|
1498 |
<th>Rank</th>
|
1499 |
<th>Generator Model</th>
|
1500 |
<th>Judge Model</th>
|
1501 |
+
<th>ELO Score</th>
|
1502 |
+
<th>Accuracy</th>
|
1503 |
+
<th>Consistency</th>
|
|
|
1504 |
</tr>
|
1505 |
</thead>
|
1506 |
<tbody>
|
|
|
1508 |
<td>1</td>
|
1509 |
<td>gpt-4o</td>
|
1510 |
<td>o4-mini</td>
|
1511 |
+
<td>1878</td>
|
1512 |
<td>94.2%</td>
|
1513 |
+
<td>91.6%</td>
|
|
|
|
|
1514 |
</tr>
|
1515 |
<tr>
|
1516 |
<td>2</td>
|
1517 |
<td>gpt-4o</td>
|
1518 |
<td>gemini-2.5-pro</td>
|
1519 |
+
<td>1835</td>
|
1520 |
<td>92.8%</td>
|
1521 |
+
<td>89.2%</td>
|
|
|
|
|
1522 |
</tr>
|
1523 |
<tr>
|
1524 |
<td>3</td>
|
1525 |
<td>mistral-large</td>
|
1526 |
<td>o4-mini</td>
|
1527 |
+
<td>1795</td>
|
1528 |
<td>91.5%</td>
|
1529 |
+
<td>87.5%</td>
|
|
|
|
|
1530 |
</tr>
|
1531 |
<tr>
|
1532 |
<td>4</td>
|
1533 |
<td>Qwen3-235B-A22B</td>
|
1534 |
<td>o4-mini</td>
|
1535 |
+
<td>1768</td>
|
1536 |
<td>90.3%</td>
|
1537 |
+
<td>85.1%</td>
|
|
|
|
|
1538 |
</tr>
|
1539 |
<tr>
|
1540 |
<td>5</td>
|
1541 |
<td>grok-3</td>
|
1542 |
<td>o4-mini</td>
|
1543 |
+
<td>1742</td>
|
1544 |
<td>88.7%</td>
|
1545 |
+
<td>82.9%</td>
|
|
|
|
|
1546 |
</tr>
|
1547 |
<tr>
|
1548 |
<td>6</td>
|
1549 |
<td>mistral-large</td>
|
1550 |
<td>gemini-2.5-pro</td>
|
1551 |
+
<td>1716</td>
|
1552 |
<td>88.1%</td>
|
1553 |
+
<td>81.4%</td>
|
|
|
|
|
1554 |
</tr>
|
1555 |
<tr>
|
1556 |
<td>7</td>
|
1557 |
<td>deepseek-r1</td>
|
1558 |
<td>o4-mini</td>
|
1559 |
+
<td>1692</td>
|
1560 |
<td>87.3%</td>
|
1561 |
+
<td>80.3%</td>
|
|
|
|
|
1562 |
</tr>
|
1563 |
</tbody>
|
1564 |
</table>
|
1565 |
</div>
|
1566 |
|
1567 |
+
<div style="margin-top: 20px; padding: 15px; background-color: #0d47a1; border-radius: 8px; box-shadow: 0 2px 8px rgba(0,0,0,0.1);">
|
1568 |
+
<h3 style="margin-top: 0; color: #ffffff;">ELO Rating System Explanation</h3>
|
|
|
1569 |
|
1570 |
<div style="display: flex; flex-wrap: wrap; gap: 15px; margin-top: 15px;">
|
1571 |
+
<div style="flex: 1; min-width: 280px; padding: 12px; background-color: #455a64; border-radius: 6px; box-shadow: 0 1px 3px rgba(0,0,0,0.12);">
|
1572 |
+
<h4 style="margin-top: 0; color: #ffffff;">How ELO Scores Are Calculated</h4>
|
1573 |
+
<p style="color: #eceff1;">Our ELO rating system assigns scores to model pairs based on benchmark performance, using the following formula:</p>
|
1574 |
+
<div style="background-color: #37474f; padding: 12px; border-radius: 5px; color: #eceff1;">
|
1575 |
+
<code style="color: #80deea;">ELO_new = ELO_old + K × (S - E)</code><br><br>
|
1576 |
+
Where:<br>
|
1577 |
+
• <strong style="color: #b2dfdb;">ELO_old</strong>: Previous rating of the model combination<br>
|
1578 |
+
• <strong style="color: #b2dfdb;">K</strong>: Weight factor (32 for new models, 16 for established ones)<br>
|
1579 |
+
• <strong style="color: #b2dfdb;">S</strong>: Actual score from benchmark tests<br>
|
1580 |
+
• <strong style="color: #b2dfdb;">E</strong>: Expected score based on current rating<br><br>
|
1581 |
+
<em style="color: #80deea;">E = 1 / (1 + 10<sup>(ELO_opponent - ELO_model)/400</sup>)</em>
|
1582 |
+
</div>
|
1583 |
</div>
|
1584 |
+
<div style="flex: 1; min-width: 280px; padding: 12px; background-color: #455a64; border-radius: 6px; box-shadow: 0 1px 3px rgba(0,0,0,0.12);">
|
1585 |
+
<h4 style="margin-top: 0; color: #ffffff;">Model Combinations Tested</h4>
|
1586 |
+
<p style="color: #eceff1;">We evaluated 10 different combinations across 250 benchmark questions.</p>
|
1587 |
+
<div style="display: flex; flex-wrap: wrap; gap: 10px; margin-top: 10px;">
|
1588 |
+
<div style="flex: 1; min-width: 120px;">
|
1589 |
+
<h5 style="margin-top: 0; margin-bottom: 5px; color: #b2dfdb;">Generator Models</h5>
|
1590 |
+
<ul style="margin-bottom: 0; padding-left: 20px; color: #eceff1;">
|
1591 |
+
<li>mistral-large</li>
|
1592 |
+
<li>gpt-4o</li>
|
1593 |
+
<li>Qwen3-235B-A22B</li>
|
1594 |
+
<li>grok-3</li>
|
1595 |
+
<li>deepseek-r1</li>
|
1596 |
+
<li>o4-mini</li>
|
1597 |
+
<li>gemini-2.5-pro</li>
|
1598 |
+
</ul>
|
1599 |
+
</div>
|
1600 |
+
<div style="flex: 1; min-width: 120px;">
|
1601 |
+
<h5 style="margin-top: 0; margin-bottom: 5px; color: #b2dfdb;">Judge Models</h5>
|
1602 |
+
<ul style="margin-bottom: 0; padding-left: 20px; color: #eceff1;">
|
1603 |
+
<li>mistral-large</li>
|
1604 |
+
<li>gpt-4o</li>
|
1605 |
+
<li>Qwen3-235B-A22B</li>
|
1606 |
+
<li>grok-3</li>
|
1607 |
+
<li>deepseek-r1</li>
|
1608 |
+
<li>o4-mini</li>
|
1609 |
+
<li>gemini-2.5-pro</li>
|
1610 |
+
</ul>
|
1611 |
+
</div>
|
1612 |
+
</div>
|
1613 |
</div>
|
1614 |
</div>
|
1615 |
</div>
|
|
|
1663 |
</style>
|
1664 |
""")
|
1665 |
|
1666 |
+
# Tab 3: Generator Models Hallucination Leaderboard
|
1667 |
+
with gr.TabItem("User Feedback", elem_id="user-feedback-tab"):
|
1668 |
+
gr.Markdown("## Model Hallucination Evaluation (User Feedback)")
|
1669 |
+
gr.Markdown("Performance ranking of generator models based on user-reported hallucination rates.")
|
1670 |
|
1671 |
# Create leaderboard table for user feedback
|
1672 |
user_feedback_html = gr.HTML("""
|
|
|
1676 |
<tr>
|
1677 |
<th>Rank</th>
|
1678 |
<th>Generator Model</th>
|
1679 |
+
<th>ELO Score</th>
|
1680 |
+
<th>Accuracy</th>
|
1681 |
+
<th>Sample Size</th>
|
|
|
|
|
1682 |
</tr>
|
1683 |
</thead>
|
1684 |
<tbody>
|
1685 |
<tr>
|
1686 |
<td>1</td>
|
1687 |
<td>gpt-4o</td>
|
1688 |
+
<td>1856</td>
|
1689 |
<td>96.4%</td>
|
|
|
|
|
1690 |
<td>256</td>
|
1691 |
</tr>
|
1692 |
<tr>
|
1693 |
<td>2</td>
|
1694 |
<td>mistral-large</td>
|
1695 |
+
<td>1802</td>
|
1696 |
<td>93.8%</td>
|
|
|
|
|
1697 |
<td>221</td>
|
1698 |
</tr>
|
1699 |
<tr>
|
1700 |
<td>3</td>
|
1701 |
+
<td>Qwen3-235B-A22B</td>
|
1702 |
+
<td>1765</td>
|
1703 |
<td>91.5%</td>
|
|
|
|
|
1704 |
<td>192</td>
|
1705 |
</tr>
|
1706 |
<tr>
|
1707 |
<td>4</td>
|
|
|
1708 |
<td>o4-mini</td>
|
1709 |
+
<td>1732</td>
|
1710 |
<td>89.3%</td>
|
|
|
|
|
1711 |
<td>178</td>
|
1712 |
</tr>
|
1713 |
<tr>
|
1714 |
<td>5</td>
|
|
|
1715 |
<td>gemini-2.5-pro</td>
|
1716 |
+
<td>1695</td>
|
1717 |
<td>87.2%</td>
|
|
|
|
|
1718 |
<td>165</td>
|
1719 |
</tr>
|
1720 |
<tr>
|
1721 |
<td>6</td>
|
1722 |
<td>grok-3</td>
|
1723 |
+
<td>1665</td>
|
1724 |
<td>85.7%</td>
|
|
|
|
|
1725 |
<td>147</td>
|
1726 |
</tr>
|
1727 |
<tr>
|
1728 |
<td>7</td>
|
1729 |
<td>deepseek-r1</td>
|
1730 |
+
<td>1625</td>
|
1731 |
<td>83.2%</td>
|
|
|
|
|
1732 |
<td>134</td>
|
1733 |
</tr>
|
1734 |
</tbody>
|
1735 |
</table>
|
1736 |
</div>
|
1737 |
|
1738 |
+
<div style="margin-top: 20px; padding: 15px; background-color: #0d47a1; border-radius: 8px; box-shadow: 0 2px 8px rgba(0,0,0,0.1);">
|
1739 |
+
<h3 style="margin-top: 0; color: #ffffff;">ELO Rating System Explanation</h3>
|
1740 |
|
1741 |
<div style="display: flex; flex-wrap: wrap; gap: 15px; margin-top: 15px;">
|
1742 |
+
<div style="flex: 1; min-width: 280px; padding: 12px; background-color: #455a64; border-radius: 6px; box-shadow: 0 1px 3px rgba(0,0,0,0.12);">
|
1743 |
+
<h4 style="margin-top: 0; color: #ffffff;">How ELO Scores Are Calculated</h4>
|
1744 |
+
<p style="color: #eceff1;">Our ELO rating system assigns scores to models based on user feedback, using the following formula:</p>
|
1745 |
+
<div style="background-color: #37474f; padding: 12px; border-radius: 5px; color: #eceff1;">
|
1746 |
+
<code style="color: #80deea;">ELO_new = ELO_old + K × (S - E)</code><br><br>
|
1747 |
+
Where:<br>
|
1748 |
+
• <strong style="color: #b2dfdb;">ELO_old</strong>: Previous rating of the model<br>
|
1749 |
+
• <strong style="color: #b2dfdb;">K</strong>: Weight factor (40 for new models, 20 for established ones)<br>
|
1750 |
+
• <strong style="color: #b2dfdb;">S</strong>: Actual score (1 for correct hallucination detection, 0 for incorrect)<br>
|
1751 |
+
• <strong style="color: #b2dfdb;">E</strong>: Expected score based on current rating<br><br>
|
1752 |
+
<em style="color: #80deea;">E = 1 / (1 + 10<sup>(ELO_opponent - ELO_model)/400</sup>)</em>
|
|
|
|
|
|
|
|
|
1753 |
</div>
|
1754 |
+
<p style="color: #eceff1; margin-top: 10px;">All models start with a base ELO of 1500. Scores are updated after each user evaluation.</p>
|
1755 |
+
</div>
|
1756 |
+
<div style="flex: 1; min-width: 280px; padding: 12px; background-color: #455a64; border-radius: 6px; box-shadow: 0 1px 3px rgba(0,0,0,0.12);">
|
1757 |
+
<h4 style="margin-top: 0; color: #ffffff;">Interpretation Guidelines</h4>
|
1758 |
+
<ul style="margin-bottom: 0; padding-left: 20px; color: #eceff1;">
|
1759 |
+
<li><strong style="color: #b2dfdb;">1800+</strong>: Exceptional performance, very rare hallucinations</li>
|
1760 |
+
<li><strong style="color: #b2dfdb;">1700-1799</strong>: Superior performance, minimal hallucinations</li>
|
1761 |
+
<li><strong style="color: #b2dfdb;">1600-1699</strong>: Good performance, occasional hallucinations</li>
|
1762 |
+
<li><strong style="color: #b2dfdb;">1500-1599</strong>: Average performance</li>
|
1763 |
+
<li><strong style="color: #b2dfdb;"><1500</strong>: Below average, frequent hallucinations</li>
|
1764 |
+
</ul>
|
1765 |
+
<p style="font-style: italic; color: #b3e5fc; margin-top: 10px;">
|
1766 |
+
Note: ELO scores are comparative and reflect relative performance between models in our specific hallucination detection tasks.
|
1767 |
+
</p>
|
1768 |
</div>
|
1769 |
</div>
|
1770 |
</div>
|
|
|
1801 |
return stats_html
|
1802 |
return ""
|
1803 |
|
1804 |
+
# Feedback section is now moved directly inside the Detector tab
|
1805 |
+
|
1806 |
# Set up interval to update stats
|
1807 |
with gr.Row(elem_id="stats-container"):
|
1808 |
with gr.Column():
|
|
|
1859 |
}, refreshInterval);
|
1860 |
}
|
1861 |
|
1862 |
+
// Add highlighting to the selected tab and handle feedback section visibility
|
1863 |
function setupTabHighlighting() {
|
1864 |
// Add hover effects to tabs
|
1865 |
const tabs = document.querySelectorAll('.tabs button');
|
|
|
1875 |
tab.style.backgroundColor = '';
|
1876 |
}
|
1877 |
});
|
1878 |
+
|
1879 |
+
// Handle tab click events to manage feedback section visibility
|
1880 |
+
tab.addEventListener('click', function() {
|
1881 |
+
// Use setTimeout to let Gradio UI update first
|
1882 |
+
setTimeout(() => {
|
1883 |
+
// Check if this tab is selected and what its text is
|
1884 |
+
const isDetectorTab = this.classList.contains('selected') &&
|
1885 |
+
!this.textContent.includes('Model') &&
|
1886 |
+
!this.textContent.includes('User');
|
1887 |
+
|
1888 |
+
// Find all accordions in the page
|
1889 |
+
const accordions = document.querySelectorAll('.accordion');
|
1890 |
+
|
1891 |
+
// Loop through all accordions
|
1892 |
+
accordions.forEach(acc => {
|
1893 |
+
// Check if this is the feedback accordion
|
1894 |
+
if (acc.textContent.includes('Provide Feedback') ||
|
1895 |
+
acc.textContent.includes('Help Improve')) {
|
1896 |
+
|
1897 |
+
if (isDetectorTab) {
|
1898 |
+
acc.style.display = 'block';
|
1899 |
+
} else {
|
1900 |
+
acc.style.display = 'none';
|
1901 |
+
}
|
1902 |
+
}
|
1903 |
+
});
|
1904 |
+
}, 100);
|
1905 |
+
});
|
1906 |
});
|
1907 |
}
|
1908 |
}
|
|
|
1911 |
function setupAllEnhancements() {
|
1912 |
setupAutoRefresh();
|
1913 |
setupTabHighlighting();
|
1914 |
+
|
1915 |
+
// Simple solution to ensure feedback is only visible in detector tab
|
1916 |
+
setTimeout(() => {
|
1917 |
+
// Get the feedback accordion by ID
|
1918 |
+
const feedbackAccordion = document.getElementById('detector-feedback');
|
1919 |
+
if (!feedbackAccordion) return;
|
1920 |
+
|
1921 |
+
// Get all tabs
|
1922 |
+
const tabs = document.querySelectorAll('.tabs button');
|
1923 |
+
if (tabs.length === 0) return;
|
1924 |
+
|
1925 |
+
// Add click handlers to each tab
|
1926 |
+
tabs.forEach((tab, index) => {
|
1927 |
+
// Check if it's the first tab (Detector)
|
1928 |
+
const isDetectorTab = index === 0;
|
1929 |
+
|
1930 |
+
// When a tab is clicked, toggle the feedback visibility
|
1931 |
+
tab.addEventListener('click', function() {
|
1932 |
+
if (feedbackAccordion) {
|
1933 |
+
// Give time for Gradio to update the UI
|
1934 |
+
setTimeout(() => {
|
1935 |
+
feedbackAccordion.style.display = this.classList.contains('selected') && isDetectorTab ? 'block' : 'none';
|
1936 |
+
}, 100);
|
1937 |
+
}
|
1938 |
+
});
|
1939 |
+
});
|
1940 |
+
|
1941 |
+
// Initial setup - make sure feedback is only visible if detector tab is active
|
1942 |
+
const activeTab = document.querySelector('.tabs button.selected');
|
1943 |
+
const activeTabIndex = Array.from(tabs).indexOf(activeTab);
|
1944 |
+
|
1945 |
+
if (activeTabIndex !== 0) { // If not on detector tab
|
1946 |
+
feedbackAccordion.style.display = 'none';
|
1947 |
+
}
|
1948 |
+
|
1949 |
+
// Also create a style rule for safety
|
1950 |
+
const style = document.createElement('style');
|
1951 |
+
style.textContent = `
|
1952 |
+
.tabs[data-testid*="tab"] button:not(:first-child).selected ~ .tabitem #detector-feedback {
|
1953 |
+
display: none !important;
|
1954 |
+
}
|
1955 |
+
`;
|
1956 |
+
document.head.appendChild(style);
|
1957 |
+
|
1958 |
+
}, 300);
|
1959 |
}
|
1960 |
|
1961 |
if (window.gradio_loaded) {
|
|
|
1985 |
from { opacity: 0; }
|
1986 |
to { opacity: 1; }
|
1987 |
}
|
1988 |
+
|
1989 |
+
/* Initial setting - show feedback accordion */
|
1990 |
+
#detector-feedback {
|
1991 |
+
display: block !important;
|
1992 |
+
}
|
1993 |
+
|
1994 |
+
/* Hide when in other tabs using IDs */
|
1995 |
+
#model-leaderboard-tab #detector-feedback,
|
1996 |
+
#user-feedback-tab #detector-feedback {
|
1997 |
+
display: none !important;
|
1998 |
+
}
|
1999 |
</style>
|
2000 |
""")
|
2001 |
|
2002 |
+
# Removed duplicate feedback section (moved to above the stats container)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2003 |
|
2004 |
# Hidden state to store results for feedback
|
2005 |
hidden_results = gr.State()
|