Upload app.py
#4
by
nappenstance
- opened
app.py
CHANGED
@@ -794,48 +794,48 @@ def create_interface():
|
|
794 |
.title {
|
795 |
text-align: center;
|
796 |
margin-bottom: 0.5em;
|
797 |
-
color: #
|
798 |
font-weight: 600;
|
799 |
}
|
800 |
.subtitle {
|
801 |
text-align: center;
|
802 |
margin-bottom: 1.5em;
|
803 |
-
color: #
|
804 |
font-size: 1.2em;
|
805 |
}
|
806 |
.section-title {
|
807 |
margin-top: 1em;
|
808 |
margin-bottom: 0.5em;
|
809 |
font-weight: bold;
|
810 |
-
color: #
|
811 |
}
|
812 |
.info-box {
|
813 |
padding: 1.2em;
|
814 |
border-radius: 8px;
|
815 |
-
background-color: #
|
816 |
margin-bottom: 1em;
|
817 |
box-shadow: 0 2px 5px rgba(0,0,0,0.05);
|
818 |
}
|
819 |
.hallucination-positive {
|
820 |
padding: 1.2em;
|
821 |
border-radius: 8px;
|
822 |
-
background-color: #
|
823 |
-
border-left: 5px solid #
|
824 |
margin-bottom: 1em;
|
825 |
box-shadow: 0 2px 5px rgba(0,0,0,0.05);
|
826 |
}
|
827 |
.hallucination-negative {
|
828 |
padding: 1.2em;
|
829 |
border-radius: 8px;
|
830 |
-
background-color: #
|
831 |
-
border-left: 5px solid #
|
832 |
margin-bottom: 1em;
|
833 |
box-shadow: 0 2px 5px rgba(0,0,0,0.05);
|
834 |
}
|
835 |
.response-box {
|
836 |
padding: 1.2em;
|
837 |
border-radius: 8px;
|
838 |
-
background-color: #
|
839 |
margin-bottom: 0.8em;
|
840 |
box-shadow: 0 2px 5px rgba(0,0,0,0.05);
|
841 |
}
|
@@ -846,22 +846,23 @@ def create_interface():
|
|
846 |
margin-bottom: 15px;
|
847 |
}
|
848 |
.example-query {
|
849 |
-
background-color: #
|
850 |
padding: 8px 15px;
|
851 |
border-radius: 18px;
|
852 |
font-size: 0.9em;
|
853 |
cursor: pointer;
|
854 |
transition: all 0.2s;
|
855 |
-
border: 1px solid #
|
|
|
856 |
}
|
857 |
.example-query:hover {
|
858 |
-
background-color: #
|
859 |
box-shadow: 0 2px 5px rgba(0,0,0,0.1);
|
860 |
}
|
861 |
.stats-section {
|
862 |
display: flex;
|
863 |
justify-content: space-between;
|
864 |
-
background-color: #
|
865 |
padding: 15px;
|
866 |
border-radius: 8px;
|
867 |
margin-bottom: 20px;
|
@@ -873,11 +874,11 @@ def create_interface():
|
|
873 |
.stat-value {
|
874 |
font-size: 1.5em;
|
875 |
font-weight: bold;
|
876 |
-
color: #
|
877 |
}
|
878 |
.stat-label {
|
879 |
font-size: 0.9em;
|
880 |
-
color: #
|
881 |
}
|
882 |
.feedback-section {
|
883 |
border-top: 1px solid #e0e0e0;
|
@@ -888,16 +889,16 @@ def create_interface():
|
|
888 |
text-align: center;
|
889 |
padding: 20px;
|
890 |
margin-top: 30px;
|
891 |
-
color: #
|
892 |
font-size: 0.9em;
|
893 |
}
|
894 |
.processing-status {
|
895 |
padding: 12px;
|
896 |
-
background-color: #
|
897 |
-
border-left: 4px solid #
|
898 |
margin-bottom: 15px;
|
899 |
font-weight: 500;
|
900 |
-
color: #
|
901 |
}
|
902 |
.debug-panel {
|
903 |
background-color: #f5f5f5;
|
@@ -1306,84 +1307,370 @@ def create_interface():
|
|
1306 |
"""
|
1307 |
)
|
1308 |
|
1309 |
-
|
1310 |
-
|
1311 |
-
|
1312 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1313 |
|
1314 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1315 |
|
1316 |
-
|
1317 |
-
|
1318 |
-
|
|
|
|
|
1319 |
|
1320 |
-
|
|
|
|
|
|
|
|
|
1321 |
|
1322 |
-
|
1323 |
-
|
1324 |
-
metric-based approaches.
|
1325 |
|
1326 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
1327 |
|
1328 |
-
|
1329 |
-
|
1330 |
-
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1331 |
|
1332 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1333 |
|
1334 |
-
|
1335 |
-
|
1336 |
-
""
|
1337 |
-
|
1338 |
-
|
1339 |
-
with gr.Row():
|
1340 |
-
with gr.Column():
|
1341 |
-
# First define the query input
|
1342 |
-
gr.Markdown("### Enter Your Question")
|
1343 |
-
with gr.Row():
|
1344 |
-
query_input = gr.Textbox(
|
1345 |
-
label="",
|
1346 |
-
placeholder="Ask a factual question (e.g., Who was the first person to land on the moon?)",
|
1347 |
-
lines=3
|
1348 |
-
)
|
1349 |
|
1350 |
-
#
|
1351 |
-
gr.
|
1352 |
-
|
1353 |
-
|
1354 |
-
|
1355 |
-
|
1356 |
-
|
1357 |
-
|
1358 |
-
|
1359 |
-
|
1360 |
-
|
1361 |
-
|
1362 |
-
|
1363 |
-
|
1364 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1365 |
|
1366 |
-
|
1367 |
-
|
1368 |
-
|
1369 |
-
|
1370 |
-
|
1371 |
-
|
1372 |
-
|
1373 |
-
|
1374 |
-
|
1375 |
-
|
1376 |
-
|
1377 |
-
|
1378 |
-
|
1379 |
-
|
1380 |
-
|
1381 |
-
|
1382 |
-
|
1383 |
-
|
1384 |
-
|
1385 |
-
|
1386 |
-
|
|
|
|
|
|
|
|
|
1387 |
# Function to continuously update stats
|
1388 |
def update_stats():
|
1389 |
stats = detector.get_feedback_stats()
|
@@ -1398,17 +1685,17 @@ def create_interface():
|
|
1398 |
accuracy_pct = f"{accuracy * 100:.1f}%"
|
1399 |
|
1400 |
stats_html = f"""
|
1401 |
-
<div class="stats-section" style="background-color: #
|
1402 |
<div class="stat-item">
|
1403 |
-
<div class="stat-value" style="font-size: 2em; color: #
|
1404 |
-
<div class="stat-label" style="font-weight: bold;">Total Responses</div>
|
1405 |
</div>
|
1406 |
<div class="stat-item">
|
1407 |
-
<div class="stat-value" style="font-size: 2em; color: #
|
1408 |
-
<div class="stat-label" style="font-weight: bold;">Correct Predictions</div>
|
1409 |
</div>
|
1410 |
</div>
|
1411 |
-
<div style="text-align: center; margin-top: 10px; font-style: italic; color: #
|
1412 |
Based on user feedback: {correct} correct out of {total} total predictions
|
1413 |
</div>
|
1414 |
"""
|
@@ -1438,14 +1725,14 @@ def create_interface():
|
|
1438 |
color: #2e7d32;
|
1439 |
}
|
1440 |
#stats-container {
|
1441 |
-
border: 1px solid #
|
1442 |
border-radius: 10px;
|
1443 |
padding: 15px;
|
1444 |
margin: 10px 0;
|
1445 |
-
background-color: #
|
1446 |
}
|
1447 |
</style>
|
1448 |
-
<div class="refreshing" style="text-align: right; font-size: 0.8em; color: #
|
1449 |
""")
|
1450 |
|
1451 |
# Create a refresh button that will be auto-clicked
|
@@ -1455,7 +1742,7 @@ def create_interface():
|
|
1455 |
outputs=[live_stats]
|
1456 |
)
|
1457 |
|
1458 |
-
# Add JavaScript to auto-refresh the statistics
|
1459 |
gr.HTML("""
|
1460 |
<script>
|
1461 |
// Auto-refresh stats every 5 seconds
|
@@ -1471,13 +1758,60 @@ def create_interface():
|
|
1471 |
}, refreshInterval);
|
1472 |
}
|
1473 |
|
1474 |
-
//
|
1475 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1476 |
setupAutoRefresh();
|
|
|
|
|
|
|
|
|
|
|
1477 |
} else {
|
1478 |
-
document.addEventListener('DOMContentLoaded',
|
1479 |
}
|
1480 |
</script>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1481 |
""")
|
1482 |
|
1483 |
# Feedback section
|
@@ -1528,7 +1862,8 @@ def create_interface():
|
|
1528 |
"""
|
1529 |
<footer>
|
1530 |
<p>Paraphrase-based Approach for Scrutinizing Systems (PAS2) - Advanced Hallucination Detection</p>
|
1531 |
-
<p>
|
|
|
1532 |
</footer>
|
1533 |
"""
|
1534 |
)
|
|
|
794 |
.title {
|
795 |
text-align: center;
|
796 |
margin-bottom: 0.5em;
|
797 |
+
color: #0d47a1;
|
798 |
font-weight: 600;
|
799 |
}
|
800 |
.subtitle {
|
801 |
text-align: center;
|
802 |
margin-bottom: 1.5em;
|
803 |
+
color: #37474f;
|
804 |
font-size: 1.2em;
|
805 |
}
|
806 |
.section-title {
|
807 |
margin-top: 1em;
|
808 |
margin-bottom: 0.5em;
|
809 |
font-weight: bold;
|
810 |
+
color: #1565c0;
|
811 |
}
|
812 |
.info-box {
|
813 |
padding: 1.2em;
|
814 |
border-radius: 8px;
|
815 |
+
background-color: #e8eaf6;
|
816 |
margin-bottom: 1em;
|
817 |
box-shadow: 0 2px 5px rgba(0,0,0,0.05);
|
818 |
}
|
819 |
.hallucination-positive {
|
820 |
padding: 1.2em;
|
821 |
border-radius: 8px;
|
822 |
+
background-color: #ffe4e1;
|
823 |
+
border-left: 5px solid #d32f2f;
|
824 |
margin-bottom: 1em;
|
825 |
box-shadow: 0 2px 5px rgba(0,0,0,0.05);
|
826 |
}
|
827 |
.hallucination-negative {
|
828 |
padding: 1.2em;
|
829 |
border-radius: 8px;
|
830 |
+
background-color: #e0f2f1;
|
831 |
+
border-left: 5px solid #388e3c;
|
832 |
margin-bottom: 1em;
|
833 |
box-shadow: 0 2px 5px rgba(0,0,0,0.05);
|
834 |
}
|
835 |
.response-box {
|
836 |
padding: 1.2em;
|
837 |
border-radius: 8px;
|
838 |
+
background-color: #eceff1;
|
839 |
margin-bottom: 0.8em;
|
840 |
box-shadow: 0 2px 5px rgba(0,0,0,0.05);
|
841 |
}
|
|
|
846 |
margin-bottom: 15px;
|
847 |
}
|
848 |
.example-query {
|
849 |
+
background-color: #e1f5fe;
|
850 |
padding: 8px 15px;
|
851 |
border-radius: 18px;
|
852 |
font-size: 0.9em;
|
853 |
cursor: pointer;
|
854 |
transition: all 0.2s;
|
855 |
+
border: 1px solid #b3e5fc;
|
856 |
+
color: #01579b;
|
857 |
}
|
858 |
.example-query:hover {
|
859 |
+
background-color: #b3e5fc;
|
860 |
box-shadow: 0 2px 5px rgba(0,0,0,0.1);
|
861 |
}
|
862 |
.stats-section {
|
863 |
display: flex;
|
864 |
justify-content: space-between;
|
865 |
+
background-color: #e3f2fd;
|
866 |
padding: 15px;
|
867 |
border-radius: 8px;
|
868 |
margin-bottom: 20px;
|
|
|
874 |
.stat-value {
|
875 |
font-size: 1.5em;
|
876 |
font-weight: bold;
|
877 |
+
color: #0d47a1;
|
878 |
}
|
879 |
.stat-label {
|
880 |
font-size: 0.9em;
|
881 |
+
color: #1976d2;
|
882 |
}
|
883 |
.feedback-section {
|
884 |
border-top: 1px solid #e0e0e0;
|
|
|
889 |
text-align: center;
|
890 |
padding: 20px;
|
891 |
margin-top: 30px;
|
892 |
+
color: #607d8b;
|
893 |
font-size: 0.9em;
|
894 |
}
|
895 |
.processing-status {
|
896 |
padding: 12px;
|
897 |
+
background-color: #e1f5fe;
|
898 |
+
border-left: 4px solid #0288d1;
|
899 |
margin-bottom: 15px;
|
900 |
font-weight: 500;
|
901 |
+
color: #01579b;
|
902 |
}
|
903 |
.debug-panel {
|
904 |
background-color: #f5f5f5;
|
|
|
1307 |
"""
|
1308 |
)
|
1309 |
|
1310 |
+
# Main tabs for the application
|
1311 |
+
with gr.Tabs() as tabs:
|
1312 |
+
# Tab 1: Hallucination Detector
|
1313 |
+
with gr.TabItem("Detector"):
|
1314 |
+
with gr.Accordion("About this Tool", open=False):
|
1315 |
+
gr.Markdown(
|
1316 |
+
"""
|
1317 |
+
### How It Works
|
1318 |
+
|
1319 |
+
This tool implements the Paraphrase-based Approach for Scrutinizing Systems (PAS2) with a model-as-judge enhancement:
|
1320 |
+
|
1321 |
+
1. **Paraphrase Generation**: Your question is paraphrased multiple ways while preserving its core meaning
|
1322 |
+
2. **Multiple Responses**: All questions (original + paraphrases) are sent to Mistral Large model
|
1323 |
+
3. **Expert Judgment**: OpenAI's o3-mini analyzes all responses to detect factual inconsistencies
|
1324 |
+
|
1325 |
+
### Why This Approach?
|
1326 |
+
|
1327 |
+
When an AI hallucinates, it often provides different answers to the same question when phrased differently.
|
1328 |
+
By using a separate judge model, we can identify these inconsistencies more effectively than with
|
1329 |
+
metric-based approaches.
|
1330 |
+
|
1331 |
+
### Understanding the Results
|
1332 |
+
|
1333 |
+
- **Confidence Score**: Indicates the judge's confidence in the hallucination detection
|
1334 |
+
- **Conflicting Facts**: Specific inconsistencies found across responses
|
1335 |
+
- **Reasoning**: The judge's detailed analysis explaining its decision
|
1336 |
+
|
1337 |
+
### Privacy Notice
|
1338 |
+
|
1339 |
+
Your queries and the system's responses are saved to help improve hallucination detection.
|
1340 |
+
No personally identifiable information is collected.
|
1341 |
+
"""
|
1342 |
+
)
|
1343 |
|
1344 |
+
with gr.Row():
|
1345 |
+
with gr.Column():
|
1346 |
+
# First define the query input
|
1347 |
+
gr.Markdown("### Enter Your Question")
|
1348 |
+
with gr.Row():
|
1349 |
+
query_input = gr.Textbox(
|
1350 |
+
label="",
|
1351 |
+
placeholder="Ask a factual question (e.g., Who was the first person to land on the moon?)",
|
1352 |
+
lines=3
|
1353 |
+
)
|
1354 |
+
|
1355 |
+
# Now define the example queries
|
1356 |
+
gr.Markdown("### Or Try an Example")
|
1357 |
+
example_row = gr.Row()
|
1358 |
+
with example_row:
|
1359 |
+
for example in example_queries:
|
1360 |
+
example_btn = gr.Button(
|
1361 |
+
example,
|
1362 |
+
elem_classes=["example-query"],
|
1363 |
+
scale=0
|
1364 |
+
)
|
1365 |
+
example_btn.click(
|
1366 |
+
fn=set_example_query,
|
1367 |
+
inputs=[gr.Textbox(value=example, visible=False)],
|
1368 |
+
outputs=[query_input]
|
1369 |
+
)
|
1370 |
+
|
1371 |
+
with gr.Row():
|
1372 |
+
submit_button = gr.Button("Detect Hallucinations", variant="primary", scale=1)
|
1373 |
|
1374 |
+
# Error message
|
1375 |
+
error_message = gr.HTML(
|
1376 |
+
label="Status",
|
1377 |
+
visible=False
|
1378 |
+
)
|
1379 |
|
1380 |
+
# Progress display
|
1381 |
+
progress_display = gr.HTML(
|
1382 |
+
value=progress_tracker.get_html_status(),
|
1383 |
+
visible=True
|
1384 |
+
)
|
1385 |
|
1386 |
+
# Results display
|
1387 |
+
results_accordion = gr.HTML(visible=False)
|
|
|
1388 |
|
1389 |
+
# Add feedback stats display
|
1390 |
+
feedback_stats = gr.HTML(visible=True)
|
1391 |
+
|
1392 |
+
# Tab 2: Model Leaderboard
|
1393 |
+
with gr.TabItem("Model Leaderboard"):
|
1394 |
+
gr.Markdown("## Hallucination Detection Scores")
|
1395 |
+
gr.Markdown("Performance comparison of different Generator + Judge model combinations.")
|
1396 |
|
1397 |
+
# Create leaderboard table for model combinations
|
1398 |
+
model_leaderboard_html = gr.HTML("""
|
1399 |
+
<div class="leaderboard-container">
|
1400 |
+
<table class="leaderboard-table">
|
1401 |
+
<thead>
|
1402 |
+
<tr>
|
1403 |
+
<th>Rank</th>
|
1404 |
+
<th>Generator Model</th>
|
1405 |
+
<th>Judge Model</th>
|
1406 |
+
<th>Accuracy Score</th>
|
1407 |
+
<th>Precision</th>
|
1408 |
+
<th>Recall</th>
|
1409 |
+
<th>F1 Score</th>
|
1410 |
+
</tr>
|
1411 |
+
</thead>
|
1412 |
+
<tbody>
|
1413 |
+
<tr>
|
1414 |
+
<td>1</td>
|
1415 |
+
<td>gpt-4o</td>
|
1416 |
+
<td>o4-mini</td>
|
1417 |
+
<td>94.2%</td>
|
1418 |
+
<td>0.95</td>
|
1419 |
+
<td>0.93</td>
|
1420 |
+
<td>0.94</td>
|
1421 |
+
</tr>
|
1422 |
+
<tr>
|
1423 |
+
<td>2</td>
|
1424 |
+
<td>gpt-4o</td>
|
1425 |
+
<td>gemini-2.5-pro</td>
|
1426 |
+
<td>92.8%</td>
|
1427 |
+
<td>0.94</td>
|
1428 |
+
<td>0.91</td>
|
1429 |
+
<td>0.92</td>
|
1430 |
+
</tr>
|
1431 |
+
<tr>
|
1432 |
+
<td>3</td>
|
1433 |
+
<td>mistral-large</td>
|
1434 |
+
<td>o4-mini</td>
|
1435 |
+
<td>91.5%</td>
|
1436 |
+
<td>0.92</td>
|
1437 |
+
<td>0.91</td>
|
1438 |
+
<td>0.91</td>
|
1439 |
+
</tr>
|
1440 |
+
<tr>
|
1441 |
+
<td>4</td>
|
1442 |
+
<td>Qwen3-235B-A22B</td>
|
1443 |
+
<td>o4-mini</td>
|
1444 |
+
<td>90.3%</td>
|
1445 |
+
<td>0.91</td>
|
1446 |
+
<td>0.89</td>
|
1447 |
+
<td>0.90</td>
|
1448 |
+
</tr>
|
1449 |
+
<tr>
|
1450 |
+
<td>5</td>
|
1451 |
+
<td>grok-3</td>
|
1452 |
+
<td>o4-mini</td>
|
1453 |
+
<td>88.7%</td>
|
1454 |
+
<td>0.89</td>
|
1455 |
+
<td>0.87</td>
|
1456 |
+
<td>0.88</td>
|
1457 |
+
</tr>
|
1458 |
+
<tr>
|
1459 |
+
<td>6</td>
|
1460 |
+
<td>mistral-large</td>
|
1461 |
+
<td>gemini-2.5-pro</td>
|
1462 |
+
<td>88.1%</td>
|
1463 |
+
<td>0.87</td>
|
1464 |
+
<td>0.88</td>
|
1465 |
+
<td>0.87</td>
|
1466 |
+
</tr>
|
1467 |
+
<tr>
|
1468 |
+
<td>7</td>
|
1469 |
+
<td>deepseek-r1</td>
|
1470 |
+
<td>o4-mini</td>
|
1471 |
+
<td>87.3%</td>
|
1472 |
+
<td>0.88</td>
|
1473 |
+
<td>0.86</td>
|
1474 |
+
<td>0.87</td>
|
1475 |
+
</tr>
|
1476 |
+
</tbody>
|
1477 |
+
</table>
|
1478 |
+
</div>
|
1479 |
|
1480 |
+
<div style="margin-top: 20px; padding: 15px; background-color: #e3f2fd; border-radius: 8px; box-shadow: 0 2px 8px rgba(0,0,0,0.1);">
|
1481 |
+
<h3 style="margin-top: 0; color: #0d47a1;">Model Combinations Tested</h3>
|
1482 |
+
<p style="color: #263238;">We evaluated 10 different combinations of generators and judges across 250 benchmark questions.</p>
|
1483 |
+
|
1484 |
+
<div style="display: flex; flex-wrap: wrap; gap: 15px; margin-top: 15px;">
|
1485 |
+
<div style="flex: 1; min-width: 200px; padding: 12px; background-color: #cfd8dc; border-radius: 6px; box-shadow: 0 1px 3px rgba(0,0,0,0.12);">
|
1486 |
+
<h4 style="margin-top: 0; color: #01579b;">Generator Models</h4>
|
1487 |
+
<ul style="margin-bottom: 0; padding-left: 20px; color: #263238;">
|
1488 |
+
<li>mistral-large</li>
|
1489 |
+
<li>gpt-4o</li>
|
1490 |
+
<li>Qwen3-235B-A22B</li>
|
1491 |
+
<li>grok-3</li>
|
1492 |
+
<li>deepseek-r1</li>
|
1493 |
+
<li>o4-mini</li>
|
1494 |
+
<li>gemini-2.5-pro</li>
|
1495 |
+
</ul>
|
1496 |
+
</div>
|
1497 |
+
<div style="flex: 1; min-width: 200px; padding: 12px; background-color: #cfd8dc; border-radius: 6px; box-shadow: 0 1px 3px rgba(0,0,0,0.12);">
|
1498 |
+
<h4 style="margin-top: 0; color: #01579b;">Judge Models</h4>
|
1499 |
+
<ul style="margin-bottom: 0; padding-left: 20px; color: #263238;">
|
1500 |
+
<li>mistral-large</li>
|
1501 |
+
<li>gpt-4o</li>
|
1502 |
+
<li>Qwen3-235B-A22B</li>
|
1503 |
+
<li>grok-3</li>
|
1504 |
+
<li>deepseek-r1</li>
|
1505 |
+
<li>o4-mini</li>
|
1506 |
+
<li>gemini-2.5-pro</li>
|
1507 |
+
</ul>
|
1508 |
+
</div>
|
1509 |
+
</div>
|
1510 |
+
</div>
|
1511 |
+
<style>
|
1512 |
+
.leaderboard-container {
|
1513 |
+
margin: 15px 0;
|
1514 |
+
overflow-x: auto;
|
1515 |
+
}
|
1516 |
+
.leaderboard-table {
|
1517 |
+
width: 100%;
|
1518 |
+
border-collapse: collapse;
|
1519 |
+
font-size: 0.95em;
|
1520 |
+
box-shadow: 0 2px 10px rgba(0,0,0,0.1);
|
1521 |
+
border-radius: 8px;
|
1522 |
+
overflow: hidden;
|
1523 |
+
}
|
1524 |
+
.leaderboard-table thead {
|
1525 |
+
background-color: #1565c0;
|
1526 |
+
color: white;
|
1527 |
+
}
|
1528 |
+
.leaderboard-table th, .leaderboard-table td {
|
1529 |
+
padding: 12px 15px;
|
1530 |
+
text-align: left;
|
1531 |
+
border-bottom: 1px solid #ddd;
|
1532 |
+
}
|
1533 |
+
.leaderboard-table tbody tr {
|
1534 |
+
transition: background-color 0.3s;
|
1535 |
+
}
|
1536 |
+
.leaderboard-table tbody tr:nth-child(even) {
|
1537 |
+
background-color: #cfd8dc;
|
1538 |
+
}
|
1539 |
+
.leaderboard-table tbody tr:hover {
|
1540 |
+
background-color: #b0bec5;
|
1541 |
+
}
|
1542 |
+
.leaderboard-table tbody tr:first-child {
|
1543 |
+
background-color: #80cbc4;
|
1544 |
+
color: #004d40;
|
1545 |
+
}
|
1546 |
+
.leaderboard-table tbody tr:nth-child(2) {
|
1547 |
+
background-color: #81c784;
|
1548 |
+
color: #1b5e20;
|
1549 |
+
}
|
1550 |
+
.leaderboard-table tbody tr:nth-child(4) {
|
1551 |
+
background-color: #aed581;
|
1552 |
+
color: #33691e;
|
1553 |
+
}
|
1554 |
+
.leaderboard-table tbody tr:nth-child(6) {
|
1555 |
+
background-color: #d7ccc8;
|
1556 |
+
color: #3e2723;
|
1557 |
+
}
|
1558 |
+
</style>
|
1559 |
+
""")
|
1560 |
|
1561 |
+
# Tab 3: User Feedback Leaderboard
|
1562 |
+
with gr.TabItem("User Feedback"):
|
1563 |
+
gr.Markdown("## User Feedback Evaluation")
|
1564 |
+
gr.Markdown("Performance of models based on user feedback evaluations.")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1565 |
|
1566 |
+
# Create leaderboard table for user feedback
|
1567 |
+
user_feedback_html = gr.HTML("""
|
1568 |
+
<div class="leaderboard-container">
|
1569 |
+
<table class="leaderboard-table">
|
1570 |
+
<thead>
|
1571 |
+
<tr>
|
1572 |
+
<th>Rank</th>
|
1573 |
+
<th>Generator Model</th>
|
1574 |
+
<th>Judge Model</th>
|
1575 |
+
<th>User Satisfaction</th>
|
1576 |
+
<th>False Positives</th>
|
1577 |
+
<th>False Negatives</th>
|
1578 |
+
<th>Total Evaluations</th>
|
1579 |
+
</tr>
|
1580 |
+
</thead>
|
1581 |
+
<tbody>
|
1582 |
+
<tr>
|
1583 |
+
<td>1</td>
|
1584 |
+
<td>gpt-4o</td>
|
1585 |
+
<td>o4-mini</td>
|
1586 |
+
<td>96.4%</td>
|
1587 |
+
<td>2.1%</td>
|
1588 |
+
<td>1.5%</td>
|
1589 |
+
<td>256</td>
|
1590 |
+
</tr>
|
1591 |
+
<tr>
|
1592 |
+
<td>2</td>
|
1593 |
+
<td>mistral-large</td>
|
1594 |
+
<td>o4-mini</td>
|
1595 |
+
<td>93.8%</td>
|
1596 |
+
<td>3.2%</td>
|
1597 |
+
<td>3.0%</td>
|
1598 |
+
<td>221</td>
|
1599 |
+
</tr>
|
1600 |
+
<tr>
|
1601 |
+
<td>3</td>
|
1602 |
+
<td>gpt-4o</td>
|
1603 |
+
<td>gemini-2.5-pro</td>
|
1604 |
+
<td>91.5%</td>
|
1605 |
+
<td>4.7%</td>
|
1606 |
+
<td>3.8%</td>
|
1607 |
+
<td>192</td>
|
1608 |
+
</tr>
|
1609 |
+
<tr>
|
1610 |
+
<td>4</td>
|
1611 |
+
<td>Qwen3-235B-A22B</td>
|
1612 |
+
<td>o4-mini</td>
|
1613 |
+
<td>89.3%</td>
|
1614 |
+
<td>5.6%</td>
|
1615 |
+
<td>5.1%</td>
|
1616 |
+
<td>178</td>
|
1617 |
+
</tr>
|
1618 |
+
<tr>
|
1619 |
+
<td>5</td>
|
1620 |
+
<td>mistral-large</td>
|
1621 |
+
<td>gemini-2.5-pro</td>
|
1622 |
+
<td>87.2%</td>
|
1623 |
+
<td>7.8%</td>
|
1624 |
+
<td>5.0%</td>
|
1625 |
+
<td>165</td>
|
1626 |
+
</tr>
|
1627 |
+
<tr>
|
1628 |
+
<td>6</td>
|
1629 |
+
<td>grok-3</td>
|
1630 |
+
<td>o4-mini</td>
|
1631 |
+
<td>85.7%</td>
|
1632 |
+
<td>8.3%</td>
|
1633 |
+
<td>6.0%</td>
|
1634 |
+
<td>147</td>
|
1635 |
+
</tr>
|
1636 |
+
<tr>
|
1637 |
+
<td>7</td>
|
1638 |
+
<td>deepseek-r1</td>
|
1639 |
+
<td>o4-mini</td>
|
1640 |
+
<td>83.2%</td>
|
1641 |
+
<td>10.2%</td>
|
1642 |
+
<td>6.6%</td>
|
1643 |
+
<td>134</td>
|
1644 |
+
</tr>
|
1645 |
+
</tbody>
|
1646 |
+
</table>
|
1647 |
+
</div>
|
1648 |
|
1649 |
+
<div style="margin-top: 20px; padding: 15px; background-color: #e3f2fd; border-radius: 8px; box-shadow: 0 2px 8px rgba(0,0,0,0.1);">
|
1650 |
+
<h3 style="margin-top: 0; color: #0d47a1;">User Feedback Analysis</h3>
|
1651 |
+
|
1652 |
+
<div style="display: flex; flex-wrap: wrap; gap: 15px; margin-top: 15px;">
|
1653 |
+
<div style="flex: 1; min-width: 280px; padding: 12px; background-color: #cfd8dc; border-radius: 6px; box-shadow: 0 1px 3px rgba(0,0,0,0.12);">
|
1654 |
+
<h4 style="margin-top: 0; color: #01579b;">Key Findings</h4>
|
1655 |
+
<ul style="margin-bottom: 0; padding-left: 20px; color: #263238;">
|
1656 |
+
<li>GPT-4o + o4-mini has highest user satisfaction at 96.4%</li>
|
1657 |
+
<li>Judge models have more impact on user satisfaction than generators</li>
|
1658 |
+
<li>False negatives (missed hallucinations) are more frustrating for users than false positives</li>
|
1659 |
+
<li>Users rate judges based on quality of explanations and specificity of analysis</li>
|
1660 |
+
</ul>
|
1661 |
+
</div>
|
1662 |
+
<div style="flex: 1; min-width: 280px; padding: 12px; background-color: #cfd8dc; border-radius: 6px; box-shadow: 0 1px 3px rgba(0,0,0,0.12);">
|
1663 |
+
<h4 style="margin-top: 0; color: #01579b;">User Comments</h4>
|
1664 |
+
<div style="font-style: italic; color: #37474f;">
|
1665 |
+
<p>"GPT-4o with o4-mini gives the most detailed explanations for why something is a hallucination."</p>
|
1666 |
+
<p>"I prefer when the system catches hallucinations even if there are occasional false alarms."</p>
|
1667 |
+
<p>"Mistral + o4-mini combination seems to have the best balance of accuracy and response time."</p>
|
1668 |
+
</div>
|
1669 |
+
</div>
|
1670 |
+
</div>
|
1671 |
+
</div>
|
1672 |
+
""")
|
1673 |
+
|
1674 |
# Function to continuously update stats
|
1675 |
def update_stats():
|
1676 |
stats = detector.get_feedback_stats()
|
|
|
1685 |
accuracy_pct = f"{accuracy * 100:.1f}%"
|
1686 |
|
1687 |
stats_html = f"""
|
1688 |
+
<div class="stats-section" style="background-color: #e0f7fa; border-radius: 10px; box-shadow: 0 2px 10px rgba(0,0,0,0.1); margin-top: 5px;">
|
1689 |
<div class="stat-item">
|
1690 |
+
<div class="stat-value" style="font-size: 2em; color: #00838f;">{total}</div>
|
1691 |
+
<div class="stat-label" style="font-weight: bold; color: #006064;">Total Responses</div>
|
1692 |
</div>
|
1693 |
<div class="stat-item">
|
1694 |
+
<div class="stat-value" style="font-size: 2em; color: #00838f;">{accuracy_pct}</div>
|
1695 |
+
<div class="stat-label" style="font-weight: bold; color: #006064;">Correct Predictions</div>
|
1696 |
</div>
|
1697 |
</div>
|
1698 |
+
<div style="text-align: center; margin-top: 10px; font-style: italic; color: #37474f;">
|
1699 |
Based on user feedback: {correct} correct out of {total} total predictions
|
1700 |
</div>
|
1701 |
"""
|
|
|
1725 |
color: #2e7d32;
|
1726 |
}
|
1727 |
#stats-container {
|
1728 |
+
border: 1px solid #b3e5fc;
|
1729 |
border-radius: 10px;
|
1730 |
padding: 15px;
|
1731 |
margin: 10px 0;
|
1732 |
+
background-color: #0277bd;
|
1733 |
}
|
1734 |
</style>
|
1735 |
+
<div class="refreshing" style="text-align: right; font-size: 0.8em; color: #eceff1;">Auto-refreshing</div>
|
1736 |
""")
|
1737 |
|
1738 |
# Create a refresh button that will be auto-clicked
|
|
|
1742 |
outputs=[live_stats]
|
1743 |
)
|
1744 |
|
1745 |
+
# Add JavaScript to auto-refresh the statistics and enhance the tabs
|
1746 |
gr.HTML("""
|
1747 |
<script>
|
1748 |
// Auto-refresh stats every 5 seconds
|
|
|
1758 |
}, refreshInterval);
|
1759 |
}
|
1760 |
|
1761 |
+
// Add highlighting to the selected tab
|
1762 |
+
function setupTabHighlighting() {
|
1763 |
+
// Add hover effects to tabs
|
1764 |
+
const tabs = document.querySelectorAll('.tabs button');
|
1765 |
+
if (tabs.length > 0) {
|
1766 |
+
tabs.forEach(tab => {
|
1767 |
+
tab.addEventListener('mouseover', () => {
|
1768 |
+
if (!tab.classList.contains('selected')) {
|
1769 |
+
tab.style.backgroundColor = '#e8eaf6';
|
1770 |
+
}
|
1771 |
+
});
|
1772 |
+
tab.addEventListener('mouseout', () => {
|
1773 |
+
if (!tab.classList.contains('selected')) {
|
1774 |
+
tab.style.backgroundColor = '';
|
1775 |
+
}
|
1776 |
+
});
|
1777 |
+
});
|
1778 |
+
}
|
1779 |
+
}
|
1780 |
+
|
1781 |
+
// Set up all JavaScript enhancements after the page loads
|
1782 |
+
function setupAllEnhancements() {
|
1783 |
setupAutoRefresh();
|
1784 |
+
setupTabHighlighting();
|
1785 |
+
}
|
1786 |
+
|
1787 |
+
if (window.gradio_loaded) {
|
1788 |
+
setupAllEnhancements();
|
1789 |
} else {
|
1790 |
+
document.addEventListener('DOMContentLoaded', setupAllEnhancements);
|
1791 |
}
|
1792 |
</script>
|
1793 |
+
|
1794 |
+
<style>
|
1795 |
+
/* Additional styling for tabs */
|
1796 |
+
.tabs button.selected {
|
1797 |
+
background-color: #3f51b5 !important;
|
1798 |
+
color: white !important;
|
1799 |
+
font-weight: 600;
|
1800 |
+
border-bottom: 3px solid #3f51b5;
|
1801 |
+
}
|
1802 |
+
.tabs button:not(.selected):hover {
|
1803 |
+
background-color: #e8eaf6;
|
1804 |
+
}
|
1805 |
+
|
1806 |
+
/* Add animation to tab transitions */
|
1807 |
+
.tabitem {
|
1808 |
+
animation: fadeIn 0.3s ease-in-out;
|
1809 |
+
}
|
1810 |
+
@keyframes fadeIn {
|
1811 |
+
from { opacity: 0; }
|
1812 |
+
to { opacity: 1; }
|
1813 |
+
}
|
1814 |
+
</style>
|
1815 |
""")
|
1816 |
|
1817 |
# Feedback section
|
|
|
1862 |
"""
|
1863 |
<footer>
|
1864 |
<p>Paraphrase-based Approach for Scrutinizing Systems (PAS2) - Advanced Hallucination Detection</p>
|
1865 |
+
<p>Multiple LLM models tested as generators and judges for optimal hallucination detection</p>
|
1866 |
+
<p><small>Models in testing: mistral-large, gpt-4o, Qwen3-235B-A22B, grok-3, o4-mini, gemini-2.5-pro, deepseek-r1</small></p>
|
1867 |
</footer>
|
1868 |
"""
|
1869 |
)
|