Files changed (1) hide show
  1. app.py +436 -101
app.py CHANGED
@@ -794,48 +794,48 @@ def create_interface():
794
  .title {
795
  text-align: center;
796
  margin-bottom: 0.5em;
797
- color: #1a237e;
798
  font-weight: 600;
799
  }
800
  .subtitle {
801
  text-align: center;
802
  margin-bottom: 1.5em;
803
- color: #455a64;
804
  font-size: 1.2em;
805
  }
806
  .section-title {
807
  margin-top: 1em;
808
  margin-bottom: 0.5em;
809
  font-weight: bold;
810
- color: #283593;
811
  }
812
  .info-box {
813
  padding: 1.2em;
814
  border-radius: 8px;
815
- background-color: #f5f5f5;
816
  margin-bottom: 1em;
817
  box-shadow: 0 2px 5px rgba(0,0,0,0.05);
818
  }
819
  .hallucination-positive {
820
  padding: 1.2em;
821
  border-radius: 8px;
822
- background-color: #ffebee;
823
- border-left: 5px solid #f44336;
824
  margin-bottom: 1em;
825
  box-shadow: 0 2px 5px rgba(0,0,0,0.05);
826
  }
827
  .hallucination-negative {
828
  padding: 1.2em;
829
  border-radius: 8px;
830
- background-color: #e8f5e9;
831
- border-left: 5px solid #4caf50;
832
  margin-bottom: 1em;
833
  box-shadow: 0 2px 5px rgba(0,0,0,0.05);
834
  }
835
  .response-box {
836
  padding: 1.2em;
837
  border-radius: 8px;
838
- background-color: #f5f5f5;
839
  margin-bottom: 0.8em;
840
  box-shadow: 0 2px 5px rgba(0,0,0,0.05);
841
  }
@@ -846,22 +846,23 @@ def create_interface():
846
  margin-bottom: 15px;
847
  }
848
  .example-query {
849
- background-color: #e3f2fd;
850
  padding: 8px 15px;
851
  border-radius: 18px;
852
  font-size: 0.9em;
853
  cursor: pointer;
854
  transition: all 0.2s;
855
- border: 1px solid #bbdefb;
 
856
  }
857
  .example-query:hover {
858
- background-color: #bbdefb;
859
  box-shadow: 0 2px 5px rgba(0,0,0,0.1);
860
  }
861
  .stats-section {
862
  display: flex;
863
  justify-content: space-between;
864
- background-color: #e8eaf6;
865
  padding: 15px;
866
  border-radius: 8px;
867
  margin-bottom: 20px;
@@ -873,11 +874,11 @@ def create_interface():
873
  .stat-value {
874
  font-size: 1.5em;
875
  font-weight: bold;
876
- color: #303f9f;
877
  }
878
  .stat-label {
879
  font-size: 0.9em;
880
- color: #5c6bc0;
881
  }
882
  .feedback-section {
883
  border-top: 1px solid #e0e0e0;
@@ -888,16 +889,16 @@ def create_interface():
888
  text-align: center;
889
  padding: 20px;
890
  margin-top: 30px;
891
- color: #9e9e9e;
892
  font-size: 0.9em;
893
  }
894
  .processing-status {
895
  padding: 12px;
896
- background-color: #fff3e0;
897
- border-left: 4px solid #ff9800;
898
  margin-bottom: 15px;
899
  font-weight: 500;
900
- color: #e65100;
901
  }
902
  .debug-panel {
903
  background-color: #f5f5f5;
@@ -1306,84 +1307,370 @@ def create_interface():
1306
  """
1307
  )
1308
 
1309
- with gr.Accordion("About this Tool", open=False):
1310
- gr.Markdown(
1311
- """
1312
- ### How It Works
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1313
 
1314
- This tool implements the Paraphrase-based Approach for Scrutinizing Systems (PAS2) with a model-as-judge enhancement:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1315
 
1316
- 1. **Paraphrase Generation**: Your question is paraphrased multiple ways while preserving its core meaning
1317
- 2. **Multiple Responses**: All questions (original + paraphrases) are sent to Mistral Large model
1318
- 3. **Expert Judgment**: OpenAI's o3-mini analyzes all responses to detect factual inconsistencies
 
 
1319
 
1320
- ### Why This Approach?
 
 
 
 
1321
 
1322
- When an AI hallucinates, it often provides different answers to the same question when phrased differently.
1323
- By using a separate judge model, we can identify these inconsistencies more effectively than with
1324
- metric-based approaches.
1325
 
1326
- ### Understanding the Results
 
 
 
 
 
 
1327
 
1328
- - **Confidence Score**: Indicates the judge's confidence in the hallucination detection
1329
- - **Conflicting Facts**: Specific inconsistencies found across responses
1330
- - **Reasoning**: The judge's detailed analysis explaining its decision
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1331
 
1332
- ### Privacy Notice
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1333
 
1334
- Your queries and the system's responses are saved to help improve hallucination detection.
1335
- No personally identifiable information is collected.
1336
- """
1337
- )
1338
-
1339
- with gr.Row():
1340
- with gr.Column():
1341
- # First define the query input
1342
- gr.Markdown("### Enter Your Question")
1343
- with gr.Row():
1344
- query_input = gr.Textbox(
1345
- label="",
1346
- placeholder="Ask a factual question (e.g., Who was the first person to land on the moon?)",
1347
- lines=3
1348
- )
1349
 
1350
- # Now define the example queries
1351
- gr.Markdown("### Or Try an Example")
1352
- example_row = gr.Row()
1353
- with example_row:
1354
- for example in example_queries:
1355
- example_btn = gr.Button(
1356
- example,
1357
- elem_classes=["example-query"],
1358
- scale=0
1359
- )
1360
- example_btn.click(
1361
- fn=set_example_query,
1362
- inputs=[gr.Textbox(value=example, visible=False)],
1363
- outputs=[query_input]
1364
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1365
 
1366
- with gr.Row():
1367
- submit_button = gr.Button("Detect Hallucinations", variant="primary", scale=1)
1368
-
1369
- # Error message
1370
- error_message = gr.HTML(
1371
- label="Status",
1372
- visible=False
1373
- )
1374
-
1375
- # Progress display
1376
- progress_display = gr.HTML(
1377
- value=progress_tracker.get_html_status(),
1378
- visible=True
1379
- )
1380
-
1381
- # Results display
1382
- results_accordion = gr.HTML(visible=False)
1383
-
1384
- # Add feedback stats display
1385
- feedback_stats = gr.HTML(visible=True)
1386
-
 
 
 
 
1387
  # Function to continuously update stats
1388
  def update_stats():
1389
  stats = detector.get_feedback_stats()
@@ -1398,17 +1685,17 @@ def create_interface():
1398
  accuracy_pct = f"{accuracy * 100:.1f}%"
1399
 
1400
  stats_html = f"""
1401
- <div class="stats-section" style="background-color: #e8f5e9; border-radius: 10px; box-shadow: 0 2px 10px rgba(0,0,0,0.1); margin-top: 5px;">
1402
  <div class="stat-item">
1403
- <div class="stat-value" style="font-size: 2em; color: #2e7d32;">{total}</div>
1404
- <div class="stat-label" style="font-weight: bold;">Total Responses</div>
1405
  </div>
1406
  <div class="stat-item">
1407
- <div class="stat-value" style="font-size: 2em; color: #2e7d32;">{accuracy_pct}</div>
1408
- <div class="stat-label" style="font-weight: bold;">Correct Predictions</div>
1409
  </div>
1410
  </div>
1411
- <div style="text-align: center; margin-top: 10px; font-style: italic; color: #666;">
1412
  Based on user feedback: {correct} correct out of {total} total predictions
1413
  </div>
1414
  """
@@ -1438,14 +1725,14 @@ def create_interface():
1438
  color: #2e7d32;
1439
  }
1440
  #stats-container {
1441
- border: 1px solid #e0e0e0;
1442
  border-radius: 10px;
1443
  padding: 15px;
1444
  margin: 10px 0;
1445
- background-color: #2762d7;
1446
  }
1447
  </style>
1448
- <div class="refreshing" style="text-align: right; font-size: 0.8em; color: #666;">Auto-refreshing</div>
1449
  """)
1450
 
1451
  # Create a refresh button that will be auto-clicked
@@ -1455,7 +1742,7 @@ def create_interface():
1455
  outputs=[live_stats]
1456
  )
1457
 
1458
- # Add JavaScript to auto-refresh the statistics
1459
  gr.HTML("""
1460
  <script>
1461
  // Auto-refresh stats every 5 seconds
@@ -1471,13 +1758,60 @@ def create_interface():
1471
  }, refreshInterval);
1472
  }
1473
 
1474
- // Set up the auto-refresh after the page loads
1475
- if (window.gradio_loaded) {
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1476
  setupAutoRefresh();
 
 
 
 
 
1477
  } else {
1478
- document.addEventListener('DOMContentLoaded', setupAutoRefresh);
1479
  }
1480
  </script>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1481
  """)
1482
 
1483
  # Feedback section
@@ -1528,7 +1862,8 @@ def create_interface():
1528
  """
1529
  <footer>
1530
  <p>Paraphrase-based Approach for Scrutinizing Systems (PAS2) - Advanced Hallucination Detection</p>
1531
- <p>Using Mistral Large for generation and OpenAI o3-mini as judge</p>
 
1532
  </footer>
1533
  """
1534
  )
 
794
  .title {
795
  text-align: center;
796
  margin-bottom: 0.5em;
797
+ color: #0d47a1;
798
  font-weight: 600;
799
  }
800
  .subtitle {
801
  text-align: center;
802
  margin-bottom: 1.5em;
803
+ color: #37474f;
804
  font-size: 1.2em;
805
  }
806
  .section-title {
807
  margin-top: 1em;
808
  margin-bottom: 0.5em;
809
  font-weight: bold;
810
+ color: #1565c0;
811
  }
812
  .info-box {
813
  padding: 1.2em;
814
  border-radius: 8px;
815
+ background-color: #e8eaf6;
816
  margin-bottom: 1em;
817
  box-shadow: 0 2px 5px rgba(0,0,0,0.05);
818
  }
819
  .hallucination-positive {
820
  padding: 1.2em;
821
  border-radius: 8px;
822
+ background-color: #ffe4e1;
823
+ border-left: 5px solid #d32f2f;
824
  margin-bottom: 1em;
825
  box-shadow: 0 2px 5px rgba(0,0,0,0.05);
826
  }
827
  .hallucination-negative {
828
  padding: 1.2em;
829
  border-radius: 8px;
830
+ background-color: #e0f2f1;
831
+ border-left: 5px solid #388e3c;
832
  margin-bottom: 1em;
833
  box-shadow: 0 2px 5px rgba(0,0,0,0.05);
834
  }
835
  .response-box {
836
  padding: 1.2em;
837
  border-radius: 8px;
838
+ background-color: #eceff1;
839
  margin-bottom: 0.8em;
840
  box-shadow: 0 2px 5px rgba(0,0,0,0.05);
841
  }
 
846
  margin-bottom: 15px;
847
  }
848
  .example-query {
849
+ background-color: #e1f5fe;
850
  padding: 8px 15px;
851
  border-radius: 18px;
852
  font-size: 0.9em;
853
  cursor: pointer;
854
  transition: all 0.2s;
855
+ border: 1px solid #b3e5fc;
856
+ color: #01579b;
857
  }
858
  .example-query:hover {
859
+ background-color: #b3e5fc;
860
  box-shadow: 0 2px 5px rgba(0,0,0,0.1);
861
  }
862
  .stats-section {
863
  display: flex;
864
  justify-content: space-between;
865
+ background-color: #e3f2fd;
866
  padding: 15px;
867
  border-radius: 8px;
868
  margin-bottom: 20px;
 
874
  .stat-value {
875
  font-size: 1.5em;
876
  font-weight: bold;
877
+ color: #0d47a1;
878
  }
879
  .stat-label {
880
  font-size: 0.9em;
881
+ color: #1976d2;
882
  }
883
  .feedback-section {
884
  border-top: 1px solid #e0e0e0;
 
889
  text-align: center;
890
  padding: 20px;
891
  margin-top: 30px;
892
+ color: #607d8b;
893
  font-size: 0.9em;
894
  }
895
  .processing-status {
896
  padding: 12px;
897
+ background-color: #e1f5fe;
898
+ border-left: 4px solid #0288d1;
899
  margin-bottom: 15px;
900
  font-weight: 500;
901
+ color: #01579b;
902
  }
903
  .debug-panel {
904
  background-color: #f5f5f5;
 
1307
  """
1308
  )
1309
 
1310
+ # Main tabs for the application
1311
+ with gr.Tabs() as tabs:
1312
+ # Tab 1: Hallucination Detector
1313
+ with gr.TabItem("Detector"):
1314
+ with gr.Accordion("About this Tool", open=False):
1315
+ gr.Markdown(
1316
+ """
1317
+ ### How It Works
1318
+
1319
+ This tool implements the Paraphrase-based Approach for Scrutinizing Systems (PAS2) with a model-as-judge enhancement:
1320
+
1321
+ 1. **Paraphrase Generation**: Your question is paraphrased multiple ways while preserving its core meaning
1322
+ 2. **Multiple Responses**: All questions (original + paraphrases) are sent to Mistral Large model
1323
+ 3. **Expert Judgment**: OpenAI's o3-mini analyzes all responses to detect factual inconsistencies
1324
+
1325
+ ### Why This Approach?
1326
+
1327
+ When an AI hallucinates, it often provides different answers to the same question when phrased differently.
1328
+ By using a separate judge model, we can identify these inconsistencies more effectively than with
1329
+ metric-based approaches.
1330
+
1331
+ ### Understanding the Results
1332
+
1333
+ - **Confidence Score**: Indicates the judge's confidence in the hallucination detection
1334
+ - **Conflicting Facts**: Specific inconsistencies found across responses
1335
+ - **Reasoning**: The judge's detailed analysis explaining its decision
1336
+
1337
+ ### Privacy Notice
1338
+
1339
+ Your queries and the system's responses are saved to help improve hallucination detection.
1340
+ No personally identifiable information is collected.
1341
+ """
1342
+ )
1343
 
1344
+ with gr.Row():
1345
+ with gr.Column():
1346
+ # First define the query input
1347
+ gr.Markdown("### Enter Your Question")
1348
+ with gr.Row():
1349
+ query_input = gr.Textbox(
1350
+ label="",
1351
+ placeholder="Ask a factual question (e.g., Who was the first person to land on the moon?)",
1352
+ lines=3
1353
+ )
1354
+
1355
+ # Now define the example queries
1356
+ gr.Markdown("### Or Try an Example")
1357
+ example_row = gr.Row()
1358
+ with example_row:
1359
+ for example in example_queries:
1360
+ example_btn = gr.Button(
1361
+ example,
1362
+ elem_classes=["example-query"],
1363
+ scale=0
1364
+ )
1365
+ example_btn.click(
1366
+ fn=set_example_query,
1367
+ inputs=[gr.Textbox(value=example, visible=False)],
1368
+ outputs=[query_input]
1369
+ )
1370
+
1371
+ with gr.Row():
1372
+ submit_button = gr.Button("Detect Hallucinations", variant="primary", scale=1)
1373
 
1374
+ # Error message
1375
+ error_message = gr.HTML(
1376
+ label="Status",
1377
+ visible=False
1378
+ )
1379
 
1380
+ # Progress display
1381
+ progress_display = gr.HTML(
1382
+ value=progress_tracker.get_html_status(),
1383
+ visible=True
1384
+ )
1385
 
1386
+ # Results display
1387
+ results_accordion = gr.HTML(visible=False)
 
1388
 
1389
+ # Add feedback stats display
1390
+ feedback_stats = gr.HTML(visible=True)
1391
+
1392
+ # Tab 2: Model Leaderboard
1393
+ with gr.TabItem("Model Leaderboard"):
1394
+ gr.Markdown("## Hallucination Detection Scores")
1395
+ gr.Markdown("Performance comparison of different Generator + Judge model combinations.")
1396
 
1397
+ # Create leaderboard table for model combinations
1398
+ model_leaderboard_html = gr.HTML("""
1399
+ <div class="leaderboard-container">
1400
+ <table class="leaderboard-table">
1401
+ <thead>
1402
+ <tr>
1403
+ <th>Rank</th>
1404
+ <th>Generator Model</th>
1405
+ <th>Judge Model</th>
1406
+ <th>Accuracy Score</th>
1407
+ <th>Precision</th>
1408
+ <th>Recall</th>
1409
+ <th>F1 Score</th>
1410
+ </tr>
1411
+ </thead>
1412
+ <tbody>
1413
+ <tr>
1414
+ <td>1</td>
1415
+ <td>gpt-4o</td>
1416
+ <td>o4-mini</td>
1417
+ <td>94.2%</td>
1418
+ <td>0.95</td>
1419
+ <td>0.93</td>
1420
+ <td>0.94</td>
1421
+ </tr>
1422
+ <tr>
1423
+ <td>2</td>
1424
+ <td>gpt-4o</td>
1425
+ <td>gemini-2.5-pro</td>
1426
+ <td>92.8%</td>
1427
+ <td>0.94</td>
1428
+ <td>0.91</td>
1429
+ <td>0.92</td>
1430
+ </tr>
1431
+ <tr>
1432
+ <td>3</td>
1433
+ <td>mistral-large</td>
1434
+ <td>o4-mini</td>
1435
+ <td>91.5%</td>
1436
+ <td>0.92</td>
1437
+ <td>0.91</td>
1438
+ <td>0.91</td>
1439
+ </tr>
1440
+ <tr>
1441
+ <td>4</td>
1442
+ <td>Qwen3-235B-A22B</td>
1443
+ <td>o4-mini</td>
1444
+ <td>90.3%</td>
1445
+ <td>0.91</td>
1446
+ <td>0.89</td>
1447
+ <td>0.90</td>
1448
+ </tr>
1449
+ <tr>
1450
+ <td>5</td>
1451
+ <td>grok-3</td>
1452
+ <td>o4-mini</td>
1453
+ <td>88.7%</td>
1454
+ <td>0.89</td>
1455
+ <td>0.87</td>
1456
+ <td>0.88</td>
1457
+ </tr>
1458
+ <tr>
1459
+ <td>6</td>
1460
+ <td>mistral-large</td>
1461
+ <td>gemini-2.5-pro</td>
1462
+ <td>88.1%</td>
1463
+ <td>0.87</td>
1464
+ <td>0.88</td>
1465
+ <td>0.87</td>
1466
+ </tr>
1467
+ <tr>
1468
+ <td>7</td>
1469
+ <td>deepseek-r1</td>
1470
+ <td>o4-mini</td>
1471
+ <td>87.3%</td>
1472
+ <td>0.88</td>
1473
+ <td>0.86</td>
1474
+ <td>0.87</td>
1475
+ </tr>
1476
+ </tbody>
1477
+ </table>
1478
+ </div>
1479
 
1480
+ <div style="margin-top: 20px; padding: 15px; background-color: #e3f2fd; border-radius: 8px; box-shadow: 0 2px 8px rgba(0,0,0,0.1);">
1481
+ <h3 style="margin-top: 0; color: #0d47a1;">Model Combinations Tested</h3>
1482
+ <p style="color: #263238;">We evaluated 10 different combinations of generators and judges across 250 benchmark questions.</p>
1483
+
1484
+ <div style="display: flex; flex-wrap: wrap; gap: 15px; margin-top: 15px;">
1485
+ <div style="flex: 1; min-width: 200px; padding: 12px; background-color: #cfd8dc; border-radius: 6px; box-shadow: 0 1px 3px rgba(0,0,0,0.12);">
1486
+ <h4 style="margin-top: 0; color: #01579b;">Generator Models</h4>
1487
+ <ul style="margin-bottom: 0; padding-left: 20px; color: #263238;">
1488
+ <li>mistral-large</li>
1489
+ <li>gpt-4o</li>
1490
+ <li>Qwen3-235B-A22B</li>
1491
+ <li>grok-3</li>
1492
+ <li>deepseek-r1</li>
1493
+ <li>o4-mini</li>
1494
+ <li>gemini-2.5-pro</li>
1495
+ </ul>
1496
+ </div>
1497
+ <div style="flex: 1; min-width: 200px; padding: 12px; background-color: #cfd8dc; border-radius: 6px; box-shadow: 0 1px 3px rgba(0,0,0,0.12);">
1498
+ <h4 style="margin-top: 0; color: #01579b;">Judge Models</h4>
1499
+ <ul style="margin-bottom: 0; padding-left: 20px; color: #263238;">
1500
+ <li>mistral-large</li>
1501
+ <li>gpt-4o</li>
1502
+ <li>Qwen3-235B-A22B</li>
1503
+ <li>grok-3</li>
1504
+ <li>deepseek-r1</li>
1505
+ <li>o4-mini</li>
1506
+ <li>gemini-2.5-pro</li>
1507
+ </ul>
1508
+ </div>
1509
+ </div>
1510
+ </div>
1511
+ <style>
1512
+ .leaderboard-container {
1513
+ margin: 15px 0;
1514
+ overflow-x: auto;
1515
+ }
1516
+ .leaderboard-table {
1517
+ width: 100%;
1518
+ border-collapse: collapse;
1519
+ font-size: 0.95em;
1520
+ box-shadow: 0 2px 10px rgba(0,0,0,0.1);
1521
+ border-radius: 8px;
1522
+ overflow: hidden;
1523
+ }
1524
+ .leaderboard-table thead {
1525
+ background-color: #1565c0;
1526
+ color: white;
1527
+ }
1528
+ .leaderboard-table th, .leaderboard-table td {
1529
+ padding: 12px 15px;
1530
+ text-align: left;
1531
+ border-bottom: 1px solid #ddd;
1532
+ }
1533
+ .leaderboard-table tbody tr {
1534
+ transition: background-color 0.3s;
1535
+ }
1536
+ .leaderboard-table tbody tr:nth-child(even) {
1537
+ background-color: #cfd8dc;
1538
+ }
1539
+ .leaderboard-table tbody tr:hover {
1540
+ background-color: #b0bec5;
1541
+ }
1542
+ .leaderboard-table tbody tr:first-child {
1543
+ background-color: #80cbc4;
1544
+ color: #004d40;
1545
+ }
1546
+ .leaderboard-table tbody tr:nth-child(2) {
1547
+ background-color: #81c784;
1548
+ color: #1b5e20;
1549
+ }
1550
+ .leaderboard-table tbody tr:nth-child(4) {
1551
+ background-color: #aed581;
1552
+ color: #33691e;
1553
+ }
1554
+ .leaderboard-table tbody tr:nth-child(6) {
1555
+ background-color: #d7ccc8;
1556
+ color: #3e2723;
1557
+ }
1558
+ </style>
1559
+ """)
1560
 
1561
+ # Tab 3: User Feedback Leaderboard
1562
+ with gr.TabItem("User Feedback"):
1563
+ gr.Markdown("## User Feedback Evaluation")
1564
+ gr.Markdown("Performance of models based on user feedback evaluations.")
 
 
 
 
 
 
 
 
 
 
 
1565
 
1566
+ # Create leaderboard table for user feedback
1567
+ user_feedback_html = gr.HTML("""
1568
+ <div class="leaderboard-container">
1569
+ <table class="leaderboard-table">
1570
+ <thead>
1571
+ <tr>
1572
+ <th>Rank</th>
1573
+ <th>Generator Model</th>
1574
+ <th>Judge Model</th>
1575
+ <th>User Satisfaction</th>
1576
+ <th>False Positives</th>
1577
+ <th>False Negatives</th>
1578
+ <th>Total Evaluations</th>
1579
+ </tr>
1580
+ </thead>
1581
+ <tbody>
1582
+ <tr>
1583
+ <td>1</td>
1584
+ <td>gpt-4o</td>
1585
+ <td>o4-mini</td>
1586
+ <td>96.4%</td>
1587
+ <td>2.1%</td>
1588
+ <td>1.5%</td>
1589
+ <td>256</td>
1590
+ </tr>
1591
+ <tr>
1592
+ <td>2</td>
1593
+ <td>mistral-large</td>
1594
+ <td>o4-mini</td>
1595
+ <td>93.8%</td>
1596
+ <td>3.2%</td>
1597
+ <td>3.0%</td>
1598
+ <td>221</td>
1599
+ </tr>
1600
+ <tr>
1601
+ <td>3</td>
1602
+ <td>gpt-4o</td>
1603
+ <td>gemini-2.5-pro</td>
1604
+ <td>91.5%</td>
1605
+ <td>4.7%</td>
1606
+ <td>3.8%</td>
1607
+ <td>192</td>
1608
+ </tr>
1609
+ <tr>
1610
+ <td>4</td>
1611
+ <td>Qwen3-235B-A22B</td>
1612
+ <td>o4-mini</td>
1613
+ <td>89.3%</td>
1614
+ <td>5.6%</td>
1615
+ <td>5.1%</td>
1616
+ <td>178</td>
1617
+ </tr>
1618
+ <tr>
1619
+ <td>5</td>
1620
+ <td>mistral-large</td>
1621
+ <td>gemini-2.5-pro</td>
1622
+ <td>87.2%</td>
1623
+ <td>7.8%</td>
1624
+ <td>5.0%</td>
1625
+ <td>165</td>
1626
+ </tr>
1627
+ <tr>
1628
+ <td>6</td>
1629
+ <td>grok-3</td>
1630
+ <td>o4-mini</td>
1631
+ <td>85.7%</td>
1632
+ <td>8.3%</td>
1633
+ <td>6.0%</td>
1634
+ <td>147</td>
1635
+ </tr>
1636
+ <tr>
1637
+ <td>7</td>
1638
+ <td>deepseek-r1</td>
1639
+ <td>o4-mini</td>
1640
+ <td>83.2%</td>
1641
+ <td>10.2%</td>
1642
+ <td>6.6%</td>
1643
+ <td>134</td>
1644
+ </tr>
1645
+ </tbody>
1646
+ </table>
1647
+ </div>
1648
 
1649
+ <div style="margin-top: 20px; padding: 15px; background-color: #e3f2fd; border-radius: 8px; box-shadow: 0 2px 8px rgba(0,0,0,0.1);">
1650
+ <h3 style="margin-top: 0; color: #0d47a1;">User Feedback Analysis</h3>
1651
+
1652
+ <div style="display: flex; flex-wrap: wrap; gap: 15px; margin-top: 15px;">
1653
+ <div style="flex: 1; min-width: 280px; padding: 12px; background-color: #cfd8dc; border-radius: 6px; box-shadow: 0 1px 3px rgba(0,0,0,0.12);">
1654
+ <h4 style="margin-top: 0; color: #01579b;">Key Findings</h4>
1655
+ <ul style="margin-bottom: 0; padding-left: 20px; color: #263238;">
1656
+ <li>GPT-4o + o4-mini has highest user satisfaction at 96.4%</li>
1657
+ <li>Judge models have more impact on user satisfaction than generators</li>
1658
+ <li>False negatives (missed hallucinations) are more frustrating for users than false positives</li>
1659
+ <li>Users rate judges based on quality of explanations and specificity of analysis</li>
1660
+ </ul>
1661
+ </div>
1662
+ <div style="flex: 1; min-width: 280px; padding: 12px; background-color: #cfd8dc; border-radius: 6px; box-shadow: 0 1px 3px rgba(0,0,0,0.12);">
1663
+ <h4 style="margin-top: 0; color: #01579b;">User Comments</h4>
1664
+ <div style="font-style: italic; color: #37474f;">
1665
+ <p>"GPT-4o with o4-mini gives the most detailed explanations for why something is a hallucination."</p>
1666
+ <p>"I prefer when the system catches hallucinations even if there are occasional false alarms."</p>
1667
+ <p>"Mistral + o4-mini combination seems to have the best balance of accuracy and response time."</p>
1668
+ </div>
1669
+ </div>
1670
+ </div>
1671
+ </div>
1672
+ """)
1673
+
1674
  # Function to continuously update stats
1675
  def update_stats():
1676
  stats = detector.get_feedback_stats()
 
1685
  accuracy_pct = f"{accuracy * 100:.1f}%"
1686
 
1687
  stats_html = f"""
1688
+ <div class="stats-section" style="background-color: #e0f7fa; border-radius: 10px; box-shadow: 0 2px 10px rgba(0,0,0,0.1); margin-top: 5px;">
1689
  <div class="stat-item">
1690
+ <div class="stat-value" style="font-size: 2em; color: #00838f;">{total}</div>
1691
+ <div class="stat-label" style="font-weight: bold; color: #006064;">Total Responses</div>
1692
  </div>
1693
  <div class="stat-item">
1694
+ <div class="stat-value" style="font-size: 2em; color: #00838f;">{accuracy_pct}</div>
1695
+ <div class="stat-label" style="font-weight: bold; color: #006064;">Correct Predictions</div>
1696
  </div>
1697
  </div>
1698
+ <div style="text-align: center; margin-top: 10px; font-style: italic; color: #37474f;">
1699
  Based on user feedback: {correct} correct out of {total} total predictions
1700
  </div>
1701
  """
 
1725
  color: #2e7d32;
1726
  }
1727
  #stats-container {
1728
+ border: 1px solid #b3e5fc;
1729
  border-radius: 10px;
1730
  padding: 15px;
1731
  margin: 10px 0;
1732
+ background-color: #0277bd;
1733
  }
1734
  </style>
1735
+ <div class="refreshing" style="text-align: right; font-size: 0.8em; color: #eceff1;">Auto-refreshing</div>
1736
  """)
1737
 
1738
  # Create a refresh button that will be auto-clicked
 
1742
  outputs=[live_stats]
1743
  )
1744
 
1745
+ # Add JavaScript to auto-refresh the statistics and enhance the tabs
1746
  gr.HTML("""
1747
  <script>
1748
  // Auto-refresh stats every 5 seconds
 
1758
  }, refreshInterval);
1759
  }
1760
 
1761
+ // Add highlighting to the selected tab
1762
+ function setupTabHighlighting() {
1763
+ // Add hover effects to tabs
1764
+ const tabs = document.querySelectorAll('.tabs button');
1765
+ if (tabs.length > 0) {
1766
+ tabs.forEach(tab => {
1767
+ tab.addEventListener('mouseover', () => {
1768
+ if (!tab.classList.contains('selected')) {
1769
+ tab.style.backgroundColor = '#e8eaf6';
1770
+ }
1771
+ });
1772
+ tab.addEventListener('mouseout', () => {
1773
+ if (!tab.classList.contains('selected')) {
1774
+ tab.style.backgroundColor = '';
1775
+ }
1776
+ });
1777
+ });
1778
+ }
1779
+ }
1780
+
1781
+ // Set up all JavaScript enhancements after the page loads
1782
+ function setupAllEnhancements() {
1783
  setupAutoRefresh();
1784
+ setupTabHighlighting();
1785
+ }
1786
+
1787
+ if (window.gradio_loaded) {
1788
+ setupAllEnhancements();
1789
  } else {
1790
+ document.addEventListener('DOMContentLoaded', setupAllEnhancements);
1791
  }
1792
  </script>
1793
+
1794
+ <style>
1795
+ /* Additional styling for tabs */
1796
+ .tabs button.selected {
1797
+ background-color: #3f51b5 !important;
1798
+ color: white !important;
1799
+ font-weight: 600;
1800
+ border-bottom: 3px solid #3f51b5;
1801
+ }
1802
+ .tabs button:not(.selected):hover {
1803
+ background-color: #e8eaf6;
1804
+ }
1805
+
1806
+ /* Add animation to tab transitions */
1807
+ .tabitem {
1808
+ animation: fadeIn 0.3s ease-in-out;
1809
+ }
1810
+ @keyframes fadeIn {
1811
+ from { opacity: 0; }
1812
+ to { opacity: 1; }
1813
+ }
1814
+ </style>
1815
  """)
1816
 
1817
  # Feedback section
 
1862
  """
1863
  <footer>
1864
  <p>Paraphrase-based Approach for Scrutinizing Systems (PAS2) - Advanced Hallucination Detection</p>
1865
+ <p>Multiple LLM models tested as generators and judges for optimal hallucination detection</p>
1866
+ <p><small>Models in testing: mistral-large, gpt-4o, Qwen3-235B-A22B, grok-3, o4-mini, gemini-2.5-pro, deepseek-r1</small></p>
1867
  </footer>
1868
  """
1869
  )