nappenstance commited on
Commit
99b2a32
·
verified ·
1 Parent(s): 31c5c21

Upload app.py

Browse files
Files changed (1) hide show
  1. app.py +864 -365
app.py CHANGED
@@ -43,12 +43,17 @@ class HallucinationJudgment(BaseModel):
43
  class PAS2:
44
  """Paraphrase-based Approach for LLM Systems - Using llm-as-judge methods"""
45
 
46
- def __init__(self, mistral_api_key=None, openai_api_key=None, progress_callback=None):
47
  """Initialize the PAS2 with API keys"""
48
  # For Hugging Face Spaces, we prioritize getting API keys from HF_* environment variables
49
  # which are set from the Secrets tab in the Space settings
50
  self.mistral_api_key = mistral_api_key or os.environ.get("HF_MISTRAL_API_KEY") or os.environ.get("MISTRAL_API_KEY")
51
  self.openai_api_key = openai_api_key or os.environ.get("HF_OPENAI_API_KEY") or os.environ.get("OPENAI_API_KEY")
 
 
 
 
 
52
  self.progress_callback = progress_callback
53
 
54
  if not self.mistral_api_key:
@@ -59,12 +64,64 @@ class PAS2:
59
 
60
  self.mistral_client = Mistral(api_key=self.mistral_api_key)
61
  self.openai_client = OpenAI(api_key=self.openai_api_key)
62
-
 
 
 
 
 
63
  self.mistral_model = "mistral-large-latest"
64
- self.openai_model = "o3-mini"
 
 
 
 
 
65
 
66
- logger.info("PAS2 initialized with Mistral model: %s and OpenAI model: %s",
67
- self.mistral_model, self.openai_model)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
68
 
69
  def generate_paraphrases(self, query: str, n_paraphrases: int = 3) -> List[str]:
70
  """Generate paraphrases of the input query using Mistral API"""
@@ -141,13 +198,38 @@ class PAS2:
141
 
142
  return fallback_paraphrases
143
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
144
  def _get_single_response(self, query: str, index: int = None) -> str:
145
- """Get a single response from Mistral API for a query"""
146
  try:
147
  query_description = f"Query {index}: {query}" if index is not None else f"Query: {query}"
148
- logger.info("Getting response for %s", query_description)
149
  start_time = time.time()
150
 
 
 
 
 
 
 
151
  messages = [
152
  {
153
  "role": "system",
@@ -159,23 +241,32 @@ class PAS2:
159
  }
160
  ]
161
 
162
- response = self.mistral_client.chat.complete(
163
- model=self.mistral_model,
164
- messages=messages
165
- )
 
 
 
 
 
 
 
 
 
166
 
167
- result = response.choices[0].message.content
168
  elapsed_time = time.time() - start_time
169
 
170
- logger.info("Received response for %s (%.2f seconds)", query_description, elapsed_time)
 
171
  logger.debug("Response content for %s: %s", query_description, result[:100] + "..." if len(result) > 100 else result)
172
 
173
  return result
174
 
175
  except Exception as e:
176
- error_msg = f"Error getting response for query '{query}': {e}"
177
  logger.error(error_msg, exc_info=True)
178
- return f"Error: Failed to get response for this query."
179
 
180
  def get_responses(self, queries: List[str]) -> List[str]:
181
  """Get responses from Mistral API for each query in parallel"""
@@ -235,6 +326,10 @@ class PAS2:
235
  logger.info("Starting hallucination detection for query: %s", query)
236
  start_time = time.time()
237
 
 
 
 
 
238
  # Report progress
239
  if self.progress_callback:
240
  self.progress_callback("starting", query=query)
@@ -250,9 +345,9 @@ class PAS2:
250
  self.progress_callback("paraphrases_complete", query=query, count=len(all_queries))
251
 
252
  # Get responses to all queries
253
- logger.info("Step 2: Getting responses to all %d queries", len(all_queries))
254
  if self.progress_callback:
255
- self.progress_callback("getting_responses", query=query, total=len(all_queries))
256
 
257
  all_responses = []
258
  for i, q in enumerate(all_queries):
@@ -267,9 +362,9 @@ class PAS2:
267
  self.progress_callback("responses_complete", query=query)
268
 
269
  # Judge the responses for hallucinations
270
- logger.info("Step 3: Judging for hallucinations")
271
  if self.progress_callback:
272
- self.progress_callback("judging", query=query)
273
 
274
  # The first query is the original, rest are paraphrases
275
  original_query = all_queries[0]
@@ -295,14 +390,17 @@ class PAS2:
295
  "confidence_score": judgment.confidence_score,
296
  "conflicting_facts": judgment.conflicting_facts,
297
  "reasoning": judgment.reasoning,
298
- "summary": judgment.summary
 
 
299
  }
300
 
301
  # Report completion
302
  if self.progress_callback:
303
- self.progress_callback("complete", query=query)
304
 
305
- logger.info("Hallucination detection completed in %.2f seconds", time.time() - start_time)
 
306
  return results
307
 
308
  def judge_hallucination(self,
@@ -311,11 +409,17 @@ class PAS2:
311
  paraphrased_queries: List[str],
312
  paraphrased_responses: List[str]) -> HallucinationJudgment:
313
  """
314
- Use OpenAI's o3-mini as a judge to detect hallucinations in the responses
315
  """
316
- logger.info("Judging hallucinations with OpenAI's %s model", self.openai_model)
317
  start_time = time.time()
318
 
 
 
 
 
 
 
319
  # Prepare the context for the judge
320
  context = f"""
321
  Original Question: {original_query}
@@ -344,18 +448,31 @@ Your response should be a JSON with the following fields:
344
  """
345
 
346
  try:
347
- logger.info("Sending judgment request to OpenAI API...")
348
- response = self.openai_client.chat.completions.create(
349
- model=self.openai_model,
350
- messages=[
351
- {"role": "system", "content": system_prompt},
352
- {"role": "user", "content": f"Evaluate these responses for hallucinations:\n\n{context}"}
353
- ],
354
- response_format={"type": "json_object"}
355
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
356
 
357
- result_json = json.loads(response.choices[0].message.content)
358
- logger.debug("Received judgment response: %s", result_json)
359
 
360
  # Create the HallucinationJudgment object from the JSON response
361
  judgment = HallucinationJudgment(
@@ -367,18 +484,18 @@ Your response should be a JSON with the following fields:
367
  )
368
 
369
  elapsed_time = time.time() - start_time
370
- logger.info("Judgment completed in %.2f seconds", elapsed_time)
371
 
372
  return judgment
373
 
374
  except Exception as e:
375
- logger.error("Error in hallucination judgment: %s", str(e), exc_info=True)
376
  # Return a fallback judgment
377
  return HallucinationJudgment(
378
  hallucination_detected=False,
379
  confidence_score=0.0,
380
  conflicting_facts=[],
381
- reasoning="Failed to obtain judgment from the model.",
382
  summary="Analysis failed due to API error."
383
  )
384
 
@@ -495,11 +612,21 @@ class HallucinationDetectorApp:
495
  "conflicting_facts": results.get('conflicting_facts', []),
496
  "reasoning": results.get('reasoning', ''),
497
  "summary": results.get('summary', ''),
 
 
498
  "user_feedback": feedback
499
  }
500
 
501
  # Insert document into collection
502
- self.feedback_collection.insert_one(document)
 
 
 
 
 
 
 
 
503
 
504
  logger.info("Feedback saved successfully to MongoDB")
505
  return "Feedback saved successfully!"
@@ -507,6 +634,266 @@ class HallucinationDetectorApp:
507
  logger.error("Error saving feedback: %s", str(e), exc_info=True)
508
  return f"Error saving feedback: {str(e)}"
509
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
510
  def get_feedback_stats(self):
511
  """Get statistics about collected feedback from MongoDB"""
512
  try:
@@ -541,6 +928,62 @@ class HallucinationDetectorApp:
541
  except Exception as e:
542
  logger.error("Error getting feedback stats: %s", str(e), exc_info=True)
543
  return None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
544
 
545
  def export_data_to_csv(self, filepath=None):
546
  """Export all feedback data to a CSV file for analysis"""
@@ -657,11 +1100,11 @@ class ProgressTracker:
657
  "starting": {"status": "Starting process...", "progress": 5, "color": "#2196F3"},
658
  "generating_paraphrases": {"status": "Generating paraphrases...", "progress": 15, "color": "#2196F3"},
659
  "paraphrases_complete": {"status": "Paraphrases generated", "progress": 30, "color": "#2196F3"},
660
- "getting_responses": {"status": "Getting responses (0/0)...", "progress": 35, "color": "#2196F3"},
661
  "responses_progress": {"status": "Getting responses ({completed}/{total})...", "progress": 40, "color": "#2196F3"},
662
  "responses_complete": {"status": "All responses received", "progress": 65, "color": "#2196F3"},
663
- "judging": {"status": "Analyzing responses for hallucinations...", "progress": 70, "color": "#2196F3"},
664
- "complete": {"status": "Analysis complete!", "progress": 100, "color": "#4CAF50"},
665
  "error": {"status": "Error: {error_message}", "progress": 100, "color": "#F44336"}
666
  }
667
 
@@ -672,6 +1115,9 @@ class ProgressTracker:
672
  self.completed_responses = 0
673
  self.total_responses = 0
674
  self.error_message = ""
 
 
 
675
  self._lock = threading.Lock()
676
  self._status_callback = None
677
  self._stop_event = threading.Event()
@@ -698,6 +1144,12 @@ class ProgressTracker:
698
  self.total_responses = value
699
  elif key == 'error_message':
700
  self.error_message = value
 
 
 
 
 
 
701
 
702
  # Format status message
703
  if stage == 'responses_progress':
@@ -705,6 +1157,19 @@ class ProgressTracker:
705
  completed=self.completed_responses,
706
  total=self.total_responses
707
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
708
  elif stage == 'error':
709
  self.stage_data['status'] = self.stage_data['status'].format(
710
  error_message=self.error_message
@@ -724,6 +1189,16 @@ class ProgressTracker:
724
  # Only show status text if not in idle state
725
  status_display = f'<div class="progress-status" style="color: {color};">{status_text}</div>' if self.stage != "idle" else ''
726
 
 
 
 
 
 
 
 
 
 
 
727
  html = f"""
728
  <div class="progress-container">
729
  {query_info}
@@ -731,6 +1206,7 @@ class ProgressTracker:
731
  <div class="progress-bar-container">
732
  <div class="progress-bar" style="width: {progress_width}; background-color: {color};"></div>
733
  </div>
 
734
  </div>
735
  """
736
  return html
@@ -1099,13 +1575,18 @@ def create_interface():
1099
  combined_progress_callback("starting", query=query)
1100
  time.sleep(0.3) # Ensure starting status is visible
1101
 
 
 
 
 
 
1102
  # Step 2: Generate paraphrases (15-30%)
1103
  combined_progress_callback("generating_paraphrases", query=query)
1104
  all_queries = detector.pas2.generate_paraphrases(query)
1105
  combined_progress_callback("paraphrases_complete", query=query, count=len(all_queries))
1106
 
1107
  # Step 3: Get responses (35-65%)
1108
- combined_progress_callback("getting_responses", query=query, total=len(all_queries))
1109
  all_responses = []
1110
  for i, q in enumerate(all_queries):
1111
  # Show incremental progress for each response
@@ -1115,7 +1596,7 @@ def create_interface():
1115
  combined_progress_callback("responses_complete", query=query)
1116
 
1117
  # Step 4: Judge hallucinations (70-100%)
1118
- combined_progress_callback("judging", query=query)
1119
 
1120
  # The first query is the original, rest are paraphrases
1121
  original_query = all_queries[0]
@@ -1141,11 +1622,13 @@ def create_interface():
1141
  "confidence_score": judgment.confidence_score,
1142
  "conflicting_facts": judgment.conflicting_facts,
1143
  "reasoning": judgment.reasoning,
1144
- "summary": judgment.summary
 
 
1145
  }
1146
 
1147
  # Show completion
1148
- combined_progress_callback("complete", query=query)
1149
  time.sleep(0.3) # Ensure complete status is visible
1150
 
1151
  return results
@@ -1201,10 +1684,25 @@ def create_interface():
1201
  reasoning_safe = reasoning.replace('\\', '\\\\').replace('\n', '<br>')
1202
  conflicting_facts_text_safe = conflicting_facts_text.replace('\\', '\\\\').replace('\n', '<br>') if conflicting_facts_text else "<strong>None identified</strong>"
1203
 
 
 
 
 
1204
  html_output = f"""
1205
  <div class="container">
1206
  <h2 class="title">Hallucination Detection Results</h2>
1207
 
 
 
 
 
 
 
 
 
 
 
 
1208
  <div class="stats-section">
1209
  <div class="stat-item">
1210
  <div class="stat-value">{'Yes' if hallucination_detected else 'No'}</div>
@@ -1234,7 +1732,7 @@ def create_interface():
1234
  {original_query}
1235
  </div>
1236
 
1237
- <div class="section-title">Original Response</div>
1238
  <div class="response-box">
1239
  {original_response_safe}
1240
  </div>
@@ -1249,14 +1747,14 @@ def create_interface():
1249
  {q}
1250
  </div>
1251
 
1252
- <div class="section-title">Response {i}</div>
1253
  <div class="response-box">
1254
  {r}
1255
  </div>
1256
  """
1257
 
1258
  html_output += f"""
1259
- <div class="section-title">Detailed Analysis</div>
1260
  <div class="info-box">
1261
  <p><strong>Reasoning:</strong></p>
1262
  <p>{reasoning_safe}</p>
@@ -1264,6 +1762,10 @@ def create_interface():
1264
  <p><strong>Conflicting Facts:</strong></p>
1265
  <p>{conflicting_facts_text_safe}</p>
1266
  </div>
 
 
 
 
1267
  </div>
1268
  """
1269
 
@@ -1289,8 +1791,11 @@ def create_interface():
1289
  ]
1290
 
1291
  # Helper function to submit feedback
1292
- def combine_feedback(fb_input, fb_text, results):
1293
- combined_feedback = f"{fb_input}: {fb_text}" if fb_text else fb_input
 
 
 
1294
  if not results:
1295
  return "No results to attach feedback to."
1296
 
@@ -1394,8 +1899,8 @@ def create_interface():
1394
  This tool implements the Paraphrase-based Approach for Scrutinizing Systems (PAS2) with a model-as-judge enhancement:
1395
 
1396
  1. **Paraphrase Generation**: Your question is paraphrased multiple ways while preserving its core meaning
1397
- 2. **Multiple Responses**: All questions (original + paraphrases) are sent to Mistral Large model
1398
- 3. **Expert Judgment**: OpenAI's o3-mini analyzes all responses to detect factual inconsistencies
1399
 
1400
  ### Why This Approach?
1401
 
@@ -1469,10 +1974,16 @@ def create_interface():
1469
  gr.Markdown("### Help Improve the System")
1470
  gr.Markdown("Your feedback helps us refine the hallucination detection system.")
1471
 
1472
- feedback_input = gr.Radio(
1473
- label="Was the hallucination detection accurate?",
1474
- choices=["Yes, the detection was correct", "No, the detection was incorrect", "Other/Unsure"],
1475
- value="Yes, the detection was correct"
 
 
 
 
 
 
1476
  )
1477
 
1478
  feedback_text = gr.Textbox(
@@ -1489,286 +2000,280 @@ def create_interface():
1489
  gr.Markdown("## Hallucination Detection Scores")
1490
  gr.Markdown("Performance comparison of different Generator + Judge model combinations.")
1491
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1492
  # Create leaderboard table for model combinations
1493
- model_leaderboard_html = gr.HTML("""
1494
- <div class="leaderboard-container">
1495
- <table class="leaderboard-table">
1496
- <thead>
1497
- <tr>
1498
- <th>Rank</th>
1499
- <th>Generator Model</th>
1500
- <th>Judge Model</th>
1501
- <th>ELO Score</th>
1502
- <th>Accuracy</th>
1503
- <th>Consistency</th>
1504
- </tr>
1505
- </thead>
1506
- <tbody>
1507
- <tr>
1508
- <td>1</td>
1509
- <td>gpt-4o</td>
1510
- <td>o4-mini</td>
1511
- <td>1878</td>
1512
- <td>94.2%</td>
1513
- <td>91.6%</td>
1514
- </tr>
1515
- <tr>
1516
- <td>2</td>
1517
- <td>gpt-4o</td>
1518
- <td>gemini-2.5-pro</td>
1519
- <td>1835</td>
1520
- <td>92.8%</td>
1521
- <td>89.2%</td>
1522
- </tr>
1523
- <tr>
1524
- <td>3</td>
1525
- <td>mistral-large</td>
1526
- <td>o4-mini</td>
1527
- <td>1795</td>
1528
- <td>91.5%</td>
1529
- <td>87.5%</td>
1530
- </tr>
1531
- <tr>
1532
- <td>4</td>
1533
- <td>Qwen3-235B-A22B</td>
1534
- <td>o4-mini</td>
1535
- <td>1768</td>
1536
- <td>90.3%</td>
1537
- <td>85.1%</td>
1538
- </tr>
1539
- <tr>
1540
- <td>5</td>
1541
- <td>grok-3</td>
1542
- <td>o4-mini</td>
1543
- <td>1742</td>
1544
- <td>88.7%</td>
1545
- <td>82.9%</td>
1546
- </tr>
1547
- <tr>
1548
- <td>6</td>
1549
- <td>mistral-large</td>
1550
- <td>gemini-2.5-pro</td>
1551
- <td>1716</td>
1552
- <td>88.1%</td>
1553
- <td>81.4%</td>
1554
- </tr>
1555
- <tr>
1556
- <td>7</td>
1557
- <td>deepseek-r1</td>
1558
- <td>o4-mini</td>
1559
- <td>1692</td>
1560
- <td>87.3%</td>
1561
- <td>80.3%</td>
1562
- </tr>
1563
- </tbody>
1564
- </table>
1565
- </div>
1566
 
1567
- <div style="margin-top: 20px; padding: 15px; background-color: #0d47a1; border-radius: 8px; box-shadow: 0 2px 8px rgba(0,0,0,0.1);">
1568
- <h3 style="margin-top: 0; color: #ffffff;">ELO Rating System Explanation</h3>
1569
-
1570
- <div style="display: flex; flex-wrap: wrap; gap: 15px; margin-top: 15px;">
1571
- <div style="flex: 1; min-width: 280px; padding: 12px; background-color: #455a64; border-radius: 6px; box-shadow: 0 1px 3px rgba(0,0,0,0.12);">
1572
- <h4 style="margin-top: 0; color: #ffffff;">How ELO Scores Are Calculated</h4>
1573
- <p style="color: #eceff1;">Our ELO rating system assigns scores to model pairs based on benchmark performance, using the following formula:</p>
1574
- <div style="background-color: #37474f; padding: 12px; border-radius: 5px; color: #eceff1;">
1575
- <code style="color: #80deea;">ELO_new = ELO_old + K × (S - E)</code><br><br>
1576
- Where:<br>
1577
- <strong style="color: #b2dfdb;">ELO_old</strong>: Previous rating of the model combination<br>
1578
- <strong style="color: #b2dfdb;">K</strong>: Weight factor (32 for new models, 16 for established ones)<br>
1579
- <strong style="color: #b2dfdb;">S</strong>: Actual score from benchmark tests<br>
1580
- <strong style="color: #b2dfdb;">E</strong>: Expected score based on current rating<br><br>
1581
- <em style="color: #80deea;">E = 1 / (1 + 10<sup>(ELO_opponent - ELO_model)/400</sup>)</em>
1582
- </div>
1583
- </div>
1584
- <div style="flex: 1; min-width: 280px; padding: 12px; background-color: #455a64; border-radius: 6px; box-shadow: 0 1px 3px rgba(0,0,0,0.12);">
1585
- <h4 style="margin-top: 0; color: #ffffff;">Model Combinations Tested</h4>
1586
- <p style="color: #eceff1;">We evaluated 10 different combinations across 250 benchmark questions.</p>
1587
- <div style="display: flex; flex-wrap: wrap; gap: 10px; margin-top: 10px;">
1588
- <div style="flex: 1; min-width: 120px;">
1589
- <h5 style="margin-top: 0; margin-bottom: 5px; color: #b2dfdb;">Generator Models</h5>
1590
- <ul style="margin-bottom: 0; padding-left: 20px; color: #eceff1;">
1591
- <li>mistral-large</li>
1592
- <li>gpt-4o</li>
1593
- <li>Qwen3-235B-A22B</li>
1594
- <li>grok-3</li>
1595
- <li>deepseek-r1</li>
1596
- <li>o4-mini</li>
1597
- <li>gemini-2.5-pro</li>
1598
- </ul>
1599
- </div>
1600
- <div style="flex: 1; min-width: 120px;">
1601
- <h5 style="margin-top: 0; margin-bottom: 5px; color: #b2dfdb;">Judge Models</h5>
1602
- <ul style="margin-bottom: 0; padding-left: 20px; color: #eceff1;">
1603
- <li>mistral-large</li>
1604
- <li>gpt-4o</li>
1605
- <li>Qwen3-235B-A22B</li>
1606
- <li>grok-3</li>
1607
- <li>deepseek-r1</li>
1608
- <li>o4-mini</li>
1609
- <li>gemini-2.5-pro</li>
1610
- </ul>
1611
- </div>
1612
- </div>
1613
- </div>
1614
- </div>
1615
- </div>
1616
- <style>
1617
- .leaderboard-container {
1618
- margin: 15px 0;
1619
- overflow-x: auto;
1620
- }
1621
- .leaderboard-table {
1622
- width: 100%;
1623
- border-collapse: collapse;
1624
- font-size: 0.95em;
1625
- box-shadow: 0 2px 10px rgba(0,0,0,0.1);
1626
- border-radius: 8px;
1627
- overflow: hidden;
1628
- }
1629
- .leaderboard-table thead {
1630
- background-color: #1565c0;
1631
- color: white;
1632
- }
1633
- .leaderboard-table th, .leaderboard-table td {
1634
- padding: 12px 15px;
1635
- text-align: left;
1636
- border-bottom: 1px solid #ddd;
1637
- }
1638
- .leaderboard-table tbody tr {
1639
- transition: background-color 0.3s;
1640
- }
1641
- .leaderboard-table tbody tr:nth-child(even) {
1642
- background-color: #cfd8dc;
1643
- }
1644
- .leaderboard-table tbody tr:hover {
1645
- background-color: #b0bec5;
1646
- }
1647
- .leaderboard-table tbody tr:first-child {
1648
- background-color: #80cbc4;
1649
- color: #004d40;
1650
- }
1651
- .leaderboard-table tbody tr:nth-child(2) {
1652
- background-color: #81c784;
1653
- color: #1b5e20;
1654
- }
1655
- .leaderboard-table tbody tr:nth-child(4) {
1656
- background-color: #aed581;
1657
- color: #33691e;
1658
- }
1659
- .leaderboard-table tbody tr:nth-child(6) {
1660
- background-color: #d7ccc8;
1661
- color: #3e2723;
1662
- }
1663
- </style>
1664
- """)
1665
 
1666
- # Tab 3: Generator Models Hallucination Leaderboard
1667
- with gr.TabItem("User Feedback", elem_id="user-feedback-tab"):
1668
- gr.Markdown("## Model Hallucination Evaluation (User Feedback)")
1669
- gr.Markdown("Performance ranking of generator models based on user-reported hallucination rates.")
1670
 
1671
- # Create leaderboard table for user feedback
1672
- user_feedback_html = gr.HTML("""
1673
- <div class="leaderboard-container">
1674
- <table class="leaderboard-table">
1675
- <thead>
1676
- <tr>
1677
- <th>Rank</th>
1678
- <th>Generator Model</th>
1679
- <th>ELO Score</th>
1680
- <th>Accuracy</th>
1681
- <th>Sample Size</th>
1682
- </tr>
1683
- </thead>
1684
- <tbody>
1685
- <tr>
1686
- <td>1</td>
1687
- <td>gpt-4o</td>
1688
- <td>1856</td>
1689
- <td>96.4%</td>
1690
- <td>256</td>
1691
- </tr>
1692
- <tr>
1693
- <td>2</td>
1694
- <td>mistral-large</td>
1695
- <td>1802</td>
1696
- <td>93.8%</td>
1697
- <td>221</td>
1698
- </tr>
1699
- <tr>
1700
- <td>3</td>
1701
- <td>Qwen3-235B-A22B</td>
1702
- <td>1765</td>
1703
- <td>91.5%</td>
1704
- <td>192</td>
1705
- </tr>
1706
- <tr>
1707
- <td>4</td>
1708
- <td>o4-mini</td>
1709
- <td>1732</td>
1710
- <td>89.3%</td>
1711
- <td>178</td>
1712
- </tr>
1713
- <tr>
1714
- <td>5</td>
1715
- <td>gemini-2.5-pro</td>
1716
- <td>1695</td>
1717
- <td>87.2%</td>
1718
- <td>165</td>
1719
- </tr>
1720
- <tr>
1721
- <td>6</td>
1722
- <td>grok-3</td>
1723
- <td>1665</td>
1724
- <td>85.7%</td>
1725
- <td>147</td>
1726
- </tr>
1727
- <tr>
1728
- <td>7</td>
1729
- <td>deepseek-r1</td>
1730
- <td>1625</td>
1731
- <td>83.2%</td>
1732
- <td>134</td>
1733
- </tr>
1734
- </tbody>
1735
- </table>
1736
- </div>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1737
 
1738
- <div style="margin-top: 20px; padding: 15px; background-color: #0d47a1; border-radius: 8px; box-shadow: 0 2px 8px rgba(0,0,0,0.1);">
1739
- <h3 style="margin-top: 0; color: #ffffff;">ELO Rating System Explanation</h3>
1740
-
1741
- <div style="display: flex; flex-wrap: wrap; gap: 15px; margin-top: 15px;">
1742
- <div style="flex: 1; min-width: 280px; padding: 12px; background-color: #455a64; border-radius: 6px; box-shadow: 0 1px 3px rgba(0,0,0,0.12);">
1743
- <h4 style="margin-top: 0; color: #ffffff;">How ELO Scores Are Calculated</h4>
1744
- <p style="color: #eceff1;">Our ELO rating system assigns scores to models based on user feedback, using the following formula:</p>
1745
- <div style="background-color: #37474f; padding: 12px; border-radius: 5px; color: #eceff1;">
1746
- <code style="color: #80deea;">ELO_new = ELO_old + K × (S - E)</code><br><br>
1747
- Where:<br>
1748
- • <strong style="color: #b2dfdb;">ELO_old</strong>: Previous rating of the model<br>
1749
- <strong style="color: #b2dfdb;">K</strong>: Weight factor (40 for new models, 20 for established ones)<br>
1750
- <strong style="color: #b2dfdb;">S</strong>: Actual score (1 for correct hallucination detection, 0 for incorrect)<br>
1751
- <strong style="color: #b2dfdb;">E</strong>: Expected score based on current rating<br><br>
1752
- <em style="color: #80deea;">E = 1 / (1 + 10<sup>(ELO_opponent - ELO_model)/400</sup>)</em>
1753
- </div>
1754
- <p style="color: #eceff1; margin-top: 10px;">All models start with a base ELO of 1500. Scores are updated after each user evaluation.</p>
1755
- </div>
1756
- <div style="flex: 1; min-width: 280px; padding: 12px; background-color: #455a64; border-radius: 6px; box-shadow: 0 1px 3px rgba(0,0,0,0.12);">
1757
- <h4 style="margin-top: 0; color: #ffffff;">Interpretation Guidelines</h4>
1758
- <ul style="margin-bottom: 0; padding-left: 20px; color: #eceff1;">
1759
- <li><strong style="color: #b2dfdb;">1800+</strong>: Exceptional performance, very rare hallucinations</li>
1760
- <li><strong style="color: #b2dfdb;">1700-1799</strong>: Superior performance, minimal hallucinations</li>
1761
- <li><strong style="color: #b2dfdb;">1600-1699</strong>: Good performance, occasional hallucinations</li>
1762
- <li><strong style="color: #b2dfdb;">1500-1599</strong>: Average performance</li>
1763
- <li><strong style="color: #b2dfdb;">&lt;1500</strong>: Below average, frequent hallucinations</li>
1764
- </ul>
1765
- <p style="font-style: italic; color: #b3e5fc; margin-top: 10px;">
1766
- Note: ELO scores are comparative and reflect relative performance between models in our specific hallucination detection tasks.
1767
- </p>
1768
- </div>
1769
- </div>
1770
- </div>
1771
- """)
 
 
 
1772
 
1773
  # Function to continuously update stats
1774
  def update_stats():
@@ -1811,30 +2316,30 @@ def create_interface():
1811
  live_stats = gr.HTML(update_stats())
1812
 
1813
  # Add loading animation style
1814
- gr.HTML("""
1815
- <style>
1816
- @keyframes pulse {
1817
- 0% { opacity: 0.6; }
1818
- 50% { opacity: 1; }
1819
- 100% { opacity: 0.6; }
1820
- }
1821
- .refreshing::after {
1822
- content: "";
1823
- display: inline-block;
1824
- margin-left: 8px;
1825
- animation: pulse 1.5s infinite ease-in-out;
1826
- color: #2e7d32;
1827
- }
1828
- #stats-container {
1829
- border: 1px solid #b3e5fc;
1830
- border-radius: 10px;
1831
- padding: 15px;
1832
- margin: 10px 0;
1833
- background-color: #0277bd;
1834
- }
1835
- </style>
1836
- <div class="refreshing" style="text-align: right; font-size: 0.8em; color: #eceff1;">Auto-refreshing</div>
1837
- """)
1838
 
1839
  # Create a refresh button that will be auto-clicked
1840
  refresh_btn = gr.Button("Refresh Stats", visible=False)
@@ -2018,19 +2523,13 @@ def create_interface():
2018
 
2019
  feedback_button.click(
2020
  fn=combine_feedback,
2021
- inputs=[feedback_input, feedback_text, hidden_results],
2022
  outputs=[feedback_status]
2023
  )
2024
 
2025
  # Footer
2026
  gr.HTML(
2027
- """
2028
- <footer>
2029
- <p>Paraphrase-based Approach for Scrutinizing Systems (PAS2) - Advanced Hallucination Detection</p>
2030
- <p>Multiple LLM models tested as generators and judges for optimal hallucination detection</p>
2031
- <p><small>Models in testing: mistral-large, gpt-4o, Qwen3-235B-A22B, grok-3, o4-mini, gemini-2.5-pro, deepseek-r1</small></p>
2032
- </footer>
2033
- """
2034
  )
2035
 
2036
  return interface
@@ -2096,4 +2595,4 @@ if __name__ == "__main__":
2096
 
2097
  # Uncomment this line to run the test function instead of the main interface
2098
  # if __name__ == "__main__":
2099
- # test_progress()
 
43
  class PAS2:
44
  """Paraphrase-based Approach for LLM Systems - Using llm-as-judge methods"""
45
 
46
+ def __init__(self, mistral_api_key=None, openai_api_key=None, xai_api_key=None, qwen_api_key=None, deepseek_api_key=None, gemini_api_key=None, progress_callback=None):
47
  """Initialize the PAS2 with API keys"""
48
  # For Hugging Face Spaces, we prioritize getting API keys from HF_* environment variables
49
  # which are set from the Secrets tab in the Space settings
50
  self.mistral_api_key = mistral_api_key or os.environ.get("HF_MISTRAL_API_KEY") or os.environ.get("MISTRAL_API_KEY")
51
  self.openai_api_key = openai_api_key or os.environ.get("HF_OPENAI_API_KEY") or os.environ.get("OPENAI_API_KEY")
52
+ self.xai_api_key = xai_api_key or os.environ.get("HF_XAI_API_KEY") or os.environ.get("XAI_API_KEY")
53
+ self.qwen_api_key = qwen_api_key or os.environ.get("HF_QWEN_API_KEY") or os.environ.get("QWEN_API_KEY")
54
+ self.deepseek_api_key = deepseek_api_key or os.environ.get("HF_DEEPSEEK_API_KEY") or os.environ.get("DEEPSEEK_API_KEY")
55
+ self.gemini_api_key = gemini_api_key or os.environ.get("HF_GEMINI_API_KEY") or os.environ.get("GEMINI_API_KEY")
56
+
57
  self.progress_callback = progress_callback
58
 
59
  if not self.mistral_api_key:
 
64
 
65
  self.mistral_client = Mistral(api_key=self.mistral_api_key)
66
  self.openai_client = OpenAI(api_key=self.openai_api_key)
67
+ self.xai_client = OpenAI(api_key=self.xai_api_key, base_url="https://api.x.ai/v1")
68
+ self.qwen_client = OpenAI(api_key=self.qwen_api_key, base_url="https://router.huggingface.co/nebius/v1")
69
+ self.deepseek_client = OpenAI(api_key=self.deepseek_api_key, base_url="https://api.deepseek.com")
70
+ self.gemini_client = OpenAI(api_key=self.gemini_api_key, base_url="https://generativelanguage.googleapis.com/v1beta/openai/")
71
+
72
+ # Define model names
73
  self.mistral_model = "mistral-large-latest"
74
+ self.openai_o4mini = "o4-mini"
75
+ self.openai_4o = "gpt-4o"
76
+ self.deepseek_model = "deepseek-reasoner"
77
+ self.grok_model = "grok-3-beta"
78
+ self.qwen_model = "Qwen/Qwen3-235B-A22B"
79
+ self.gemini_model = "gemini-2.5-pro-preview-05-06"
80
 
81
+ # Create a dictionary mapping model names to their clients and model identifiers
82
+ self.model_configs = {
83
+ "mistral-large": {
84
+ "client": self.mistral_client,
85
+ "model_id": self.mistral_model,
86
+ "type": "mistral"
87
+ },
88
+ "o4-mini": {
89
+ "client": self.openai_client,
90
+ "model_id": self.openai_o4mini,
91
+ "type": "openai"
92
+ },
93
+ "gpt-4o": {
94
+ "client": self.openai_client,
95
+ "model_id": self.openai_4o,
96
+ "type": "openai"
97
+ },
98
+ "deepseek-reasoner": {
99
+ "client": self.deepseek_client,
100
+ "model_id": self.deepseek_model,
101
+ "type": "openai"
102
+ },
103
+ "grok-3": {
104
+ "client": self.xai_client,
105
+ "model_id": self.grok_model,
106
+ "type": "openai"
107
+ },
108
+ "qwen-235b": {
109
+ "client": self.qwen_client,
110
+ "model_id": self.qwen_model,
111
+ "type": "openai"
112
+ },
113
+ "gemini-2.5-pro": {
114
+ "client": self.gemini_client,
115
+ "model_id": self.gemini_model,
116
+ "type": "openai"
117
+ }
118
+ }
119
+
120
+ # Set default models (will be randomized later)
121
+ self.generator_model = "mistral-large"
122
+ self.judge_model = "o4-mini"
123
+
124
+ logger.info("PAS2 initialized with available models: %s", ", ".join(self.model_configs.keys()))
125
 
126
  def generate_paraphrases(self, query: str, n_paraphrases: int = 3) -> List[str]:
127
  """Generate paraphrases of the input query using Mistral API"""
 
198
 
199
  return fallback_paraphrases
200
 
201
+ def set_random_model_pair(self):
202
+ """Randomly select a pair of generator and judge models"""
203
+ import random
204
+
205
+ # Get list of available models
206
+ available_models = list(self.model_configs.keys())
207
+
208
+ # Randomly select generator and judge models
209
+ self.generator_model = random.choice(available_models)
210
+
211
+ # Make sure judge is different from generator
212
+ judge_options = [m for m in available_models if m != self.generator_model]
213
+ self.judge_model = random.choice(judge_options)
214
+
215
+ logger.info("Randomly selected model pair - Generator: %s, Judge: %s",
216
+ self.generator_model, self.judge_model)
217
+
218
+ return self.generator_model, self.judge_model
219
+
220
  def _get_single_response(self, query: str, index: int = None) -> str:
221
+ """Get a single response from the selected generator model for a query"""
222
  try:
223
  query_description = f"Query {index}: {query}" if index is not None else f"Query: {query}"
224
+ logger.info("Getting response for %s using %s", query_description, self.generator_model)
225
  start_time = time.time()
226
 
227
+ # Get the model configuration
228
+ model_config = self.model_configs[self.generator_model]
229
+ client = model_config["client"]
230
+ model_id = model_config["model_id"]
231
+ model_type = model_config["type"]
232
+
233
  messages = [
234
  {
235
  "role": "system",
 
241
  }
242
  ]
243
 
244
+ # Use the appropriate client and model based on the type
245
+ if model_type == "mistral":
246
+ response = client.chat.complete(
247
+ model=model_id,
248
+ messages=messages
249
+ )
250
+ result = response.choices[0].message.content
251
+ else: # openai-compatible API
252
+ response = client.chat.completions.create(
253
+ model=model_id,
254
+ messages=messages
255
+ )
256
+ result = response.choices[0].message.content
257
 
 
258
  elapsed_time = time.time() - start_time
259
 
260
+ logger.info("Received response from %s for %s (%.2f seconds)",
261
+ self.generator_model, query_description, elapsed_time)
262
  logger.debug("Response content for %s: %s", query_description, result[:100] + "..." if len(result) > 100 else result)
263
 
264
  return result
265
 
266
  except Exception as e:
267
+ error_msg = f"Error getting response for query '{query}' with model {self.generator_model}: {e}"
268
  logger.error(error_msg, exc_info=True)
269
+ return f"Error: Failed to get response for this query with model {self.generator_model}."
270
 
271
  def get_responses(self, queries: List[str]) -> List[str]:
272
  """Get responses from Mistral API for each query in parallel"""
 
326
  logger.info("Starting hallucination detection for query: %s", query)
327
  start_time = time.time()
328
 
329
+ # Randomly select a model pair for this detection
330
+ generator_model, judge_model = self.set_random_model_pair()
331
+ logger.info("Using %s as generator and %s as judge for this detection", generator_model, judge_model)
332
+
333
  # Report progress
334
  if self.progress_callback:
335
  self.progress_callback("starting", query=query)
 
345
  self.progress_callback("paraphrases_complete", query=query, count=len(all_queries))
346
 
347
  # Get responses to all queries
348
+ logger.info("Step 2: Getting responses to all %d queries using %s", len(all_queries), generator_model)
349
  if self.progress_callback:
350
+ self.progress_callback("getting_responses", query=query, total=len(all_queries), model=generator_model)
351
 
352
  all_responses = []
353
  for i, q in enumerate(all_queries):
 
362
  self.progress_callback("responses_complete", query=query)
363
 
364
  # Judge the responses for hallucinations
365
+ logger.info("Step 3: Judging for hallucinations using %s", judge_model)
366
  if self.progress_callback:
367
+ self.progress_callback("judging", query=query, model=judge_model)
368
 
369
  # The first query is the original, rest are paraphrases
370
  original_query = all_queries[0]
 
390
  "confidence_score": judgment.confidence_score,
391
  "conflicting_facts": judgment.conflicting_facts,
392
  "reasoning": judgment.reasoning,
393
+ "summary": judgment.summary,
394
+ "generator_model": generator_model,
395
+ "judge_model": judge_model
396
  }
397
 
398
  # Report completion
399
  if self.progress_callback:
400
+ self.progress_callback("complete", query=query, generator=generator_model, judge=judge_model)
401
 
402
+ logger.info("Hallucination detection completed in %.2f seconds using %s (generator) and %s (judge)",
403
+ time.time() - start_time, generator_model, judge_model)
404
  return results
405
 
406
  def judge_hallucination(self,
 
409
  paraphrased_queries: List[str],
410
  paraphrased_responses: List[str]) -> HallucinationJudgment:
411
  """
412
+ Use the selected judge model to detect hallucinations in the responses
413
  """
414
+ logger.info("Judging hallucinations with %s model", self.judge_model)
415
  start_time = time.time()
416
 
417
+ # Get the model configuration for the judge
418
+ model_config = self.model_configs[self.judge_model]
419
+ client = model_config["client"]
420
+ model_id = model_config["model_id"]
421
+ model_type = model_config["type"]
422
+
423
  # Prepare the context for the judge
424
  context = f"""
425
  Original Question: {original_query}
 
448
  """
449
 
450
  try:
451
+ logger.info("Sending judgment request to %s...", self.judge_model)
452
+
453
+ # Use the appropriate client and model based on the type
454
+ if model_type == "mistral":
455
+ response = client.chat.complete(
456
+ model=model_id,
457
+ messages=[
458
+ {"role": "system", "content": system_prompt},
459
+ {"role": "user", "content": f"Evaluate these responses for hallucinations:\n\n{context}"}
460
+ ],
461
+ response_format={"type": "json_object"}
462
+ )
463
+ result_json = json.loads(response.choices[0].message.content)
464
+ else: # openai-compatible API
465
+ response = client.chat.completions.create(
466
+ model=model_id,
467
+ messages=[
468
+ {"role": "system", "content": system_prompt},
469
+ {"role": "user", "content": f"Evaluate these responses for hallucinations:\n\n{context}"}
470
+ ],
471
+ response_format={"type": "json_object"}
472
+ )
473
+ result_json = json.loads(response.choices[0].message.content)
474
 
475
+ logger.debug("Received judgment response from %s: %s", self.judge_model, result_json)
 
476
 
477
  # Create the HallucinationJudgment object from the JSON response
478
  judgment = HallucinationJudgment(
 
484
  )
485
 
486
  elapsed_time = time.time() - start_time
487
+ logger.info("Judgment completed by %s in %.2f seconds", self.judge_model, elapsed_time)
488
 
489
  return judgment
490
 
491
  except Exception as e:
492
+ logger.error("Error in hallucination judgment with %s: %s", self.judge_model, str(e), exc_info=True)
493
  # Return a fallback judgment
494
  return HallucinationJudgment(
495
  hallucination_detected=False,
496
  confidence_score=0.0,
497
  conflicting_facts=[],
498
+ reasoning=f"Failed to obtain judgment from the {self.judge_model} model: {str(e)}",
499
  summary="Analysis failed due to API error."
500
  )
501
 
 
612
  "conflicting_facts": results.get('conflicting_facts', []),
613
  "reasoning": results.get('reasoning', ''),
614
  "summary": results.get('summary', ''),
615
+ "generator_model": results.get('generator_model', 'unknown'),
616
+ "judge_model": results.get('judge_model', 'unknown'),
617
  "user_feedback": feedback
618
  }
619
 
620
  # Insert document into collection
621
+ result = self.feedback_collection.insert_one(document)
622
+
623
+ # Update model leaderboard scores
624
+ self._update_model_scores(
625
+ generator=results.get('generator_model', 'unknown'),
626
+ judge=results.get('judge_model', 'unknown'),
627
+ feedback=feedback,
628
+ hallucination_detected=results.get('hallucination_detected', False)
629
+ )
630
 
631
  logger.info("Feedback saved successfully to MongoDB")
632
  return "Feedback saved successfully!"
 
634
  logger.error("Error saving feedback: %s", str(e), exc_info=True)
635
  return f"Error saving feedback: {str(e)}"
636
 
637
+ def _update_model_scores(self, generator, judge, feedback, hallucination_detected):
638
+ """Update the ELO scores for the generator and judge models based on feedback"""
639
+ try:
640
+ if self.db is None:
641
+ logger.error("MongoDB connection not available. Cannot update model scores.")
642
+ return
643
+
644
+ # Access or create the models collection
645
+ models_collection = self.db.get_collection("model_scores")
646
+
647
+ # Create indexes if they don't exist
648
+ models_collection.create_index("model_name", unique=True)
649
+
650
+ # Parse the feedback to determine scenario
651
+ actual_hallucination = "Yes, there was a hallucination" in feedback
652
+ no_hallucination = "No, there was no hallucination" in feedback
653
+ judge_correct = "Yes, the judge was correct" in feedback
654
+ judge_incorrect = "No, the judge was incorrect" in feedback
655
+
656
+ # Determine scores based on different scenarios:
657
+ # 1. Actual hallucination + Judge correct = positive for judge, negative for generator
658
+ # 2. No hallucination + Judge correct = positive for both
659
+ # 3. No hallucination + Judge incorrect = negative for judge, positive for generator
660
+ # 4. Actual hallucination + Judge incorrect = negative for both
661
+
662
+ if judge_correct:
663
+ if actual_hallucination:
664
+ # Scenario 1: Judge correctly detected hallucination
665
+ judge_score = 1 # Positive for judge
666
+ generator_score = 0 # Negative for generator (hallucinated)
667
+ logger.info("Judge %s correctly detected hallucination from generator %s", judge, generator)
668
+ elif no_hallucination:
669
+ # Scenario 2: Judge correctly determined no hallucination
670
+ judge_score = 1 # Positive for judge
671
+ generator_score = 1 # Positive for generator (didn't hallucinate)
672
+ logger.info("Judge %s correctly determined no hallucination from generator %s", judge, generator)
673
+ else:
674
+ # User unsure about hallucination, but confirmed judge was correct
675
+ judge_score = 1 # Positive for judge
676
+ generator_score = 0.5 # Neutral for generator (unclear)
677
+ logger.info("User confirmed judge %s was correct, but unclear about hallucination from %s", judge, generator)
678
+ elif judge_incorrect:
679
+ if no_hallucination:
680
+ # Scenario 3: Judge incorrectly claimed hallucination (false positive)
681
+ judge_score = 0 # Negative for judge
682
+ generator_score = 1 # Positive for generator (unfairly accused)
683
+ logger.info("Judge %s incorrectly claimed hallucination from generator %s", judge, generator)
684
+ elif actual_hallucination:
685
+ # Scenario 4: Judge missed actual hallucination (false negative)
686
+ judge_score = 0 # Negative for judge
687
+ generator_score = 0 # Negative for generator (hallucination went undetected)
688
+ logger.info("Judge %s missed actual hallucination from generator %s", judge, generator)
689
+ else:
690
+ # User unsure about hallucination, but confirmed judge was incorrect
691
+ judge_score = 0 # Negative for judge
692
+ generator_score = 0.5 # Neutral for generator (unclear)
693
+ logger.info("User confirmed judge %s was incorrect, but unclear about hallucination from %s", judge, generator)
694
+ else:
695
+ # User unsure about judge correctness, don't update scores
696
+ judge_score = 0.5 # Neutral for judge (unclear)
697
+ generator_score = 0.5 # Neutral for generator (unclear)
698
+ logger.info("User unsure about judge %s correctness and generator %s hallucination", judge, generator)
699
+
700
+ # Update generator model stats with specific score
701
+ self._update_model_stats(models_collection, generator, generator_score, "generator")
702
+
703
+ # Update judge model stats with specific score
704
+ self._update_model_stats(models_collection, judge, judge_score, "judge")
705
+
706
+ # Determine if the detection was correct based on judge correctness
707
+ detection_correct = judge_correct
708
+
709
+ # Determine if there was actually hallucination based on user feedback
710
+ actual_hallucination_present = actual_hallucination
711
+
712
+ # Update model pair stats
713
+ self._update_model_pair_stats(generator, judge, detection_correct, actual_hallucination_present,
714
+ generator_score, judge_score)
715
+
716
+ logger.info("Updated model scores based on feedback: generator(%s)=%s, judge(%s)=%s",
717
+ generator, generator_score, judge, judge_score)
718
+
719
+ except Exception as e:
720
+ logger.error("Error updating model scores: %s", str(e), exc_info=True)
721
+
722
+ def _update_model_stats(self, collection, model_name, score, role):
723
+ """Update statistics for a single model"""
724
+ # Simplified ELO calculation
725
+ K_FACTOR = 32 # Standard K-factor for ELO
726
+
727
+ # Get current model data or create if not exists
728
+ model_data = collection.find_one({"model_name": model_name})
729
+
730
+ if model_data is None:
731
+ # Initialize new model with default values
732
+ model_data = {
733
+ "model_name": model_name,
734
+ "elo_score": 1500, # Starting ELO
735
+ "total_samples": 0,
736
+ "correct_predictions": 0,
737
+ "accuracy": 0.0,
738
+ "as_generator": 0,
739
+ "as_judge": 0,
740
+ "as_generator_correct": 0,
741
+ "as_judge_correct": 0,
742
+ "neutral_samples": 0 # Add a counter for neutral samples
743
+ }
744
+
745
+ # Skip counting for neutral feedback (0.5)
746
+ if score == 0.5:
747
+ # Increment neutral samples counter instead
748
+ if "neutral_samples" not in model_data:
749
+ model_data["neutral_samples"] = 0
750
+ model_data["neutral_samples"] += 1
751
+
752
+ # Expected score based on current rating (vs average rating)
753
+ expected_score = 1 / (1 + 10**((1500 - model_data["elo_score"]) / 400))
754
+
755
+ # For neutral score, use a much smaller K factor to slightly adjust the ELO
756
+ # This handles the "unsure" case with minimal impact
757
+ model_data["elo_score"] = model_data["elo_score"] + (K_FACTOR/4) * (0.5 - expected_score)
758
+
759
+ # Update or insert the model data
760
+ collection.replace_one(
761
+ {"model_name": model_name},
762
+ model_data,
763
+ upsert=True
764
+ )
765
+ return
766
+
767
+ # Update sample counts for non-neutral cases
768
+ model_data["total_samples"] += 1
769
+ if role == "generator":
770
+ model_data["as_generator"] += 1
771
+ if score == 1: # Only count as correct if score is 1 (not 0)
772
+ model_data["as_generator_correct"] += 1
773
+ else: # role == "judge"
774
+ model_data["as_judge"] += 1
775
+ if score == 1: # Only count as correct if score is 1 (not 0)
776
+ model_data["as_judge_correct"] += 1
777
+
778
+ # Update correct predictions based on score
779
+ if score == 1:
780
+ model_data["correct_predictions"] += 1
781
+
782
+ # Calculate new accuracy
783
+ model_data["accuracy"] = model_data["correct_predictions"] / model_data["total_samples"]
784
+
785
+ # Update ELO score based on the specific score value (0 or 1)
786
+ # Expected score based on current rating (vs average rating)
787
+ expected_score = 1 / (1 + 10**((1500 - model_data["elo_score"]) / 400))
788
+
789
+ # Use the provided score (0 or 1)
790
+ actual_score = score
791
+
792
+ # New ELO calculation
793
+ model_data["elo_score"] = model_data["elo_score"] + K_FACTOR * (actual_score - expected_score)
794
+
795
+ # Update or insert the model data
796
+ collection.replace_one(
797
+ {"model_name": model_name},
798
+ model_data,
799
+ upsert=True
800
+ )
801
+
802
+ def _update_model_pair_stats(self, generator, judge, detection_correct, hallucination_detected,
803
+ generator_score, judge_score):
804
+ """Update statistics for a model pair combination"""
805
+ try:
806
+ # Access or create the model pairs collection
807
+ pairs_collection = self.db.get_collection("model_pairs")
808
+
809
+ # Create compound index if it doesn't exist
810
+ pairs_collection.create_index([("generator", 1), ("judge", 1)], unique=True)
811
+
812
+ # Get current pair data or create if not exists
813
+ pair_data = pairs_collection.find_one({
814
+ "generator": generator,
815
+ "judge": judge
816
+ })
817
+
818
+ if pair_data is None:
819
+ # Initialize new pair with default values
820
+ pair_data = {
821
+ "generator": generator,
822
+ "judge": judge,
823
+ "elo_score": 1500, # Starting ELO
824
+ "total_samples": 0,
825
+ "correct_predictions": 0,
826
+ "accuracy": 0.0,
827
+ "hallucinations_detected": 0,
828
+ "generator_performance": 0.0,
829
+ "judge_performance": 0.0,
830
+ "consistency_score": 0.0
831
+ }
832
+
833
+ # Update sample counts
834
+ pair_data["total_samples"] += 1
835
+ if detection_correct:
836
+ pair_data["correct_predictions"] += 1
837
+
838
+ if hallucination_detected:
839
+ pair_data["hallucinations_detected"] += 1
840
+
841
+ # Track model-specific performances within the pair
842
+ if "generator_correct_count" not in pair_data:
843
+ pair_data["generator_correct_count"] = 0
844
+ if "judge_correct_count" not in pair_data:
845
+ pair_data["judge_correct_count"] = 0
846
+
847
+ # Update individual performance counters based on scores
848
+ if generator_score == 1:
849
+ pair_data["generator_correct_count"] += 1
850
+ if judge_score == 1:
851
+ pair_data["judge_correct_count"] += 1
852
+
853
+ # Calculate individual performance rates within the pair
854
+ pair_data["generator_performance"] = pair_data["generator_correct_count"] / pair_data["total_samples"]
855
+ pair_data["judge_performance"] = pair_data["judge_correct_count"] / pair_data["total_samples"]
856
+
857
+ # Calculate new accuracy for the pair (detection accuracy)
858
+ pair_data["accuracy"] = pair_data["correct_predictions"] / pair_data["total_samples"]
859
+
860
+ # Calculate consistency score - weighted average of individual performances
861
+ # Gives more weight to the generator when hallucinations are detected
862
+ if hallucination_detected:
863
+ # When hallucination is detected, judge's role is more critical
864
+ pair_data["consistency_score"] = (0.4 * pair_data["generator_performance"] +
865
+ 0.6 * pair_data["judge_performance"])
866
+ else:
867
+ # When no hallucination is detected, both roles are equally important
868
+ pair_data["consistency_score"] = (0.5 * pair_data["generator_performance"] +
869
+ 0.5 * pair_data["judge_performance"])
870
+
871
+ # Update ELO score (simplified version)
872
+ K_FACTOR = 24 # Slightly lower K-factor for pairs
873
+
874
+ # Expected score based on current rating
875
+ expected_score = 1 / (1 + 10**((1500 - pair_data["elo_score"]) / 400))
876
+
877
+ # Actual score - use the average of both model scores (0-1 range)
878
+ # This represents the pair's overall performance
879
+ actual_score = (generator_score + judge_score) / 2
880
+
881
+ # New ELO calculation
882
+ pair_data["elo_score"] = pair_data["elo_score"] + K_FACTOR * (actual_score - expected_score)
883
+
884
+ # Update or insert the pair data
885
+ pairs_collection.replace_one(
886
+ {"generator": generator, "judge": judge},
887
+ pair_data,
888
+ upsert=True
889
+ )
890
+
891
+ logger.info("Updated model pair stats for %s (generator) and %s (judge)", generator, judge)
892
+
893
+ except Exception as e:
894
+ logger.error("Error updating model pair stats: %s", str(e), exc_info=True)
895
+ return None
896
+
897
  def get_feedback_stats(self):
898
  """Get statistics about collected feedback from MongoDB"""
899
  try:
 
928
  except Exception as e:
929
  logger.error("Error getting feedback stats: %s", str(e), exc_info=True)
930
  return None
931
+
932
+ def get_model_leaderboard(self):
933
+ """Get the current model leaderboard data"""
934
+ try:
935
+ if self.db is None:
936
+ logger.error("MongoDB connection not available. Cannot get model leaderboard.")
937
+ return None
938
+
939
+ # Access models collection
940
+ models_collection = self.db.get_collection("model_scores")
941
+
942
+ # Get all models and sort by ELO score
943
+ models = list(models_collection.find().sort("elo_score", pymongo.DESCENDING))
944
+
945
+ # Format percentages and convert ObjectId
946
+ for model in models:
947
+ model["_id"] = str(model["_id"])
948
+ model["accuracy"] = round(model["accuracy"] * 100, 1)
949
+ if "as_generator" in model and model["as_generator"] > 0:
950
+ model["generator_accuracy"] = round((model["as_generator_correct"] / model["as_generator"]) * 100, 1)
951
+ else:
952
+ model["generator_accuracy"] = 0.0
953
+
954
+ if "as_judge" in model and model["as_judge"] > 0:
955
+ model["judge_accuracy"] = round((model["as_judge_correct"] / model["as_judge"]) * 100, 1)
956
+ else:
957
+ model["judge_accuracy"] = 0.0
958
+
959
+ return models
960
+ except Exception as e:
961
+ logger.error("Error getting model leaderboard: %s", str(e), exc_info=True)
962
+ return []
963
+
964
+ def get_pair_leaderboard(self):
965
+ """Get the current model pair leaderboard data"""
966
+ try:
967
+ if self.db is None:
968
+ logger.error("MongoDB connection not available. Cannot get pair leaderboard.")
969
+ return None
970
+
971
+ # Access model pairs collection
972
+ pairs_collection = self.db.get_collection("model_pairs")
973
+
974
+ # Get all pairs and sort by ELO score
975
+ pairs = list(pairs_collection.find().sort("elo_score", pymongo.DESCENDING))
976
+
977
+ # Format percentages and convert ObjectId
978
+ for pair in pairs:
979
+ pair["_id"] = str(pair["_id"])
980
+ pair["accuracy"] = round(pair["accuracy"] * 100, 1)
981
+ pair["consistency_score"] = round(pair["consistency_score"] * 100, 1)
982
+
983
+ return pairs
984
+ except Exception as e:
985
+ logger.error("Error getting pair leaderboard: %s", str(e), exc_info=True)
986
+ return []
987
 
988
  def export_data_to_csv(self, filepath=None):
989
  """Export all feedback data to a CSV file for analysis"""
 
1100
  "starting": {"status": "Starting process...", "progress": 5, "color": "#2196F3"},
1101
  "generating_paraphrases": {"status": "Generating paraphrases...", "progress": 15, "color": "#2196F3"},
1102
  "paraphrases_complete": {"status": "Paraphrases generated", "progress": 30, "color": "#2196F3"},
1103
+ "getting_responses": {"status": "Getting responses using {model}...", "progress": 35, "color": "#2196F3"},
1104
  "responses_progress": {"status": "Getting responses ({completed}/{total})...", "progress": 40, "color": "#2196F3"},
1105
  "responses_complete": {"status": "All responses received", "progress": 65, "color": "#2196F3"},
1106
+ "judging": {"status": "Analyzing responses for hallucinations using {model}...", "progress": 70, "color": "#2196F3"},
1107
+ "complete": {"status": "Analysis complete! Using {generator} (generator) and {judge} (judge)", "progress": 100, "color": "#4CAF50"},
1108
  "error": {"status": "Error: {error_message}", "progress": 100, "color": "#F44336"}
1109
  }
1110
 
 
1115
  self.completed_responses = 0
1116
  self.total_responses = 0
1117
  self.error_message = ""
1118
+ self.generator_model = ""
1119
+ self.judge_model = ""
1120
+ self.model = "" # For general model reference in status messages
1121
  self._lock = threading.Lock()
1122
  self._status_callback = None
1123
  self._stop_event = threading.Event()
 
1144
  self.total_responses = value
1145
  elif key == 'error_message':
1146
  self.error_message = value
1147
+ elif key == 'model':
1148
+ self.model = value
1149
+ elif key == 'generator':
1150
+ self.generator_model = value
1151
+ elif key == 'judge':
1152
+ self.judge_model = value
1153
 
1154
  # Format status message
1155
  if stage == 'responses_progress':
 
1157
  completed=self.completed_responses,
1158
  total=self.total_responses
1159
  )
1160
+ elif stage == 'getting_responses' and 'model' in kwargs:
1161
+ self.stage_data['status'] = self.stage_data['status'].format(
1162
+ model=kwargs.get('model', 'selected model')
1163
+ )
1164
+ elif stage == 'judging' and 'model' in kwargs:
1165
+ self.stage_data['status'] = self.stage_data['status'].format(
1166
+ model=kwargs.get('model', 'selected model')
1167
+ )
1168
+ elif stage == 'complete' and 'generator' in kwargs and 'judge' in kwargs:
1169
+ self.stage_data['status'] = self.stage_data['status'].format(
1170
+ generator=self.generator_model,
1171
+ judge=self.judge_model
1172
+ )
1173
  elif stage == 'error':
1174
  self.stage_data['status'] = self.stage_data['status'].format(
1175
  error_message=self.error_message
 
1189
  # Only show status text if not in idle state
1190
  status_display = f'<div class="progress-status" style="color: {color};">{status_text}</div>' if self.stage != "idle" else ''
1191
 
1192
+ # Add model information if available and we're not in idle or error state
1193
+ model_info = ''
1194
+ if self.stage not in ["idle", "error", "starting"] and (self.generator_model or self.judge_model):
1195
+ model_info = f'<div class="model-info" style="display: flex; justify-content: space-between; margin-top: 8px; font-size: 0.85em; color: #37474f; background-color: #e1f5fe; padding: 5px 10px; border-radius: 4px;">'
1196
+ if self.generator_model:
1197
+ model_info += f'<div><span style="font-weight: bold;">Generator:</span> {self.generator_model}</div>'
1198
+ if self.judge_model:
1199
+ model_info += f'<div><span style="font-weight: bold;">Judge:</span> {self.judge_model}</div>'
1200
+ model_info += '</div>'
1201
+
1202
  html = f"""
1203
  <div class="progress-container">
1204
  {query_info}
 
1206
  <div class="progress-bar-container">
1207
  <div class="progress-bar" style="width: {progress_width}; background-color: {color};"></div>
1208
  </div>
1209
+ {model_info}
1210
  </div>
1211
  """
1212
  return html
 
1575
  combined_progress_callback("starting", query=query)
1576
  time.sleep(0.3) # Ensure starting status is visible
1577
 
1578
+ # Step 1.5: Randomly select model pair
1579
+ generator_model, judge_model = detector.pas2.set_random_model_pair()
1580
+ combined_progress_callback("starting", query=query, generator=generator_model, judge=judge_model)
1581
+ time.sleep(0.3) # Ensure model info is visible
1582
+
1583
  # Step 2: Generate paraphrases (15-30%)
1584
  combined_progress_callback("generating_paraphrases", query=query)
1585
  all_queries = detector.pas2.generate_paraphrases(query)
1586
  combined_progress_callback("paraphrases_complete", query=query, count=len(all_queries))
1587
 
1588
  # Step 3: Get responses (35-65%)
1589
+ combined_progress_callback("getting_responses", query=query, total=len(all_queries), model=generator_model)
1590
  all_responses = []
1591
  for i, q in enumerate(all_queries):
1592
  # Show incremental progress for each response
 
1596
  combined_progress_callback("responses_complete", query=query)
1597
 
1598
  # Step 4: Judge hallucinations (70-100%)
1599
+ combined_progress_callback("judging", query=query, model=judge_model)
1600
 
1601
  # The first query is the original, rest are paraphrases
1602
  original_query = all_queries[0]
 
1622
  "confidence_score": judgment.confidence_score,
1623
  "conflicting_facts": judgment.conflicting_facts,
1624
  "reasoning": judgment.reasoning,
1625
+ "summary": judgment.summary,
1626
+ "generator_model": generator_model,
1627
+ "judge_model": judge_model
1628
  }
1629
 
1630
  # Show completion
1631
+ combined_progress_callback("complete", query=query, generator=generator_model, judge=judge_model)
1632
  time.sleep(0.3) # Ensure complete status is visible
1633
 
1634
  return results
 
1684
  reasoning_safe = reasoning.replace('\\', '\\\\').replace('\n', '<br>')
1685
  conflicting_facts_text_safe = conflicting_facts_text.replace('\\', '\\\\').replace('\n', '<br>') if conflicting_facts_text else "<strong>None identified</strong>"
1686
 
1687
+ # Get model info from the results
1688
+ generator_model = results.get("generator_model", "unknown model")
1689
+ judge_model = results.get("judge_model", "unknown model")
1690
+
1691
  html_output = f"""
1692
  <div class="container">
1693
  <h2 class="title">Hallucination Detection Results</h2>
1694
 
1695
+ <div class="model-info-bar" style="background-color: #e1f5fe; padding: 10px 15px; border-radius: 8px; margin-bottom: 15px; display: flex; justify-content: space-between;">
1696
+ <div style="flex: 1; text-align: center; border-right: 1px solid #b3e5fc; padding-right: 10px;">
1697
+ <div style="font-weight: bold; color: #0277bd;">Generator Model</div>
1698
+ <div style="font-size: 1.2em; color: #01579b;">{generator_model}</div>
1699
+ </div>
1700
+ <div style="flex: 1; text-align: center; padding-left: 10px;">
1701
+ <div style="font-weight: bold; color: #0277bd;">Judge Model</div>
1702
+ <div style="font-size: 1.2em; color: #01579b;">{judge_model}</div>
1703
+ </div>
1704
+ </div>
1705
+
1706
  <div class="stats-section">
1707
  <div class="stat-item">
1708
  <div class="stat-value">{'Yes' if hallucination_detected else 'No'}</div>
 
1732
  {original_query}
1733
  </div>
1734
 
1735
+ <div class="section-title">Original Response <span style="font-size: 0.8em; color: #607d8b;">(generated by {generator_model})</span></div>
1736
  <div class="response-box">
1737
  {original_response_safe}
1738
  </div>
 
1747
  {q}
1748
  </div>
1749
 
1750
+ <div class="section-title">Response {i} <span style="font-size: 0.8em; color: #607d8b;">(generated by {generator_model})</span></div>
1751
  <div class="response-box">
1752
  {r}
1753
  </div>
1754
  """
1755
 
1756
  html_output += f"""
1757
+ <div class="section-title">Detailed Analysis <span style="font-size: 0.8em; color: #607d8b;">(judged by {judge_model})</span></div>
1758
  <div class="info-box">
1759
  <p><strong>Reasoning:</strong></p>
1760
  <p>{reasoning_safe}</p>
 
1762
  <p><strong>Conflicting Facts:</strong></p>
1763
  <p>{conflicting_facts_text_safe}</p>
1764
  </div>
1765
+
1766
+ <div style="margin-top: 20px; border-top: 1px dashed #ccc; padding-top: 15px; font-size: 0.9em; color: #607d8b; text-align: center;">
1767
+ Models randomly selected for this analysis: <strong>{generator_model}</strong> (Generator) and <strong>{judge_model}</strong> (Judge)
1768
+ </div>
1769
  </div>
1770
  """
1771
 
 
1791
  ]
1792
 
1793
  # Helper function to submit feedback
1794
+ def combine_feedback(hallucination_present, judge_correct, fb_text, results):
1795
+ combined_feedback = f"Hallucination: {hallucination_present}, Judge Correct: {judge_correct}"
1796
+ if fb_text:
1797
+ combined_feedback += f", Comments: {fb_text}"
1798
+
1799
  if not results:
1800
  return "No results to attach feedback to."
1801
 
 
1899
  This tool implements the Paraphrase-based Approach for Scrutinizing Systems (PAS2) with a model-as-judge enhancement:
1900
 
1901
  1. **Paraphrase Generation**: Your question is paraphrased multiple ways while preserving its core meaning
1902
+ 2. **Multiple Responses**: All questions (original + paraphrases) are sent to a randomly selected generator model
1903
+ 3. **Expert Judgment**: A randomly selected judge model analyzes all responses to detect factual inconsistencies
1904
 
1905
  ### Why This Approach?
1906
 
 
1974
  gr.Markdown("### Help Improve the System")
1975
  gr.Markdown("Your feedback helps us refine the hallucination detection system.")
1976
 
1977
+ hallucination_present = gr.Radio(
1978
+ label="Was there actually a hallucination in the responses?",
1979
+ choices=["Yes, there was a hallucination", "No, there was no hallucination", "Not sure"],
1980
+ value="Not sure"
1981
+ )
1982
+
1983
+ judge_correct = gr.Radio(
1984
+ label="Did the judge model correctly identify the situation?",
1985
+ choices=["Yes, the judge was correct", "No, the judge was incorrect", "Not sure"],
1986
+ value="Not sure"
1987
  )
1988
 
1989
  feedback_text = gr.Textbox(
 
2000
  gr.Markdown("## Hallucination Detection Scores")
2001
  gr.Markdown("Performance comparison of different Generator + Judge model combinations.")
2002
 
2003
+ # Function to generate the HTML for the model pair leaderboard
2004
+ def generate_pair_leaderboard_html():
2005
+ try:
2006
+ # Get leaderboard data
2007
+ pairs = detector.get_pair_leaderboard() or []
2008
+
2009
+ if not pairs:
2010
+ return (
2011
+ "<div class=\"info-message\" style=\"padding: 20px; background-color: #e1f5fe; "
2012
+ "border-radius: 8px; text-align: center; margin: 20px 0;\">"
2013
+ "<h3 style=\"margin-top: 0; color: #0277bd;\">No Data Available Yet</h3>"
2014
+ "<p>Try the detector with more queries to populate the leaderboard!</p>"
2015
+ "</div>"
2016
+ )
2017
+
2018
+ # Generate table rows
2019
+ rows = ""
2020
+ for rank, pair in enumerate(pairs, 1):
2021
+ # Add special styling for top 3
2022
+ row_class = ""
2023
+ if rank == 1:
2024
+ row_class = "class='top-rank-1'"
2025
+ elif rank == 2:
2026
+ row_class = "class='top-rank-2'"
2027
+ elif rank == 3:
2028
+ row_class = "class='top-rank-3'"
2029
+
2030
+ # Format percentages for display
2031
+ generator_perf = f"{pair.get('generator_performance', 0) * 100:.1f}%" if 'generator_performance' in pair else "N/A"
2032
+ judge_perf = f"{pair.get('judge_performance', 0) * 100:.1f}%" if 'judge_performance' in pair else "N/A"
2033
+ consistency = f"{pair.get('consistency_score', 0)}%" if 'consistency_score' in pair else "N/A"
2034
+
2035
+ rows += (
2036
+ f"<tr {row_class}>"
2037
+ f"<td>{rank}</td>"
2038
+ f"<td>{pair.get('generator', 'unknown')}</td>"
2039
+ f"<td>{pair.get('judge', 'unknown')}</td>"
2040
+ f"<td>{round(pair.get('elo_score', 0))}</td>"
2041
+ f"<td>{pair.get('accuracy')}%</td>"
2042
+ f"<td style='color: #80cbc4; font-weight: 500;'>{generator_perf}</td>"
2043
+ f"<td style='color: #90caf9; font-weight: 500;'>{judge_perf}</td>"
2044
+ f"<td style='color: #ce93d8; font-weight: 500;'>{consistency}</td>"
2045
+ f"<td>{pair.get('total_samples', 0)}</td>"
2046
+ f"</tr>"
2047
+ )
2048
+
2049
+ # Build the full table
2050
+ html = (
2051
+ f"<div class=\"leaderboard-container\">"
2052
+ f"<table class=\"leaderboard-table\">"
2053
+ f"<thead>"
2054
+ f"<tr>"
2055
+ f"<th>Rank</th>"
2056
+ f"<th>Generator Model</th>"
2057
+ f"<th>Judge Model</th>"
2058
+ f"<th>ELO Score</th>"
2059
+ f"<th>Accuracy</th>"
2060
+ f"<th>Generator Perf.</th>"
2061
+ f"<th>Judge Perf.</th>"
2062
+ f"<th>Consistency</th>"
2063
+ f"<th>Sample Size</th>"
2064
+ f"</tr>"
2065
+ f"</thead>"
2066
+ f"<tbody>"
2067
+ f"{rows}"
2068
+ f"</tbody>"
2069
+ f"</table>"
2070
+ f"</div>"
2071
+ f"<div style='margin-top: 15px; padding: 12px; background-color: #263238; border-radius: 8px; font-size: 0.95em; color: #e0f7fa; box-shadow: 0 2px 5px rgba(0,0,0,0.2);'>"
2072
+ f"<p style='margin-bottom: 8px; color: #80deea;'><strong>Model Pair Performance Metrics:</strong></p>"
2073
+ f"<ul style='margin-top: 5px; padding-left: 20px; line-height: 1.4;'>"
2074
+ f"<li><strong style='color: #b2dfdb;'>Accuracy</strong>: Percentage of correct hallucination judgments based on user feedback</li>"
2075
+ f"<li><strong style='color: #b2dfdb;'>Generator Performance</strong>: How well the generator model avoids hallucinations</li>"
2076
+ f"<li><strong style='color: #b2dfdb;'>Judge Performance</strong>: How accurately the judge model identifies hallucinations</li>"
2077
+ f"<li><strong style='color: #b2dfdb;'>Consistency</strong>: Weighted measure of how well the pair works together</li>"
2078
+ f"</ul>"
2079
+ f"</div>"
2080
+ )
2081
+
2082
+ return html
2083
+ except Exception as e:
2084
+ logger.error("Error generating leaderboard HTML: %s", str(e), exc_info=True)
2085
+ return (
2086
+ f"<div class=\"error-message\" style=\"padding: 20px; background-color: #ffebee; "
2087
+ f"border-radius: 8px; text-align: center; margin: 20px 0;\">"
2088
+ f"<h3 style=\"margin-top: 0; color: #c62828;\">Error Loading Leaderboard</h3>"
2089
+ f"<p>{str(e)}</p>"
2090
+ f"</div>"
2091
+ )
2092
+
2093
  # Create leaderboard table for model combinations
2094
+ model_leaderboard_html = gr.HTML(generate_pair_leaderboard_html())
2095
+ refresh_leaderboard_btn = gr.Button("Refresh Leaderboard", variant="primary")
2096
+ refresh_leaderboard_btn.click(
2097
+ fn=lambda: generate_pair_leaderboard_html(),
2098
+ outputs=[model_leaderboard_html]
2099
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2100
 
2101
+ # ELO rating explanation
2102
+ with gr.Accordion("ELO Rating System Explanation", open=False):
2103
+ gr.HTML(
2104
+ "<div style='margin-top: 20px; padding: 15px; background-color: #0d47a1; border-radius: 8px; box-shadow: 0 2px 8px rgba(0,0,0,0.1);'>" +
2105
+ "<h3 style='margin-top: 0; color: #ffffff;'>ELO Rating System Explanation</h3>" +
2106
+ "<div style='display: flex; flex-wrap: wrap; gap: 15px; margin-top: 15px;'>" +
2107
+ "<div style='flex: 1; min-width: 280px; padding: 12px; background-color: #455a64; border-radius: 6px; box-shadow: 0 1px 3px rgba(0,0,0,0.12);'>" +
2108
+ "<h4 style='margin-top: 0; color: #ffffff;'>How ELO Scores Are Calculated</h4>" +
2109
+ "<p style='color: #eceff1;'>Our ELO rating system assigns scores to model pairs based on user feedback, using the following formula:</p>" +
2110
+ "<div style='background-color: #37474f; padding: 12px; border-radius: 5px; color: #eceff1;'>" +
2111
+ "<code style='color: #80deea;'>ELO_new = ELO_old + K * (S - E)</code><br><br>" +
2112
+ "Where:<br>* <strong style='color: #b2dfdb;'>ELO_old</strong>: Previous rating of the model combination<br>" +
2113
+ "* <strong style='color: #b2dfdb;'>K</strong>: Weight factor (24 for model pairs)<br>" +
2114
+ "* <strong style='color: #b2dfdb;'>S</strong>: Actual score from user feedback (1 for correct, 0 for incorrect)<br>" +
2115
+ "* <strong style='color: #b2dfdb;'>E</strong>: Expected score based on current rating<br><br>" +
2116
+ "<em style='color: #80deea;'>E = 1 / (1 + 10<sup>(1500 - ELO_model)/400</sup>)</em></div></div>" +
2117
+ "<div style='flex: 1; min-width: 280px; padding: 12px; background-color: #455a64; border-radius: 6px; box-shadow: 0 1px 3px rgba(0,0,0,0.12);'>" +
2118
+ "<h4 style='margin-top: 0; color: #ffffff;'>Available Models</h4>" +
2119
+ "<p style='color: #eceff1;'>The system randomly selects from these models for each hallucination detection:</p>" +
2120
+ "<div style='display: flex; flex-wrap: wrap; gap: 10px; margin-top: 10px;'>" +
2121
+ "<div style='flex: 1; min-width: 120px;'>" +
2122
+ "<h5 style='margin-top: 0; margin-bottom: 5px; color: #b2dfdb;'>All Models (Used as both Generator & Judge)</h5>" +
2123
+ "<ul style='margin-bottom: 0; padding-left: 20px; color: #eceff1;'>" +
2124
+ "<li>mistral-large</li><li>gpt-4o</li><li>qwen-235b</li><li>grok-3</li>" +
2125
+ "<li>deepseek-reasoner</li><li>o4-mini</li><li>gemini-2.5-pro</li>" +
2126
+ "</ul></div></div></div></div></div>"
2127
+ )
2128
+ gr.HTML(
2129
+ "<style>" +
2130
+ ".leaderboard-container {margin: 15px 0; overflow-x: auto;}" +
2131
+ ".leaderboard-table {width: 100%; border-collapse: collapse; font-size: 0.95em; " +
2132
+ "box-shadow: 0 2px 10px rgba(0,0,0,0.2); border-radius: 8px; overflow: hidden;}" +
2133
+ ".leaderboard-table thead {background-color: #0d47a1; color: white;}" +
2134
+ ".leaderboard-table th, .leaderboard-table td {padding: 12px 15px; text-align: left; border-bottom: 1px solid #37474f; color: #eceff1;}" +
2135
+ ".leaderboard-table tbody tr {transition: background-color 0.3s;}" +
2136
+ ".leaderboard-table tbody tr:nth-child(even) {background-color: #37474f;}" +
2137
+ ".leaderboard-table tbody tr:nth-child(odd) {background-color: #455a64;}" +
2138
+ ".leaderboard-table tbody tr:hover {background-color: #263238;}" +
2139
+ ".leaderboard-table tbody tr.top-rank-1 {background-color: #004d40; color: #e0f2f1; font-weight: bold;}" +
2140
+ ".leaderboard-table tbody tr.top-rank-2 {background-color: #1b5e20; color: #e8f5e9; font-weight: 500;}" +
2141
+ ".leaderboard-table tbody tr.top-rank-3 {background-color: #33691e; color: #f1f8e9; font-weight: 500;}" +
2142
+ ".leaderboard-table td {position: relative;}" +
2143
+ ".leaderboard-table td::after {content: ''; position: absolute; top: 0; left: 0; width: 100%; height: 100%; background: transparent; pointer-events: none;}" +
2144
+ "</style>"
2145
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2146
 
2147
+ # Tab 3: Individual Models Leaderboard
2148
+ with gr.TabItem("Individual Models", elem_id="user-feedback-tab"):
2149
+ gr.Markdown("## Individual Model Performance")
2150
+ gr.Markdown("Performance ranking of models based on user feedback, showing statistics for both generator and judge roles.")
2151
 
2152
+ # Function to generate individual model leaderboard HTML
2153
+ def generate_model_leaderboard_html():
2154
+ try:
2155
+ # Get model scores from MongoDB
2156
+ models = detector.get_model_leaderboard() or []
2157
+
2158
+ if not models:
2159
+ return (
2160
+ "<div class=\"info-message\" style=\"padding: 20px; background-color: #e1f5fe; "
2161
+ "border-radius: 8px; text-align: center; margin: 20px 0;\">"
2162
+ "<h3 style=\"margin-top: 0; color: #0277bd;\">No Data Available Yet</h3>"
2163
+ "<p>Try the detector with more queries to populate the model scores!</p>"
2164
+ "</div>"
2165
+ )
2166
+
2167
+ # Generate table rows
2168
+ rows = ""
2169
+ for rank, model in enumerate(models, 1):
2170
+ # Add special styling for top 3
2171
+ row_class = ""
2172
+ if rank == 1:
2173
+ row_class = "class='top-rank-1'"
2174
+ elif rank == 2:
2175
+ row_class = "class='top-rank-2'"
2176
+ elif rank == 3:
2177
+ row_class = "class='top-rank-3'"
2178
+
2179
+ # Calculate role distribution
2180
+ as_generator = model.get('as_generator', 0)
2181
+ as_judge = model.get('as_judge', 0)
2182
+ if as_generator + as_judge > 0:
2183
+ generator_pct = round((as_generator / (as_generator + as_judge)) * 100)
2184
+ judge_pct = 100 - generator_pct
2185
+ role_distribution = f"{generator_pct}% / {judge_pct}%"
2186
+ else:
2187
+ role_distribution = "N/A"
2188
+
2189
+ # Format percentages with better contrast against dark background
2190
+ generator_acc = f"{model.get('generator_accuracy', 0.0)}%"
2191
+ judge_acc = f"{model.get('judge_accuracy', 0.0)}%"
2192
+
2193
+ rows += (
2194
+ f"<tr {row_class}>"
2195
+ f"<td>{rank}</td>"
2196
+ f"<td>{model.get('model_name', 'unknown')}</td>"
2197
+ f"<td>{round(model.get('elo_score', 0))}</td>"
2198
+ f"<td>{model.get('accuracy')}%</td>"
2199
+ f"<td style='color: #80cbc4; font-weight: 500;'>{generator_acc}</td>"
2200
+ f"<td style='color: #90caf9; font-weight: 500;'>{judge_acc}</td>"
2201
+ f"<td>{model.get('total_samples', 0)}</td>"
2202
+ f"<td style='color: #ffcc80; font-weight: 500;'>{role_distribution}</td>"
2203
+ f"</tr>"
2204
+ )
2205
+
2206
+ # Build the full table
2207
+ html = (
2208
+ f"<div class=\"leaderboard-container\">"
2209
+ f"<table class=\"leaderboard-table\">"
2210
+ f"<thead>"
2211
+ f"<tr>"
2212
+ f"<th>Rank</th>"
2213
+ f"<th>Model</th>"
2214
+ f"<th>ELO Score</th>"
2215
+ f"<th>Overall Accuracy</th>"
2216
+ f"<th>Generator Accuracy</th>"
2217
+ f"<th>Judge Accuracy</th>"
2218
+ f"<th>Sample Size</th>"
2219
+ f"<th>Generator/Judge Ratio</th>"
2220
+ f"</tr>"
2221
+ f"</thead>"
2222
+ f"<tbody>"
2223
+ f"{rows}"
2224
+ f"</tbody>"
2225
+ f"</table>"
2226
+ f"</div>"
2227
+ )
2228
+
2229
+ return html
2230
+ except Exception as e:
2231
+ logger.error("Error generating model leaderboard HTML: %s", str(e), exc_info=True)
2232
+ return (
2233
+ f"<div class=\"error-message\" style=\"padding: 20px; background-color: #ffebee; "
2234
+ f"border-radius: 8px; text-align: center; margin: 20px 0;\">"
2235
+ f"<h3 style=\"margin-top: 0; color: #c62828;\">Error Loading Model Leaderboard</h3>"
2236
+ f"<p>{str(e)}</p>"
2237
+ f"</div>"
2238
+ )
2239
 
2240
+ # Create leaderboard table for individual models
2241
+ model_scores_html = gr.HTML(generate_model_leaderboard_html())
2242
+ refresh_models_btn = gr.Button("Refresh Model Scores", variant="primary")
2243
+ refresh_models_btn.click(
2244
+ fn=lambda: generate_model_leaderboard_html(),
2245
+ outputs=[model_scores_html]
2246
+ )
2247
+
2248
+ # ELO rating explanation for individual models
2249
+ with gr.Accordion("ELO Rating Explanation for Individual Models", open=False):
2250
+ gr.HTML(
2251
+ "<div style='margin-top: 20px; padding: 15px; background-color: #0d47a1; border-radius: 8px; box-shadow: 0 2px 8px rgba(0,0,0,0.1);'>" +
2252
+ "<h3 style='margin-top: 0; color: #ffffff;'>Individual Model ELO Rating System</h3>" +
2253
+ "<div style='display: flex; flex-wrap: wrap; gap: 15px; margin-top: 15px;'>" +
2254
+ "<div style='flex: 1; min-width: 280px; padding: 12px; background-color: #455a64; border-radius: 6px; box-shadow: 0 1px 3px rgba(0,0,0,0.12);'>" +
2255
+ "<h4 style='margin-top: 0; color: #ffffff;'>How Individual ELO Scores Are Calculated</h4>" +
2256
+ "<p style='color: #eceff1;'>Our ELO rating system assigns scores to individual models based on user feedback, using the following formula:</p>" +
2257
+ "<div style='background-color: #37474f; padding: 12px; border-radius: 5px; color: #eceff1;'>" +
2258
+ "<code style='color: #80deea;'>ELO_new = ELO_old + K * (S - E)</code><br><br>" +
2259
+ "Where:<br>* <strong style='color: #b2dfdb;'>ELO_old</strong>: Previous rating of the model<br>" +
2260
+ "* <strong style='color: #b2dfdb;'>K</strong>: Weight factor (32 for individual models)<br>" +
2261
+ "* <strong style='color: #b2dfdb;'>S</strong>: Actual score (1 for correct judgment, 0 for incorrect)<br>" +
2262
+ "* <strong style='color: #b2dfdb;'>E</strong>: Expected score based on current rating<br><br>" +
2263
+ "<em style='color: #80deea;'>E = 1 / (1 + 10<sup>(1500 - ELO_model)/400</sup>)</em></div>" +
2264
+ "<p style='color: #eceff1; margin-top: 10px;'>All models start with a base ELO of 1500. Scores are updated after each user evaluation.</p></div>" +
2265
+ "<div style='flex: 1; min-width: 280px; padding: 12px; background-color: #455a64; border-radius: 6px; box-shadow: 0 1px 3px rgba(0,0,0,0.12);'>" +
2266
+ "<h4 style='margin-top: 0; color: #ffffff;'>Interpretation Guidelines</h4>" +
2267
+ "<ul style='margin-bottom: 0; padding-left: 20px; color: #eceff1;'>" +
2268
+ "<li><strong style='color: #b2dfdb;'>1800+</strong>: Exceptional performance, very rare hallucinations</li>" +
2269
+ "<li><strong style='color: #b2dfdb;'>1700-1799</strong>: Superior performance, minimal hallucinations</li>" +
2270
+ "<li><strong style='color: #b2dfdb;'>1600-1699</strong>: Good performance, occasional hallucinations</li>" +
2271
+ "<li><strong style='color: #b2dfdb;'>1500-1599</strong>: Average performance</li>" +
2272
+ "<li><strong style='color: #b2dfdb;'>&lt;1500</strong>: Below average, frequent hallucinations</li>" +
2273
+ "</ul><p style='font-style: italic; color: #b3e5fc; margin-top: 10px;'>" +
2274
+ "Note: ELO scores are comparative and reflect relative performance between models in our specific hallucination detection tasks.</p>" +
2275
+ "</div></div></div>"
2276
+ )
2277
 
2278
  # Function to continuously update stats
2279
  def update_stats():
 
2316
  live_stats = gr.HTML(update_stats())
2317
 
2318
  # Add loading animation style
2319
+ gr.HTML(
2320
+ "<style>" +
2321
+ "@keyframes pulse {" +
2322
+ "0% { opacity: 0.6; }" +
2323
+ "50% { opacity: 1; }" +
2324
+ "100% { opacity: 0.6; }" +
2325
+ "}" +
2326
+ ".refreshing::after {" +
2327
+ "content: \"⟳\";" +
2328
+ "display: inline-block;" +
2329
+ "margin-left: 8px;" +
2330
+ "animation: pulse 1.5s infinite ease-in-out;" +
2331
+ "color: #2e7d32;" +
2332
+ "}" +
2333
+ "#stats-container {" +
2334
+ "border: 1px solid #b3e5fc;" +
2335
+ "border-radius: 10px;" +
2336
+ "padding: 15px;" +
2337
+ "margin: 10px 0;" +
2338
+ "background-color: #0277bd;" +
2339
+ "}" +
2340
+ "</style>" +
2341
+ "<div class=\"refreshing\" style=\"text-align: right; font-size: 0.8em; color: #eceff1;\">Auto-refreshing</div>"
2342
+ )
2343
 
2344
  # Create a refresh button that will be auto-clicked
2345
  refresh_btn = gr.Button("Refresh Stats", visible=False)
 
2523
 
2524
  feedback_button.click(
2525
  fn=combine_feedback,
2526
+ inputs=[hallucination_present, judge_correct, feedback_text, hidden_results],
2527
  outputs=[feedback_status]
2528
  )
2529
 
2530
  # Footer
2531
  gr.HTML(
2532
+ """<footer><p>Paraphrase-based Approach for Scrutinizing Systems (PAS2) - Advanced Hallucination Detection</p><p>Multiple LLM models tested as generators and judges for optimal hallucination detection</p><p><small>Models in testing: mistral-large, gpt-4o, Qwen3-235B-A22B, grok-3, o4-mini, gemini-2.5-pro, deepseek-r1</small></p></footer>"""
 
 
 
 
 
 
2533
  )
2534
 
2535
  return interface
 
2595
 
2596
  # Uncomment this line to run the test function instead of the main interface
2597
  # if __name__ == "__main__":
2598
+ # test_progress()