Upload app.py
#6
by
nappenstance
- opened
app.py
CHANGED
@@ -43,12 +43,17 @@ class HallucinationJudgment(BaseModel):
|
|
43 |
class PAS2:
|
44 |
"""Paraphrase-based Approach for LLM Systems - Using llm-as-judge methods"""
|
45 |
|
46 |
-
def __init__(self, mistral_api_key=None, openai_api_key=None, progress_callback=None):
|
47 |
"""Initialize the PAS2 with API keys"""
|
48 |
# For Hugging Face Spaces, we prioritize getting API keys from HF_* environment variables
|
49 |
# which are set from the Secrets tab in the Space settings
|
50 |
self.mistral_api_key = mistral_api_key or os.environ.get("HF_MISTRAL_API_KEY") or os.environ.get("MISTRAL_API_KEY")
|
51 |
self.openai_api_key = openai_api_key or os.environ.get("HF_OPENAI_API_KEY") or os.environ.get("OPENAI_API_KEY")
|
|
|
|
|
|
|
|
|
|
|
52 |
self.progress_callback = progress_callback
|
53 |
|
54 |
if not self.mistral_api_key:
|
@@ -59,12 +64,64 @@ class PAS2:
|
|
59 |
|
60 |
self.mistral_client = Mistral(api_key=self.mistral_api_key)
|
61 |
self.openai_client = OpenAI(api_key=self.openai_api_key)
|
62 |
-
|
|
|
|
|
|
|
|
|
|
|
63 |
self.mistral_model = "mistral-large-latest"
|
64 |
-
self.
|
|
|
|
|
|
|
|
|
|
|
65 |
|
66 |
-
|
67 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
68 |
|
69 |
def generate_paraphrases(self, query: str, n_paraphrases: int = 3) -> List[str]:
|
70 |
"""Generate paraphrases of the input query using Mistral API"""
|
@@ -141,13 +198,38 @@ class PAS2:
|
|
141 |
|
142 |
return fallback_paraphrases
|
143 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
144 |
def _get_single_response(self, query: str, index: int = None) -> str:
|
145 |
-
"""Get a single response from
|
146 |
try:
|
147 |
query_description = f"Query {index}: {query}" if index is not None else f"Query: {query}"
|
148 |
-
logger.info("Getting response for %s", query_description)
|
149 |
start_time = time.time()
|
150 |
|
|
|
|
|
|
|
|
|
|
|
|
|
151 |
messages = [
|
152 |
{
|
153 |
"role": "system",
|
@@ -159,23 +241,32 @@ class PAS2:
|
|
159 |
}
|
160 |
]
|
161 |
|
162 |
-
|
163 |
-
|
164 |
-
|
165 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
166 |
|
167 |
-
result = response.choices[0].message.content
|
168 |
elapsed_time = time.time() - start_time
|
169 |
|
170 |
-
logger.info("Received response for %s (%.2f seconds)",
|
|
|
171 |
logger.debug("Response content for %s: %s", query_description, result[:100] + "..." if len(result) > 100 else result)
|
172 |
|
173 |
return result
|
174 |
|
175 |
except Exception as e:
|
176 |
-
error_msg = f"Error getting response for query '{query}': {e}"
|
177 |
logger.error(error_msg, exc_info=True)
|
178 |
-
return f"Error: Failed to get response for this query."
|
179 |
|
180 |
def get_responses(self, queries: List[str]) -> List[str]:
|
181 |
"""Get responses from Mistral API for each query in parallel"""
|
@@ -235,6 +326,10 @@ class PAS2:
|
|
235 |
logger.info("Starting hallucination detection for query: %s", query)
|
236 |
start_time = time.time()
|
237 |
|
|
|
|
|
|
|
|
|
238 |
# Report progress
|
239 |
if self.progress_callback:
|
240 |
self.progress_callback("starting", query=query)
|
@@ -250,9 +345,9 @@ class PAS2:
|
|
250 |
self.progress_callback("paraphrases_complete", query=query, count=len(all_queries))
|
251 |
|
252 |
# Get responses to all queries
|
253 |
-
logger.info("Step 2: Getting responses to all %d queries", len(all_queries))
|
254 |
if self.progress_callback:
|
255 |
-
self.progress_callback("getting_responses", query=query, total=len(all_queries))
|
256 |
|
257 |
all_responses = []
|
258 |
for i, q in enumerate(all_queries):
|
@@ -267,9 +362,9 @@ class PAS2:
|
|
267 |
self.progress_callback("responses_complete", query=query)
|
268 |
|
269 |
# Judge the responses for hallucinations
|
270 |
-
logger.info("Step 3: Judging for hallucinations")
|
271 |
if self.progress_callback:
|
272 |
-
self.progress_callback("judging", query=query)
|
273 |
|
274 |
# The first query is the original, rest are paraphrases
|
275 |
original_query = all_queries[0]
|
@@ -295,14 +390,17 @@ class PAS2:
|
|
295 |
"confidence_score": judgment.confidence_score,
|
296 |
"conflicting_facts": judgment.conflicting_facts,
|
297 |
"reasoning": judgment.reasoning,
|
298 |
-
"summary": judgment.summary
|
|
|
|
|
299 |
}
|
300 |
|
301 |
# Report completion
|
302 |
if self.progress_callback:
|
303 |
-
self.progress_callback("complete", query=query)
|
304 |
|
305 |
-
logger.info("Hallucination detection completed in %.2f seconds
|
|
|
306 |
return results
|
307 |
|
308 |
def judge_hallucination(self,
|
@@ -311,11 +409,17 @@ class PAS2:
|
|
311 |
paraphrased_queries: List[str],
|
312 |
paraphrased_responses: List[str]) -> HallucinationJudgment:
|
313 |
"""
|
314 |
-
Use
|
315 |
"""
|
316 |
-
logger.info("Judging hallucinations with
|
317 |
start_time = time.time()
|
318 |
|
|
|
|
|
|
|
|
|
|
|
|
|
319 |
# Prepare the context for the judge
|
320 |
context = f"""
|
321 |
Original Question: {original_query}
|
@@ -344,18 +448,31 @@ Your response should be a JSON with the following fields:
|
|
344 |
"""
|
345 |
|
346 |
try:
|
347 |
-
logger.info("Sending judgment request to
|
348 |
-
|
349 |
-
|
350 |
-
|
351 |
-
|
352 |
-
|
353 |
-
|
354 |
-
|
355 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
356 |
|
357 |
-
|
358 |
-
logger.debug("Received judgment response: %s", result_json)
|
359 |
|
360 |
# Create the HallucinationJudgment object from the JSON response
|
361 |
judgment = HallucinationJudgment(
|
@@ -367,18 +484,18 @@ Your response should be a JSON with the following fields:
|
|
367 |
)
|
368 |
|
369 |
elapsed_time = time.time() - start_time
|
370 |
-
logger.info("Judgment completed in %.2f seconds", elapsed_time)
|
371 |
|
372 |
return judgment
|
373 |
|
374 |
except Exception as e:
|
375 |
-
logger.error("Error in hallucination judgment: %s", str(e), exc_info=True)
|
376 |
# Return a fallback judgment
|
377 |
return HallucinationJudgment(
|
378 |
hallucination_detected=False,
|
379 |
confidence_score=0.0,
|
380 |
conflicting_facts=[],
|
381 |
-
reasoning="Failed to obtain judgment from the model
|
382 |
summary="Analysis failed due to API error."
|
383 |
)
|
384 |
|
@@ -495,11 +612,21 @@ class HallucinationDetectorApp:
|
|
495 |
"conflicting_facts": results.get('conflicting_facts', []),
|
496 |
"reasoning": results.get('reasoning', ''),
|
497 |
"summary": results.get('summary', ''),
|
|
|
|
|
498 |
"user_feedback": feedback
|
499 |
}
|
500 |
|
501 |
# Insert document into collection
|
502 |
-
self.feedback_collection.insert_one(document)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
503 |
|
504 |
logger.info("Feedback saved successfully to MongoDB")
|
505 |
return "Feedback saved successfully!"
|
@@ -507,6 +634,266 @@ class HallucinationDetectorApp:
|
|
507 |
logger.error("Error saving feedback: %s", str(e), exc_info=True)
|
508 |
return f"Error saving feedback: {str(e)}"
|
509 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
510 |
def get_feedback_stats(self):
|
511 |
"""Get statistics about collected feedback from MongoDB"""
|
512 |
try:
|
@@ -541,6 +928,62 @@ class HallucinationDetectorApp:
|
|
541 |
except Exception as e:
|
542 |
logger.error("Error getting feedback stats: %s", str(e), exc_info=True)
|
543 |
return None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
544 |
|
545 |
def export_data_to_csv(self, filepath=None):
|
546 |
"""Export all feedback data to a CSV file for analysis"""
|
@@ -657,11 +1100,11 @@ class ProgressTracker:
|
|
657 |
"starting": {"status": "Starting process...", "progress": 5, "color": "#2196F3"},
|
658 |
"generating_paraphrases": {"status": "Generating paraphrases...", "progress": 15, "color": "#2196F3"},
|
659 |
"paraphrases_complete": {"status": "Paraphrases generated", "progress": 30, "color": "#2196F3"},
|
660 |
-
"getting_responses": {"status": "Getting responses
|
661 |
"responses_progress": {"status": "Getting responses ({completed}/{total})...", "progress": 40, "color": "#2196F3"},
|
662 |
"responses_complete": {"status": "All responses received", "progress": 65, "color": "#2196F3"},
|
663 |
-
"judging": {"status": "Analyzing responses for hallucinations...", "progress": 70, "color": "#2196F3"},
|
664 |
-
"complete": {"status": "Analysis complete!", "progress": 100, "color": "#4CAF50"},
|
665 |
"error": {"status": "Error: {error_message}", "progress": 100, "color": "#F44336"}
|
666 |
}
|
667 |
|
@@ -672,6 +1115,9 @@ class ProgressTracker:
|
|
672 |
self.completed_responses = 0
|
673 |
self.total_responses = 0
|
674 |
self.error_message = ""
|
|
|
|
|
|
|
675 |
self._lock = threading.Lock()
|
676 |
self._status_callback = None
|
677 |
self._stop_event = threading.Event()
|
@@ -698,6 +1144,12 @@ class ProgressTracker:
|
|
698 |
self.total_responses = value
|
699 |
elif key == 'error_message':
|
700 |
self.error_message = value
|
|
|
|
|
|
|
|
|
|
|
|
|
701 |
|
702 |
# Format status message
|
703 |
if stage == 'responses_progress':
|
@@ -705,6 +1157,19 @@ class ProgressTracker:
|
|
705 |
completed=self.completed_responses,
|
706 |
total=self.total_responses
|
707 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
708 |
elif stage == 'error':
|
709 |
self.stage_data['status'] = self.stage_data['status'].format(
|
710 |
error_message=self.error_message
|
@@ -724,6 +1189,16 @@ class ProgressTracker:
|
|
724 |
# Only show status text if not in idle state
|
725 |
status_display = f'<div class="progress-status" style="color: {color};">{status_text}</div>' if self.stage != "idle" else ''
|
726 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
727 |
html = f"""
|
728 |
<div class="progress-container">
|
729 |
{query_info}
|
@@ -731,6 +1206,7 @@ class ProgressTracker:
|
|
731 |
<div class="progress-bar-container">
|
732 |
<div class="progress-bar" style="width: {progress_width}; background-color: {color};"></div>
|
733 |
</div>
|
|
|
734 |
</div>
|
735 |
"""
|
736 |
return html
|
@@ -1099,13 +1575,18 @@ def create_interface():
|
|
1099 |
combined_progress_callback("starting", query=query)
|
1100 |
time.sleep(0.3) # Ensure starting status is visible
|
1101 |
|
|
|
|
|
|
|
|
|
|
|
1102 |
# Step 2: Generate paraphrases (15-30%)
|
1103 |
combined_progress_callback("generating_paraphrases", query=query)
|
1104 |
all_queries = detector.pas2.generate_paraphrases(query)
|
1105 |
combined_progress_callback("paraphrases_complete", query=query, count=len(all_queries))
|
1106 |
|
1107 |
# Step 3: Get responses (35-65%)
|
1108 |
-
combined_progress_callback("getting_responses", query=query, total=len(all_queries))
|
1109 |
all_responses = []
|
1110 |
for i, q in enumerate(all_queries):
|
1111 |
# Show incremental progress for each response
|
@@ -1115,7 +1596,7 @@ def create_interface():
|
|
1115 |
combined_progress_callback("responses_complete", query=query)
|
1116 |
|
1117 |
# Step 4: Judge hallucinations (70-100%)
|
1118 |
-
combined_progress_callback("judging", query=query)
|
1119 |
|
1120 |
# The first query is the original, rest are paraphrases
|
1121 |
original_query = all_queries[0]
|
@@ -1141,11 +1622,13 @@ def create_interface():
|
|
1141 |
"confidence_score": judgment.confidence_score,
|
1142 |
"conflicting_facts": judgment.conflicting_facts,
|
1143 |
"reasoning": judgment.reasoning,
|
1144 |
-
"summary": judgment.summary
|
|
|
|
|
1145 |
}
|
1146 |
|
1147 |
# Show completion
|
1148 |
-
combined_progress_callback("complete", query=query)
|
1149 |
time.sleep(0.3) # Ensure complete status is visible
|
1150 |
|
1151 |
return results
|
@@ -1201,10 +1684,25 @@ def create_interface():
|
|
1201 |
reasoning_safe = reasoning.replace('\\', '\\\\').replace('\n', '<br>')
|
1202 |
conflicting_facts_text_safe = conflicting_facts_text.replace('\\', '\\\\').replace('\n', '<br>') if conflicting_facts_text else "<strong>None identified</strong>"
|
1203 |
|
|
|
|
|
|
|
|
|
1204 |
html_output = f"""
|
1205 |
<div class="container">
|
1206 |
<h2 class="title">Hallucination Detection Results</h2>
|
1207 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1208 |
<div class="stats-section">
|
1209 |
<div class="stat-item">
|
1210 |
<div class="stat-value">{'Yes' if hallucination_detected else 'No'}</div>
|
@@ -1234,7 +1732,7 @@ def create_interface():
|
|
1234 |
{original_query}
|
1235 |
</div>
|
1236 |
|
1237 |
-
<div class="section-title">Original Response</div>
|
1238 |
<div class="response-box">
|
1239 |
{original_response_safe}
|
1240 |
</div>
|
@@ -1249,14 +1747,14 @@ def create_interface():
|
|
1249 |
{q}
|
1250 |
</div>
|
1251 |
|
1252 |
-
<div class="section-title">Response {i}</div>
|
1253 |
<div class="response-box">
|
1254 |
{r}
|
1255 |
</div>
|
1256 |
"""
|
1257 |
|
1258 |
html_output += f"""
|
1259 |
-
<div class="section-title">Detailed Analysis</div>
|
1260 |
<div class="info-box">
|
1261 |
<p><strong>Reasoning:</strong></p>
|
1262 |
<p>{reasoning_safe}</p>
|
@@ -1264,6 +1762,10 @@ def create_interface():
|
|
1264 |
<p><strong>Conflicting Facts:</strong></p>
|
1265 |
<p>{conflicting_facts_text_safe}</p>
|
1266 |
</div>
|
|
|
|
|
|
|
|
|
1267 |
</div>
|
1268 |
"""
|
1269 |
|
@@ -1289,8 +1791,11 @@ def create_interface():
|
|
1289 |
]
|
1290 |
|
1291 |
# Helper function to submit feedback
|
1292 |
-
def combine_feedback(
|
1293 |
-
combined_feedback = f"{
|
|
|
|
|
|
|
1294 |
if not results:
|
1295 |
return "No results to attach feedback to."
|
1296 |
|
@@ -1394,8 +1899,8 @@ def create_interface():
|
|
1394 |
This tool implements the Paraphrase-based Approach for Scrutinizing Systems (PAS2) with a model-as-judge enhancement:
|
1395 |
|
1396 |
1. **Paraphrase Generation**: Your question is paraphrased multiple ways while preserving its core meaning
|
1397 |
-
2. **Multiple Responses**: All questions (original + paraphrases) are sent to
|
1398 |
-
3. **Expert Judgment**:
|
1399 |
|
1400 |
### Why This Approach?
|
1401 |
|
@@ -1469,10 +1974,16 @@ def create_interface():
|
|
1469 |
gr.Markdown("### Help Improve the System")
|
1470 |
gr.Markdown("Your feedback helps us refine the hallucination detection system.")
|
1471 |
|
1472 |
-
|
1473 |
-
label="Was
|
1474 |
-
choices=["Yes,
|
1475 |
-
value="
|
|
|
|
|
|
|
|
|
|
|
|
|
1476 |
)
|
1477 |
|
1478 |
feedback_text = gr.Textbox(
|
@@ -1489,286 +2000,280 @@ def create_interface():
|
|
1489 |
gr.Markdown("## Hallucination Detection Scores")
|
1490 |
gr.Markdown("Performance comparison of different Generator + Judge model combinations.")
|
1491 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1492 |
# Create leaderboard table for model combinations
|
1493 |
-
model_leaderboard_html = gr.HTML(
|
1494 |
-
|
1495 |
-
|
1496 |
-
|
1497 |
-
|
1498 |
-
|
1499 |
-
<th>Generator Model</th>
|
1500 |
-
<th>Judge Model</th>
|
1501 |
-
<th>ELO Score</th>
|
1502 |
-
<th>Accuracy</th>
|
1503 |
-
<th>Consistency</th>
|
1504 |
-
</tr>
|
1505 |
-
</thead>
|
1506 |
-
<tbody>
|
1507 |
-
<tr>
|
1508 |
-
<td>1</td>
|
1509 |
-
<td>gpt-4o</td>
|
1510 |
-
<td>o4-mini</td>
|
1511 |
-
<td>1878</td>
|
1512 |
-
<td>94.2%</td>
|
1513 |
-
<td>91.6%</td>
|
1514 |
-
</tr>
|
1515 |
-
<tr>
|
1516 |
-
<td>2</td>
|
1517 |
-
<td>gpt-4o</td>
|
1518 |
-
<td>gemini-2.5-pro</td>
|
1519 |
-
<td>1835</td>
|
1520 |
-
<td>92.8%</td>
|
1521 |
-
<td>89.2%</td>
|
1522 |
-
</tr>
|
1523 |
-
<tr>
|
1524 |
-
<td>3</td>
|
1525 |
-
<td>mistral-large</td>
|
1526 |
-
<td>o4-mini</td>
|
1527 |
-
<td>1795</td>
|
1528 |
-
<td>91.5%</td>
|
1529 |
-
<td>87.5%</td>
|
1530 |
-
</tr>
|
1531 |
-
<tr>
|
1532 |
-
<td>4</td>
|
1533 |
-
<td>Qwen3-235B-A22B</td>
|
1534 |
-
<td>o4-mini</td>
|
1535 |
-
<td>1768</td>
|
1536 |
-
<td>90.3%</td>
|
1537 |
-
<td>85.1%</td>
|
1538 |
-
</tr>
|
1539 |
-
<tr>
|
1540 |
-
<td>5</td>
|
1541 |
-
<td>grok-3</td>
|
1542 |
-
<td>o4-mini</td>
|
1543 |
-
<td>1742</td>
|
1544 |
-
<td>88.7%</td>
|
1545 |
-
<td>82.9%</td>
|
1546 |
-
</tr>
|
1547 |
-
<tr>
|
1548 |
-
<td>6</td>
|
1549 |
-
<td>mistral-large</td>
|
1550 |
-
<td>gemini-2.5-pro</td>
|
1551 |
-
<td>1716</td>
|
1552 |
-
<td>88.1%</td>
|
1553 |
-
<td>81.4%</td>
|
1554 |
-
</tr>
|
1555 |
-
<tr>
|
1556 |
-
<td>7</td>
|
1557 |
-
<td>deepseek-r1</td>
|
1558 |
-
<td>o4-mini</td>
|
1559 |
-
<td>1692</td>
|
1560 |
-
<td>87.3%</td>
|
1561 |
-
<td>80.3%</td>
|
1562 |
-
</tr>
|
1563 |
-
</tbody>
|
1564 |
-
</table>
|
1565 |
-
</div>
|
1566 |
|
1567 |
-
|
1568 |
-
|
1569 |
-
|
1570 |
-
|
1571 |
-
<
|
1572 |
-
|
1573 |
-
|
1574 |
-
|
1575 |
-
|
1576 |
-
|
1577 |
-
|
1578 |
-
|
1579 |
-
|
1580 |
-
|
1581 |
-
|
1582 |
-
|
1583 |
-
|
1584 |
-
<
|
1585 |
-
|
1586 |
-
|
1587 |
-
|
1588 |
-
|
1589 |
-
|
1590 |
-
|
1591 |
-
|
1592 |
-
|
1593 |
-
|
1594 |
-
|
1595 |
-
|
1596 |
-
|
1597 |
-
|
1598 |
-
|
1599 |
-
|
1600 |
-
|
1601 |
-
|
1602 |
-
|
1603 |
-
|
1604 |
-
|
1605 |
-
|
1606 |
-
|
1607 |
-
|
1608 |
-
|
1609 |
-
|
1610 |
-
|
1611 |
-
|
1612 |
-
</div>
|
1613 |
-
</div>
|
1614 |
-
</div>
|
1615 |
-
</div>
|
1616 |
-
<style>
|
1617 |
-
.leaderboard-container {
|
1618 |
-
margin: 15px 0;
|
1619 |
-
overflow-x: auto;
|
1620 |
-
}
|
1621 |
-
.leaderboard-table {
|
1622 |
-
width: 100%;
|
1623 |
-
border-collapse: collapse;
|
1624 |
-
font-size: 0.95em;
|
1625 |
-
box-shadow: 0 2px 10px rgba(0,0,0,0.1);
|
1626 |
-
border-radius: 8px;
|
1627 |
-
overflow: hidden;
|
1628 |
-
}
|
1629 |
-
.leaderboard-table thead {
|
1630 |
-
background-color: #1565c0;
|
1631 |
-
color: white;
|
1632 |
-
}
|
1633 |
-
.leaderboard-table th, .leaderboard-table td {
|
1634 |
-
padding: 12px 15px;
|
1635 |
-
text-align: left;
|
1636 |
-
border-bottom: 1px solid #ddd;
|
1637 |
-
}
|
1638 |
-
.leaderboard-table tbody tr {
|
1639 |
-
transition: background-color 0.3s;
|
1640 |
-
}
|
1641 |
-
.leaderboard-table tbody tr:nth-child(even) {
|
1642 |
-
background-color: #cfd8dc;
|
1643 |
-
}
|
1644 |
-
.leaderboard-table tbody tr:hover {
|
1645 |
-
background-color: #b0bec5;
|
1646 |
-
}
|
1647 |
-
.leaderboard-table tbody tr:first-child {
|
1648 |
-
background-color: #80cbc4;
|
1649 |
-
color: #004d40;
|
1650 |
-
}
|
1651 |
-
.leaderboard-table tbody tr:nth-child(2) {
|
1652 |
-
background-color: #81c784;
|
1653 |
-
color: #1b5e20;
|
1654 |
-
}
|
1655 |
-
.leaderboard-table tbody tr:nth-child(4) {
|
1656 |
-
background-color: #aed581;
|
1657 |
-
color: #33691e;
|
1658 |
-
}
|
1659 |
-
.leaderboard-table tbody tr:nth-child(6) {
|
1660 |
-
background-color: #d7ccc8;
|
1661 |
-
color: #3e2723;
|
1662 |
-
}
|
1663 |
-
</style>
|
1664 |
-
""")
|
1665 |
|
1666 |
-
# Tab 3:
|
1667 |
-
with gr.TabItem("
|
1668 |
-
gr.Markdown("## Model
|
1669 |
-
gr.Markdown("Performance ranking of
|
1670 |
|
1671 |
-
#
|
1672 |
-
|
1673 |
-
|
1674 |
-
|
1675 |
-
|
1676 |
-
|
1677 |
-
|
1678 |
-
|
1679 |
-
<
|
1680 |
-
|
1681 |
-
<
|
1682 |
-
|
1683 |
-
|
1684 |
-
|
1685 |
-
|
1686 |
-
|
1687 |
-
|
1688 |
-
|
1689 |
-
|
1690 |
-
|
1691 |
-
|
1692 |
-
|
1693 |
-
|
1694 |
-
|
1695 |
-
|
1696 |
-
|
1697 |
-
|
1698 |
-
|
1699 |
-
|
1700 |
-
|
1701 |
-
|
1702 |
-
|
1703 |
-
|
1704 |
-
|
1705 |
-
|
1706 |
-
|
1707 |
-
|
1708 |
-
|
1709 |
-
|
1710 |
-
|
1711 |
-
|
1712 |
-
|
1713 |
-
|
1714 |
-
<td>
|
1715 |
-
<td>
|
1716 |
-
<td>
|
1717 |
-
<td>
|
1718 |
-
<td>
|
1719 |
-
|
1720 |
-
|
1721 |
-
<td>
|
1722 |
-
|
1723 |
-
|
1724 |
-
|
1725 |
-
|
1726 |
-
|
1727 |
-
<
|
1728 |
-
|
1729 |
-
|
1730 |
-
|
1731 |
-
|
1732 |
-
|
1733 |
-
</
|
1734 |
-
|
1735 |
-
|
1736 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1737 |
|
1738 |
-
|
1739 |
-
|
1740 |
-
|
1741 |
-
|
1742 |
-
|
1743 |
-
|
1744 |
-
|
1745 |
-
|
1746 |
-
|
1747 |
-
|
1748 |
-
|
1749 |
-
|
1750 |
-
|
1751 |
-
|
1752 |
-
|
1753 |
-
|
1754 |
-
|
1755 |
-
|
1756 |
-
<
|
1757 |
-
|
1758 |
-
|
1759 |
-
|
1760 |
-
|
1761 |
-
|
1762 |
-
|
1763 |
-
|
1764 |
-
|
1765 |
-
|
1766 |
-
|
1767 |
-
|
1768 |
-
</
|
1769 |
-
|
1770 |
-
|
1771 |
-
|
|
|
|
|
|
|
1772 |
|
1773 |
# Function to continuously update stats
|
1774 |
def update_stats():
|
@@ -1811,30 +2316,30 @@ def create_interface():
|
|
1811 |
live_stats = gr.HTML(update_stats())
|
1812 |
|
1813 |
# Add loading animation style
|
1814 |
-
gr.HTML(
|
1815 |
-
|
1816 |
-
|
1817 |
-
0% { opacity: 0.6; }
|
1818 |
-
50% { opacity: 1; }
|
1819 |
-
100% { opacity: 0.6; }
|
1820 |
-
|
1821 |
-
|
1822 |
-
content: "
|
1823 |
-
display: inline-block;
|
1824 |
-
margin-left: 8px;
|
1825 |
-
animation: pulse 1.5s infinite ease-in-out;
|
1826 |
-
color: #2e7d32;
|
1827 |
-
|
1828 |
-
|
1829 |
-
border: 1px solid #b3e5fc;
|
1830 |
-
border-radius: 10px;
|
1831 |
-
padding: 15px;
|
1832 |
-
margin: 10px 0;
|
1833 |
-
background-color: #0277bd;
|
1834 |
-
|
1835 |
-
|
1836 |
-
|
1837 |
-
|
1838 |
|
1839 |
# Create a refresh button that will be auto-clicked
|
1840 |
refresh_btn = gr.Button("Refresh Stats", visible=False)
|
@@ -2018,19 +2523,13 @@ def create_interface():
|
|
2018 |
|
2019 |
feedback_button.click(
|
2020 |
fn=combine_feedback,
|
2021 |
-
inputs=[
|
2022 |
outputs=[feedback_status]
|
2023 |
)
|
2024 |
|
2025 |
# Footer
|
2026 |
gr.HTML(
|
2027 |
-
"""
|
2028 |
-
<footer>
|
2029 |
-
<p>Paraphrase-based Approach for Scrutinizing Systems (PAS2) - Advanced Hallucination Detection</p>
|
2030 |
-
<p>Multiple LLM models tested as generators and judges for optimal hallucination detection</p>
|
2031 |
-
<p><small>Models in testing: mistral-large, gpt-4o, Qwen3-235B-A22B, grok-3, o4-mini, gemini-2.5-pro, deepseek-r1</small></p>
|
2032 |
-
</footer>
|
2033 |
-
"""
|
2034 |
)
|
2035 |
|
2036 |
return interface
|
@@ -2096,4 +2595,4 @@ if __name__ == "__main__":
|
|
2096 |
|
2097 |
# Uncomment this line to run the test function instead of the main interface
|
2098 |
# if __name__ == "__main__":
|
2099 |
-
# test_progress()
|
|
|
43 |
class PAS2:
|
44 |
"""Paraphrase-based Approach for LLM Systems - Using llm-as-judge methods"""
|
45 |
|
46 |
+
def __init__(self, mistral_api_key=None, openai_api_key=None, xai_api_key=None, qwen_api_key=None, deepseek_api_key=None, gemini_api_key=None, progress_callback=None):
|
47 |
"""Initialize the PAS2 with API keys"""
|
48 |
# For Hugging Face Spaces, we prioritize getting API keys from HF_* environment variables
|
49 |
# which are set from the Secrets tab in the Space settings
|
50 |
self.mistral_api_key = mistral_api_key or os.environ.get("HF_MISTRAL_API_KEY") or os.environ.get("MISTRAL_API_KEY")
|
51 |
self.openai_api_key = openai_api_key or os.environ.get("HF_OPENAI_API_KEY") or os.environ.get("OPENAI_API_KEY")
|
52 |
+
self.xai_api_key = xai_api_key or os.environ.get("HF_XAI_API_KEY") or os.environ.get("XAI_API_KEY")
|
53 |
+
self.qwen_api_key = qwen_api_key or os.environ.get("HF_QWEN_API_KEY") or os.environ.get("QWEN_API_KEY")
|
54 |
+
self.deepseek_api_key = deepseek_api_key or os.environ.get("HF_DEEPSEEK_API_KEY") or os.environ.get("DEEPSEEK_API_KEY")
|
55 |
+
self.gemini_api_key = gemini_api_key or os.environ.get("HF_GEMINI_API_KEY") or os.environ.get("GEMINI_API_KEY")
|
56 |
+
|
57 |
self.progress_callback = progress_callback
|
58 |
|
59 |
if not self.mistral_api_key:
|
|
|
64 |
|
65 |
self.mistral_client = Mistral(api_key=self.mistral_api_key)
|
66 |
self.openai_client = OpenAI(api_key=self.openai_api_key)
|
67 |
+
self.xai_client = OpenAI(api_key=self.xai_api_key, base_url="https://api.x.ai/v1")
|
68 |
+
self.qwen_client = OpenAI(api_key=self.qwen_api_key, base_url="https://router.huggingface.co/nebius/v1")
|
69 |
+
self.deepseek_client = OpenAI(api_key=self.deepseek_api_key, base_url="https://api.deepseek.com")
|
70 |
+
self.gemini_client = OpenAI(api_key=self.gemini_api_key, base_url="https://generativelanguage.googleapis.com/v1beta/openai/")
|
71 |
+
|
72 |
+
# Define model names
|
73 |
self.mistral_model = "mistral-large-latest"
|
74 |
+
self.openai_o4mini = "o4-mini"
|
75 |
+
self.openai_4o = "gpt-4o"
|
76 |
+
self.deepseek_model = "deepseek-reasoner"
|
77 |
+
self.grok_model = "grok-3-beta"
|
78 |
+
self.qwen_model = "Qwen/Qwen3-235B-A22B"
|
79 |
+
self.gemini_model = "gemini-2.5-pro-preview-05-06"
|
80 |
|
81 |
+
# Create a dictionary mapping model names to their clients and model identifiers
|
82 |
+
self.model_configs = {
|
83 |
+
"mistral-large": {
|
84 |
+
"client": self.mistral_client,
|
85 |
+
"model_id": self.mistral_model,
|
86 |
+
"type": "mistral"
|
87 |
+
},
|
88 |
+
"o4-mini": {
|
89 |
+
"client": self.openai_client,
|
90 |
+
"model_id": self.openai_o4mini,
|
91 |
+
"type": "openai"
|
92 |
+
},
|
93 |
+
"gpt-4o": {
|
94 |
+
"client": self.openai_client,
|
95 |
+
"model_id": self.openai_4o,
|
96 |
+
"type": "openai"
|
97 |
+
},
|
98 |
+
"deepseek-reasoner": {
|
99 |
+
"client": self.deepseek_client,
|
100 |
+
"model_id": self.deepseek_model,
|
101 |
+
"type": "openai"
|
102 |
+
},
|
103 |
+
"grok-3": {
|
104 |
+
"client": self.xai_client,
|
105 |
+
"model_id": self.grok_model,
|
106 |
+
"type": "openai"
|
107 |
+
},
|
108 |
+
"qwen-235b": {
|
109 |
+
"client": self.qwen_client,
|
110 |
+
"model_id": self.qwen_model,
|
111 |
+
"type": "openai"
|
112 |
+
},
|
113 |
+
"gemini-2.5-pro": {
|
114 |
+
"client": self.gemini_client,
|
115 |
+
"model_id": self.gemini_model,
|
116 |
+
"type": "openai"
|
117 |
+
}
|
118 |
+
}
|
119 |
+
|
120 |
+
# Set default models (will be randomized later)
|
121 |
+
self.generator_model = "mistral-large"
|
122 |
+
self.judge_model = "o4-mini"
|
123 |
+
|
124 |
+
logger.info("PAS2 initialized with available models: %s", ", ".join(self.model_configs.keys()))
|
125 |
|
126 |
def generate_paraphrases(self, query: str, n_paraphrases: int = 3) -> List[str]:
|
127 |
"""Generate paraphrases of the input query using Mistral API"""
|
|
|
198 |
|
199 |
return fallback_paraphrases
|
200 |
|
201 |
+
def set_random_model_pair(self):
|
202 |
+
"""Randomly select a pair of generator and judge models"""
|
203 |
+
import random
|
204 |
+
|
205 |
+
# Get list of available models
|
206 |
+
available_models = list(self.model_configs.keys())
|
207 |
+
|
208 |
+
# Randomly select generator and judge models
|
209 |
+
self.generator_model = random.choice(available_models)
|
210 |
+
|
211 |
+
# Make sure judge is different from generator
|
212 |
+
judge_options = [m for m in available_models if m != self.generator_model]
|
213 |
+
self.judge_model = random.choice(judge_options)
|
214 |
+
|
215 |
+
logger.info("Randomly selected model pair - Generator: %s, Judge: %s",
|
216 |
+
self.generator_model, self.judge_model)
|
217 |
+
|
218 |
+
return self.generator_model, self.judge_model
|
219 |
+
|
220 |
def _get_single_response(self, query: str, index: int = None) -> str:
|
221 |
+
"""Get a single response from the selected generator model for a query"""
|
222 |
try:
|
223 |
query_description = f"Query {index}: {query}" if index is not None else f"Query: {query}"
|
224 |
+
logger.info("Getting response for %s using %s", query_description, self.generator_model)
|
225 |
start_time = time.time()
|
226 |
|
227 |
+
# Get the model configuration
|
228 |
+
model_config = self.model_configs[self.generator_model]
|
229 |
+
client = model_config["client"]
|
230 |
+
model_id = model_config["model_id"]
|
231 |
+
model_type = model_config["type"]
|
232 |
+
|
233 |
messages = [
|
234 |
{
|
235 |
"role": "system",
|
|
|
241 |
}
|
242 |
]
|
243 |
|
244 |
+
# Use the appropriate client and model based on the type
|
245 |
+
if model_type == "mistral":
|
246 |
+
response = client.chat.complete(
|
247 |
+
model=model_id,
|
248 |
+
messages=messages
|
249 |
+
)
|
250 |
+
result = response.choices[0].message.content
|
251 |
+
else: # openai-compatible API
|
252 |
+
response = client.chat.completions.create(
|
253 |
+
model=model_id,
|
254 |
+
messages=messages
|
255 |
+
)
|
256 |
+
result = response.choices[0].message.content
|
257 |
|
|
|
258 |
elapsed_time = time.time() - start_time
|
259 |
|
260 |
+
logger.info("Received response from %s for %s (%.2f seconds)",
|
261 |
+
self.generator_model, query_description, elapsed_time)
|
262 |
logger.debug("Response content for %s: %s", query_description, result[:100] + "..." if len(result) > 100 else result)
|
263 |
|
264 |
return result
|
265 |
|
266 |
except Exception as e:
|
267 |
+
error_msg = f"Error getting response for query '{query}' with model {self.generator_model}: {e}"
|
268 |
logger.error(error_msg, exc_info=True)
|
269 |
+
return f"Error: Failed to get response for this query with model {self.generator_model}."
|
270 |
|
271 |
def get_responses(self, queries: List[str]) -> List[str]:
|
272 |
"""Get responses from Mistral API for each query in parallel"""
|
|
|
326 |
logger.info("Starting hallucination detection for query: %s", query)
|
327 |
start_time = time.time()
|
328 |
|
329 |
+
# Randomly select a model pair for this detection
|
330 |
+
generator_model, judge_model = self.set_random_model_pair()
|
331 |
+
logger.info("Using %s as generator and %s as judge for this detection", generator_model, judge_model)
|
332 |
+
|
333 |
# Report progress
|
334 |
if self.progress_callback:
|
335 |
self.progress_callback("starting", query=query)
|
|
|
345 |
self.progress_callback("paraphrases_complete", query=query, count=len(all_queries))
|
346 |
|
347 |
# Get responses to all queries
|
348 |
+
logger.info("Step 2: Getting responses to all %d queries using %s", len(all_queries), generator_model)
|
349 |
if self.progress_callback:
|
350 |
+
self.progress_callback("getting_responses", query=query, total=len(all_queries), model=generator_model)
|
351 |
|
352 |
all_responses = []
|
353 |
for i, q in enumerate(all_queries):
|
|
|
362 |
self.progress_callback("responses_complete", query=query)
|
363 |
|
364 |
# Judge the responses for hallucinations
|
365 |
+
logger.info("Step 3: Judging for hallucinations using %s", judge_model)
|
366 |
if self.progress_callback:
|
367 |
+
self.progress_callback("judging", query=query, model=judge_model)
|
368 |
|
369 |
# The first query is the original, rest are paraphrases
|
370 |
original_query = all_queries[0]
|
|
|
390 |
"confidence_score": judgment.confidence_score,
|
391 |
"conflicting_facts": judgment.conflicting_facts,
|
392 |
"reasoning": judgment.reasoning,
|
393 |
+
"summary": judgment.summary,
|
394 |
+
"generator_model": generator_model,
|
395 |
+
"judge_model": judge_model
|
396 |
}
|
397 |
|
398 |
# Report completion
|
399 |
if self.progress_callback:
|
400 |
+
self.progress_callback("complete", query=query, generator=generator_model, judge=judge_model)
|
401 |
|
402 |
+
logger.info("Hallucination detection completed in %.2f seconds using %s (generator) and %s (judge)",
|
403 |
+
time.time() - start_time, generator_model, judge_model)
|
404 |
return results
|
405 |
|
406 |
def judge_hallucination(self,
|
|
|
409 |
paraphrased_queries: List[str],
|
410 |
paraphrased_responses: List[str]) -> HallucinationJudgment:
|
411 |
"""
|
412 |
+
Use the selected judge model to detect hallucinations in the responses
|
413 |
"""
|
414 |
+
logger.info("Judging hallucinations with %s model", self.judge_model)
|
415 |
start_time = time.time()
|
416 |
|
417 |
+
# Get the model configuration for the judge
|
418 |
+
model_config = self.model_configs[self.judge_model]
|
419 |
+
client = model_config["client"]
|
420 |
+
model_id = model_config["model_id"]
|
421 |
+
model_type = model_config["type"]
|
422 |
+
|
423 |
# Prepare the context for the judge
|
424 |
context = f"""
|
425 |
Original Question: {original_query}
|
|
|
448 |
"""
|
449 |
|
450 |
try:
|
451 |
+
logger.info("Sending judgment request to %s...", self.judge_model)
|
452 |
+
|
453 |
+
# Use the appropriate client and model based on the type
|
454 |
+
if model_type == "mistral":
|
455 |
+
response = client.chat.complete(
|
456 |
+
model=model_id,
|
457 |
+
messages=[
|
458 |
+
{"role": "system", "content": system_prompt},
|
459 |
+
{"role": "user", "content": f"Evaluate these responses for hallucinations:\n\n{context}"}
|
460 |
+
],
|
461 |
+
response_format={"type": "json_object"}
|
462 |
+
)
|
463 |
+
result_json = json.loads(response.choices[0].message.content)
|
464 |
+
else: # openai-compatible API
|
465 |
+
response = client.chat.completions.create(
|
466 |
+
model=model_id,
|
467 |
+
messages=[
|
468 |
+
{"role": "system", "content": system_prompt},
|
469 |
+
{"role": "user", "content": f"Evaluate these responses for hallucinations:\n\n{context}"}
|
470 |
+
],
|
471 |
+
response_format={"type": "json_object"}
|
472 |
+
)
|
473 |
+
result_json = json.loads(response.choices[0].message.content)
|
474 |
|
475 |
+
logger.debug("Received judgment response from %s: %s", self.judge_model, result_json)
|
|
|
476 |
|
477 |
# Create the HallucinationJudgment object from the JSON response
|
478 |
judgment = HallucinationJudgment(
|
|
|
484 |
)
|
485 |
|
486 |
elapsed_time = time.time() - start_time
|
487 |
+
logger.info("Judgment completed by %s in %.2f seconds", self.judge_model, elapsed_time)
|
488 |
|
489 |
return judgment
|
490 |
|
491 |
except Exception as e:
|
492 |
+
logger.error("Error in hallucination judgment with %s: %s", self.judge_model, str(e), exc_info=True)
|
493 |
# Return a fallback judgment
|
494 |
return HallucinationJudgment(
|
495 |
hallucination_detected=False,
|
496 |
confidence_score=0.0,
|
497 |
conflicting_facts=[],
|
498 |
+
reasoning=f"Failed to obtain judgment from the {self.judge_model} model: {str(e)}",
|
499 |
summary="Analysis failed due to API error."
|
500 |
)
|
501 |
|
|
|
612 |
"conflicting_facts": results.get('conflicting_facts', []),
|
613 |
"reasoning": results.get('reasoning', ''),
|
614 |
"summary": results.get('summary', ''),
|
615 |
+
"generator_model": results.get('generator_model', 'unknown'),
|
616 |
+
"judge_model": results.get('judge_model', 'unknown'),
|
617 |
"user_feedback": feedback
|
618 |
}
|
619 |
|
620 |
# Insert document into collection
|
621 |
+
result = self.feedback_collection.insert_one(document)
|
622 |
+
|
623 |
+
# Update model leaderboard scores
|
624 |
+
self._update_model_scores(
|
625 |
+
generator=results.get('generator_model', 'unknown'),
|
626 |
+
judge=results.get('judge_model', 'unknown'),
|
627 |
+
feedback=feedback,
|
628 |
+
hallucination_detected=results.get('hallucination_detected', False)
|
629 |
+
)
|
630 |
|
631 |
logger.info("Feedback saved successfully to MongoDB")
|
632 |
return "Feedback saved successfully!"
|
|
|
634 |
logger.error("Error saving feedback: %s", str(e), exc_info=True)
|
635 |
return f"Error saving feedback: {str(e)}"
|
636 |
|
637 |
+
def _update_model_scores(self, generator, judge, feedback, hallucination_detected):
|
638 |
+
"""Update the ELO scores for the generator and judge models based on feedback"""
|
639 |
+
try:
|
640 |
+
if self.db is None:
|
641 |
+
logger.error("MongoDB connection not available. Cannot update model scores.")
|
642 |
+
return
|
643 |
+
|
644 |
+
# Access or create the models collection
|
645 |
+
models_collection = self.db.get_collection("model_scores")
|
646 |
+
|
647 |
+
# Create indexes if they don't exist
|
648 |
+
models_collection.create_index("model_name", unique=True)
|
649 |
+
|
650 |
+
# Parse the feedback to determine scenario
|
651 |
+
actual_hallucination = "Yes, there was a hallucination" in feedback
|
652 |
+
no_hallucination = "No, there was no hallucination" in feedback
|
653 |
+
judge_correct = "Yes, the judge was correct" in feedback
|
654 |
+
judge_incorrect = "No, the judge was incorrect" in feedback
|
655 |
+
|
656 |
+
# Determine scores based on different scenarios:
|
657 |
+
# 1. Actual hallucination + Judge correct = positive for judge, negative for generator
|
658 |
+
# 2. No hallucination + Judge correct = positive for both
|
659 |
+
# 3. No hallucination + Judge incorrect = negative for judge, positive for generator
|
660 |
+
# 4. Actual hallucination + Judge incorrect = negative for both
|
661 |
+
|
662 |
+
if judge_correct:
|
663 |
+
if actual_hallucination:
|
664 |
+
# Scenario 1: Judge correctly detected hallucination
|
665 |
+
judge_score = 1 # Positive for judge
|
666 |
+
generator_score = 0 # Negative for generator (hallucinated)
|
667 |
+
logger.info("Judge %s correctly detected hallucination from generator %s", judge, generator)
|
668 |
+
elif no_hallucination:
|
669 |
+
# Scenario 2: Judge correctly determined no hallucination
|
670 |
+
judge_score = 1 # Positive for judge
|
671 |
+
generator_score = 1 # Positive for generator (didn't hallucinate)
|
672 |
+
logger.info("Judge %s correctly determined no hallucination from generator %s", judge, generator)
|
673 |
+
else:
|
674 |
+
# User unsure about hallucination, but confirmed judge was correct
|
675 |
+
judge_score = 1 # Positive for judge
|
676 |
+
generator_score = 0.5 # Neutral for generator (unclear)
|
677 |
+
logger.info("User confirmed judge %s was correct, but unclear about hallucination from %s", judge, generator)
|
678 |
+
elif judge_incorrect:
|
679 |
+
if no_hallucination:
|
680 |
+
# Scenario 3: Judge incorrectly claimed hallucination (false positive)
|
681 |
+
judge_score = 0 # Negative for judge
|
682 |
+
generator_score = 1 # Positive for generator (unfairly accused)
|
683 |
+
logger.info("Judge %s incorrectly claimed hallucination from generator %s", judge, generator)
|
684 |
+
elif actual_hallucination:
|
685 |
+
# Scenario 4: Judge missed actual hallucination (false negative)
|
686 |
+
judge_score = 0 # Negative for judge
|
687 |
+
generator_score = 0 # Negative for generator (hallucination went undetected)
|
688 |
+
logger.info("Judge %s missed actual hallucination from generator %s", judge, generator)
|
689 |
+
else:
|
690 |
+
# User unsure about hallucination, but confirmed judge was incorrect
|
691 |
+
judge_score = 0 # Negative for judge
|
692 |
+
generator_score = 0.5 # Neutral for generator (unclear)
|
693 |
+
logger.info("User confirmed judge %s was incorrect, but unclear about hallucination from %s", judge, generator)
|
694 |
+
else:
|
695 |
+
# User unsure about judge correctness, don't update scores
|
696 |
+
judge_score = 0.5 # Neutral for judge (unclear)
|
697 |
+
generator_score = 0.5 # Neutral for generator (unclear)
|
698 |
+
logger.info("User unsure about judge %s correctness and generator %s hallucination", judge, generator)
|
699 |
+
|
700 |
+
# Update generator model stats with specific score
|
701 |
+
self._update_model_stats(models_collection, generator, generator_score, "generator")
|
702 |
+
|
703 |
+
# Update judge model stats with specific score
|
704 |
+
self._update_model_stats(models_collection, judge, judge_score, "judge")
|
705 |
+
|
706 |
+
# Determine if the detection was correct based on judge correctness
|
707 |
+
detection_correct = judge_correct
|
708 |
+
|
709 |
+
# Determine if there was actually hallucination based on user feedback
|
710 |
+
actual_hallucination_present = actual_hallucination
|
711 |
+
|
712 |
+
# Update model pair stats
|
713 |
+
self._update_model_pair_stats(generator, judge, detection_correct, actual_hallucination_present,
|
714 |
+
generator_score, judge_score)
|
715 |
+
|
716 |
+
logger.info("Updated model scores based on feedback: generator(%s)=%s, judge(%s)=%s",
|
717 |
+
generator, generator_score, judge, judge_score)
|
718 |
+
|
719 |
+
except Exception as e:
|
720 |
+
logger.error("Error updating model scores: %s", str(e), exc_info=True)
|
721 |
+
|
722 |
+
def _update_model_stats(self, collection, model_name, score, role):
|
723 |
+
"""Update statistics for a single model"""
|
724 |
+
# Simplified ELO calculation
|
725 |
+
K_FACTOR = 32 # Standard K-factor for ELO
|
726 |
+
|
727 |
+
# Get current model data or create if not exists
|
728 |
+
model_data = collection.find_one({"model_name": model_name})
|
729 |
+
|
730 |
+
if model_data is None:
|
731 |
+
# Initialize new model with default values
|
732 |
+
model_data = {
|
733 |
+
"model_name": model_name,
|
734 |
+
"elo_score": 1500, # Starting ELO
|
735 |
+
"total_samples": 0,
|
736 |
+
"correct_predictions": 0,
|
737 |
+
"accuracy": 0.0,
|
738 |
+
"as_generator": 0,
|
739 |
+
"as_judge": 0,
|
740 |
+
"as_generator_correct": 0,
|
741 |
+
"as_judge_correct": 0,
|
742 |
+
"neutral_samples": 0 # Add a counter for neutral samples
|
743 |
+
}
|
744 |
+
|
745 |
+
# Skip counting for neutral feedback (0.5)
|
746 |
+
if score == 0.5:
|
747 |
+
# Increment neutral samples counter instead
|
748 |
+
if "neutral_samples" not in model_data:
|
749 |
+
model_data["neutral_samples"] = 0
|
750 |
+
model_data["neutral_samples"] += 1
|
751 |
+
|
752 |
+
# Expected score based on current rating (vs average rating)
|
753 |
+
expected_score = 1 / (1 + 10**((1500 - model_data["elo_score"]) / 400))
|
754 |
+
|
755 |
+
# For neutral score, use a much smaller K factor to slightly adjust the ELO
|
756 |
+
# This handles the "unsure" case with minimal impact
|
757 |
+
model_data["elo_score"] = model_data["elo_score"] + (K_FACTOR/4) * (0.5 - expected_score)
|
758 |
+
|
759 |
+
# Update or insert the model data
|
760 |
+
collection.replace_one(
|
761 |
+
{"model_name": model_name},
|
762 |
+
model_data,
|
763 |
+
upsert=True
|
764 |
+
)
|
765 |
+
return
|
766 |
+
|
767 |
+
# Update sample counts for non-neutral cases
|
768 |
+
model_data["total_samples"] += 1
|
769 |
+
if role == "generator":
|
770 |
+
model_data["as_generator"] += 1
|
771 |
+
if score == 1: # Only count as correct if score is 1 (not 0)
|
772 |
+
model_data["as_generator_correct"] += 1
|
773 |
+
else: # role == "judge"
|
774 |
+
model_data["as_judge"] += 1
|
775 |
+
if score == 1: # Only count as correct if score is 1 (not 0)
|
776 |
+
model_data["as_judge_correct"] += 1
|
777 |
+
|
778 |
+
# Update correct predictions based on score
|
779 |
+
if score == 1:
|
780 |
+
model_data["correct_predictions"] += 1
|
781 |
+
|
782 |
+
# Calculate new accuracy
|
783 |
+
model_data["accuracy"] = model_data["correct_predictions"] / model_data["total_samples"]
|
784 |
+
|
785 |
+
# Update ELO score based on the specific score value (0 or 1)
|
786 |
+
# Expected score based on current rating (vs average rating)
|
787 |
+
expected_score = 1 / (1 + 10**((1500 - model_data["elo_score"]) / 400))
|
788 |
+
|
789 |
+
# Use the provided score (0 or 1)
|
790 |
+
actual_score = score
|
791 |
+
|
792 |
+
# New ELO calculation
|
793 |
+
model_data["elo_score"] = model_data["elo_score"] + K_FACTOR * (actual_score - expected_score)
|
794 |
+
|
795 |
+
# Update or insert the model data
|
796 |
+
collection.replace_one(
|
797 |
+
{"model_name": model_name},
|
798 |
+
model_data,
|
799 |
+
upsert=True
|
800 |
+
)
|
801 |
+
|
802 |
+
def _update_model_pair_stats(self, generator, judge, detection_correct, hallucination_detected,
|
803 |
+
generator_score, judge_score):
|
804 |
+
"""Update statistics for a model pair combination"""
|
805 |
+
try:
|
806 |
+
# Access or create the model pairs collection
|
807 |
+
pairs_collection = self.db.get_collection("model_pairs")
|
808 |
+
|
809 |
+
# Create compound index if it doesn't exist
|
810 |
+
pairs_collection.create_index([("generator", 1), ("judge", 1)], unique=True)
|
811 |
+
|
812 |
+
# Get current pair data or create if not exists
|
813 |
+
pair_data = pairs_collection.find_one({
|
814 |
+
"generator": generator,
|
815 |
+
"judge": judge
|
816 |
+
})
|
817 |
+
|
818 |
+
if pair_data is None:
|
819 |
+
# Initialize new pair with default values
|
820 |
+
pair_data = {
|
821 |
+
"generator": generator,
|
822 |
+
"judge": judge,
|
823 |
+
"elo_score": 1500, # Starting ELO
|
824 |
+
"total_samples": 0,
|
825 |
+
"correct_predictions": 0,
|
826 |
+
"accuracy": 0.0,
|
827 |
+
"hallucinations_detected": 0,
|
828 |
+
"generator_performance": 0.0,
|
829 |
+
"judge_performance": 0.0,
|
830 |
+
"consistency_score": 0.0
|
831 |
+
}
|
832 |
+
|
833 |
+
# Update sample counts
|
834 |
+
pair_data["total_samples"] += 1
|
835 |
+
if detection_correct:
|
836 |
+
pair_data["correct_predictions"] += 1
|
837 |
+
|
838 |
+
if hallucination_detected:
|
839 |
+
pair_data["hallucinations_detected"] += 1
|
840 |
+
|
841 |
+
# Track model-specific performances within the pair
|
842 |
+
if "generator_correct_count" not in pair_data:
|
843 |
+
pair_data["generator_correct_count"] = 0
|
844 |
+
if "judge_correct_count" not in pair_data:
|
845 |
+
pair_data["judge_correct_count"] = 0
|
846 |
+
|
847 |
+
# Update individual performance counters based on scores
|
848 |
+
if generator_score == 1:
|
849 |
+
pair_data["generator_correct_count"] += 1
|
850 |
+
if judge_score == 1:
|
851 |
+
pair_data["judge_correct_count"] += 1
|
852 |
+
|
853 |
+
# Calculate individual performance rates within the pair
|
854 |
+
pair_data["generator_performance"] = pair_data["generator_correct_count"] / pair_data["total_samples"]
|
855 |
+
pair_data["judge_performance"] = pair_data["judge_correct_count"] / pair_data["total_samples"]
|
856 |
+
|
857 |
+
# Calculate new accuracy for the pair (detection accuracy)
|
858 |
+
pair_data["accuracy"] = pair_data["correct_predictions"] / pair_data["total_samples"]
|
859 |
+
|
860 |
+
# Calculate consistency score - weighted average of individual performances
|
861 |
+
# Gives more weight to the generator when hallucinations are detected
|
862 |
+
if hallucination_detected:
|
863 |
+
# When hallucination is detected, judge's role is more critical
|
864 |
+
pair_data["consistency_score"] = (0.4 * pair_data["generator_performance"] +
|
865 |
+
0.6 * pair_data["judge_performance"])
|
866 |
+
else:
|
867 |
+
# When no hallucination is detected, both roles are equally important
|
868 |
+
pair_data["consistency_score"] = (0.5 * pair_data["generator_performance"] +
|
869 |
+
0.5 * pair_data["judge_performance"])
|
870 |
+
|
871 |
+
# Update ELO score (simplified version)
|
872 |
+
K_FACTOR = 24 # Slightly lower K-factor for pairs
|
873 |
+
|
874 |
+
# Expected score based on current rating
|
875 |
+
expected_score = 1 / (1 + 10**((1500 - pair_data["elo_score"]) / 400))
|
876 |
+
|
877 |
+
# Actual score - use the average of both model scores (0-1 range)
|
878 |
+
# This represents the pair's overall performance
|
879 |
+
actual_score = (generator_score + judge_score) / 2
|
880 |
+
|
881 |
+
# New ELO calculation
|
882 |
+
pair_data["elo_score"] = pair_data["elo_score"] + K_FACTOR * (actual_score - expected_score)
|
883 |
+
|
884 |
+
# Update or insert the pair data
|
885 |
+
pairs_collection.replace_one(
|
886 |
+
{"generator": generator, "judge": judge},
|
887 |
+
pair_data,
|
888 |
+
upsert=True
|
889 |
+
)
|
890 |
+
|
891 |
+
logger.info("Updated model pair stats for %s (generator) and %s (judge)", generator, judge)
|
892 |
+
|
893 |
+
except Exception as e:
|
894 |
+
logger.error("Error updating model pair stats: %s", str(e), exc_info=True)
|
895 |
+
return None
|
896 |
+
|
897 |
def get_feedback_stats(self):
|
898 |
"""Get statistics about collected feedback from MongoDB"""
|
899 |
try:
|
|
|
928 |
except Exception as e:
|
929 |
logger.error("Error getting feedback stats: %s", str(e), exc_info=True)
|
930 |
return None
|
931 |
+
|
932 |
+
def get_model_leaderboard(self):
|
933 |
+
"""Get the current model leaderboard data"""
|
934 |
+
try:
|
935 |
+
if self.db is None:
|
936 |
+
logger.error("MongoDB connection not available. Cannot get model leaderboard.")
|
937 |
+
return None
|
938 |
+
|
939 |
+
# Access models collection
|
940 |
+
models_collection = self.db.get_collection("model_scores")
|
941 |
+
|
942 |
+
# Get all models and sort by ELO score
|
943 |
+
models = list(models_collection.find().sort("elo_score", pymongo.DESCENDING))
|
944 |
+
|
945 |
+
# Format percentages and convert ObjectId
|
946 |
+
for model in models:
|
947 |
+
model["_id"] = str(model["_id"])
|
948 |
+
model["accuracy"] = round(model["accuracy"] * 100, 1)
|
949 |
+
if "as_generator" in model and model["as_generator"] > 0:
|
950 |
+
model["generator_accuracy"] = round((model["as_generator_correct"] / model["as_generator"]) * 100, 1)
|
951 |
+
else:
|
952 |
+
model["generator_accuracy"] = 0.0
|
953 |
+
|
954 |
+
if "as_judge" in model and model["as_judge"] > 0:
|
955 |
+
model["judge_accuracy"] = round((model["as_judge_correct"] / model["as_judge"]) * 100, 1)
|
956 |
+
else:
|
957 |
+
model["judge_accuracy"] = 0.0
|
958 |
+
|
959 |
+
return models
|
960 |
+
except Exception as e:
|
961 |
+
logger.error("Error getting model leaderboard: %s", str(e), exc_info=True)
|
962 |
+
return []
|
963 |
+
|
964 |
+
def get_pair_leaderboard(self):
|
965 |
+
"""Get the current model pair leaderboard data"""
|
966 |
+
try:
|
967 |
+
if self.db is None:
|
968 |
+
logger.error("MongoDB connection not available. Cannot get pair leaderboard.")
|
969 |
+
return None
|
970 |
+
|
971 |
+
# Access model pairs collection
|
972 |
+
pairs_collection = self.db.get_collection("model_pairs")
|
973 |
+
|
974 |
+
# Get all pairs and sort by ELO score
|
975 |
+
pairs = list(pairs_collection.find().sort("elo_score", pymongo.DESCENDING))
|
976 |
+
|
977 |
+
# Format percentages and convert ObjectId
|
978 |
+
for pair in pairs:
|
979 |
+
pair["_id"] = str(pair["_id"])
|
980 |
+
pair["accuracy"] = round(pair["accuracy"] * 100, 1)
|
981 |
+
pair["consistency_score"] = round(pair["consistency_score"] * 100, 1)
|
982 |
+
|
983 |
+
return pairs
|
984 |
+
except Exception as e:
|
985 |
+
logger.error("Error getting pair leaderboard: %s", str(e), exc_info=True)
|
986 |
+
return []
|
987 |
|
988 |
def export_data_to_csv(self, filepath=None):
|
989 |
"""Export all feedback data to a CSV file for analysis"""
|
|
|
1100 |
"starting": {"status": "Starting process...", "progress": 5, "color": "#2196F3"},
|
1101 |
"generating_paraphrases": {"status": "Generating paraphrases...", "progress": 15, "color": "#2196F3"},
|
1102 |
"paraphrases_complete": {"status": "Paraphrases generated", "progress": 30, "color": "#2196F3"},
|
1103 |
+
"getting_responses": {"status": "Getting responses using {model}...", "progress": 35, "color": "#2196F3"},
|
1104 |
"responses_progress": {"status": "Getting responses ({completed}/{total})...", "progress": 40, "color": "#2196F3"},
|
1105 |
"responses_complete": {"status": "All responses received", "progress": 65, "color": "#2196F3"},
|
1106 |
+
"judging": {"status": "Analyzing responses for hallucinations using {model}...", "progress": 70, "color": "#2196F3"},
|
1107 |
+
"complete": {"status": "Analysis complete! Using {generator} (generator) and {judge} (judge)", "progress": 100, "color": "#4CAF50"},
|
1108 |
"error": {"status": "Error: {error_message}", "progress": 100, "color": "#F44336"}
|
1109 |
}
|
1110 |
|
|
|
1115 |
self.completed_responses = 0
|
1116 |
self.total_responses = 0
|
1117 |
self.error_message = ""
|
1118 |
+
self.generator_model = ""
|
1119 |
+
self.judge_model = ""
|
1120 |
+
self.model = "" # For general model reference in status messages
|
1121 |
self._lock = threading.Lock()
|
1122 |
self._status_callback = None
|
1123 |
self._stop_event = threading.Event()
|
|
|
1144 |
self.total_responses = value
|
1145 |
elif key == 'error_message':
|
1146 |
self.error_message = value
|
1147 |
+
elif key == 'model':
|
1148 |
+
self.model = value
|
1149 |
+
elif key == 'generator':
|
1150 |
+
self.generator_model = value
|
1151 |
+
elif key == 'judge':
|
1152 |
+
self.judge_model = value
|
1153 |
|
1154 |
# Format status message
|
1155 |
if stage == 'responses_progress':
|
|
|
1157 |
completed=self.completed_responses,
|
1158 |
total=self.total_responses
|
1159 |
)
|
1160 |
+
elif stage == 'getting_responses' and 'model' in kwargs:
|
1161 |
+
self.stage_data['status'] = self.stage_data['status'].format(
|
1162 |
+
model=kwargs.get('model', 'selected model')
|
1163 |
+
)
|
1164 |
+
elif stage == 'judging' and 'model' in kwargs:
|
1165 |
+
self.stage_data['status'] = self.stage_data['status'].format(
|
1166 |
+
model=kwargs.get('model', 'selected model')
|
1167 |
+
)
|
1168 |
+
elif stage == 'complete' and 'generator' in kwargs and 'judge' in kwargs:
|
1169 |
+
self.stage_data['status'] = self.stage_data['status'].format(
|
1170 |
+
generator=self.generator_model,
|
1171 |
+
judge=self.judge_model
|
1172 |
+
)
|
1173 |
elif stage == 'error':
|
1174 |
self.stage_data['status'] = self.stage_data['status'].format(
|
1175 |
error_message=self.error_message
|
|
|
1189 |
# Only show status text if not in idle state
|
1190 |
status_display = f'<div class="progress-status" style="color: {color};">{status_text}</div>' if self.stage != "idle" else ''
|
1191 |
|
1192 |
+
# Add model information if available and we're not in idle or error state
|
1193 |
+
model_info = ''
|
1194 |
+
if self.stage not in ["idle", "error", "starting"] and (self.generator_model or self.judge_model):
|
1195 |
+
model_info = f'<div class="model-info" style="display: flex; justify-content: space-between; margin-top: 8px; font-size: 0.85em; color: #37474f; background-color: #e1f5fe; padding: 5px 10px; border-radius: 4px;">'
|
1196 |
+
if self.generator_model:
|
1197 |
+
model_info += f'<div><span style="font-weight: bold;">Generator:</span> {self.generator_model}</div>'
|
1198 |
+
if self.judge_model:
|
1199 |
+
model_info += f'<div><span style="font-weight: bold;">Judge:</span> {self.judge_model}</div>'
|
1200 |
+
model_info += '</div>'
|
1201 |
+
|
1202 |
html = f"""
|
1203 |
<div class="progress-container">
|
1204 |
{query_info}
|
|
|
1206 |
<div class="progress-bar-container">
|
1207 |
<div class="progress-bar" style="width: {progress_width}; background-color: {color};"></div>
|
1208 |
</div>
|
1209 |
+
{model_info}
|
1210 |
</div>
|
1211 |
"""
|
1212 |
return html
|
|
|
1575 |
combined_progress_callback("starting", query=query)
|
1576 |
time.sleep(0.3) # Ensure starting status is visible
|
1577 |
|
1578 |
+
# Step 1.5: Randomly select model pair
|
1579 |
+
generator_model, judge_model = detector.pas2.set_random_model_pair()
|
1580 |
+
combined_progress_callback("starting", query=query, generator=generator_model, judge=judge_model)
|
1581 |
+
time.sleep(0.3) # Ensure model info is visible
|
1582 |
+
|
1583 |
# Step 2: Generate paraphrases (15-30%)
|
1584 |
combined_progress_callback("generating_paraphrases", query=query)
|
1585 |
all_queries = detector.pas2.generate_paraphrases(query)
|
1586 |
combined_progress_callback("paraphrases_complete", query=query, count=len(all_queries))
|
1587 |
|
1588 |
# Step 3: Get responses (35-65%)
|
1589 |
+
combined_progress_callback("getting_responses", query=query, total=len(all_queries), model=generator_model)
|
1590 |
all_responses = []
|
1591 |
for i, q in enumerate(all_queries):
|
1592 |
# Show incremental progress for each response
|
|
|
1596 |
combined_progress_callback("responses_complete", query=query)
|
1597 |
|
1598 |
# Step 4: Judge hallucinations (70-100%)
|
1599 |
+
combined_progress_callback("judging", query=query, model=judge_model)
|
1600 |
|
1601 |
# The first query is the original, rest are paraphrases
|
1602 |
original_query = all_queries[0]
|
|
|
1622 |
"confidence_score": judgment.confidence_score,
|
1623 |
"conflicting_facts": judgment.conflicting_facts,
|
1624 |
"reasoning": judgment.reasoning,
|
1625 |
+
"summary": judgment.summary,
|
1626 |
+
"generator_model": generator_model,
|
1627 |
+
"judge_model": judge_model
|
1628 |
}
|
1629 |
|
1630 |
# Show completion
|
1631 |
+
combined_progress_callback("complete", query=query, generator=generator_model, judge=judge_model)
|
1632 |
time.sleep(0.3) # Ensure complete status is visible
|
1633 |
|
1634 |
return results
|
|
|
1684 |
reasoning_safe = reasoning.replace('\\', '\\\\').replace('\n', '<br>')
|
1685 |
conflicting_facts_text_safe = conflicting_facts_text.replace('\\', '\\\\').replace('\n', '<br>') if conflicting_facts_text else "<strong>None identified</strong>"
|
1686 |
|
1687 |
+
# Get model info from the results
|
1688 |
+
generator_model = results.get("generator_model", "unknown model")
|
1689 |
+
judge_model = results.get("judge_model", "unknown model")
|
1690 |
+
|
1691 |
html_output = f"""
|
1692 |
<div class="container">
|
1693 |
<h2 class="title">Hallucination Detection Results</h2>
|
1694 |
|
1695 |
+
<div class="model-info-bar" style="background-color: #e1f5fe; padding: 10px 15px; border-radius: 8px; margin-bottom: 15px; display: flex; justify-content: space-between;">
|
1696 |
+
<div style="flex: 1; text-align: center; border-right: 1px solid #b3e5fc; padding-right: 10px;">
|
1697 |
+
<div style="font-weight: bold; color: #0277bd;">Generator Model</div>
|
1698 |
+
<div style="font-size: 1.2em; color: #01579b;">{generator_model}</div>
|
1699 |
+
</div>
|
1700 |
+
<div style="flex: 1; text-align: center; padding-left: 10px;">
|
1701 |
+
<div style="font-weight: bold; color: #0277bd;">Judge Model</div>
|
1702 |
+
<div style="font-size: 1.2em; color: #01579b;">{judge_model}</div>
|
1703 |
+
</div>
|
1704 |
+
</div>
|
1705 |
+
|
1706 |
<div class="stats-section">
|
1707 |
<div class="stat-item">
|
1708 |
<div class="stat-value">{'Yes' if hallucination_detected else 'No'}</div>
|
|
|
1732 |
{original_query}
|
1733 |
</div>
|
1734 |
|
1735 |
+
<div class="section-title">Original Response <span style="font-size: 0.8em; color: #607d8b;">(generated by {generator_model})</span></div>
|
1736 |
<div class="response-box">
|
1737 |
{original_response_safe}
|
1738 |
</div>
|
|
|
1747 |
{q}
|
1748 |
</div>
|
1749 |
|
1750 |
+
<div class="section-title">Response {i} <span style="font-size: 0.8em; color: #607d8b;">(generated by {generator_model})</span></div>
|
1751 |
<div class="response-box">
|
1752 |
{r}
|
1753 |
</div>
|
1754 |
"""
|
1755 |
|
1756 |
html_output += f"""
|
1757 |
+
<div class="section-title">Detailed Analysis <span style="font-size: 0.8em; color: #607d8b;">(judged by {judge_model})</span></div>
|
1758 |
<div class="info-box">
|
1759 |
<p><strong>Reasoning:</strong></p>
|
1760 |
<p>{reasoning_safe}</p>
|
|
|
1762 |
<p><strong>Conflicting Facts:</strong></p>
|
1763 |
<p>{conflicting_facts_text_safe}</p>
|
1764 |
</div>
|
1765 |
+
|
1766 |
+
<div style="margin-top: 20px; border-top: 1px dashed #ccc; padding-top: 15px; font-size: 0.9em; color: #607d8b; text-align: center;">
|
1767 |
+
Models randomly selected for this analysis: <strong>{generator_model}</strong> (Generator) and <strong>{judge_model}</strong> (Judge)
|
1768 |
+
</div>
|
1769 |
</div>
|
1770 |
"""
|
1771 |
|
|
|
1791 |
]
|
1792 |
|
1793 |
# Helper function to submit feedback
|
1794 |
+
def combine_feedback(hallucination_present, judge_correct, fb_text, results):
|
1795 |
+
combined_feedback = f"Hallucination: {hallucination_present}, Judge Correct: {judge_correct}"
|
1796 |
+
if fb_text:
|
1797 |
+
combined_feedback += f", Comments: {fb_text}"
|
1798 |
+
|
1799 |
if not results:
|
1800 |
return "No results to attach feedback to."
|
1801 |
|
|
|
1899 |
This tool implements the Paraphrase-based Approach for Scrutinizing Systems (PAS2) with a model-as-judge enhancement:
|
1900 |
|
1901 |
1. **Paraphrase Generation**: Your question is paraphrased multiple ways while preserving its core meaning
|
1902 |
+
2. **Multiple Responses**: All questions (original + paraphrases) are sent to a randomly selected generator model
|
1903 |
+
3. **Expert Judgment**: A randomly selected judge model analyzes all responses to detect factual inconsistencies
|
1904 |
|
1905 |
### Why This Approach?
|
1906 |
|
|
|
1974 |
gr.Markdown("### Help Improve the System")
|
1975 |
gr.Markdown("Your feedback helps us refine the hallucination detection system.")
|
1976 |
|
1977 |
+
hallucination_present = gr.Radio(
|
1978 |
+
label="Was there actually a hallucination in the responses?",
|
1979 |
+
choices=["Yes, there was a hallucination", "No, there was no hallucination", "Not sure"],
|
1980 |
+
value="Not sure"
|
1981 |
+
)
|
1982 |
+
|
1983 |
+
judge_correct = gr.Radio(
|
1984 |
+
label="Did the judge model correctly identify the situation?",
|
1985 |
+
choices=["Yes, the judge was correct", "No, the judge was incorrect", "Not sure"],
|
1986 |
+
value="Not sure"
|
1987 |
)
|
1988 |
|
1989 |
feedback_text = gr.Textbox(
|
|
|
2000 |
gr.Markdown("## Hallucination Detection Scores")
|
2001 |
gr.Markdown("Performance comparison of different Generator + Judge model combinations.")
|
2002 |
|
2003 |
+
# Function to generate the HTML for the model pair leaderboard
|
2004 |
+
def generate_pair_leaderboard_html():
|
2005 |
+
try:
|
2006 |
+
# Get leaderboard data
|
2007 |
+
pairs = detector.get_pair_leaderboard() or []
|
2008 |
+
|
2009 |
+
if not pairs:
|
2010 |
+
return (
|
2011 |
+
"<div class=\"info-message\" style=\"padding: 20px; background-color: #e1f5fe; "
|
2012 |
+
"border-radius: 8px; text-align: center; margin: 20px 0;\">"
|
2013 |
+
"<h3 style=\"margin-top: 0; color: #0277bd;\">No Data Available Yet</h3>"
|
2014 |
+
"<p>Try the detector with more queries to populate the leaderboard!</p>"
|
2015 |
+
"</div>"
|
2016 |
+
)
|
2017 |
+
|
2018 |
+
# Generate table rows
|
2019 |
+
rows = ""
|
2020 |
+
for rank, pair in enumerate(pairs, 1):
|
2021 |
+
# Add special styling for top 3
|
2022 |
+
row_class = ""
|
2023 |
+
if rank == 1:
|
2024 |
+
row_class = "class='top-rank-1'"
|
2025 |
+
elif rank == 2:
|
2026 |
+
row_class = "class='top-rank-2'"
|
2027 |
+
elif rank == 3:
|
2028 |
+
row_class = "class='top-rank-3'"
|
2029 |
+
|
2030 |
+
# Format percentages for display
|
2031 |
+
generator_perf = f"{pair.get('generator_performance', 0) * 100:.1f}%" if 'generator_performance' in pair else "N/A"
|
2032 |
+
judge_perf = f"{pair.get('judge_performance', 0) * 100:.1f}%" if 'judge_performance' in pair else "N/A"
|
2033 |
+
consistency = f"{pair.get('consistency_score', 0)}%" if 'consistency_score' in pair else "N/A"
|
2034 |
+
|
2035 |
+
rows += (
|
2036 |
+
f"<tr {row_class}>"
|
2037 |
+
f"<td>{rank}</td>"
|
2038 |
+
f"<td>{pair.get('generator', 'unknown')}</td>"
|
2039 |
+
f"<td>{pair.get('judge', 'unknown')}</td>"
|
2040 |
+
f"<td>{round(pair.get('elo_score', 0))}</td>"
|
2041 |
+
f"<td>{pair.get('accuracy')}%</td>"
|
2042 |
+
f"<td style='color: #80cbc4; font-weight: 500;'>{generator_perf}</td>"
|
2043 |
+
f"<td style='color: #90caf9; font-weight: 500;'>{judge_perf}</td>"
|
2044 |
+
f"<td style='color: #ce93d8; font-weight: 500;'>{consistency}</td>"
|
2045 |
+
f"<td>{pair.get('total_samples', 0)}</td>"
|
2046 |
+
f"</tr>"
|
2047 |
+
)
|
2048 |
+
|
2049 |
+
# Build the full table
|
2050 |
+
html = (
|
2051 |
+
f"<div class=\"leaderboard-container\">"
|
2052 |
+
f"<table class=\"leaderboard-table\">"
|
2053 |
+
f"<thead>"
|
2054 |
+
f"<tr>"
|
2055 |
+
f"<th>Rank</th>"
|
2056 |
+
f"<th>Generator Model</th>"
|
2057 |
+
f"<th>Judge Model</th>"
|
2058 |
+
f"<th>ELO Score</th>"
|
2059 |
+
f"<th>Accuracy</th>"
|
2060 |
+
f"<th>Generator Perf.</th>"
|
2061 |
+
f"<th>Judge Perf.</th>"
|
2062 |
+
f"<th>Consistency</th>"
|
2063 |
+
f"<th>Sample Size</th>"
|
2064 |
+
f"</tr>"
|
2065 |
+
f"</thead>"
|
2066 |
+
f"<tbody>"
|
2067 |
+
f"{rows}"
|
2068 |
+
f"</tbody>"
|
2069 |
+
f"</table>"
|
2070 |
+
f"</div>"
|
2071 |
+
f"<div style='margin-top: 15px; padding: 12px; background-color: #263238; border-radius: 8px; font-size: 0.95em; color: #e0f7fa; box-shadow: 0 2px 5px rgba(0,0,0,0.2);'>"
|
2072 |
+
f"<p style='margin-bottom: 8px; color: #80deea;'><strong>Model Pair Performance Metrics:</strong></p>"
|
2073 |
+
f"<ul style='margin-top: 5px; padding-left: 20px; line-height: 1.4;'>"
|
2074 |
+
f"<li><strong style='color: #b2dfdb;'>Accuracy</strong>: Percentage of correct hallucination judgments based on user feedback</li>"
|
2075 |
+
f"<li><strong style='color: #b2dfdb;'>Generator Performance</strong>: How well the generator model avoids hallucinations</li>"
|
2076 |
+
f"<li><strong style='color: #b2dfdb;'>Judge Performance</strong>: How accurately the judge model identifies hallucinations</li>"
|
2077 |
+
f"<li><strong style='color: #b2dfdb;'>Consistency</strong>: Weighted measure of how well the pair works together</li>"
|
2078 |
+
f"</ul>"
|
2079 |
+
f"</div>"
|
2080 |
+
)
|
2081 |
+
|
2082 |
+
return html
|
2083 |
+
except Exception as e:
|
2084 |
+
logger.error("Error generating leaderboard HTML: %s", str(e), exc_info=True)
|
2085 |
+
return (
|
2086 |
+
f"<div class=\"error-message\" style=\"padding: 20px; background-color: #ffebee; "
|
2087 |
+
f"border-radius: 8px; text-align: center; margin: 20px 0;\">"
|
2088 |
+
f"<h3 style=\"margin-top: 0; color: #c62828;\">Error Loading Leaderboard</h3>"
|
2089 |
+
f"<p>{str(e)}</p>"
|
2090 |
+
f"</div>"
|
2091 |
+
)
|
2092 |
+
|
2093 |
# Create leaderboard table for model combinations
|
2094 |
+
model_leaderboard_html = gr.HTML(generate_pair_leaderboard_html())
|
2095 |
+
refresh_leaderboard_btn = gr.Button("Refresh Leaderboard", variant="primary")
|
2096 |
+
refresh_leaderboard_btn.click(
|
2097 |
+
fn=lambda: generate_pair_leaderboard_html(),
|
2098 |
+
outputs=[model_leaderboard_html]
|
2099 |
+
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2100 |
|
2101 |
+
# ELO rating explanation
|
2102 |
+
with gr.Accordion("ELO Rating System Explanation", open=False):
|
2103 |
+
gr.HTML(
|
2104 |
+
"<div style='margin-top: 20px; padding: 15px; background-color: #0d47a1; border-radius: 8px; box-shadow: 0 2px 8px rgba(0,0,0,0.1);'>" +
|
2105 |
+
"<h3 style='margin-top: 0; color: #ffffff;'>ELO Rating System Explanation</h3>" +
|
2106 |
+
"<div style='display: flex; flex-wrap: wrap; gap: 15px; margin-top: 15px;'>" +
|
2107 |
+
"<div style='flex: 1; min-width: 280px; padding: 12px; background-color: #455a64; border-radius: 6px; box-shadow: 0 1px 3px rgba(0,0,0,0.12);'>" +
|
2108 |
+
"<h4 style='margin-top: 0; color: #ffffff;'>How ELO Scores Are Calculated</h4>" +
|
2109 |
+
"<p style='color: #eceff1;'>Our ELO rating system assigns scores to model pairs based on user feedback, using the following formula:</p>" +
|
2110 |
+
"<div style='background-color: #37474f; padding: 12px; border-radius: 5px; color: #eceff1;'>" +
|
2111 |
+
"<code style='color: #80deea;'>ELO_new = ELO_old + K * (S - E)</code><br><br>" +
|
2112 |
+
"Where:<br>* <strong style='color: #b2dfdb;'>ELO_old</strong>: Previous rating of the model combination<br>" +
|
2113 |
+
"* <strong style='color: #b2dfdb;'>K</strong>: Weight factor (24 for model pairs)<br>" +
|
2114 |
+
"* <strong style='color: #b2dfdb;'>S</strong>: Actual score from user feedback (1 for correct, 0 for incorrect)<br>" +
|
2115 |
+
"* <strong style='color: #b2dfdb;'>E</strong>: Expected score based on current rating<br><br>" +
|
2116 |
+
"<em style='color: #80deea;'>E = 1 / (1 + 10<sup>(1500 - ELO_model)/400</sup>)</em></div></div>" +
|
2117 |
+
"<div style='flex: 1; min-width: 280px; padding: 12px; background-color: #455a64; border-radius: 6px; box-shadow: 0 1px 3px rgba(0,0,0,0.12);'>" +
|
2118 |
+
"<h4 style='margin-top: 0; color: #ffffff;'>Available Models</h4>" +
|
2119 |
+
"<p style='color: #eceff1;'>The system randomly selects from these models for each hallucination detection:</p>" +
|
2120 |
+
"<div style='display: flex; flex-wrap: wrap; gap: 10px; margin-top: 10px;'>" +
|
2121 |
+
"<div style='flex: 1; min-width: 120px;'>" +
|
2122 |
+
"<h5 style='margin-top: 0; margin-bottom: 5px; color: #b2dfdb;'>All Models (Used as both Generator & Judge)</h5>" +
|
2123 |
+
"<ul style='margin-bottom: 0; padding-left: 20px; color: #eceff1;'>" +
|
2124 |
+
"<li>mistral-large</li><li>gpt-4o</li><li>qwen-235b</li><li>grok-3</li>" +
|
2125 |
+
"<li>deepseek-reasoner</li><li>o4-mini</li><li>gemini-2.5-pro</li>" +
|
2126 |
+
"</ul></div></div></div></div></div>"
|
2127 |
+
)
|
2128 |
+
gr.HTML(
|
2129 |
+
"<style>" +
|
2130 |
+
".leaderboard-container {margin: 15px 0; overflow-x: auto;}" +
|
2131 |
+
".leaderboard-table {width: 100%; border-collapse: collapse; font-size: 0.95em; " +
|
2132 |
+
"box-shadow: 0 2px 10px rgba(0,0,0,0.2); border-radius: 8px; overflow: hidden;}" +
|
2133 |
+
".leaderboard-table thead {background-color: #0d47a1; color: white;}" +
|
2134 |
+
".leaderboard-table th, .leaderboard-table td {padding: 12px 15px; text-align: left; border-bottom: 1px solid #37474f; color: #eceff1;}" +
|
2135 |
+
".leaderboard-table tbody tr {transition: background-color 0.3s;}" +
|
2136 |
+
".leaderboard-table tbody tr:nth-child(even) {background-color: #37474f;}" +
|
2137 |
+
".leaderboard-table tbody tr:nth-child(odd) {background-color: #455a64;}" +
|
2138 |
+
".leaderboard-table tbody tr:hover {background-color: #263238;}" +
|
2139 |
+
".leaderboard-table tbody tr.top-rank-1 {background-color: #004d40; color: #e0f2f1; font-weight: bold;}" +
|
2140 |
+
".leaderboard-table tbody tr.top-rank-2 {background-color: #1b5e20; color: #e8f5e9; font-weight: 500;}" +
|
2141 |
+
".leaderboard-table tbody tr.top-rank-3 {background-color: #33691e; color: #f1f8e9; font-weight: 500;}" +
|
2142 |
+
".leaderboard-table td {position: relative;}" +
|
2143 |
+
".leaderboard-table td::after {content: ''; position: absolute; top: 0; left: 0; width: 100%; height: 100%; background: transparent; pointer-events: none;}" +
|
2144 |
+
"</style>"
|
2145 |
+
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2146 |
|
2147 |
+
# Tab 3: Individual Models Leaderboard
|
2148 |
+
with gr.TabItem("Individual Models", elem_id="user-feedback-tab"):
|
2149 |
+
gr.Markdown("## Individual Model Performance")
|
2150 |
+
gr.Markdown("Performance ranking of models based on user feedback, showing statistics for both generator and judge roles.")
|
2151 |
|
2152 |
+
# Function to generate individual model leaderboard HTML
|
2153 |
+
def generate_model_leaderboard_html():
|
2154 |
+
try:
|
2155 |
+
# Get model scores from MongoDB
|
2156 |
+
models = detector.get_model_leaderboard() or []
|
2157 |
+
|
2158 |
+
if not models:
|
2159 |
+
return (
|
2160 |
+
"<div class=\"info-message\" style=\"padding: 20px; background-color: #e1f5fe; "
|
2161 |
+
"border-radius: 8px; text-align: center; margin: 20px 0;\">"
|
2162 |
+
"<h3 style=\"margin-top: 0; color: #0277bd;\">No Data Available Yet</h3>"
|
2163 |
+
"<p>Try the detector with more queries to populate the model scores!</p>"
|
2164 |
+
"</div>"
|
2165 |
+
)
|
2166 |
+
|
2167 |
+
# Generate table rows
|
2168 |
+
rows = ""
|
2169 |
+
for rank, model in enumerate(models, 1):
|
2170 |
+
# Add special styling for top 3
|
2171 |
+
row_class = ""
|
2172 |
+
if rank == 1:
|
2173 |
+
row_class = "class='top-rank-1'"
|
2174 |
+
elif rank == 2:
|
2175 |
+
row_class = "class='top-rank-2'"
|
2176 |
+
elif rank == 3:
|
2177 |
+
row_class = "class='top-rank-3'"
|
2178 |
+
|
2179 |
+
# Calculate role distribution
|
2180 |
+
as_generator = model.get('as_generator', 0)
|
2181 |
+
as_judge = model.get('as_judge', 0)
|
2182 |
+
if as_generator + as_judge > 0:
|
2183 |
+
generator_pct = round((as_generator / (as_generator + as_judge)) * 100)
|
2184 |
+
judge_pct = 100 - generator_pct
|
2185 |
+
role_distribution = f"{generator_pct}% / {judge_pct}%"
|
2186 |
+
else:
|
2187 |
+
role_distribution = "N/A"
|
2188 |
+
|
2189 |
+
# Format percentages with better contrast against dark background
|
2190 |
+
generator_acc = f"{model.get('generator_accuracy', 0.0)}%"
|
2191 |
+
judge_acc = f"{model.get('judge_accuracy', 0.0)}%"
|
2192 |
+
|
2193 |
+
rows += (
|
2194 |
+
f"<tr {row_class}>"
|
2195 |
+
f"<td>{rank}</td>"
|
2196 |
+
f"<td>{model.get('model_name', 'unknown')}</td>"
|
2197 |
+
f"<td>{round(model.get('elo_score', 0))}</td>"
|
2198 |
+
f"<td>{model.get('accuracy')}%</td>"
|
2199 |
+
f"<td style='color: #80cbc4; font-weight: 500;'>{generator_acc}</td>"
|
2200 |
+
f"<td style='color: #90caf9; font-weight: 500;'>{judge_acc}</td>"
|
2201 |
+
f"<td>{model.get('total_samples', 0)}</td>"
|
2202 |
+
f"<td style='color: #ffcc80; font-weight: 500;'>{role_distribution}</td>"
|
2203 |
+
f"</tr>"
|
2204 |
+
)
|
2205 |
+
|
2206 |
+
# Build the full table
|
2207 |
+
html = (
|
2208 |
+
f"<div class=\"leaderboard-container\">"
|
2209 |
+
f"<table class=\"leaderboard-table\">"
|
2210 |
+
f"<thead>"
|
2211 |
+
f"<tr>"
|
2212 |
+
f"<th>Rank</th>"
|
2213 |
+
f"<th>Model</th>"
|
2214 |
+
f"<th>ELO Score</th>"
|
2215 |
+
f"<th>Overall Accuracy</th>"
|
2216 |
+
f"<th>Generator Accuracy</th>"
|
2217 |
+
f"<th>Judge Accuracy</th>"
|
2218 |
+
f"<th>Sample Size</th>"
|
2219 |
+
f"<th>Generator/Judge Ratio</th>"
|
2220 |
+
f"</tr>"
|
2221 |
+
f"</thead>"
|
2222 |
+
f"<tbody>"
|
2223 |
+
f"{rows}"
|
2224 |
+
f"</tbody>"
|
2225 |
+
f"</table>"
|
2226 |
+
f"</div>"
|
2227 |
+
)
|
2228 |
+
|
2229 |
+
return html
|
2230 |
+
except Exception as e:
|
2231 |
+
logger.error("Error generating model leaderboard HTML: %s", str(e), exc_info=True)
|
2232 |
+
return (
|
2233 |
+
f"<div class=\"error-message\" style=\"padding: 20px; background-color: #ffebee; "
|
2234 |
+
f"border-radius: 8px; text-align: center; margin: 20px 0;\">"
|
2235 |
+
f"<h3 style=\"margin-top: 0; color: #c62828;\">Error Loading Model Leaderboard</h3>"
|
2236 |
+
f"<p>{str(e)}</p>"
|
2237 |
+
f"</div>"
|
2238 |
+
)
|
2239 |
|
2240 |
+
# Create leaderboard table for individual models
|
2241 |
+
model_scores_html = gr.HTML(generate_model_leaderboard_html())
|
2242 |
+
refresh_models_btn = gr.Button("Refresh Model Scores", variant="primary")
|
2243 |
+
refresh_models_btn.click(
|
2244 |
+
fn=lambda: generate_model_leaderboard_html(),
|
2245 |
+
outputs=[model_scores_html]
|
2246 |
+
)
|
2247 |
+
|
2248 |
+
# ELO rating explanation for individual models
|
2249 |
+
with gr.Accordion("ELO Rating Explanation for Individual Models", open=False):
|
2250 |
+
gr.HTML(
|
2251 |
+
"<div style='margin-top: 20px; padding: 15px; background-color: #0d47a1; border-radius: 8px; box-shadow: 0 2px 8px rgba(0,0,0,0.1);'>" +
|
2252 |
+
"<h3 style='margin-top: 0; color: #ffffff;'>Individual Model ELO Rating System</h3>" +
|
2253 |
+
"<div style='display: flex; flex-wrap: wrap; gap: 15px; margin-top: 15px;'>" +
|
2254 |
+
"<div style='flex: 1; min-width: 280px; padding: 12px; background-color: #455a64; border-radius: 6px; box-shadow: 0 1px 3px rgba(0,0,0,0.12);'>" +
|
2255 |
+
"<h4 style='margin-top: 0; color: #ffffff;'>How Individual ELO Scores Are Calculated</h4>" +
|
2256 |
+
"<p style='color: #eceff1;'>Our ELO rating system assigns scores to individual models based on user feedback, using the following formula:</p>" +
|
2257 |
+
"<div style='background-color: #37474f; padding: 12px; border-radius: 5px; color: #eceff1;'>" +
|
2258 |
+
"<code style='color: #80deea;'>ELO_new = ELO_old + K * (S - E)</code><br><br>" +
|
2259 |
+
"Where:<br>* <strong style='color: #b2dfdb;'>ELO_old</strong>: Previous rating of the model<br>" +
|
2260 |
+
"* <strong style='color: #b2dfdb;'>K</strong>: Weight factor (32 for individual models)<br>" +
|
2261 |
+
"* <strong style='color: #b2dfdb;'>S</strong>: Actual score (1 for correct judgment, 0 for incorrect)<br>" +
|
2262 |
+
"* <strong style='color: #b2dfdb;'>E</strong>: Expected score based on current rating<br><br>" +
|
2263 |
+
"<em style='color: #80deea;'>E = 1 / (1 + 10<sup>(1500 - ELO_model)/400</sup>)</em></div>" +
|
2264 |
+
"<p style='color: #eceff1; margin-top: 10px;'>All models start with a base ELO of 1500. Scores are updated after each user evaluation.</p></div>" +
|
2265 |
+
"<div style='flex: 1; min-width: 280px; padding: 12px; background-color: #455a64; border-radius: 6px; box-shadow: 0 1px 3px rgba(0,0,0,0.12);'>" +
|
2266 |
+
"<h4 style='margin-top: 0; color: #ffffff;'>Interpretation Guidelines</h4>" +
|
2267 |
+
"<ul style='margin-bottom: 0; padding-left: 20px; color: #eceff1;'>" +
|
2268 |
+
"<li><strong style='color: #b2dfdb;'>1800+</strong>: Exceptional performance, very rare hallucinations</li>" +
|
2269 |
+
"<li><strong style='color: #b2dfdb;'>1700-1799</strong>: Superior performance, minimal hallucinations</li>" +
|
2270 |
+
"<li><strong style='color: #b2dfdb;'>1600-1699</strong>: Good performance, occasional hallucinations</li>" +
|
2271 |
+
"<li><strong style='color: #b2dfdb;'>1500-1599</strong>: Average performance</li>" +
|
2272 |
+
"<li><strong style='color: #b2dfdb;'><1500</strong>: Below average, frequent hallucinations</li>" +
|
2273 |
+
"</ul><p style='font-style: italic; color: #b3e5fc; margin-top: 10px;'>" +
|
2274 |
+
"Note: ELO scores are comparative and reflect relative performance between models in our specific hallucination detection tasks.</p>" +
|
2275 |
+
"</div></div></div>"
|
2276 |
+
)
|
2277 |
|
2278 |
# Function to continuously update stats
|
2279 |
def update_stats():
|
|
|
2316 |
live_stats = gr.HTML(update_stats())
|
2317 |
|
2318 |
# Add loading animation style
|
2319 |
+
gr.HTML(
|
2320 |
+
"<style>" +
|
2321 |
+
"@keyframes pulse {" +
|
2322 |
+
"0% { opacity: 0.6; }" +
|
2323 |
+
"50% { opacity: 1; }" +
|
2324 |
+
"100% { opacity: 0.6; }" +
|
2325 |
+
"}" +
|
2326 |
+
".refreshing::after {" +
|
2327 |
+
"content: \"⟳\";" +
|
2328 |
+
"display: inline-block;" +
|
2329 |
+
"margin-left: 8px;" +
|
2330 |
+
"animation: pulse 1.5s infinite ease-in-out;" +
|
2331 |
+
"color: #2e7d32;" +
|
2332 |
+
"}" +
|
2333 |
+
"#stats-container {" +
|
2334 |
+
"border: 1px solid #b3e5fc;" +
|
2335 |
+
"border-radius: 10px;" +
|
2336 |
+
"padding: 15px;" +
|
2337 |
+
"margin: 10px 0;" +
|
2338 |
+
"background-color: #0277bd;" +
|
2339 |
+
"}" +
|
2340 |
+
"</style>" +
|
2341 |
+
"<div class=\"refreshing\" style=\"text-align: right; font-size: 0.8em; color: #eceff1;\">Auto-refreshing</div>"
|
2342 |
+
)
|
2343 |
|
2344 |
# Create a refresh button that will be auto-clicked
|
2345 |
refresh_btn = gr.Button("Refresh Stats", visible=False)
|
|
|
2523 |
|
2524 |
feedback_button.click(
|
2525 |
fn=combine_feedback,
|
2526 |
+
inputs=[hallucination_present, judge_correct, feedback_text, hidden_results],
|
2527 |
outputs=[feedback_status]
|
2528 |
)
|
2529 |
|
2530 |
# Footer
|
2531 |
gr.HTML(
|
2532 |
+
"""<footer><p>Paraphrase-based Approach for Scrutinizing Systems (PAS2) - Advanced Hallucination Detection</p><p>Multiple LLM models tested as generators and judges for optimal hallucination detection</p><p><small>Models in testing: mistral-large, gpt-4o, Qwen3-235B-A22B, grok-3, o4-mini, gemini-2.5-pro, deepseek-r1</small></p></footer>"""
|
|
|
|
|
|
|
|
|
|
|
|
|
2533 |
)
|
2534 |
|
2535 |
return interface
|
|
|
2595 |
|
2596 |
# Uncomment this line to run the test function instead of the main interface
|
2597 |
# if __name__ == "__main__":
|
2598 |
+
# test_progress()
|