Spaces:

serhany
/

pas2-llm-hallucination-detector

Running

App Files Files Community

nappenstance commited on May 17

Commit

99b2a32

verified ·

1 Parent(s): 31c5c21

Upload app.py

Browse files

Files changed (1) hide show

app.py +864 -365

app.py CHANGED Viewed

@@ -43,12 +43,17 @@ class HallucinationJudgment(BaseModel):
 class PAS2:
     """Paraphrase-based Approach for LLM Systems - Using llm-as-judge methods"""
-    def __init__(self, mistral_api_key=None, openai_api_key=None, progress_callback=None):
         """Initialize the PAS2 with API keys"""
         # For Hugging Face Spaces, we prioritize getting API keys from HF_* environment variables
         # which are set from the Secrets tab in the Space settings
         self.mistral_api_key = mistral_api_key or os.environ.get("HF_MISTRAL_API_KEY") or os.environ.get("MISTRAL_API_KEY")
         self.openai_api_key = openai_api_key or os.environ.get("HF_OPENAI_API_KEY") or os.environ.get("OPENAI_API_KEY")
         self.progress_callback = progress_callback
         if not self.mistral_api_key:
@@ -59,12 +64,64 @@ class PAS2:
         self.mistral_client = Mistral(api_key=self.mistral_api_key)
         self.openai_client = OpenAI(api_key=self.openai_api_key)
         self.mistral_model = "mistral-large-latest"
-        self.openai_model = "o3-mini"
-        logger.info("PAS2 initialized with Mistral model: %s and OpenAI model: %s",
-                   self.mistral_model, self.openai_model)
     def generate_paraphrases(self, query: str, n_paraphrases: int = 3) -> List[str]:
         """Generate paraphrases of the input query using Mistral API"""
@@ -141,13 +198,38 @@ class PAS2:
             return fallback_paraphrases
     def _get_single_response(self, query: str, index: int = None) -> str:
-        """Get a single response from Mistral API for a query"""
         try:
             query_description = f"Query {index}: {query}" if index is not None else f"Query: {query}"
-            logger.info("Getting response for %s", query_description)
             start_time = time.time()
             messages = [
                 {
                     "role": "system",
@@ -159,23 +241,32 @@ class PAS2:
                 }
             ]
-            response = self.mistral_client.chat.complete(
-                model=self.mistral_model,
-                messages=messages
-            )
-            result = response.choices[0].message.content
             elapsed_time = time.time() - start_time
-            logger.info("Received response for %s (%.2f seconds)", query_description, elapsed_time)
             logger.debug("Response content for %s: %s", query_description, result[:100] + "..." if len(result) > 100 else result)
             return result
         except Exception as e:
-            error_msg = f"Error getting response for query '{query}': {e}"
             logger.error(error_msg, exc_info=True)
-            return f"Error: Failed to get response for this query."
     def get_responses(self, queries: List[str]) -> List[str]:
         """Get responses from Mistral API for each query in parallel"""
@@ -235,6 +326,10 @@ class PAS2:
         logger.info("Starting hallucination detection for query: %s", query)
         start_time = time.time()
         # Report progress
         if self.progress_callback:
             self.progress_callback("starting", query=query)
@@ -250,9 +345,9 @@ class PAS2:
             self.progress_callback("paraphrases_complete", query=query, count=len(all_queries))
         # Get responses to all queries
-        logger.info("Step 2: Getting responses to all %d queries", len(all_queries))
         if self.progress_callback:
-            self.progress_callback("getting_responses", query=query, total=len(all_queries))
         all_responses = []
         for i, q in enumerate(all_queries):
@@ -267,9 +362,9 @@ class PAS2:
             self.progress_callback("responses_complete", query=query)
         # Judge the responses for hallucinations
-        logger.info("Step 3: Judging for hallucinations")
         if self.progress_callback:
-            self.progress_callback("judging", query=query)
         # The first query is the original, rest are paraphrases
         original_query = all_queries[0]
@@ -295,14 +390,17 @@ class PAS2:
             "confidence_score": judgment.confidence_score,
             "conflicting_facts": judgment.conflicting_facts,
             "reasoning": judgment.reasoning,
-            "summary": judgment.summary
         }
         # Report completion
         if self.progress_callback:
-            self.progress_callback("complete", query=query)
-        logger.info("Hallucination detection completed in %.2f seconds", time.time() - start_time)
         return results
     def judge_hallucination(self,
@@ -311,11 +409,17 @@ class PAS2:
                            paraphrased_queries: List[str],
                            paraphrased_responses: List[str]) -> HallucinationJudgment:
         """
-        Use OpenAI's o3-mini as a judge to detect hallucinations in the responses
         """
-        logger.info("Judging hallucinations with OpenAI's %s model", self.openai_model)
         start_time = time.time()
         # Prepare the context for the judge
         context = f"""
 Original Question: {original_query}
@@ -344,18 +448,31 @@ Your response should be a JSON with the following fields:
 """
         try:
-            logger.info("Sending judgment request to OpenAI API...")
-            response = self.openai_client.chat.completions.create(
-                model=self.openai_model,
-                messages=[
-                    {"role": "system", "content": system_prompt},
-                    {"role": "user", "content": f"Evaluate these responses for hallucinations:\n\n{context}"}
-                ],
-                response_format={"type": "json_object"}
-            )
-            result_json = json.loads(response.choices[0].message.content)
-            logger.debug("Received judgment response: %s", result_json)
             # Create the HallucinationJudgment object from the JSON response
             judgment = HallucinationJudgment(
@@ -367,18 +484,18 @@ Your response should be a JSON with the following fields:
             )
             elapsed_time = time.time() - start_time
-            logger.info("Judgment completed in %.2f seconds", elapsed_time)
             return judgment
         except Exception as e:
-            logger.error("Error in hallucination judgment: %s", str(e), exc_info=True)
             # Return a fallback judgment
             return HallucinationJudgment(
                 hallucination_detected=False,
                 confidence_score=0.0,
                 conflicting_facts=[],
-                reasoning="Failed to obtain judgment from the model.",
                 summary="Analysis failed due to API error."
             )
@@ -495,11 +612,21 @@ class HallucinationDetectorApp:
                 "conflicting_facts": results.get('conflicting_facts', []),
                 "reasoning": results.get('reasoning', ''),
                 "summary": results.get('summary', ''),
                 "user_feedback": feedback
             }
             # Insert document into collection
-            self.feedback_collection.insert_one(document)
             logger.info("Feedback saved successfully to MongoDB")
             return "Feedback saved successfully!"
@@ -507,6 +634,266 @@ class HallucinationDetectorApp:
             logger.error("Error saving feedback: %s", str(e), exc_info=True)
             return f"Error saving feedback: {str(e)}"
     def get_feedback_stats(self):
         """Get statistics about collected feedback from MongoDB"""
         try:
@@ -541,6 +928,62 @@ class HallucinationDetectorApp:
         except Exception as e:
             logger.error("Error getting feedback stats: %s", str(e), exc_info=True)
             return None
     def export_data_to_csv(self, filepath=None):
         """Export all feedback data to a CSV file for analysis"""
@@ -657,11 +1100,11 @@ class ProgressTracker:
         "starting": {"status": "Starting process...", "progress": 5, "color": "#2196F3"},
         "generating_paraphrases": {"status": "Generating paraphrases...", "progress": 15, "color": "#2196F3"},
         "paraphrases_complete": {"status": "Paraphrases generated", "progress": 30, "color": "#2196F3"},
-        "getting_responses": {"status": "Getting responses (0/0)...", "progress": 35, "color": "#2196F3"},
         "responses_progress": {"status": "Getting responses ({completed}/{total})...", "progress": 40, "color": "#2196F3"},
         "responses_complete": {"status": "All responses received", "progress": 65, "color": "#2196F3"},
-        "judging": {"status": "Analyzing responses for hallucinations...", "progress": 70, "color": "#2196F3"},
-        "complete": {"status": "Analysis complete!", "progress": 100, "color": "#4CAF50"},
         "error": {"status": "Error: {error_message}", "progress": 100, "color": "#F44336"}
     }
@@ -672,6 +1115,9 @@ class ProgressTracker:
         self.completed_responses = 0
         self.total_responses = 0
         self.error_message = ""
         self._lock = threading.Lock()
         self._status_callback = None
         self._stop_event = threading.Event()
@@ -698,6 +1144,12 @@ class ProgressTracker:
                         self.total_responses = value
                     elif key == 'error_message':
                         self.error_message = value
                 # Format status message
                 if stage == 'responses_progress':
@@ -705,6 +1157,19 @@ class ProgressTracker:
                         completed=self.completed_responses,
                         total=self.total_responses
                     )
                 elif stage == 'error':
                     self.stage_data['status'] = self.stage_data['status'].format(
                         error_message=self.error_message
@@ -724,6 +1189,16 @@ class ProgressTracker:
         # Only show status text if not in idle state
         status_display = f'<div class="progress-status" style="color: {color};">{status_text}</div>' if self.stage != "idle" else ''
         html = f"""
         <div class="progress-container">
             {query_info}
@@ -731,6 +1206,7 @@ class ProgressTracker:
             <div class="progress-bar-container">
                 <div class="progress-bar" style="width: {progress_width}; background-color: {color};"></div>
             </div>
         </div>
         """
         return html
@@ -1099,13 +1575,18 @@ def create_interface():
                 combined_progress_callback("starting", query=query)
                 time.sleep(0.3)  # Ensure starting status is visible
                 # Step 2: Generate paraphrases (15-30%)
                 combined_progress_callback("generating_paraphrases", query=query)
                 all_queries = detector.pas2.generate_paraphrases(query)
                 combined_progress_callback("paraphrases_complete", query=query, count=len(all_queries))
                 # Step 3: Get responses (35-65%)
-                combined_progress_callback("getting_responses", query=query, total=len(all_queries))
                 all_responses = []
                 for i, q in enumerate(all_queries):
                     # Show incremental progress for each response
@@ -1115,7 +1596,7 @@ def create_interface():
                 combined_progress_callback("responses_complete", query=query)
                 # Step 4: Judge hallucinations (70-100%)
-                combined_progress_callback("judging", query=query)
                 # The first query is the original, rest are paraphrases
                 original_query = all_queries[0]
@@ -1141,11 +1622,13 @@ def create_interface():
                     "confidence_score": judgment.confidence_score,
                     "conflicting_facts": judgment.conflicting_facts,
                     "reasoning": judgment.reasoning,
-                    "summary": judgment.summary
                 }
                 # Show completion
-                combined_progress_callback("complete", query=query)
                 time.sleep(0.3)  # Ensure complete status is visible
                 return results
@@ -1201,10 +1684,25 @@ def create_interface():
             reasoning_safe = reasoning.replace('\\', '\\\\').replace('\n', '<br>')
             conflicting_facts_text_safe = conflicting_facts_text.replace('\\', '\\\\').replace('\n', '<br>') if conflicting_facts_text else "<strong>None identified</strong>"
             html_output = f"""
             <div class="container">
                 <h2 class="title">Hallucination Detection Results</h2>
                 <div class="stats-section">
                     <div class="stat-item">
                         <div class="stat-value">{'Yes' if hallucination_detected else 'No'}</div>
@@ -1234,7 +1732,7 @@ def create_interface():
                     {original_query}
                 </div>
-                <div class="section-title">Original Response</div>
                 <div class="response-box">
                     {original_response_safe}
                 </div>
@@ -1249,14 +1747,14 @@ def create_interface():
                     {q}
                 </div>
-                <div class="section-title">Response {i}</div>
                 <div class="response-box">
                     {r}
                 </div>
                 """
             html_output += f"""
-                <div class="section-title">Detailed Analysis</div>
                 <div class="info-box">
                     <p><strong>Reasoning:</strong></p>
                     <p>{reasoning_safe}</p>
@@ -1264,6 +1762,10 @@ def create_interface():
                     <p><strong>Conflicting Facts:</strong></p>
                     <p>{conflicting_facts_text_safe}</p>
                 </div>
             </div>
             """
@@ -1289,8 +1791,11 @@ def create_interface():
             ]
     # Helper function to submit feedback
-    def combine_feedback(fb_input, fb_text, results):
-        combined_feedback = f"{fb_input}: {fb_text}" if fb_text else fb_input
         if not results:
             return "No results to attach feedback to."
@@ -1394,8 +1899,8 @@ def create_interface():
                         This tool implements the Paraphrase-based Approach for Scrutinizing Systems (PAS2) with a model-as-judge enhancement:
                         1. **Paraphrase Generation**: Your question is paraphrased multiple ways while preserving its core meaning
-                        2. **Multiple Responses**: All questions (original + paraphrases) are sent to Mistral Large model
-                        3. **Expert Judgment**: OpenAI's o3-mini analyzes all responses to detect factual inconsistencies
                         ### Why This Approach?
@@ -1469,10 +1974,16 @@ def create_interface():
                     gr.Markdown("### Help Improve the System")
                     gr.Markdown("Your feedback helps us refine the hallucination detection system.")
-                    feedback_input = gr.Radio(
-                        label="Was the hallucination detection accurate?",
-                        choices=["Yes, the detection was correct", "No, the detection was incorrect", "Other/Unsure"],
-                        value="Yes, the detection was correct"
                     )
                     feedback_text = gr.Textbox(
@@ -1489,286 +2000,280 @@ def create_interface():
                 gr.Markdown("## Hallucination Detection Scores")
                 gr.Markdown("Performance comparison of different Generator + Judge model combinations.")
                 # Create leaderboard table for model combinations
-                model_leaderboard_html = gr.HTML("""
-                <div class="leaderboard-container">
-                    <table class="leaderboard-table">
-                        <thead>
-                            <tr>
-                                <th>Rank</th>
-                                <th>Generator Model</th>
-                                <th>Judge Model</th>
-                                <th>ELO Score</th>
-                                <th>Accuracy</th>
-                                <th>Consistency</th>
-                            </tr>
-                        </thead>
-                        <tbody>
-                            <tr>
-                                <td>1</td>
-                                <td>gpt-4o</td>
-                                <td>o4-mini</td>
-                                <td>1878</td>
-                                <td>94.2%</td>
-                                <td>91.6%</td>
-                            </tr>
-                            <tr>
-                                <td>2</td>
-                                <td>gpt-4o</td>
-                                <td>gemini-2.5-pro</td>
-                                <td>1835</td>
-                                <td>92.8%</td>
-                                <td>89.2%</td>
-                            </tr>
-                            <tr>
-                                <td>3</td>
-                                <td>mistral-large</td>
-                                <td>o4-mini</td>
-                                <td>1795</td>
-                                <td>91.5%</td>
-                                <td>87.5%</td>
-                            </tr>
-                            <tr>
-                                <td>4</td>
-                                <td>Qwen3-235B-A22B</td>
-                                <td>o4-mini</td>
-                                <td>1768</td>
-                                <td>90.3%</td>
-                                <td>85.1%</td>
-                            </tr>
-                            <tr>
-                                <td>5</td>
-                                <td>grok-3</td>
-                                <td>o4-mini</td>
-                                <td>1742</td>
-                                <td>88.7%</td>
-                                <td>82.9%</td>
-                            </tr>
-                            <tr>
-                                <td>6</td>
-                                <td>mistral-large</td>
-                                <td>gemini-2.5-pro</td>
-                                <td>1716</td>
-                                <td>88.1%</td>
-                                <td>81.4%</td>
-                            </tr>
-                            <tr>
-                                <td>7</td>
-                                <td>deepseek-r1</td>
-                                <td>o4-mini</td>
-                                <td>1692</td>
-                                <td>87.3%</td>
-                                <td>80.3%</td>
-                            </tr>
-                        </tbody>
-                    </table>
-                </div>
-                <div style="margin-top: 20px; padding: 15px; background-color: #0d47a1; border-radius: 8px; box-shadow: 0 2px 8px rgba(0,0,0,0.1);">
-                    <h3 style="margin-top: 0; color: #ffffff;">ELO Rating System Explanation</h3>
-                    <div style="display: flex; flex-wrap: wrap; gap: 15px; margin-top: 15px;">
-                        <div style="flex: 1; min-width: 280px; padding: 12px; background-color: #455a64; border-radius: 6px; box-shadow: 0 1px 3px rgba(0,0,0,0.12);">
-                            <h4 style="margin-top: 0; color: #ffffff;">How ELO Scores Are Calculated</h4>
-                            <p style="color: #eceff1;">Our ELO rating system assigns scores to model pairs based on benchmark performance, using the following formula:</p>
-                            <div style="background-color: #37474f; padding: 12px; border-radius: 5px; color: #eceff1;">
-                                <code style="color: #80deea;">ELO_new = ELO_old + K × (S - E)</code><br><br>
-                                Where:<br>
-                                • <strong style="color: #b2dfdb;">ELO_old</strong>: Previous rating of the model combination<br>
-                                • <strong style="color: #b2dfdb;">K</strong>: Weight factor (32 for new models, 16 for established ones)<br>
-                                • <strong style="color: #b2dfdb;">S</strong>: Actual score from benchmark tests<br>
-                                • <strong style="color: #b2dfdb;">E</strong>: Expected score based on current rating<br><br>
-                                <em style="color: #80deea;">E = 1 / (1 + 10<sup>(ELO_opponent - ELO_model)/400</sup>)</em>
-                            </div>
-                        </div>
-                        <div style="flex: 1; min-width: 280px; padding: 12px; background-color: #455a64; border-radius: 6px; box-shadow: 0 1px 3px rgba(0,0,0,0.12);">
-                            <h4 style="margin-top: 0; color: #ffffff;">Model Combinations Tested</h4>
-                            <p style="color: #eceff1;">We evaluated 10 different combinations across 250 benchmark questions.</p>
-                            <div style="display: flex; flex-wrap: wrap; gap: 10px; margin-top: 10px;">
-                                <div style="flex: 1; min-width: 120px;">
-                                    <h5 style="margin-top: 0; margin-bottom: 5px; color: #b2dfdb;">Generator Models</h5>
-                                    <ul style="margin-bottom: 0; padding-left: 20px; color: #eceff1;">
-                                        <li>mistral-large</li>
-                                        <li>gpt-4o</li>
-                                        <li>Qwen3-235B-A22B</li>
-                                        <li>grok-3</li>
-                                        <li>deepseek-r1</li>
-                                        <li>o4-mini</li>
-                                        <li>gemini-2.5-pro</li>
-                                    </ul>
-                                </div>
-                                <div style="flex: 1; min-width: 120px;">
-                                    <h5 style="margin-top: 0; margin-bottom: 5px; color: #b2dfdb;">Judge Models</h5>
-                                    <ul style="margin-bottom: 0; padding-left: 20px; color: #eceff1;">
-                                        <li>mistral-large</li>
-                                        <li>gpt-4o</li>
-                                        <li>Qwen3-235B-A22B</li>
-                                        <li>grok-3</li>
-                                        <li>deepseek-r1</li>
-                                        <li>o4-mini</li>
-                                        <li>gemini-2.5-pro</li>
-                                    </ul>
-                                </div>
-                            </div>
-                        </div>
-                    </div>
-                </div>
-                <style>
-                .leaderboard-container {
-                    margin: 15px 0;
-                    overflow-x: auto;
-                }
-                .leaderboard-table {
-                    width: 100%;
-                    border-collapse: collapse;
-                    font-size: 0.95em;
-                    box-shadow: 0 2px 10px rgba(0,0,0,0.1);
-                    border-radius: 8px;
-                    overflow: hidden;
-                }
-                .leaderboard-table thead {
-                    background-color: #1565c0;
-                    color: white;
-                }
-                .leaderboard-table th, .leaderboard-table td {
-                    padding: 12px 15px;
-                    text-align: left;
-                    border-bottom: 1px solid #ddd;
-                }
-                .leaderboard-table tbody tr {
-                    transition: background-color 0.3s;
-                }
-                .leaderboard-table tbody tr:nth-child(even) {
-                    background-color: #cfd8dc;
-                }
-                .leaderboard-table tbody tr:hover {
-                    background-color: #b0bec5;
-                }
-                .leaderboard-table tbody tr:first-child {
-                    background-color: #80cbc4;
-                    color: #004d40;
-                }
-                .leaderboard-table tbody tr:nth-child(2) {
-                    background-color: #81c784;
-                    color: #1b5e20;
-                }
-                .leaderboard-table tbody tr:nth-child(4) {
-                    background-color: #aed581;
-                    color: #33691e;
-                }
-                .leaderboard-table tbody tr:nth-child(6) {
-                    background-color: #d7ccc8;
-                    color: #3e2723;
-                }
-                </style>
-                """)
-            # Tab 3: Generator Models Hallucination Leaderboard
-            with gr.TabItem("User Feedback", elem_id="user-feedback-tab"):
-                gr.Markdown("## Model Hallucination Evaluation (User Feedback)")
-                gr.Markdown("Performance ranking of generator models based on user-reported hallucination rates.")
-                # Create leaderboard table for user feedback
-                user_feedback_html = gr.HTML("""
-                <div class="leaderboard-container">
-                    <table class="leaderboard-table">
-                        <thead>
-                            <tr>
-                                <th>Rank</th>
-                                <th>Generator Model</th>
-                                <th>ELO Score</th>
-                                <th>Accuracy</th>
-                                <th>Sample Size</th>
-                            </tr>
-                        </thead>
-                        <tbody>
-                            <tr>
-                                <td>1</td>
-                                <td>gpt-4o</td>
-                                <td>1856</td>
-                                <td>96.4%</td>
-                                <td>256</td>
-                            </tr>
-                            <tr>
-                                <td>2</td>
-                                <td>mistral-large</td>
-                                <td>1802</td>
-                                <td>93.8%</td>
-                                <td>221</td>
-                            </tr>
-                            <tr>
-                                <td>3</td>
-                                <td>Qwen3-235B-A22B</td>
-                                <td>1765</td>
-                                <td>91.5%</td>
-                                <td>192</td>
-                            </tr>
-                            <tr>
-                                <td>4</td>
-                                <td>o4-mini</td>
-                                <td>1732</td>
-                                <td>89.3%</td>
-                                <td>178</td>
-                            </tr>
-                            <tr>
-                                <td>5</td>
-                                <td>gemini-2.5-pro</td>
-                                <td>1695</td>
-                                <td>87.2%</td>
-                                <td>165</td>
-                            </tr>
-                            <tr>
-                                <td>6</td>
-                                <td>grok-3</td>
-                                <td>1665</td>
-                                <td>85.7%</td>
-                                <td>147</td>
-                            </tr>
-                            <tr>
-                                <td>7</td>
-                                <td>deepseek-r1</td>
-                                <td>1625</td>
-                                <td>83.2%</td>
-                                <td>134</td>
-                            </tr>
-                        </tbody>
-                    </table>
-                </div>
-                <div style="margin-top: 20px; padding: 15px; background-color: #0d47a1; border-radius: 8px; box-shadow: 0 2px 8px rgba(0,0,0,0.1);">
-                    <h3 style="margin-top: 0; color: #ffffff;">ELO Rating System Explanation</h3>
-                    <div style="display: flex; flex-wrap: wrap; gap: 15px; margin-top: 15px;">
-                        <div style="flex: 1; min-width: 280px; padding: 12px; background-color: #455a64; border-radius: 6px; box-shadow: 0 1px 3px rgba(0,0,0,0.12);">
-                            <h4 style="margin-top: 0; color: #ffffff;">How ELO Scores Are Calculated</h4>
-                            <p style="color: #eceff1;">Our ELO rating system assigns scores to models based on user feedback, using the following formula:</p>
-                            <div style="background-color: #37474f; padding: 12px; border-radius: 5px; color: #eceff1;">
-                                <code style="color: #80deea;">ELO_new = ELO_old + K × (S - E)</code><br><br>
-                                Where:<br>
-                                • <strong style="color: #b2dfdb;">ELO_old</strong>: Previous rating of the model<br>
-                                • <strong style="color: #b2dfdb;">K</strong>: Weight factor (40 for new models, 20 for established ones)<br>
-                                • <strong style="color: #b2dfdb;">S</strong>: Actual score (1 for correct hallucination detection, 0 for incorrect)<br>
-                                • <strong style="color: #b2dfdb;">E</strong>: Expected score based on current rating<br><br>
-                                <em style="color: #80deea;">E = 1 / (1 + 10<sup>(ELO_opponent - ELO_model)/400</sup>)</em>
-                            </div>
-                            <p style="color: #eceff1; margin-top: 10px;">All models start with a base ELO of 1500. Scores are updated after each user evaluation.</p>
-                        </div>
-                        <div style="flex: 1; min-width: 280px; padding: 12px; background-color: #455a64; border-radius: 6px; box-shadow: 0 1px 3px rgba(0,0,0,0.12);">
-                            <h4 style="margin-top: 0; color: #ffffff;">Interpretation Guidelines</h4>
-                            <ul style="margin-bottom: 0; padding-left: 20px; color: #eceff1;">
-                                <li><strong style="color: #b2dfdb;">1800+</strong>: Exceptional performance, very rare hallucinations</li>
-                                <li><strong style="color: #b2dfdb;">1700-1799</strong>: Superior performance, minimal hallucinations</li>
-                                <li><strong style="color: #b2dfdb;">1600-1699</strong>: Good performance, occasional hallucinations</li>
-                                <li><strong style="color: #b2dfdb;">1500-1599</strong>: Average performance</li>
-                                <li><strong style="color: #b2dfdb;">&lt;1500</strong>: Below average, frequent hallucinations</li>
-                            </ul>
-                            <p style="font-style: italic; color: #b3e5fc; margin-top: 10px;">
-                                Note: ELO scores are comparative and reflect relative performance between models in our specific hallucination detection tasks.
-                            </p>
-                        </div>
-                    </div>
-                </div>
-                """)
         # Function to continuously update stats
         def update_stats():
@@ -1811,30 +2316,30 @@ def create_interface():
                 live_stats = gr.HTML(update_stats())
                 # Add loading animation style
-                gr.HTML("""
-                <style>
-                @keyframes pulse {
-                    0% { opacity: 0.6; }
-                    50% { opacity: 1; }
-                    100% { opacity: 0.6; }
-                }
-                .refreshing::after {
-                    content: "⟳";
-                    display: inline-block;
-                    margin-left: 8px;
-                    animation: pulse 1.5s infinite ease-in-out;
-                    color: #2e7d32;
-                }
-                #stats-container {
-                    border: 1px solid #b3e5fc;
-                    border-radius: 10px;
-                    padding: 15px;
-                    margin: 10px 0;
-                    background-color: #0277bd;
-                }
-                </style>
-                <div class="refreshing" style="text-align: right; font-size: 0.8em; color: #eceff1;">Auto-refreshing</div>
-                """)
         # Create a refresh button that will be auto-clicked
         refresh_btn = gr.Button("Refresh Stats", visible=False)
@@ -2018,19 +2523,13 @@ def create_interface():
         feedback_button.click(
             fn=combine_feedback,
-            inputs=[feedback_input, feedback_text, hidden_results],
             outputs=[feedback_status]
         )
         # Footer
         gr.HTML(
-            """
-            <footer>
-                <p>Paraphrase-based Approach for Scrutinizing Systems (PAS2) - Advanced Hallucination Detection</p>
-                <p>Multiple LLM models tested as generators and judges for optimal hallucination detection</p>
-                <p><small>Models in testing: mistral-large, gpt-4o, Qwen3-235B-A22B, grok-3, o4-mini, gemini-2.5-pro, deepseek-r1</small></p>
-            </footer>
-            """
         )
     return interface
@@ -2096,4 +2595,4 @@ if __name__ == "__main__":
 # Uncomment this line to run the test function instead of the main interface
 # if __name__ == "__main__":
-#     test_progress()

 class PAS2:
     """Paraphrase-based Approach for LLM Systems - Using llm-as-judge methods"""
+    def __init__(self, mistral_api_key=None, openai_api_key=None, xai_api_key=None, qwen_api_key=None, deepseek_api_key=None, gemini_api_key=None, progress_callback=None):
         """Initialize the PAS2 with API keys"""
         # For Hugging Face Spaces, we prioritize getting API keys from HF_* environment variables
         # which are set from the Secrets tab in the Space settings
         self.mistral_api_key = mistral_api_key or os.environ.get("HF_MISTRAL_API_KEY") or os.environ.get("MISTRAL_API_KEY")
         self.openai_api_key = openai_api_key or os.environ.get("HF_OPENAI_API_KEY") or os.environ.get("OPENAI_API_KEY")
+        self.xai_api_key = xai_api_key or os.environ.get("HF_XAI_API_KEY") or os.environ.get("XAI_API_KEY")
+        self.qwen_api_key = qwen_api_key or os.environ.get("HF_QWEN_API_KEY") or os.environ.get("QWEN_API_KEY")
+        self.deepseek_api_key = deepseek_api_key or os.environ.get("HF_DEEPSEEK_API_KEY") or os.environ.get("DEEPSEEK_API_KEY")
+        self.gemini_api_key = gemini_api_key or os.environ.get("HF_GEMINI_API_KEY") or os.environ.get("GEMINI_API_KEY")
         self.progress_callback = progress_callback
         if not self.mistral_api_key:
         self.mistral_client = Mistral(api_key=self.mistral_api_key)
         self.openai_client = OpenAI(api_key=self.openai_api_key)
+        self.xai_client = OpenAI(api_key=self.xai_api_key, base_url="https://api.x.ai/v1")
+        self.qwen_client = OpenAI(api_key=self.qwen_api_key, base_url="https://router.huggingface.co/nebius/v1")
+        self.deepseek_client = OpenAI(api_key=self.deepseek_api_key, base_url="https://api.deepseek.com")
+        self.gemini_client = OpenAI(api_key=self.gemini_api_key, base_url="https://generativelanguage.googleapis.com/v1beta/openai/")
+        # Define model names
         self.mistral_model = "mistral-large-latest"
+        self.openai_o4mini = "o4-mini"
+        self.openai_4o = "gpt-4o"
+        self.deepseek_model = "deepseek-reasoner"
+        self.grok_model = "grok-3-beta"
+        self.qwen_model = "Qwen/Qwen3-235B-A22B"
+        self.gemini_model = "gemini-2.5-pro-preview-05-06"
+        # Create a dictionary mapping model names to their clients and model identifiers
+        self.model_configs = {
+            "mistral-large": {
+                "client": self.mistral_client,
+                "model_id": self.mistral_model,
+                "type": "mistral"
+            },
+            "o4-mini": {
+                "client": self.openai_client,
+                "model_id": self.openai_o4mini,
+                "type": "openai"
+            },
+            "gpt-4o": {
+                "client": self.openai_client,
+                "model_id": self.openai_4o,
+                "type": "openai"
+            },
+            "deepseek-reasoner": {
+                "client": self.deepseek_client,
+                "model_id": self.deepseek_model,
+                "type": "openai"
+            },
+            "grok-3": {
+                "client": self.xai_client,
+                "model_id": self.grok_model,
+                "type": "openai"
+            },
+            "qwen-235b": {
+                "client": self.qwen_client,
+                "model_id": self.qwen_model,
+                "type": "openai"
+            },
+            "gemini-2.5-pro": {
+                "client": self.gemini_client,
+                "model_id": self.gemini_model,
+                "type": "openai"
+            }
+        }
+        # Set default models (will be randomized later)
+        self.generator_model = "mistral-large"
+        self.judge_model = "o4-mini"
+        logger.info("PAS2 initialized with available models: %s", ", ".join(self.model_configs.keys()))
     def generate_paraphrases(self, query: str, n_paraphrases: int = 3) -> List[str]:
         """Generate paraphrases of the input query using Mistral API"""
             return fallback_paraphrases
+    def set_random_model_pair(self):
+        """Randomly select a pair of generator and judge models"""
+        import random
+        # Get list of available models
+        available_models = list(self.model_configs.keys())
+        # Randomly select generator and judge models
+        self.generator_model = random.choice(available_models)
+        # Make sure judge is different from generator
+        judge_options = [m for m in available_models if m != self.generator_model]
+        self.judge_model = random.choice(judge_options)
+        logger.info("Randomly selected model pair - Generator: %s, Judge: %s",
+                   self.generator_model, self.judge_model)
+        return self.generator_model, self.judge_model
     def _get_single_response(self, query: str, index: int = None) -> str:
+        """Get a single response from the selected generator model for a query"""
         try:
             query_description = f"Query {index}: {query}" if index is not None else f"Query: {query}"
+            logger.info("Getting response for %s using %s", query_description, self.generator_model)
             start_time = time.time()
+            # Get the model configuration
+            model_config = self.model_configs[self.generator_model]
+            client = model_config["client"]
+            model_id = model_config["model_id"]
+            model_type = model_config["type"]
             messages = [
                 {
                     "role": "system",
                 }
             ]
+            # Use the appropriate client and model based on the type
+            if model_type == "mistral":
+                response = client.chat.complete(
+                    model=model_id,
+                    messages=messages
+                )
+                result = response.choices[0].message.content
+            else:  # openai-compatible API
+                response = client.chat.completions.create(
+                    model=model_id,
+                    messages=messages
+                )
+                result = response.choices[0].message.content
             elapsed_time = time.time() - start_time
+            logger.info("Received response from %s for %s (%.2f seconds)",
+                       self.generator_model, query_description, elapsed_time)
             logger.debug("Response content for %s: %s", query_description, result[:100] + "..." if len(result) > 100 else result)
             return result
         except Exception as e:
+            error_msg = f"Error getting response for query '{query}' with model {self.generator_model}: {e}"
             logger.error(error_msg, exc_info=True)
+            return f"Error: Failed to get response for this query with model {self.generator_model}."
     def get_responses(self, queries: List[str]) -> List[str]:
         """Get responses from Mistral API for each query in parallel"""
         logger.info("Starting hallucination detection for query: %s", query)
         start_time = time.time()
+        # Randomly select a model pair for this detection
+        generator_model, judge_model = self.set_random_model_pair()
+        logger.info("Using %s as generator and %s as judge for this detection", generator_model, judge_model)
         # Report progress
         if self.progress_callback:
             self.progress_callback("starting", query=query)
             self.progress_callback("paraphrases_complete", query=query, count=len(all_queries))
         # Get responses to all queries
+        logger.info("Step 2: Getting responses to all %d queries using %s", len(all_queries), generator_model)
         if self.progress_callback:
+            self.progress_callback("getting_responses", query=query, total=len(all_queries), model=generator_model)
         all_responses = []
         for i, q in enumerate(all_queries):
             self.progress_callback("responses_complete", query=query)
         # Judge the responses for hallucinations
+        logger.info("Step 3: Judging for hallucinations using %s", judge_model)
         if self.progress_callback:
+            self.progress_callback("judging", query=query, model=judge_model)
         # The first query is the original, rest are paraphrases
         original_query = all_queries[0]
             "confidence_score": judgment.confidence_score,
             "conflicting_facts": judgment.conflicting_facts,
             "reasoning": judgment.reasoning,
+            "summary": judgment.summary,
+            "generator_model": generator_model,
+            "judge_model": judge_model
         }
         # Report completion
         if self.progress_callback:
+            self.progress_callback("complete", query=query, generator=generator_model, judge=judge_model)
+        logger.info("Hallucination detection completed in %.2f seconds using %s (generator) and %s (judge)",
+                   time.time() - start_time, generator_model, judge_model)
         return results
     def judge_hallucination(self,
                            paraphrased_queries: List[str],
                            paraphrased_responses: List[str]) -> HallucinationJudgment:
         """
+        Use the selected judge model to detect hallucinations in the responses
         """
+        logger.info("Judging hallucinations with %s model", self.judge_model)
         start_time = time.time()
+        # Get the model configuration for the judge
+        model_config = self.model_configs[self.judge_model]
+        client = model_config["client"]
+        model_id = model_config["model_id"]
+        model_type = model_config["type"]
         # Prepare the context for the judge
         context = f"""
 Original Question: {original_query}
 """
         try:
+            logger.info("Sending judgment request to %s...", self.judge_model)
+            # Use the appropriate client and model based on the type
+            if model_type == "mistral":
+                response = client.chat.complete(
+                    model=model_id,
+                    messages=[
+                        {"role": "system", "content": system_prompt},
+                        {"role": "user", "content": f"Evaluate these responses for hallucinations:\n\n{context}"}
+                    ],
+                    response_format={"type": "json_object"}
+                )
+                result_json = json.loads(response.choices[0].message.content)
+            else:  # openai-compatible API
+                response = client.chat.completions.create(
+                    model=model_id,
+                    messages=[
+                        {"role": "system", "content": system_prompt},
+                        {"role": "user", "content": f"Evaluate these responses for hallucinations:\n\n{context}"}
+                    ],
+                    response_format={"type": "json_object"}
+                )
+                result_json = json.loads(response.choices[0].message.content)
+            logger.debug("Received judgment response from %s: %s", self.judge_model, result_json)
             # Create the HallucinationJudgment object from the JSON response
             judgment = HallucinationJudgment(
             )
             elapsed_time = time.time() - start_time
+            logger.info("Judgment completed by %s in %.2f seconds", self.judge_model, elapsed_time)
             return judgment
         except Exception as e:
+            logger.error("Error in hallucination judgment with %s: %s", self.judge_model, str(e), exc_info=True)
             # Return a fallback judgment
             return HallucinationJudgment(
                 hallucination_detected=False,
                 confidence_score=0.0,
                 conflicting_facts=[],
+                reasoning=f"Failed to obtain judgment from the {self.judge_model} model: {str(e)}",
                 summary="Analysis failed due to API error."
             )
                 "conflicting_facts": results.get('conflicting_facts', []),
                 "reasoning": results.get('reasoning', ''),
                 "summary": results.get('summary', ''),
+                "generator_model": results.get('generator_model', 'unknown'),
+                "judge_model": results.get('judge_model', 'unknown'),
                 "user_feedback": feedback
             }
             # Insert document into collection
+            result = self.feedback_collection.insert_one(document)
+            # Update model leaderboard scores
+            self._update_model_scores(
+                generator=results.get('generator_model', 'unknown'),
+                judge=results.get('judge_model', 'unknown'),
+                feedback=feedback,
+                hallucination_detected=results.get('hallucination_detected', False)
+            )
             logger.info("Feedback saved successfully to MongoDB")
             return "Feedback saved successfully!"
             logger.error("Error saving feedback: %s", str(e), exc_info=True)
             return f"Error saving feedback: {str(e)}"
+    def _update_model_scores(self, generator, judge, feedback, hallucination_detected):
+        """Update the ELO scores for the generator and judge models based on feedback"""
+        try:
+            if self.db is None:
+                logger.error("MongoDB connection not available. Cannot update model scores.")
+                return
+            # Access or create the models collection
+            models_collection = self.db.get_collection("model_scores")
+            # Create indexes if they don't exist
+            models_collection.create_index("model_name", unique=True)
+            # Parse the feedback to determine scenario
+            actual_hallucination = "Yes, there was a hallucination" in feedback
+            no_hallucination = "No, there was no hallucination" in feedback
+            judge_correct = "Yes, the judge was correct" in feedback
+            judge_incorrect = "No, the judge was incorrect" in feedback
+            # Determine scores based on different scenarios:
+            # 1. Actual hallucination + Judge correct = positive for judge, negative for generator
+            # 2. No hallucination + Judge correct = positive for both
+            # 3. No hallucination + Judge incorrect = negative for judge, positive for generator
+            # 4. Actual hallucination + Judge incorrect = negative for both
+            if judge_correct:
+                if actual_hallucination:
+                    # Scenario 1: Judge correctly detected hallucination
+                    judge_score = 1  # Positive for judge
+                    generator_score = 0  # Negative for generator (hallucinated)
+                    logger.info("Judge %s correctly detected hallucination from generator %s", judge, generator)
+                elif no_hallucination:
+                    # Scenario 2: Judge correctly determined no hallucination
+                    judge_score = 1  # Positive for judge
+                    generator_score = 1  # Positive for generator (didn't hallucinate)
+                    logger.info("Judge %s correctly determined no hallucination from generator %s", judge, generator)
+                else:
+                    # User unsure about hallucination, but confirmed judge was correct
+                    judge_score = 1  # Positive for judge
+                    generator_score = 0.5  # Neutral for generator (unclear)
+                    logger.info("User confirmed judge %s was correct, but unclear about hallucination from %s", judge, generator)
+            elif judge_incorrect:
+                if no_hallucination:
+                    # Scenario 3: Judge incorrectly claimed hallucination (false positive)
+                    judge_score = 0  # Negative for judge
+                    generator_score = 1  # Positive for generator (unfairly accused)
+                    logger.info("Judge %s incorrectly claimed hallucination from generator %s", judge, generator)
+                elif actual_hallucination:
+                    # Scenario 4: Judge missed actual hallucination (false negative)
+                    judge_score = 0  # Negative for judge
+                    generator_score = 0  # Negative for generator (hallucination went undetected)
+                    logger.info("Judge %s missed actual hallucination from generator %s", judge, generator)
+                else:
+                    # User unsure about hallucination, but confirmed judge was incorrect
+                    judge_score = 0  # Negative for judge
+                    generator_score = 0.5  # Neutral for generator (unclear)
+                    logger.info("User confirmed judge %s was incorrect, but unclear about hallucination from %s", judge, generator)
+            else:
+                # User unsure about judge correctness, don't update scores
+                judge_score = 0.5  # Neutral for judge (unclear)
+                generator_score = 0.5  # Neutral for generator (unclear)
+                logger.info("User unsure about judge %s correctness and generator %s hallucination", judge, generator)
+            # Update generator model stats with specific score
+            self._update_model_stats(models_collection, generator, generator_score, "generator")
+            # Update judge model stats with specific score
+            self._update_model_stats(models_collection, judge, judge_score, "judge")
+            # Determine if the detection was correct based on judge correctness
+            detection_correct = judge_correct
+            # Determine if there was actually hallucination based on user feedback
+            actual_hallucination_present = actual_hallucination
+            # Update model pair stats
+            self._update_model_pair_stats(generator, judge, detection_correct, actual_hallucination_present,
+                                         generator_score, judge_score)
+            logger.info("Updated model scores based on feedback: generator(%s)=%s, judge(%s)=%s",
+                       generator, generator_score, judge, judge_score)
+        except Exception as e:
+            logger.error("Error updating model scores: %s", str(e), exc_info=True)
+    def _update_model_stats(self, collection, model_name, score, role):
+        """Update statistics for a single model"""
+        # Simplified ELO calculation
+        K_FACTOR = 32  # Standard K-factor for ELO
+        # Get current model data or create if not exists
+        model_data = collection.find_one({"model_name": model_name})
+        if model_data is None:
+            # Initialize new model with default values
+            model_data = {
+                "model_name": model_name,
+                "elo_score": 1500,  # Starting ELO
+                "total_samples": 0,
+                "correct_predictions": 0,
+                "accuracy": 0.0,
+                "as_generator": 0,
+                "as_judge": 0,
+                "as_generator_correct": 0,
+                "as_judge_correct": 0,
+                "neutral_samples": 0  # Add a counter for neutral samples
+            }
+        # Skip counting for neutral feedback (0.5)
+        if score == 0.5:
+            # Increment neutral samples counter instead
+            if "neutral_samples" not in model_data:
+                model_data["neutral_samples"] = 0
+            model_data["neutral_samples"] += 1
+            # Expected score based on current rating (vs average rating)
+            expected_score = 1 / (1 + 10**((1500 - model_data["elo_score"]) / 400))
+            # For neutral score, use a much smaller K factor to slightly adjust the ELO
+            # This handles the "unsure" case with minimal impact
+            model_data["elo_score"] = model_data["elo_score"] + (K_FACTOR/4) * (0.5 - expected_score)
+            # Update or insert the model data
+            collection.replace_one(
+                {"model_name": model_name},
+                model_data,
+                upsert=True
+            )
+            return
+        # Update sample counts for non-neutral cases
+        model_data["total_samples"] += 1
+        if role == "generator":
+            model_data["as_generator"] += 1
+            if score == 1:  # Only count as correct if score is 1 (not 0)
+                model_data["as_generator_correct"] += 1
+        else:  # role == "judge"
+            model_data["as_judge"] += 1
+            if score == 1:  # Only count as correct if score is 1 (not 0)
+                model_data["as_judge_correct"] += 1
+        # Update correct predictions based on score
+        if score == 1:
+            model_data["correct_predictions"] += 1
+        # Calculate new accuracy
+        model_data["accuracy"] = model_data["correct_predictions"] / model_data["total_samples"]
+        # Update ELO score based on the specific score value (0 or 1)
+        # Expected score based on current rating (vs average rating)
+        expected_score = 1 / (1 + 10**((1500 - model_data["elo_score"]) / 400))
+        # Use the provided score (0 or 1)
+        actual_score = score
+        # New ELO calculation
+        model_data["elo_score"] = model_data["elo_score"] + K_FACTOR * (actual_score - expected_score)
+        # Update or insert the model data
+        collection.replace_one(
+            {"model_name": model_name},
+            model_data,
+            upsert=True
+        )
+    def _update_model_pair_stats(self, generator, judge, detection_correct, hallucination_detected,
+                             generator_score, judge_score):
+        """Update statistics for a model pair combination"""
+        try:
+            # Access or create the model pairs collection
+            pairs_collection = self.db.get_collection("model_pairs")
+            # Create compound index if it doesn't exist
+            pairs_collection.create_index([("generator", 1), ("judge", 1)], unique=True)
+            # Get current pair data or create if not exists
+            pair_data = pairs_collection.find_one({
+                "generator": generator,
+                "judge": judge
+            })
+            if pair_data is None:
+                # Initialize new pair with default values
+                pair_data = {
+                    "generator": generator,
+                    "judge": judge,
+                    "elo_score": 1500,  # Starting ELO
+                    "total_samples": 0,
+                    "correct_predictions": 0,
+                    "accuracy": 0.0,
+                    "hallucinations_detected": 0,
+                    "generator_performance": 0.0,
+                    "judge_performance": 0.0,
+                    "consistency_score": 0.0
+                }
+            # Update sample counts
+            pair_data["total_samples"] += 1
+            if detection_correct:
+                pair_data["correct_predictions"] += 1
+            if hallucination_detected:
+                pair_data["hallucinations_detected"] += 1
+            # Track model-specific performances within the pair
+            if "generator_correct_count" not in pair_data:
+                pair_data["generator_correct_count"] = 0
+            if "judge_correct_count" not in pair_data:
+                pair_data["judge_correct_count"] = 0
+            # Update individual performance counters based on scores
+            if generator_score == 1:
+                pair_data["generator_correct_count"] += 1
+            if judge_score == 1:
+                pair_data["judge_correct_count"] += 1
+            # Calculate individual performance rates within the pair
+            pair_data["generator_performance"] = pair_data["generator_correct_count"] / pair_data["total_samples"]
+            pair_data["judge_performance"] = pair_data["judge_correct_count"] / pair_data["total_samples"]
+            # Calculate new accuracy for the pair (detection accuracy)
+            pair_data["accuracy"] = pair_data["correct_predictions"] / pair_data["total_samples"]
+            # Calculate consistency score - weighted average of individual performances
+            # Gives more weight to the generator when hallucinations are detected
+            if hallucination_detected:
+                # When hallucination is detected, judge's role is more critical
+                pair_data["consistency_score"] = (0.4 * pair_data["generator_performance"] +
+                                                0.6 * pair_data["judge_performance"])
+            else:
+                # When no hallucination is detected, both roles are equally important
+                pair_data["consistency_score"] = (0.5 * pair_data["generator_performance"] +
+                                                0.5 * pair_data["judge_performance"])
+            # Update ELO score (simplified version)
+            K_FACTOR = 24  # Slightly lower K-factor for pairs
+            # Expected score based on current rating
+            expected_score = 1 / (1 + 10**((1500 - pair_data["elo_score"]) / 400))
+            # Actual score - use the average of both model scores (0-1 range)
+            # This represents the pair's overall performance
+            actual_score = (generator_score + judge_score) / 2
+            # New ELO calculation
+            pair_data["elo_score"] = pair_data["elo_score"] + K_FACTOR * (actual_score - expected_score)
+            # Update or insert the pair data
+            pairs_collection.replace_one(
+                {"generator": generator, "judge": judge},
+                pair_data,
+                upsert=True
+            )
+            logger.info("Updated model pair stats for %s (generator) and %s (judge)", generator, judge)
+        except Exception as e:
+            logger.error("Error updating model pair stats: %s", str(e), exc_info=True)
+            return None
     def get_feedback_stats(self):
         """Get statistics about collected feedback from MongoDB"""
         try:
         except Exception as e:
             logger.error("Error getting feedback stats: %s", str(e), exc_info=True)
             return None
+    def get_model_leaderboard(self):
+        """Get the current model leaderboard data"""
+        try:
+            if self.db is None:
+                logger.error("MongoDB connection not available. Cannot get model leaderboard.")
+                return None
+            # Access models collection
+            models_collection = self.db.get_collection("model_scores")
+            # Get all models and sort by ELO score
+            models = list(models_collection.find().sort("elo_score", pymongo.DESCENDING))
+            # Format percentages and convert ObjectId
+            for model in models:
+                model["_id"] = str(model["_id"])
+                model["accuracy"] = round(model["accuracy"] * 100, 1)
+                if "as_generator" in model and model["as_generator"] > 0:
+                    model["generator_accuracy"] = round((model["as_generator_correct"] / model["as_generator"]) * 100, 1)
+                else:
+                    model["generator_accuracy"] = 0.0
+                if "as_judge" in model and model["as_judge"] > 0:
+                    model["judge_accuracy"] = round((model["as_judge_correct"] / model["as_judge"]) * 100, 1)
+                else:
+                    model["judge_accuracy"] = 0.0
+            return models
+        except Exception as e:
+            logger.error("Error getting model leaderboard: %s", str(e), exc_info=True)
+            return []
+    def get_pair_leaderboard(self):
+        """Get the current model pair leaderboard data"""
+        try:
+            if self.db is None:
+                logger.error("MongoDB connection not available. Cannot get pair leaderboard.")
+                return None
+            # Access model pairs collection
+            pairs_collection = self.db.get_collection("model_pairs")
+            # Get all pairs and sort by ELO score
+            pairs = list(pairs_collection.find().sort("elo_score", pymongo.DESCENDING))
+            # Format percentages and convert ObjectId
+            for pair in pairs:
+                pair["_id"] = str(pair["_id"])
+                pair["accuracy"] = round(pair["accuracy"] * 100, 1)
+                pair["consistency_score"] = round(pair["consistency_score"] * 100, 1)
+            return pairs
+        except Exception as e:
+            logger.error("Error getting pair leaderboard: %s", str(e), exc_info=True)
+            return []
     def export_data_to_csv(self, filepath=None):
         """Export all feedback data to a CSV file for analysis"""
         "starting": {"status": "Starting process...", "progress": 5, "color": "#2196F3"},
         "generating_paraphrases": {"status": "Generating paraphrases...", "progress": 15, "color": "#2196F3"},
         "paraphrases_complete": {"status": "Paraphrases generated", "progress": 30, "color": "#2196F3"},
+        "getting_responses": {"status": "Getting responses using {model}...", "progress": 35, "color": "#2196F3"},
         "responses_progress": {"status": "Getting responses ({completed}/{total})...", "progress": 40, "color": "#2196F3"},
         "responses_complete": {"status": "All responses received", "progress": 65, "color": "#2196F3"},
+        "judging": {"status": "Analyzing responses for hallucinations using {model}...", "progress": 70, "color": "#2196F3"},
+        "complete": {"status": "Analysis complete! Using {generator} (generator) and {judge} (judge)", "progress": 100, "color": "#4CAF50"},
         "error": {"status": "Error: {error_message}", "progress": 100, "color": "#F44336"}
     }
         self.completed_responses = 0
         self.total_responses = 0
         self.error_message = ""
+        self.generator_model = ""
+        self.judge_model = ""
+        self.model = ""  # For general model reference in status messages
         self._lock = threading.Lock()
         self._status_callback = None
         self._stop_event = threading.Event()
                         self.total_responses = value
                     elif key == 'error_message':
                         self.error_message = value
+                    elif key == 'model':
+                        self.model = value
+                    elif key == 'generator':
+                        self.generator_model = value
+                    elif key == 'judge':
+                        self.judge_model = value
                 # Format status message
                 if stage == 'responses_progress':
                         completed=self.completed_responses,
                         total=self.total_responses
                     )
+                elif stage == 'getting_responses' and 'model' in kwargs:
+                    self.stage_data['status'] = self.stage_data['status'].format(
+                        model=kwargs.get('model', 'selected model')
+                    )
+                elif stage == 'judging' and 'model' in kwargs:
+                    self.stage_data['status'] = self.stage_data['status'].format(
+                        model=kwargs.get('model', 'selected model')
+                    )
+                elif stage == 'complete' and 'generator' in kwargs and 'judge' in kwargs:
+                    self.stage_data['status'] = self.stage_data['status'].format(
+                        generator=self.generator_model,
+                        judge=self.judge_model
+                    )
                 elif stage == 'error':
                     self.stage_data['status'] = self.stage_data['status'].format(
                         error_message=self.error_message
         # Only show status text if not in idle state
         status_display = f'<div class="progress-status" style="color: {color};">{status_text}</div>' if self.stage != "idle" else ''
+        # Add model information if available and we're not in idle or error state
+        model_info = ''
+        if self.stage not in ["idle", "error", "starting"] and (self.generator_model or self.judge_model):
+            model_info = f'<div class="model-info" style="display: flex; justify-content: space-between; margin-top: 8px; font-size: 0.85em; color: #37474f; background-color: #e1f5fe; padding: 5px 10px; border-radius: 4px;">'
+            if self.generator_model:
+                model_info += f'<div><span style="font-weight: bold;">Generator:</span> {self.generator_model}</div>'
+            if self.judge_model:
+                model_info += f'<div><span style="font-weight: bold;">Judge:</span> {self.judge_model}</div>'
+            model_info += '</div>'
         html = f"""
         <div class="progress-container">
             {query_info}
             <div class="progress-bar-container">
                 <div class="progress-bar" style="width: {progress_width}; background-color: {color};"></div>
             </div>
+            {model_info}
         </div>
         """
         return html
                 combined_progress_callback("starting", query=query)
                 time.sleep(0.3)  # Ensure starting status is visible
+                # Step 1.5: Randomly select model pair
+                generator_model, judge_model = detector.pas2.set_random_model_pair()
+                combined_progress_callback("starting", query=query, generator=generator_model, judge=judge_model)
+                time.sleep(0.3)  # Ensure model info is visible
                 # Step 2: Generate paraphrases (15-30%)
                 combined_progress_callback("generating_paraphrases", query=query)
                 all_queries = detector.pas2.generate_paraphrases(query)
                 combined_progress_callback("paraphrases_complete", query=query, count=len(all_queries))
                 # Step 3: Get responses (35-65%)
+                combined_progress_callback("getting_responses", query=query, total=len(all_queries), model=generator_model)
                 all_responses = []
                 for i, q in enumerate(all_queries):
                     # Show incremental progress for each response
                 combined_progress_callback("responses_complete", query=query)
                 # Step 4: Judge hallucinations (70-100%)
+                combined_progress_callback("judging", query=query, model=judge_model)
                 # The first query is the original, rest are paraphrases
                 original_query = all_queries[0]
                     "confidence_score": judgment.confidence_score,
                     "conflicting_facts": judgment.conflicting_facts,
                     "reasoning": judgment.reasoning,
+                    "summary": judgment.summary,
+                    "generator_model": generator_model,
+                    "judge_model": judge_model
                 }
                 # Show completion
+                combined_progress_callback("complete", query=query, generator=generator_model, judge=judge_model)
                 time.sleep(0.3)  # Ensure complete status is visible
                 return results
             reasoning_safe = reasoning.replace('\\', '\\\\').replace('\n', '<br>')
             conflicting_facts_text_safe = conflicting_facts_text.replace('\\', '\\\\').replace('\n', '<br>') if conflicting_facts_text else "<strong>None identified</strong>"
+            # Get model info from the results
+            generator_model = results.get("generator_model", "unknown model")
+            judge_model = results.get("judge_model", "unknown model")
             html_output = f"""
             <div class="container">
                 <h2 class="title">Hallucination Detection Results</h2>
+                <div class="model-info-bar" style="background-color: #e1f5fe; padding: 10px 15px; border-radius: 8px; margin-bottom: 15px; display: flex; justify-content: space-between;">
+                    <div style="flex: 1; text-align: center; border-right: 1px solid #b3e5fc; padding-right: 10px;">
+                        <div style="font-weight: bold; color: #0277bd;">Generator Model</div>
+                        <div style="font-size: 1.2em; color: #01579b;">{generator_model}</div>
+                    </div>
+                    <div style="flex: 1; text-align: center; padding-left: 10px;">
+                        <div style="font-weight: bold; color: #0277bd;">Judge Model</div>
+                        <div style="font-size: 1.2em; color: #01579b;">{judge_model}</div>
+                    </div>
+                </div>
                 <div class="stats-section">
                     <div class="stat-item">
                         <div class="stat-value">{'Yes' if hallucination_detected else 'No'}</div>
                     {original_query}
                 </div>
+                <div class="section-title">Original Response <span style="font-size: 0.8em; color: #607d8b;">(generated by {generator_model})</span></div>
                 <div class="response-box">
                     {original_response_safe}
                 </div>
                     {q}
                 </div>
+                <div class="section-title">Response {i} <span style="font-size: 0.8em; color: #607d8b;">(generated by {generator_model})</span></div>
                 <div class="response-box">
                     {r}
                 </div>
                 """
             html_output += f"""
+                <div class="section-title">Detailed Analysis <span style="font-size: 0.8em; color: #607d8b;">(judged by {judge_model})</span></div>
                 <div class="info-box">
                     <p><strong>Reasoning:</strong></p>
                     <p>{reasoning_safe}</p>
                     <p><strong>Conflicting Facts:</strong></p>
                     <p>{conflicting_facts_text_safe}</p>
                 </div>
+                <div style="margin-top: 20px; border-top: 1px dashed #ccc; padding-top: 15px; font-size: 0.9em; color: #607d8b; text-align: center;">
+                    Models randomly selected for this analysis: <strong>{generator_model}</strong> (Generator) and <strong>{judge_model}</strong> (Judge)
+                </div>
             </div>
             """
             ]
     # Helper function to submit feedback
+    def combine_feedback(hallucination_present, judge_correct, fb_text, results):
+        combined_feedback = f"Hallucination: {hallucination_present}, Judge Correct: {judge_correct}"
+        if fb_text:
+            combined_feedback += f", Comments: {fb_text}"
         if not results:
             return "No results to attach feedback to."
                         This tool implements the Paraphrase-based Approach for Scrutinizing Systems (PAS2) with a model-as-judge enhancement:
                         1. **Paraphrase Generation**: Your question is paraphrased multiple ways while preserving its core meaning
+                        2. **Multiple Responses**: All questions (original + paraphrases) are sent to a randomly selected generator model
+                        3. **Expert Judgment**: A randomly selected judge model analyzes all responses to detect factual inconsistencies
                         ### Why This Approach?
                     gr.Markdown("### Help Improve the System")
                     gr.Markdown("Your feedback helps us refine the hallucination detection system.")
+                    hallucination_present = gr.Radio(
+                        label="Was there actually a hallucination in the responses?",
+                        choices=["Yes, there was a hallucination", "No, there was no hallucination", "Not sure"],
+                        value="Not sure"
+                    )
+                    judge_correct = gr.Radio(
+                        label="Did the judge model correctly identify the situation?",
+                        choices=["Yes, the judge was correct", "No, the judge was incorrect", "Not sure"],
+                        value="Not sure"
                     )
                     feedback_text = gr.Textbox(
                 gr.Markdown("## Hallucination Detection Scores")
                 gr.Markdown("Performance comparison of different Generator + Judge model combinations.")
+                # Function to generate the HTML for the model pair leaderboard
+                def generate_pair_leaderboard_html():
+                    try:
+                        # Get leaderboard data
+                        pairs = detector.get_pair_leaderboard() or []
+                        if not pairs:
+                            return (
+                                "<div class=\"info-message\" style=\"padding: 20px; background-color: #e1f5fe; "
+                                "border-radius: 8px; text-align: center; margin: 20px 0;\">"
+                                "<h3 style=\"margin-top: 0; color: #0277bd;\">No Data Available Yet</h3>"
+                                "<p>Try the detector with more queries to populate the leaderboard!</p>"
+                                "</div>"
+                            )
+                        # Generate table rows
+                        rows = ""
+                        for rank, pair in enumerate(pairs, 1):
+                            # Add special styling for top 3
+                            row_class = ""
+                            if rank == 1:
+                                row_class = "class='top-rank-1'"
+                            elif rank == 2:
+                                row_class = "class='top-rank-2'"
+                            elif rank == 3:
+                                row_class = "class='top-rank-3'"
+                            # Format percentages for display
+                            generator_perf = f"{pair.get('generator_performance', 0) * 100:.1f}%" if 'generator_performance' in pair else "N/A"
+                            judge_perf = f"{pair.get('judge_performance', 0) * 100:.1f}%" if 'judge_performance' in pair else "N/A"
+                            consistency = f"{pair.get('consistency_score', 0)}%" if 'consistency_score' in pair else "N/A"
+                            rows += (
+                                f"<tr {row_class}>"
+                                f"<td>{rank}</td>"
+                                f"<td>{pair.get('generator', 'unknown')}</td>"
+                                f"<td>{pair.get('judge', 'unknown')}</td>"
+                                f"<td>{round(pair.get('elo_score', 0))}</td>"
+                                f"<td>{pair.get('accuracy')}%</td>"
+                                f"<td style='color: #80cbc4; font-weight: 500;'>{generator_perf}</td>"
+                                f"<td style='color: #90caf9; font-weight: 500;'>{judge_perf}</td>"
+                                f"<td style='color: #ce93d8; font-weight: 500;'>{consistency}</td>"
+                                f"<td>{pair.get('total_samples', 0)}</td>"
+                                f"</tr>"
+                            )
+                        # Build the full table
+                        html = (
+                            f"<div class=\"leaderboard-container\">"
+                            f"<table class=\"leaderboard-table\">"
+                            f"<thead>"
+                            f"<tr>"
+                            f"<th>Rank</th>"
+                            f"<th>Generator Model</th>"
+                            f"<th>Judge Model</th>"
+                            f"<th>ELO Score</th>"
+                            f"<th>Accuracy</th>"
+                            f"<th>Generator Perf.</th>"
+                            f"<th>Judge Perf.</th>"
+                            f"<th>Consistency</th>"
+                            f"<th>Sample Size</th>"
+                            f"</tr>"
+                            f"</thead>"
+                            f"<tbody>"
+                            f"{rows}"
+                            f"</tbody>"
+                            f"</table>"
+                            f"</div>"
+                            f"<div style='margin-top: 15px; padding: 12px; background-color: #263238; border-radius: 8px; font-size: 0.95em; color: #e0f7fa; box-shadow: 0 2px 5px rgba(0,0,0,0.2);'>"
+                            f"<p style='margin-bottom: 8px; color: #80deea;'><strong>Model Pair Performance Metrics:</strong></p>"
+                            f"<ul style='margin-top: 5px; padding-left: 20px; line-height: 1.4;'>"
+                            f"<li><strong style='color: #b2dfdb;'>Accuracy</strong>: Percentage of correct hallucination judgments based on user feedback</li>"
+                            f"<li><strong style='color: #b2dfdb;'>Generator Performance</strong>: How well the generator model avoids hallucinations</li>"
+                            f"<li><strong style='color: #b2dfdb;'>Judge Performance</strong>: How accurately the judge model identifies hallucinations</li>"
+                            f"<li><strong style='color: #b2dfdb;'>Consistency</strong>: Weighted measure of how well the pair works together</li>"
+                            f"</ul>"
+                            f"</div>"
+                        )
+                        return html
+                    except Exception as e:
+                        logger.error("Error generating leaderboard HTML: %s", str(e), exc_info=True)
+                        return (
+                            f"<div class=\"error-message\" style=\"padding: 20px; background-color: #ffebee; "
+                            f"border-radius: 8px; text-align: center; margin: 20px 0;\">"
+                            f"<h3 style=\"margin-top: 0; color: #c62828;\">Error Loading Leaderboard</h3>"
+                            f"<p>{str(e)}</p>"
+                            f"</div>"
+                        )
                 # Create leaderboard table for model combinations
+                model_leaderboard_html = gr.HTML(generate_pair_leaderboard_html())
+                refresh_leaderboard_btn = gr.Button("Refresh Leaderboard", variant="primary")
+                refresh_leaderboard_btn.click(
+                    fn=lambda: generate_pair_leaderboard_html(),
+                    outputs=[model_leaderboard_html]
+                )
+                # ELO rating explanation
+                with gr.Accordion("ELO Rating System Explanation", open=False):
+                    gr.HTML(
+                        "<div style='margin-top: 20px; padding: 15px; background-color: #0d47a1; border-radius: 8px; box-shadow: 0 2px 8px rgba(0,0,0,0.1);'>" +
+                        "<h3 style='margin-top: 0; color: #ffffff;'>ELO Rating System Explanation</h3>" +
+                        "<div style='display: flex; flex-wrap: wrap; gap: 15px; margin-top: 15px;'>" +
+                        "<div style='flex: 1; min-width: 280px; padding: 12px; background-color: #455a64; border-radius: 6px; box-shadow: 0 1px 3px rgba(0,0,0,0.12);'>" +
+                        "<h4 style='margin-top: 0; color: #ffffff;'>How ELO Scores Are Calculated</h4>" +
+                        "<p style='color: #eceff1;'>Our ELO rating system assigns scores to model pairs based on user feedback, using the following formula:</p>" +
+                        "<div style='background-color: #37474f; padding: 12px; border-radius: 5px; color: #eceff1;'>" +
+                        "<code style='color: #80deea;'>ELO_new = ELO_old + K * (S - E)</code><br><br>" +
+                        "Where:<br>* <strong style='color: #b2dfdb;'>ELO_old</strong>: Previous rating of the model combination<br>" +
+                        "* <strong style='color: #b2dfdb;'>K</strong>: Weight factor (24 for model pairs)<br>" +
+                        "* <strong style='color: #b2dfdb;'>S</strong>: Actual score from user feedback (1 for correct, 0 for incorrect)<br>" +
+                        "* <strong style='color: #b2dfdb;'>E</strong>: Expected score based on current rating<br><br>" +
+                        "<em style='color: #80deea;'>E = 1 / (1 + 10<sup>(1500 - ELO_model)/400</sup>)</em></div></div>" +
+                        "<div style='flex: 1; min-width: 280px; padding: 12px; background-color: #455a64; border-radius: 6px; box-shadow: 0 1px 3px rgba(0,0,0,0.12);'>" +
+                        "<h4 style='margin-top: 0; color: #ffffff;'>Available Models</h4>" +
+                        "<p style='color: #eceff1;'>The system randomly selects from these models for each hallucination detection:</p>" +
+                        "<div style='display: flex; flex-wrap: wrap; gap: 10px; margin-top: 10px;'>" +
+                        "<div style='flex: 1; min-width: 120px;'>" +
+                        "<h5 style='margin-top: 0; margin-bottom: 5px; color: #b2dfdb;'>All Models (Used as both Generator & Judge)</h5>" +
+                        "<ul style='margin-bottom: 0; padding-left: 20px; color: #eceff1;'>" +
+                        "<li>mistral-large</li><li>gpt-4o</li><li>qwen-235b</li><li>grok-3</li>" +
+                        "<li>deepseek-reasoner</li><li>o4-mini</li><li>gemini-2.5-pro</li>" +
+                        "</ul></div></div></div></div></div>"
+                    )
+                gr.HTML(
+                    "<style>" +
+                    ".leaderboard-container {margin: 15px 0; overflow-x: auto;}" +
+                    ".leaderboard-table {width: 100%; border-collapse: collapse; font-size: 0.95em; " +
+                    "box-shadow: 0 2px 10px rgba(0,0,0,0.2); border-radius: 8px; overflow: hidden;}" +
+                    ".leaderboard-table thead {background-color: #0d47a1; color: white;}" +
+                    ".leaderboard-table th, .leaderboard-table td {padding: 12px 15px; text-align: left; border-bottom: 1px solid #37474f; color: #eceff1;}" +
+                    ".leaderboard-table tbody tr {transition: background-color 0.3s;}" +
+                    ".leaderboard-table tbody tr:nth-child(even) {background-color: #37474f;}" +
+                    ".leaderboard-table tbody tr:nth-child(odd) {background-color: #455a64;}" +
+                    ".leaderboard-table tbody tr:hover {background-color: #263238;}" +
+                    ".leaderboard-table tbody tr.top-rank-1 {background-color: #004d40; color: #e0f2f1; font-weight: bold;}" +
+                    ".leaderboard-table tbody tr.top-rank-2 {background-color: #1b5e20; color: #e8f5e9; font-weight: 500;}" +
+                    ".leaderboard-table tbody tr.top-rank-3 {background-color: #33691e; color: #f1f8e9; font-weight: 500;}" +
+                    ".leaderboard-table td {position: relative;}" +
+                    ".leaderboard-table td::after {content: ''; position: absolute; top: 0; left: 0; width: 100%; height: 100%; background: transparent; pointer-events: none;}" +
+                    "</style>"
+                )
+            # Tab 3: Individual Models Leaderboard
+            with gr.TabItem("Individual Models", elem_id="user-feedback-tab"):
+                gr.Markdown("## Individual Model Performance")
+                gr.Markdown("Performance ranking of models based on user feedback, showing statistics for both generator and judge roles.")
+                # Function to generate individual model leaderboard HTML
+                def generate_model_leaderboard_html():
+                    try:
+                        # Get model scores from MongoDB
+                        models = detector.get_model_leaderboard() or []
+                        if not models:
+                            return (
+                                "<div class=\"info-message\" style=\"padding: 20px; background-color: #e1f5fe; "
+                                "border-radius: 8px; text-align: center; margin: 20px 0;\">"
+                                "<h3 style=\"margin-top: 0; color: #0277bd;\">No Data Available Yet</h3>"
+                                "<p>Try the detector with more queries to populate the model scores!</p>"
+                                "</div>"
+                            )
+                        # Generate table rows
+                        rows = ""
+                        for rank, model in enumerate(models, 1):
+                            # Add special styling for top 3
+                            row_class = ""
+                            if rank == 1:
+                                row_class = "class='top-rank-1'"
+                            elif rank == 2:
+                                row_class = "class='top-rank-2'"
+                            elif rank == 3:
+                                row_class = "class='top-rank-3'"
+                            # Calculate role distribution
+                            as_generator = model.get('as_generator', 0)
+                            as_judge = model.get('as_judge', 0)
+                            if as_generator + as_judge > 0:
+                                generator_pct = round((as_generator / (as_generator + as_judge)) * 100)
+                                judge_pct = 100 - generator_pct
+                                role_distribution = f"{generator_pct}% / {judge_pct}%"
+                            else:
+                                role_distribution = "N/A"
+                            # Format percentages with better contrast against dark background
+                            generator_acc = f"{model.get('generator_accuracy', 0.0)}%"
+                            judge_acc = f"{model.get('judge_accuracy', 0.0)}%"
+                            rows += (
+                                f"<tr {row_class}>"
+                                f"<td>{rank}</td>"
+                                f"<td>{model.get('model_name', 'unknown')}</td>"
+                                f"<td>{round(model.get('elo_score', 0))}</td>"
+                                f"<td>{model.get('accuracy')}%</td>"
+                                f"<td style='color: #80cbc4; font-weight: 500;'>{generator_acc}</td>"
+                                f"<td style='color: #90caf9; font-weight: 500;'>{judge_acc}</td>"
+                                f"<td>{model.get('total_samples', 0)}</td>"
+                                f"<td style='color: #ffcc80; font-weight: 500;'>{role_distribution}</td>"
+                                f"</tr>"
+                            )
+                        # Build the full table
+                        html = (
+                            f"<div class=\"leaderboard-container\">"
+                            f"<table class=\"leaderboard-table\">"
+                            f"<thead>"
+                            f"<tr>"
+                            f"<th>Rank</th>"
+                            f"<th>Model</th>"
+                            f"<th>ELO Score</th>"
+                            f"<th>Overall Accuracy</th>"
+                            f"<th>Generator Accuracy</th>"
+                            f"<th>Judge Accuracy</th>"
+                            f"<th>Sample Size</th>"
+                            f"<th>Generator/Judge Ratio</th>"
+                            f"</tr>"
+                            f"</thead>"
+                            f"<tbody>"
+                            f"{rows}"
+                            f"</tbody>"
+                            f"</table>"
+                            f"</div>"
+                        )
+                        return html
+                    except Exception as e:
+                        logger.error("Error generating model leaderboard HTML: %s", str(e), exc_info=True)
+                        return (
+                            f"<div class=\"error-message\" style=\"padding: 20px; background-color: #ffebee; "
+                            f"border-radius: 8px; text-align: center; margin: 20px 0;\">"
+                            f"<h3 style=\"margin-top: 0; color: #c62828;\">Error Loading Model Leaderboard</h3>"
+                            f"<p>{str(e)}</p>"
+                            f"</div>"
+                        )
+                # Create leaderboard table for individual models
+                model_scores_html = gr.HTML(generate_model_leaderboard_html())
+                refresh_models_btn = gr.Button("Refresh Model Scores", variant="primary")
+                refresh_models_btn.click(
+                    fn=lambda: generate_model_leaderboard_html(),
+                    outputs=[model_scores_html]
+                )
+                # ELO rating explanation for individual models
+                with gr.Accordion("ELO Rating Explanation for Individual Models", open=False):
+                    gr.HTML(
+                        "<div style='margin-top: 20px; padding: 15px; background-color: #0d47a1; border-radius: 8px; box-shadow: 0 2px 8px rgba(0,0,0,0.1);'>" +
+                        "<h3 style='margin-top: 0; color: #ffffff;'>Individual Model ELO Rating System</h3>" +
+                        "<div style='display: flex; flex-wrap: wrap; gap: 15px; margin-top: 15px;'>" +
+                        "<div style='flex: 1; min-width: 280px; padding: 12px; background-color: #455a64; border-radius: 6px; box-shadow: 0 1px 3px rgba(0,0,0,0.12);'>" +
+                        "<h4 style='margin-top: 0; color: #ffffff;'>How Individual ELO Scores Are Calculated</h4>" +
+                        "<p style='color: #eceff1;'>Our ELO rating system assigns scores to individual models based on user feedback, using the following formula:</p>" +
+                        "<div style='background-color: #37474f; padding: 12px; border-radius: 5px; color: #eceff1;'>" +
+                        "<code style='color: #80deea;'>ELO_new = ELO_old + K * (S - E)</code><br><br>" +
+                        "Where:<br>* <strong style='color: #b2dfdb;'>ELO_old</strong>: Previous rating of the model<br>" +
+                        "* <strong style='color: #b2dfdb;'>K</strong>: Weight factor (32 for individual models)<br>" +
+                        "* <strong style='color: #b2dfdb;'>S</strong>: Actual score (1 for correct judgment, 0 for incorrect)<br>" +
+                        "* <strong style='color: #b2dfdb;'>E</strong>: Expected score based on current rating<br><br>" +
+                        "<em style='color: #80deea;'>E = 1 / (1 + 10<sup>(1500 - ELO_model)/400</sup>)</em></div>" +
+                        "<p style='color: #eceff1; margin-top: 10px;'>All models start with a base ELO of 1500. Scores are updated after each user evaluation.</p></div>" +
+                        "<div style='flex: 1; min-width: 280px; padding: 12px; background-color: #455a64; border-radius: 6px; box-shadow: 0 1px 3px rgba(0,0,0,0.12);'>" +
+                        "<h4 style='margin-top: 0; color: #ffffff;'>Interpretation Guidelines</h4>" +
+                        "<ul style='margin-bottom: 0; padding-left: 20px; color: #eceff1;'>" +
+                        "<li><strong style='color: #b2dfdb;'>1800+</strong>: Exceptional performance, very rare hallucinations</li>" +
+                        "<li><strong style='color: #b2dfdb;'>1700-1799</strong>: Superior performance, minimal hallucinations</li>" +
+                        "<li><strong style='color: #b2dfdb;'>1600-1699</strong>: Good performance, occasional hallucinations</li>" +
+                        "<li><strong style='color: #b2dfdb;'>1500-1599</strong>: Average performance</li>" +
+                        "<li><strong style='color: #b2dfdb;'>&lt;1500</strong>: Below average, frequent hallucinations</li>" +
+                        "</ul><p style='font-style: italic; color: #b3e5fc; margin-top: 10px;'>" +
+                        "Note: ELO scores are comparative and reflect relative performance between models in our specific hallucination detection tasks.</p>" +
+                        "</div></div></div>"
+                    )
         # Function to continuously update stats
         def update_stats():
                 live_stats = gr.HTML(update_stats())
                 # Add loading animation style
+                gr.HTML(
+                    "<style>" +
+                    "@keyframes pulse {" +
+                    "0% { opacity: 0.6; }" +
+                    "50% { opacity: 1; }" +
+                    "100% { opacity: 0.6; }" +
+                    "}" +
+                    ".refreshing::after {" +
+                    "content: \"⟳\";" +
+                    "display: inline-block;" +
+                    "margin-left: 8px;" +
+                    "animation: pulse 1.5s infinite ease-in-out;" +
+                    "color: #2e7d32;" +
+                    "}" +
+                    "#stats-container {" +
+                    "border: 1px solid #b3e5fc;" +
+                    "border-radius: 10px;" +
+                    "padding: 15px;" +
+                    "margin: 10px 0;" +
+                    "background-color: #0277bd;" +
+                    "}" +
+                    "</style>" +
+                    "<div class=\"refreshing\" style=\"text-align: right; font-size: 0.8em; color: #eceff1;\">Auto-refreshing</div>"
+                )
         # Create a refresh button that will be auto-clicked
         refresh_btn = gr.Button("Refresh Stats", visible=False)
         feedback_button.click(
             fn=combine_feedback,
+            inputs=[hallucination_present, judge_correct, feedback_text, hidden_results],
             outputs=[feedback_status]
         )
         # Footer
         gr.HTML(
+            """<footer><p>Paraphrase-based Approach for Scrutinizing Systems (PAS2) - Advanced Hallucination Detection</p><p>Multiple LLM models tested as generators and judges for optimal hallucination detection</p><p><small>Models in testing: mistral-large, gpt-4o, Qwen3-235B-A22B, grok-3, o4-mini, gemini-2.5-pro, deepseek-r1</small></p></footer>"""
         )
     return interface
 # Uncomment this line to run the test function instead of the main interface
 # if __name__ == "__main__":
+#     test_progress()