H2H-eval-comparator

Sleeping

App Files Files Community

rohansampath commited on Feb 11

Commit

e8d7a5b

verified ·

1 Parent(s): 76926dc

Made some changes

Browse files

Files changed (1) hide show

app.py +99 -40

app.py CHANGED Viewed

@@ -10,11 +10,15 @@ import io
 import base64
 import os
 from huggingface_hub import login
-from transformers import AutoTokenizer, AutoModel
-hf_token = os.getenv("HF_TOKEN_READ_WRITE")  # Read the token from Secrets
-login(hf_token)
 if torch.cuda.is_available():
     print("✅ GPU is available")
     print("GPU Name:", torch.cuda.get_device_name(0))
@@ -24,18 +28,21 @@ else:
 # ---------------------------------------------------------------------------
 # 1. Define model name and load model/tokenizer
 # ---------------------------------------------------------------------------
-model_name = "mistralai/Mistral-7B-Instruct-v0.3"  # fictional placeholder
-tokenizer = AutoTokenizer.from_pretrained(model_name)
 device = "cuda" if torch.cuda.is_available() else "cpu"
-model = AutoModelForCausalLM.from_pretrained(model_name, token=hf_token, torch_dtype=torch.float16, device_map="auto")
 print(f"✅ Model loaded on {device}")
-#model = AutoModelForCausalLM.from_pretrained(model_name)
 # ---------------------------------------------------------------------------
-# 2. Define a tiny "dataset" for demonstration
-#    In reality, you'll load a real dataset from HF or custom code.
 # ---------------------------------------------------------------------------
 test_data = [
     {"question": "What is 2+2?", "answer": "4"},
@@ -44,7 +51,7 @@ test_data = [
 ]
 # ---------------------------------------------------------------------------
-# 3. Load a metric (accuracy) from Hugging Face evaluate library
 # ---------------------------------------------------------------------------
 accuracy_metric = evaluate.load("accuracy")
@@ -53,31 +60,32 @@ accuracy_metric = evaluate.load("accuracy")
 # ---------------------------------------------------------------------------
 def generate_answer(question):
     """
-    Generates an answer to the given question using the loaded model.
     """
-    # Simple prompt
-    prompt = f"Question: {question}\nAnswer:"
     inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
     with torch.no_grad():
         outputs = model.generate(
             **inputs,
-            max_new_tokens=30,
             temperature=0.0,  # deterministic
         )
     text_output = tokenizer.decode(outputs[0], skip_special_tokens=True)
-    return text_output
 def parse_answer(model_output):
     """
-    Heuristic to extract the final numeric answer from model's text.
-    You can customize this regex or logic as needed.
     """
-    # Example: find digits (possibly multiple, but we keep the first match)
-    match = re.search(r"(\d+)", model_output)
     if match:
         return match.group(1)
-    # fallback to entire text if no digits found
     return model_output.strip()
 # ---------------------------------------------------------------------------
@@ -86,6 +94,7 @@ def parse_answer(model_output):
 def run_evaluation():
     predictions = []
     references = []
     for sample in test_data:
         question = sample["question"]
@@ -97,54 +106,104 @@ def run_evaluation():
         predictions.append(predicted_answer)
         references.append(reference_answer)
-    # Normalize answers (simple: just remove spaces/punctuation, lower case)
     def normalize_answer(ans):
-        return ans.lower().strip()
     norm_preds = [normalize_answer(p) for p in predictions]
-    norm_refs  = [normalize_answer(r) for r in references]
     # Compute accuracy
     results = accuracy_metric.compute(predictions=norm_preds, references=norm_refs)
     accuracy = results["accuracy"]
-    # Create a simple bar chart: correct vs. incorrect
     correct_count = sum(p == r for p, r in zip(norm_preds, norm_refs))
     incorrect_count = len(test_data) - correct_count
-    fig, ax = plt.subplots()
-    ax.bar(["Correct", "Incorrect"], [correct_count, incorrect_count], color=["green", "red"])
     ax.set_title("Evaluation Results")
     ax.set_ylabel("Count")
-    ax.set_ylim([0, len(test_data)])
-    # Convert the plot to a base64-encoded PNG for Gradio display
     buf = io.BytesIO()
-    plt.savefig(buf, format="png")
     buf.seek(0)
     plt.close(fig)
     data = base64.b64encode(buf.read()).decode("utf-8")
-    image_url = f"data:image/png;base64,{data}"
-    # Return text and the plot
-    return f"Accuracy: {accuracy:.2f}", image_url
 # ---------------------------------------------------------------------------
-# 6. Gradio App
 # ---------------------------------------------------------------------------
 with gr.Blocks() as demo:
-    gr.Markdown("# Simple Math Evaluation with 'Llama 3.2'")
-    eval_button = gr.Button("Run Evaluation")
     output_text = gr.Textbox(label="Results")
-    output_plot = gr.HTML(label="Plot")
     eval_button.click(
         fn=run_evaluation,
         inputs=None,
         outputs=[output_text, output_plot]
     )
-demo.launch()

 import base64
 import os
 from huggingface_hub import login
+# Read token and login
+hf_token = os.getenv("HF_TOKEN_READ_WRITE")
+if hf_token:
+    login(hf_token)
+else:
+    print("⚠️ No HF_TOKEN_READ_WRITE found in environment")
+# Check GPU availability
 if torch.cuda.is_available():
     print("✅ GPU is available")
     print("GPU Name:", torch.cuda.get_device_name(0))
 # ---------------------------------------------------------------------------
 # 1. Define model name and load model/tokenizer
 # ---------------------------------------------------------------------------
+model_name = "mistralai/Mistral-7B-Instruct-v0.3"
+tokenizer = AutoTokenizer.from_pretrained(model_name, token=hf_token)
 device = "cuda" if torch.cuda.is_available() else "cpu"
+model = AutoModelForCausalLM.from_pretrained(
+    model_name,
+    token=hf_token,
+    torch_dtype=torch.float16,
+    device_map="auto"
+)
 print(f"✅ Model loaded on {device}")
 # ---------------------------------------------------------------------------
+# 2. Test dataset
 # ---------------------------------------------------------------------------
 test_data = [
     {"question": "What is 2+2?", "answer": "4"},
 ]
 # ---------------------------------------------------------------------------
+# 3. Load metric
 # ---------------------------------------------------------------------------
 accuracy_metric = evaluate.load("accuracy")
 # ---------------------------------------------------------------------------
 def generate_answer(question):
     """
+    Generates an answer using Mistral's instruction format.
     """
+    # Mistral instruction format
+    prompt = f"""<s>[INST] {question} [/INST]"""
     inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
     with torch.no_grad():
         outputs = model.generate(
             **inputs,
+            max_new_tokens=50,
             temperature=0.0,  # deterministic
+            pad_token_id=tokenizer.pad_token_id,
+            eos_token_id=tokenizer.eos_token_id
         )
     text_output = tokenizer.decode(outputs[0], skip_special_tokens=True)
+    # Remove the original question from the output
+    return text_output.replace(question, "").strip()
 def parse_answer(model_output):
     """
+    Extract numeric answer from model's text output.
     """
+    # Look for numbers (including decimals)
+    match = re.search(r"(-?\d*\.?\d+)", model_output)
     if match:
         return match.group(1)
     return model_output.strip()
 # ---------------------------------------------------------------------------
 def run_evaluation():
     predictions = []
     references = []
+    raw_outputs = []  # Store full model outputs for display
     for sample in test_data:
         question = sample["question"]
         predictions.append(predicted_answer)
         references.append(reference_answer)
+        raw_outputs.append({
+            "question": question,
+            "model_output": model_output,
+            "parsed_answer": predicted_answer,
+            "reference": reference_answer
+        })
+    # Normalize answers
     def normalize_answer(ans):
+        return str(ans).lower().strip()
     norm_preds = [normalize_answer(p) for p in predictions]
+    norm_refs = [normalize_answer(r) for r in references]
     # Compute accuracy
     results = accuracy_metric.compute(predictions=norm_preds, references=norm_refs)
     accuracy = results["accuracy"]
+    # Create visualization
     correct_count = sum(p == r for p, r in zip(norm_preds, norm_refs))
     incorrect_count = len(test_data) - correct_count
+    fig, ax = plt.subplots(figsize=(8, 6))
+    bars = ax.bar(["Correct", "Incorrect"],
+                 [correct_count, incorrect_count],
+                 color=["#2ecc71", "#e74c3c"])
+    # Add value labels on bars
+    for bar in bars:
+        height = bar.get_height()
+        ax.text(bar.get_x() + bar.get_width()/2., height,
+                f'{int(height)}',
+                ha='center', va='bottom')
     ax.set_title("Evaluation Results")
     ax.set_ylabel("Count")
+    ax.set_ylim([0, len(test_data) + 0.5])  # Add some padding at top
+    # Convert plot to base64
     buf = io.BytesIO()
+    plt.savefig(buf, format="png", bbox_inches='tight', dpi=300)
     buf.seek(0)
     plt.close(fig)
     data = base64.b64encode(buf.read()).decode("utf-8")
+    # Create detailed results HTML
+    details_html = """
+    <div style="margin-top: 20px;">
+        <h3>Detailed Results:</h3>
+        <table style="width:100%; border-collapse: collapse;">
+            <tr style="background-color: #f5f5f5;">
+                <th style="padding: 8px; border: 1px solid #ddd;">Question</th>
+                <th style="padding: 8px; border: 1px solid #ddd;">Model Output</th>
+                <th style="padding: 8px; border: 1px solid #ddd;">Parsed Answer</th>
+                <th style="padding: 8px; border: 1px solid #ddd;">Reference</th>
+            </tr>
+    """
+    for result in raw_outputs:
+        details_html += f"""
+            <tr>
+                <td style="padding: 8px; border: 1px solid #ddd;">{result['question']}</td>
+                <td style="padding: 8px; border: 1px solid #ddd;">{result['model_output']}</td>
+                <td style="padding: 8px; border: 1px solid #ddd;">{result['parsed_answer']}</td>
+                <td style="padding: 8px; border: 1px solid #ddd;">{result['reference']}</td>
+            </tr>
+        """
+    details_html += "</table></div>"
+    # Combine plot and details
+    full_html = f"""
+    <div>
+        <img src="data:image/png;base64,{data}" style="width:100%; max-width:600px;">
+        {details_html}
+    </div>
+    """
+    return f"Accuracy: {accuracy:.2f}", full_html
 # ---------------------------------------------------------------------------
+# 6. Gradio Interface
 # ---------------------------------------------------------------------------
 with gr.Blocks() as demo:
+    gr.Markdown("# Mistral-7B Math Evaluation Demo")
+    gr.Markdown("""
+    This demo evaluates Mistral-7B on basic math problems.
+    Press the button below to run the evaluation.
+    """)
+    eval_button = gr.Button("Run Evaluation", variant="primary")
     output_text = gr.Textbox(label="Results")
+    output_plot = gr.HTML(label="Visualization and Details")
     eval_button.click(
         fn=run_evaluation,
         inputs=None,
         outputs=[output_text, output_plot]
     )
+demo.launch()