rohansampath commited on
Commit
a7f824f
·
verified ·
1 Parent(s): 2ff25b4

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +52 -177
app.py CHANGED
@@ -1,16 +1,9 @@
1
  import gradio as gr
2
  import torch
3
  from transformers import AutoTokenizer, AutoModelForCausalLM
4
- import evaluate
5
- import re
6
- import matplotlib
7
- matplotlib.use('Agg')
8
- import matplotlib.pyplot as plt
9
- import io
10
- import base64
11
  import os
12
  from huggingface_hub import login
13
- import spaces
14
  from mmlu_eval import evaluate_mmlu
15
 
16
  # Read token and login
@@ -21,177 +14,58 @@ else:
21
  print("⚠️ No HF_TOKEN_READ_WRITE found in environment")
22
 
23
  # ---------------------------------------------------------------------------
24
- # 1. Model and tokenizer setup
25
  # ---------------------------------------------------------------------------
26
  model_name = "mistralai/Mistral-7B-Instruct-v0.3"
27
  tokenizer = None
28
  model = None
 
29
 
30
  @spaces.GPU
31
  def load_model():
32
- global tokenizer, model
33
- if tokenizer is None:
34
- tokenizer = AutoTokenizer.from_pretrained(model_name, token=hf_token)
35
- if model is None:
36
- model = AutoModelForCausalLM.from_pretrained(
37
- model_name,
38
- token=hf_token,
39
- torch_dtype=torch.float16
40
- )
41
- model.to('cuda')
42
- return model, tokenizer
 
 
 
 
 
 
43
 
44
  # ---------------------------------------------------------------------------
45
- # 2. Test dataset
46
  # ---------------------------------------------------------------------------
47
- test_data = [
48
- {"question": "What is 2+2?", "answer": "4"},
49
- {"question": "What is 3*3?", "answer": "9"},
50
- {"question": "What is 10/2?", "answer": "5"},
51
- ]
52
 
53
- # ---------------------------------------------------------------------------
54
- # 3. Load metric
55
- # ---------------------------------------------------------------------------
56
- accuracy_metric = evaluate.load("accuracy")
57
-
58
- # ---------------------------------------------------------------------------
59
- # 4. Inference helper functions
60
- # ---------------------------------------------------------------------------
61
- @spaces.GPU
62
- def generate_answer(question):
63
- """
64
- Generates an answer using Mistral's instruction format.
65
- """
66
- model, tokenizer = load_model()
67
 
68
- # Mistral instruction format
69
- prompt = f"""<s>[INST] {question}. Provide only the numerical answer. [/INST]"""
70
 
71
- inputs = tokenizer(prompt, return_tensors="pt").to('cuda')
72
- with torch.no_grad():
73
- outputs = model.generate(
74
- **inputs,
75
- max_new_tokens=50,
76
- pad_token_id=tokenizer.pad_token_id,
77
- eos_token_id=tokenizer.eos_token_id
78
- )
79
- text_output = tokenizer.decode(outputs[0], skip_special_tokens=True)
80
- # Remove the original question from the output
81
- return text_output.replace(question, "").strip()
82
-
83
- def parse_answer(model_output):
84
- """
85
- Extract numeric answer from model's text output.
86
- """
87
- # Look for numbers (including decimals)
88
- match = re.search(r"(-?\d*\.?\d+)", model_output)
89
- if match:
90
- return match.group(1)
91
- return model_output.strip()
92
-
93
  # ---------------------------------------------------------------------------
94
- # 5. Evaluation routine
95
  # ---------------------------------------------------------------------------
96
  @spaces.GPU(duration=120) # Allow up to 2 minutes for full evaluation
97
- def run_evaluation():
98
- predictions = []
99
- references = []
100
- raw_outputs = [] # Store full model outputs for display
101
-
102
- for sample in test_data:
103
- question = sample["question"]
104
- reference_answer = sample["answer"]
105
-
106
- # Model inference
107
- model_output = generate_answer(question)
108
- predicted_answer = parse_answer(model_output)
109
-
110
- predictions.append(predicted_answer)
111
- references.append(reference_answer)
112
- raw_outputs.append({
113
- "question": question,
114
- "model_output": model_output,
115
- "parsed_answer": predicted_answer,
116
- "reference": reference_answer
117
- })
118
-
119
- # Normalize answers
120
- def normalize_answer(ans):
121
- return str(ans).lower().strip()
122
-
123
- norm_preds = [normalize_answer(p) for p in predictions]
124
- norm_refs = [normalize_answer(r) for r in references]
125
-
126
- # Compute accuracy
127
- results = accuracy_metric.compute(predictions=norm_preds, references=norm_refs)
128
- accuracy = results["accuracy"]
129
-
130
- # Create visualization
131
- fig, ax = plt.subplots(figsize=(8, 6))
132
- correct_count = sum(p == r for p, r in zip(norm_preds, norm_refs))
133
- incorrect_count = len(test_data) - correct_count
134
-
135
- bars = ax.bar(["Correct", "Incorrect"],
136
- [correct_count, incorrect_count],
137
- color=["#2ecc71", "#e74c3c"])
138
-
139
- # Add value labels on bars
140
- for bar in bars:
141
- height = bar.get_height()
142
- ax.text(bar.get_x() + bar.get_width()/2., height,
143
- f'{int(height)}',
144
- ha='center', va='bottom')
145
-
146
- ax.set_title("Evaluation Results")
147
- ax.set_ylabel("Count")
148
- ax.set_ylim([0, len(test_data) + 0.5])
149
-
150
- # Convert plot to base64
151
- buf = io.BytesIO()
152
- plt.savefig(buf, format="png", bbox_inches='tight', dpi=300)
153
- buf.seek(0)
154
- plt.close(fig)
155
- data = base64.b64encode(buf.read()).decode("utf-8")
156
-
157
- # Create detailed results HTML
158
- details_html = """
159
- <div style="margin-top: 20px;">
160
- <h3>Detailed Results:</h3>
161
- <table style="width:100%; border-collapse: collapse;">
162
- <tr style="background-color: #f5f5f5;">
163
- <th style="padding: 8px; border: 1px solid #ddd;">Question</th>
164
- <th style="padding: 8px; border: 1px solid #ddd;">Model Output</th>
165
- <th style="padding: 8px; border: 1px solid #ddd;">Parsed Answer</th>
166
- <th style="padding: 8px; border: 1px solid #ddd;">Reference</th>
167
- </tr>
168
- """
169
-
170
- for result in raw_outputs:
171
- details_html += f"""
172
- <tr>
173
- <td style="padding: 8px; border: 1px solid #ddd;">{result['question']}</td>
174
- <td style="padding: 8px; border: 1px solid #ddd;">{result['model_output']}</td>
175
- <td style="padding: 8px; border: 1px solid #ddd;">{result['parsed_answer']}</td>
176
- <td style="padding: 8px; border: 1px solid #ddd;">{result['reference']}</td>
177
- </tr>
178
- """
179
-
180
- details_html += "</table></div>"
181
-
182
- full_html = f"""
183
- <div>
184
- <img src="data:image/png;base64,{data}" style="width:100%; max-width:600px;">
185
- {details_html}
186
- </div>
187
- """
188
-
189
- return f"Accuracy: {accuracy:.2f}", full_html
190
-
191
- # ---------------------------------------------------------------------------
192
- # 5. MMLU Evaluation call
193
- # ---------------------------------------------------------------------------
194
  def run_mmlu_evaluation(num_questions):
 
 
 
 
 
195
  """
196
  Runs the MMLU evaluation with the specified number of questions per task.
197
  Also displays two correct and two incorrect examples.
@@ -224,32 +98,33 @@ def run_mmlu_evaluation(num_questions):
224
  return report
225
 
226
  # ---------------------------------------------------------------------------
227
- # 6. Gradio Interface
228
  # ---------------------------------------------------------------------------
229
  with gr.Blocks() as demo:
230
  gr.Markdown("# Mistral-7B Math Evaluation Demo")
231
  gr.Markdown("""
232
- This demo evaluates Mistral-7B on three very simple math problems to get started.
233
- Press the button below to run the evaluation.
234
  """)
235
-
 
 
 
 
 
 
 
236
  eval_button = gr.Button("Run Evaluation", variant="primary")
237
  output_text = gr.Textbox(label="Results")
238
  output_plot = gr.HTML(label="Visualization and Details")
239
-
240
- eval_button.click(
241
- fn=run_evaluation,
242
- inputs=None,
243
- outputs=[output_text, output_plot]
244
- )
245
 
 
 
 
246
  gr.Markdown("### MMLU Evaluation")
247
- num_questions_input = gr.Number(label="Questions per Task (there are 57 total Tasks)", value=5, precision=0)
248
- eval_mmlu_button = gr.Button("Run MMLU Evaluation")
249
  mmlu_output = gr.Textbox(label="MMLU Evaluation Results")
250
-
251
- eval_mmlu_button.click(fn=run_mmlu_evaluation, inputs=[num_questions_input], outputs=[mmlu_output])
252
-
253
 
 
254
 
255
  demo.launch()
 
1
  import gradio as gr
2
  import torch
3
  from transformers import AutoTokenizer, AutoModelForCausalLM
 
 
 
 
 
 
 
4
  import os
5
  from huggingface_hub import login
6
+ from toy-dataset-eval import evaluate_toy_dataset
7
  from mmlu_eval import evaluate_mmlu
8
 
9
  # Read token and login
 
14
  print("⚠️ No HF_TOKEN_READ_WRITE found in environment")
15
 
16
  # ---------------------------------------------------------------------------
17
+ # 1. Model and tokenizer setup and Loading
18
  # ---------------------------------------------------------------------------
19
  model_name = "mistralai/Mistral-7B-Instruct-v0.3"
20
  tokenizer = None
21
  model = None
22
+ model_loaded = False
23
 
24
  @spaces.GPU
25
  def load_model():
26
+ """Loads the Mistral model and tokenizer and updates the load status."""
27
+ global tokenizer, model, model_loaded
28
+ try:
29
+ if tokenizer is None:
30
+ tokenizer = AutoTokenizer.from_pretrained(model_name, token=hf_token)
31
+ if model is None:
32
+ model = AutoModelForCausalLM.from_pretrained(
33
+ model_name,
34
+ token=hf_token,
35
+ torch_dtype=torch.float16
36
+ )
37
+ model.to('cuda')
38
+ model_loaded = True
39
+ return "✅ Model Loaded!"
40
+ except Exception as e:
41
+ model_loaded = False
42
+ return f"❌ Model Load Failed: {str(e)}"
43
 
44
  # ---------------------------------------------------------------------------
45
+ # 2. Toy Evaluation
46
  # ---------------------------------------------------------------------------
47
+ @spaces.GPU (duration=120)
48
+ def run_toy_evaluation():
49
+ """Runs the toy dataset evaluation."""
50
+ if not model_loaded:
51
+ load_model()
52
 
53
+ if not model_loaded:
54
+ return "⚠️ Model not loaded. Please load the model first."
 
 
 
 
 
 
 
 
 
 
 
 
55
 
56
+ results = evaluate_toy_dataset(model, tokenizer)
57
+ return results # Ensure load confirmation is shown before results
58
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
59
  # ---------------------------------------------------------------------------
60
+ # 3. MMLU Evaluation call
61
  # ---------------------------------------------------------------------------
62
  @spaces.GPU(duration=120) # Allow up to 2 minutes for full evaluation
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
63
  def run_mmlu_evaluation(num_questions):
64
+ if not model_loaded:
65
+ load_model()
66
+
67
+ if not model_loaded:
68
+ return "⚠️ Model not loaded. Please load the model first."
69
  """
70
  Runs the MMLU evaluation with the specified number of questions per task.
71
  Also displays two correct and two incorrect examples.
 
98
  return report
99
 
100
  # ---------------------------------------------------------------------------
101
+ # 4. Gradio Interface
102
  # ---------------------------------------------------------------------------
103
  with gr.Blocks() as demo:
104
  gr.Markdown("# Mistral-7B Math Evaluation Demo")
105
  gr.Markdown("""
106
+ This demo evaluates Mistral-7B on Various Datasets.
 
107
  """)
108
+
109
+ # Load Model Button
110
+ load_button = gr.Button("Load Model", variant="primary")
111
+ load_status = gr.Textbox(label="Model Status", interactive=False)
112
+ load_button.click(fn=load_model, inputs=None, outputs=load_status)
113
+
114
+ # Toy Dataset Evaluation
115
+ gr.Markdown("### Toy Dataset Evaluation")
116
  eval_button = gr.Button("Run Evaluation", variant="primary")
117
  output_text = gr.Textbox(label="Results")
118
  output_plot = gr.HTML(label="Visualization and Details")
 
 
 
 
 
 
119
 
120
+ eval_button.click(fn=run_toy_evaluation, inputs=None, outputs=[output_text, output_plot])
121
+
122
+ # MMLU Evaluation
123
  gr.Markdown("### MMLU Evaluation")
124
+ num_questions_input = gr.Number(label="Questions per Task (Max: 57)", value=5, precision=0)
125
+ eval_mmlu_button = gr.Button("Run MMLU Evaluation", variant="primary")
126
  mmlu_output = gr.Textbox(label="MMLU Evaluation Results")
 
 
 
127
 
128
+ eval_mmlu_button.click(fn=run_mmlu_evaluation, inputs=[num_questions_input], outputs=[mmlu_output])
129
 
130
  demo.launch()