LamiaYT commited on
Commit
8c139ea
·
1 Parent(s): 1d9a78b
Files changed (2) hide show
  1. app.py +185 -68
  2. requirements.txt +11 -24
app.py CHANGED
@@ -11,11 +11,13 @@ from duckduckgo_search import DDGS
11
  from pdfminer.high_level import extract_text
12
  from bs4 import BeautifulSoup
13
  import html2text
14
- from typing import Dict, Any, List, Tuple, Callable
15
  from dotenv import load_dotenv
16
- from transformers import AutoModelForCausalLM, AutoTokenizer
17
  import torch
18
  import time
 
 
19
  # --- Load Environment Variables ---
20
  load_dotenv()
21
  SERPER_API_KEY = os.getenv("SERPER_API_KEY")
@@ -31,24 +33,32 @@ os.environ["PIP_BREAK_SYSTEM_PACKAGES"] = "1"
31
  os.environ["HF_HUB_DISABLE_SYMLINKS_WARNING"] = "1"
32
  os.environ["BITSANDBYTES_NOWELCOME"] = "1"
33
 
34
-
35
- MODEL_NAME = "microsoft/Phi-3-mini-4k-instruct"
36
-
37
  print("Loading model (CPU-compatible)...")
38
  start_time = time.time()
39
 
 
40
  model = AutoModelForCausalLM.from_pretrained(
41
  MODEL_NAME,
42
  trust_remote_code=True,
43
- torch_dtype=torch.float32 # Use float32 for CPU compatibility
 
 
 
 
 
 
 
 
 
44
  )
45
- tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=False)
46
 
 
 
 
47
 
48
  load_time = time.time() - start_time
49
  print(f"Model loaded in {load_time:.2f} seconds")
50
 
51
-
52
  # --- Tools for GAIA Agent ---
53
  def web_search(query: str) -> str:
54
  """Search the web using DuckDuckGo or Serper API"""
@@ -68,7 +78,8 @@ def web_search(query: str) -> str:
68
  response = requests.post(
69
  'https://google.serper.dev/search',
70
  headers=headers,
71
- json=params
 
72
  )
73
  results = response.json()
74
  if 'organic' in results:
@@ -85,23 +96,37 @@ def web_search(query: str) -> str:
85
  def calculator(expression: str) -> str:
86
  """Evaluate mathematical expressions safely"""
87
  try:
88
- return str(numexpr.evaluate(expression))
 
 
 
89
  except Exception as e:
90
  return f"Calculation error: {str(e)}"
91
 
92
  def read_pdf(file_path: str) -> str:
93
  """Extract text from PDF files"""
94
  try:
95
- return extract_text(file_path)[:2000] # Limit to first 2000 characters
 
96
  except Exception as e:
97
  return f"PDF read error: {str(e)}"
98
 
99
  def read_webpage(url: str) -> str:
100
  """Fetch and extract text from web pages"""
101
  try:
102
- response = requests.get(url, timeout=10)
 
 
 
 
103
  soup = BeautifulSoup(response.text, 'html.parser')
104
- return soup.get_text(separator=' ', strip=True)[:2000] # Limit text
 
 
 
 
 
 
105
  except Exception as e:
106
  return f"Webpage read error: {str(e)}"
107
 
@@ -139,24 +164,33 @@ class GAIA_Agent:
139
  print(f"\nProcessing: {question[:80]}...")
140
  self.history = [f"Question: {question}"]
141
 
142
- for step in range(MAX_STEPS):
143
- prompt = self._build_prompt()
144
- response = self._call_model(prompt)
145
-
146
- if "Final Answer" in response:
147
- answer = response.split("Final Answer:")[-1].strip()
148
- print(f"Final Answer: {answer}")
149
- return answer
150
 
151
- tool_call = self._parse_tool_call(response)
152
- if tool_call:
153
- tool_name, args = tool_call
154
- observation = self._use_tool(tool_name, args)
155
- self.history.append(f"Observation: {observation}")
156
- else:
157
- self.history.append(f"Thought: {response}")
158
-
159
- return "Agent couldn't find solution within step limit"
 
 
 
 
 
 
 
 
 
 
 
 
 
160
 
161
  def _build_prompt(self) -> str:
162
  prompt = "<|system|>\n" + self.system_prompt + "<|end|>\n"
@@ -167,28 +201,60 @@ class GAIA_Agent:
167
  def _call_model(self, prompt: str) -> str:
168
  start_time = time.time()
169
 
170
- inputs = tokenizer(prompt, return_tensors="pt", return_attention_mask=True).to(model.device)
171
- outputs = model.generate(
172
- **inputs,
173
- max_new_tokens=MAX_TOKENS,
174
- temperature=0.01,
175
- do_sample=True,
176
- pad_token_id=tokenizer.eos_token_id
177
- )
178
-
179
- response = tokenizer.decode(outputs[0], skip_special_tokens=True)
180
- response = response.split("<|assistant|>")[-1].strip()
181
-
182
- gen_time = time.time() - start_time
183
- print(f"Generated {len(response)} tokens in {gen_time:.2f}s: {response[:60]}...")
184
- return response
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
185
 
186
- def _parse_tool_call(self, text: str) -> Tuple[str, Dict] or None:
187
  try:
188
  json_match = re.search(r'```json\s*({.*?})\s*```', text, re.DOTALL)
189
  if json_match:
190
  tool_call = json.loads(json_match.group(1))
191
- return tool_call["tool"], tool_call["args"]
 
192
  except Exception as e:
193
  print(f"Tool parse error: {str(e)}")
194
  return None
@@ -230,12 +296,12 @@ def run_and_submit_all(profile: gr.OAuthProfile | None):
230
  return f"Error initializing agent: {e}", None
231
 
232
  agent_code = f"https://huggingface.co/spaces/{space_id}/tree/main"
233
- print(agent_code)
234
 
235
  # Fetch Questions
236
  print(f"Fetching questions from: {questions_url}")
237
  try:
238
- response = requests.get(questions_url, timeout=15)
239
  response.raise_for_status()
240
  questions_data = response.json()
241
  if not questions_data:
@@ -253,19 +319,38 @@ def run_and_submit_all(profile: gr.OAuthProfile | None):
253
  results_log = []
254
  answers_payload = []
255
  print(f"Running agent on {len(questions_data)} questions...")
256
- for item in questions_data:
 
257
  task_id = item.get("task_id")
258
  question_text = item.get("question")
 
259
  if not task_id or question_text is None:
260
  print(f"Skipping item with missing task_id or question: {item}")
261
  continue
 
262
  try:
 
263
  submitted_answer = agent(question_text)
264
  answers_payload.append({"task_id": task_id, "submitted_answer": submitted_answer})
265
- results_log.append({"Task ID": task_id, "Question": question_text, "Submitted Answer": submitted_answer})
 
 
 
 
 
 
 
 
 
266
  except Exception as e:
267
  print(f"Error running agent on task {task_id}: {e}")
268
- results_log.append({"Task ID": task_id, "Question": question_text, "Submitted Answer": f"AGENT ERROR: {e}"})
 
 
 
 
 
 
269
 
270
  if not answers_payload:
271
  print("Agent did not produce any answers to submit.")
@@ -283,7 +368,7 @@ def run_and_submit_all(profile: gr.OAuthProfile | None):
283
  # Submit
284
  print(f"Submitting {len(answers_payload)} answers to: {submit_url}")
285
  try:
286
- response = requests.post(submit_url, json=submission_data, timeout=60)
287
  response.raise_for_status()
288
  result_data = response.json()
289
  final_status = (
@@ -314,42 +399,74 @@ def run_and_submit_all(profile: gr.OAuthProfile | None):
314
  return status_message, results_df
315
 
316
  # --- Gradio Interface ---
317
- with gr.Blocks() as demo:
318
  gr.Markdown("# GAIA Agent Evaluation Runner")
319
  gr.Markdown(
320
  """
321
  **Instructions:**
322
- 1. Log in to your Hugging Face account
323
- 2. Click 'Run Evaluation & Submit All Answers'
324
- 3. View results and score
325
 
326
- **Agent Info:**
327
- - Model: Phi-3-mini-4k-instruct (4-bit quantized)
328
  - Tools: Web Search, Calculator, PDF Reader, Webpage Reader
329
- - Max Steps: 6
 
330
  """
331
  )
332
 
333
- gr.LoginButton()
334
- run_button = gr.Button("Run Evaluation & Submit All Answers", variant="primary")
335
- status_output = gr.Textbox(label="Run Status / Submission Result", lines=5, interactive=False)
336
- results_table = gr.DataFrame(label="Questions and Agent Answers", wrap=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
337
 
338
  run_button.click(
339
  fn=run_and_submit_all,
340
- outputs=[status_output, results_table]
 
341
  )
342
 
343
  if __name__ == "__main__":
344
- print("\n" + "-"*30 + " App Starting " + "-"*30)
 
 
 
345
  space_host = os.getenv("SPACE_HOST")
346
  space_id = os.getenv("SPACE_ID")
347
 
348
  if space_host:
349
  print(f"✅ SPACE_HOST found: {space_host}")
 
 
 
350
  if space_id:
351
  print(f"✅ SPACE_ID found: {space_id}")
 
 
352
 
353
- print("-"*(60 + len(" App Starting ")) + "\n")
354
  print("Launching Gradio Interface...")
355
- demo.launch(debug=True, share=False)
 
 
 
 
 
 
 
11
  from pdfminer.high_level import extract_text
12
  from bs4 import BeautifulSoup
13
  import html2text
14
+ from typing import Dict, Any, List, Tuple, Callable, Optional
15
  from dotenv import load_dotenv
16
+ from transformers import AutoModelForCausalLM, AutoTokenizer, GenerationConfig
17
  import torch
18
  import time
19
+ import gc
20
+
21
  # --- Load Environment Variables ---
22
  load_dotenv()
23
  SERPER_API_KEY = os.getenv("SERPER_API_KEY")
 
33
  os.environ["HF_HUB_DISABLE_SYMLINKS_WARNING"] = "1"
34
  os.environ["BITSANDBYTES_NOWELCOME"] = "1"
35
 
 
 
 
36
  print("Loading model (CPU-compatible)...")
37
  start_time = time.time()
38
 
39
+ # Load model with explicit configuration for better compatibility
40
  model = AutoModelForCausalLM.from_pretrained(
41
  MODEL_NAME,
42
  trust_remote_code=True,
43
+ torch_dtype=torch.float32, # Use float32 for CPU compatibility
44
+ device_map="cpu", # Explicitly set to CPU
45
+ low_cpu_mem_usage=True, # Optimize for low memory usage
46
+ use_cache=False # Disable cache to avoid DynamicCache issues
47
+ )
48
+
49
+ tokenizer = AutoTokenizer.from_pretrained(
50
+ MODEL_NAME,
51
+ use_fast=False,
52
+ trust_remote_code=True
53
  )
 
54
 
55
+ # Ensure pad token is set
56
+ if tokenizer.pad_token is None:
57
+ tokenizer.pad_token = tokenizer.eos_token
58
 
59
  load_time = time.time() - start_time
60
  print(f"Model loaded in {load_time:.2f} seconds")
61
 
 
62
  # --- Tools for GAIA Agent ---
63
  def web_search(query: str) -> str:
64
  """Search the web using DuckDuckGo or Serper API"""
 
78
  response = requests.post(
79
  'https://google.serper.dev/search',
80
  headers=headers,
81
+ json=params,
82
+ timeout=10
83
  )
84
  results = response.json()
85
  if 'organic' in results:
 
96
  def calculator(expression: str) -> str:
97
  """Evaluate mathematical expressions safely"""
98
  try:
99
+ # Clean the expression
100
+ expression = re.sub(r'[^\d+\-*/().\s]', '', expression)
101
+ result = numexpr.evaluate(expression)
102
+ return str(result)
103
  except Exception as e:
104
  return f"Calculation error: {str(e)}"
105
 
106
  def read_pdf(file_path: str) -> str:
107
  """Extract text from PDF files"""
108
  try:
109
+ text = extract_text(file_path)
110
+ return text[:2000] if text else "No text found in PDF"
111
  except Exception as e:
112
  return f"PDF read error: {str(e)}"
113
 
114
  def read_webpage(url: str) -> str:
115
  """Fetch and extract text from web pages"""
116
  try:
117
+ headers = {
118
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
119
+ }
120
+ response = requests.get(url, timeout=10, headers=headers)
121
+ response.raise_for_status()
122
  soup = BeautifulSoup(response.text, 'html.parser')
123
+
124
+ # Remove script and style elements
125
+ for script in soup(["script", "style"]):
126
+ script.decompose()
127
+
128
+ text = soup.get_text(separator=' ', strip=True)
129
+ return text[:2000] if text else "No text found on webpage"
130
  except Exception as e:
131
  return f"Webpage read error: {str(e)}"
132
 
 
164
  print(f"\nProcessing: {question[:80]}...")
165
  self.history = [f"Question: {question}"]
166
 
167
+ try:
168
+ for step in range(MAX_STEPS):
169
+ prompt = self._build_prompt()
170
+ response = self._call_model(prompt)
 
 
 
 
171
 
172
+ if "Final Answer" in response:
173
+ answer = response.split("Final Answer:")[-1].strip()
174
+ print(f"Final Answer: {answer}")
175
+ return answer
176
+
177
+ tool_call = self._parse_tool_call(response)
178
+ if tool_call:
179
+ tool_name, args = tool_call
180
+ observation = self._use_tool(tool_name, args)
181
+ self.history.append(f"Observation: {observation}")
182
+ else:
183
+ self.history.append(f"Thought: {response}")
184
+
185
+ # Clean up memory after each step
186
+ if step % 2 == 0:
187
+ gc.collect()
188
+
189
+ return "Agent couldn't find solution within step limit"
190
+
191
+ except Exception as e:
192
+ print(f"Error in agent execution: {str(e)}")
193
+ return f"Agent error: {str(e)}"
194
 
195
  def _build_prompt(self) -> str:
196
  prompt = "<|system|>\n" + self.system_prompt + "<|end|>\n"
 
201
  def _call_model(self, prompt: str) -> str:
202
  start_time = time.time()
203
 
204
+ try:
205
+ # Tokenize input
206
+ inputs = tokenizer(
207
+ prompt,
208
+ return_tensors="pt",
209
+ return_attention_mask=True,
210
+ truncation=True,
211
+ max_length=3072 # Leave room for generation
212
+ )
213
+
214
+ # Move to same device as model
215
+ inputs = {k: v.to(model.device) for k, v in inputs.items()}
216
+
217
+ # Create generation config
218
+ generation_config = GenerationConfig(
219
+ max_new_tokens=MAX_TOKENS,
220
+ temperature=0.01,
221
+ do_sample=True,
222
+ pad_token_id=tokenizer.pad_token_id,
223
+ eos_token_id=tokenizer.eos_token_id,
224
+ use_cache=False # Disable cache to avoid DynamicCache issues
225
+ )
226
+
227
+ # Generate response
228
+ with torch.no_grad():
229
+ outputs = model.generate(
230
+ **inputs,
231
+ generation_config=generation_config
232
+ )
233
+
234
+ # Decode response
235
+ full_response = tokenizer.decode(outputs[0], skip_special_tokens=True)
236
+ response = full_response.split("<|assistant|>")[-1].strip()
237
+
238
+ gen_time = time.time() - start_time
239
+ print(f"Generated {len(response)} tokens in {gen_time:.2f}s: {response[:60]}...")
240
+
241
+ # Clean up
242
+ del inputs, outputs
243
+ gc.collect()
244
+
245
+ return response
246
+
247
+ except Exception as e:
248
+ print(f"Model generation error: {str(e)}")
249
+ return f"Generation error: {str(e)}"
250
 
251
+ def _parse_tool_call(self, text: str) -> Optional[Tuple[str, Dict]]:
252
  try:
253
  json_match = re.search(r'```json\s*({.*?})\s*```', text, re.DOTALL)
254
  if json_match:
255
  tool_call = json.loads(json_match.group(1))
256
+ if "tool" in tool_call and "args" in tool_call:
257
+ return tool_call["tool"], tool_call["args"]
258
  except Exception as e:
259
  print(f"Tool parse error: {str(e)}")
260
  return None
 
296
  return f"Error initializing agent: {e}", None
297
 
298
  agent_code = f"https://huggingface.co/spaces/{space_id}/tree/main"
299
+ print(f"Agent code URL: {agent_code}")
300
 
301
  # Fetch Questions
302
  print(f"Fetching questions from: {questions_url}")
303
  try:
304
+ response = requests.get(questions_url, timeout=30)
305
  response.raise_for_status()
306
  questions_data = response.json()
307
  if not questions_data:
 
319
  results_log = []
320
  answers_payload = []
321
  print(f"Running agent on {len(questions_data)} questions...")
322
+
323
+ for i, item in enumerate(questions_data):
324
  task_id = item.get("task_id")
325
  question_text = item.get("question")
326
+
327
  if not task_id or question_text is None:
328
  print(f"Skipping item with missing task_id or question: {item}")
329
  continue
330
+
331
  try:
332
+ print(f"Processing question {i+1}/{len(questions_data)}")
333
  submitted_answer = agent(question_text)
334
  answers_payload.append({"task_id": task_id, "submitted_answer": submitted_answer})
335
+ results_log.append({
336
+ "Task ID": task_id,
337
+ "Question": question_text[:100] + "..." if len(question_text) > 100 else question_text,
338
+ "Submitted Answer": submitted_answer
339
+ })
340
+
341
+ # Clean up memory periodically
342
+ if i % 5 == 0:
343
+ gc.collect()
344
+
345
  except Exception as e:
346
  print(f"Error running agent on task {task_id}: {e}")
347
+ error_answer = f"AGENT ERROR: {str(e)}"
348
+ answers_payload.append({"task_id": task_id, "submitted_answer": error_answer})
349
+ results_log.append({
350
+ "Task ID": task_id,
351
+ "Question": question_text[:100] + "..." if len(question_text) > 100 else question_text,
352
+ "Submitted Answer": error_answer
353
+ })
354
 
355
  if not answers_payload:
356
  print("Agent did not produce any answers to submit.")
 
368
  # Submit
369
  print(f"Submitting {len(answers_payload)} answers to: {submit_url}")
370
  try:
371
+ response = requests.post(submit_url, json=submission_data, timeout=120)
372
  response.raise_for_status()
373
  result_data = response.json()
374
  final_status = (
 
399
  return status_message, results_df
400
 
401
  # --- Gradio Interface ---
402
+ with gr.Blocks(title="GAIA Agent Evaluation") as demo:
403
  gr.Markdown("# GAIA Agent Evaluation Runner")
404
  gr.Markdown(
405
  """
406
  **Instructions:**
407
+ 1. Log in to your Hugging Face account using the button below
408
+ 2. Click 'Run Evaluation & Submit All Answers' to start the evaluation
409
+ 3. View results and score in the output sections
410
 
411
+ **Agent Information:**
412
+ - Model: Phi-3-mini-4k-instruct (CPU optimized)
413
  - Tools: Web Search, Calculator, PDF Reader, Webpage Reader
414
+ - Max Steps: 6 per question
415
+ - Memory: Optimized for 2vCPU/16GB environment
416
  """
417
  )
418
 
419
+ with gr.Row():
420
+ gr.LoginButton()
421
+
422
+ with gr.Row():
423
+ run_button = gr.Button("Run Evaluation & Submit All Answers", variant="primary", size="lg")
424
+
425
+ with gr.Row():
426
+ status_output = gr.Textbox(
427
+ label="Evaluation Status & Submission Result",
428
+ lines=5,
429
+ interactive=False,
430
+ placeholder="Click the button above to start evaluation..."
431
+ )
432
+
433
+ with gr.Row():
434
+ results_table = gr.DataFrame(
435
+ label="Questions and Agent Answers",
436
+ wrap=True,
437
+ interactive=False
438
+ )
439
 
440
  run_button.click(
441
  fn=run_and_submit_all,
442
+ outputs=[status_output, results_table],
443
+ show_progress=True
444
  )
445
 
446
  if __name__ == "__main__":
447
+ print("\n" + "="*50)
448
+ print("GAIA Agent Evaluation System Starting")
449
+ print("="*50)
450
+
451
  space_host = os.getenv("SPACE_HOST")
452
  space_id = os.getenv("SPACE_ID")
453
 
454
  if space_host:
455
  print(f"✅ SPACE_HOST found: {space_host}")
456
+ else:
457
+ print("⚠️ SPACE_HOST not found")
458
+
459
  if space_id:
460
  print(f"✅ SPACE_ID found: {space_id}")
461
+ else:
462
+ print("⚠️ SPACE_ID not found")
463
 
464
+ print("="*50)
465
  print("Launching Gradio Interface...")
466
+ demo.launch(
467
+ debug=False, # Disable debug in production
468
+ share=False,
469
+ server_name="0.0.0.0",
470
+ server_port=7860,
471
+ show_error=True
472
+ )
requirements.txt CHANGED
@@ -1,26 +1,13 @@
1
- # Core dependencies
2
  gradio>=4.0.0
3
- requests>=2.31.0
4
- pandas>=2.0.0
5
-
6
- # Local LLM support
7
- ctransformers>=0.2.27
8
-
9
- # Mathematical operations
10
- numpy>=1.24.0
11
-
12
- # Logging and utilities
13
- python-dotenv>=1.0.0
14
-
15
- # Additional utilities for text processing
16
- regex>=2023.10.3
17
- numexpr
18
- torch
19
- pdfminer.six
20
- transformers>=4.0.0
21
- duckduckgo-search>=0.8
22
- beautifulsoup4>=4.12.0
23
  html2text>=2020.1.16
24
- bitsandbytes
25
- accelerate
26
- sentencepiece
 
 
1
  gradio>=4.0.0
2
+ torch>=2.0.0
3
+ transformers>=4.35.0
4
+ requests>=2.25.0
5
+ pandas>=1.3.0
6
+ numpy>=1.21.0
7
+ duckduckgo-search>=3.8.0
8
+ pdfminer.six>=20220524
9
+ beautifulsoup4>=4.9.0
 
 
 
 
 
 
 
 
 
 
 
 
10
  html2text>=2020.1.16
11
+ numexpr>=2.8.0
12
+ python-dotenv>=0.19.0
13
+ accelerate>=0.20.0