LamiaYT commited on
Commit
dfcd4f6
ยท
1 Parent(s): c913a81
Files changed (2) hide show
  1. app.py +409 -297
  2. requirements.txt +10 -10
app.py CHANGED
@@ -5,268 +5,367 @@ import inspect
5
  import pandas as pd
6
  import json
7
  import re
8
- from typing import Dict, List, Any, Optional
9
- import urllib.parse
10
  from datetime import datetime
11
- import math
 
 
 
12
 
13
- # Transformers and torch imports
14
- from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
15
- import torch
16
 
17
  # --- Constants ---
18
  DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
19
 
20
- class EnhancedGAIAAgent:
21
- def __init__(self):
22
- print("Initializing Enhanced GAIA Agent with Mistral-7B...")
 
 
 
23
 
24
- # Initialize Mistral model
 
25
  try:
26
- print("Loading Mistral-7B-Instruct model...")
27
- self.tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-Instruct-v0.3")
28
- self.model = AutoModelForCausalLM.from_pretrained(
29
- "mistralai/Mistral-7B-Instruct-v0.3",
30
- torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
31
- device_map="auto" if torch.cuda.is_available() else None
32
- )
33
 
34
- # Create pipeline for easier use
35
- self.pipe = pipeline(
36
- "text-generation",
37
- model=self.model,
38
- tokenizer=self.tokenizer,
39
- max_new_tokens=512,
40
- temperature=0.7,
41
- do_sample=True,
42
- pad_token_id=self.tokenizer.eos_token_id
43
- )
44
- print("โœ… Mistral model loaded successfully!")
45
 
46
- except Exception as e:
47
- print(f"โŒ Error loading Mistral model: {e}")
48
- print("Falling back to basic responses...")
49
- self.pipe = None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
50
 
51
- # Tool functions for GAIA tasks
52
- self.tools = {
53
- "calculate": self._calculate,
54
- "search_web": self._search_web,
55
- "parse_data": self._parse_data,
56
- "analyze_text": self._analyze_text,
57
- "solve_math": self._solve_math
58
- }
59
-
60
- def _calculate(self, expression: str) -> str:
61
- """Safe calculator for mathematical expressions"""
62
- try:
63
- # Clean and validate expression
64
- expression = re.sub(r'[^0-9+\-*/().\s]', '', expression)
65
- result = eval(expression)
66
- return str(result)
67
- except Exception as e:
68
- return f"Calculation error: {e}"
69
-
70
- def _search_web(self, query: str) -> str:
71
- """Simulate web search (placeholder - you'd integrate real search API)"""
72
- # This is a placeholder - integrate with actual search API
73
- return f"Search results for '{query}': [This would contain real search results]"
74
-
75
- def _parse_data(self, data: str) -> str:
76
- """Parse and analyze structured data"""
77
- try:
78
- # Try to parse as JSON
79
- if data.strip().startswith('{') or data.strip().startswith('['):
80
- parsed = json.loads(data)
81
- return f"Parsed data structure with {len(parsed) if isinstance(parsed, (list, dict)) else 1} elements"
82
- else:
83
- # Basic text analysis
84
- lines = data.split('\n')
85
- return f"Text data with {len(lines)} lines, {len(data.split())} words"
86
  except Exception as e:
87
- return f"Data parsing error: {e}"
88
-
89
- def _analyze_text(self, text: str) -> str:
90
- """Analyze text content"""
91
- words = text.split()
92
- sentences = text.split('.')
93
- return f"Text analysis: {len(words)} words, {len(sentences)} sentences"
 
 
 
 
94
 
95
- def _solve_math(self, problem: str) -> str:
96
- """Enhanced math problem solver"""
97
  try:
98
- # Extract numbers and operations
99
- numbers = re.findall(r'-?\d+\.?\d*', problem)
 
 
 
 
100
 
101
- # Handle common math patterns
102
- if "percent" in problem.lower() or "%" in problem:
103
- if len(numbers) >= 2:
104
- base = float(numbers[0])
105
- percent = float(numbers[1])
106
- result = base * (percent / 100)
107
- return str(result)
108
 
109
- if "average" in problem.lower() or "mean" in problem.lower():
110
- if numbers:
111
- nums = [float(n) for n in numbers]
112
- return str(sum(nums) / len(nums))
 
 
 
 
 
 
 
 
 
 
 
 
113
 
114
- # Default calculation
115
- return self._calculate(" ".join(numbers))
 
 
 
116
 
117
  except Exception as e:
118
- return f"Math solving error: {e}"
 
 
 
 
 
 
 
 
 
119
 
120
- def _generate_response(self, prompt: str) -> str:
121
- """Generate response using Mistral model"""
122
- if not self.pipe:
123
- return "Model not available - using fallback response."
124
 
125
- try:
126
- messages = [
127
- {"role": "user", "content": prompt}
128
- ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
129
 
130
- response = self.pipe(messages, max_new_tokens=512, temperature=0.7)
 
131
 
132
- # Extract the generated text
133
- if response and len(response) > 0:
134
- generated_text = response[0]['generated_text']
135
- # Get only the assistant's response (after the user message)
136
- if isinstance(generated_text, list):
137
- # Find the assistant's response
138
- for msg in generated_text:
139
- if msg.get('role') == 'assistant':
140
- return msg.get('content', '')
141
- elif isinstance(generated_text, str):
142
- return generated_text
143
- else:
144
- return str(generated_text)
145
 
146
- return "No response generated."
 
 
 
 
 
 
 
 
 
 
 
 
147
 
148
  except Exception as e:
149
- print(f"Error generating response: {e}")
150
- return f"Error in response generation: {e}"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
151
 
152
- def _detect_task_type(self, question: str) -> str:
153
- """Detect the type of task to apply appropriate strategy"""
154
  question_lower = question.lower()
155
 
156
- if any(word in question_lower for word in ["calculate", "compute", "math", "+", "-", "*", "/", "="]):
157
- return "calculation"
158
- elif any(word in question_lower for word in ["search", "find", "lookup", "google"]):
159
- return "search"
160
- elif any(word in question_lower for word in ["data", "csv", "json", "table", "parse"]):
161
- return "data_analysis"
162
- elif any(word in question_lower for word in ["percent", "%", "average", "mean", "sum"]):
163
- return "math_word_problem"
164
  else:
165
- return "general_reasoning"
166
 
167
- def __call__(self, question: str) -> str:
168
- print(f"Agent processing question (first 100 chars): {question[:100]}...")
 
 
 
 
169
 
170
- # Detect task type
171
- task_type = self._detect_task_type(question)
172
- print(f"Detected task type: {task_type}")
173
 
174
- # Build enhanced prompt based on task type
175
- if task_type == "calculation":
176
- enhanced_prompt = f"""
177
- You are a precise mathematical assistant. Solve this step-by-step:
178
-
179
- Question: {question}
180
-
181
- Provide a clear, accurate answer. If calculation is needed, show your work.
182
- Answer:"""
183
-
184
- elif task_type == "math_word_problem":
185
- enhanced_prompt = f"""
186
- You are solving a math word problem. Break it down step by step:
 
 
 
 
 
 
 
 
187
 
188
- Question: {question}
189
 
190
- Steps:
191
- 1. Identify what is being asked
192
- 2. Extract the relevant numbers
193
- 3. Determine the operation needed
194
- 4. Calculate the result
195
- 5. Provide the final answer
196
 
197
- Answer:"""
198
-
199
- elif task_type == "data_analysis":
200
- enhanced_prompt = f"""
201
- You are analyzing data. Approach this systematically:
202
 
203
- Question: {question}
 
 
 
 
 
 
 
 
 
 
 
 
204
 
205
- Consider:
206
- - What type of data is involved?
207
- - What analysis is needed?
208
- - What tools or methods should be used?
209
 
210
- Provide a clear, structured answer.
211
- Answer:"""
212
-
213
- else:
214
- enhanced_prompt = f"""
215
- You are a helpful assistant that provides accurate, well-reasoned answers.
216
 
217
- Question: {question}
218
 
219
- Think through this step-by-step and provide a clear, comprehensive answer.
220
- Answer:"""
 
 
 
 
 
 
 
 
221
 
222
- # Generate response using the model
223
  try:
224
- response = self._generate_response(enhanced_prompt)
 
 
 
 
 
 
 
 
 
 
225
 
226
- # Post-process response for specific task types
227
- if task_type in ["calculation", "math_word_problem"]:
228
- # Try to extract and verify any calculations
229
- numbers_in_response = re.findall(r'-?\d+\.?\d*', response)
230
- if numbers_in_response:
231
- # Attempt to verify calculation if simple enough
232
- pass
 
 
 
 
 
 
 
 
 
 
 
 
233
 
234
- print(f"Agent returning response (first 100 chars): {response[:100]}...")
235
- return response.strip()
 
 
 
236
 
237
  except Exception as e:
238
- print(f"Error in agent processing: {e}")
239
- fallback_response = self._handle_fallback(question, task_type)
240
- return fallback_response
241
-
242
- def _handle_fallback(self, question: str, task_type: str) -> str:
243
- """Provide fallback responses when the main model fails"""
244
- if task_type == "calculation":
245
- # Try to extract and calculate simple expressions
246
- try:
247
- numbers = re.findall(r'-?\d+\.?\d*', question)
248
- if len(numbers) >= 2:
249
- if "+" in question:
250
- result = sum(float(n) for n in numbers)
251
- return f"The sum is {result}"
252
- elif "*" in question or "multiply" in question.lower():
253
- result = 1
254
- for n in numbers:
255
- result *= float(n)
256
- return f"The product is {result}"
257
- except:
258
- pass
259
-
260
- return f"I understand you're asking about: {question}. This appears to be a {task_type} task. Let me provide my best analysis based on the available information."
261
-
262
 
263
  def run_and_submit_all(profile: gr.OAuthProfile | None):
264
  """
265
- Fetches all questions, runs the EnhancedGAIAAgent on them, submits all answers,
266
  and displays the results.
267
  """
268
  # --- Determine HF Space Runtime URL and Repo URL ---
269
- space_id = os.getenv("SPACE_ID")
270
 
271
  if profile:
272
  username = f"{profile.username}"
@@ -279,17 +378,18 @@ def run_and_submit_all(profile: gr.OAuthProfile | None):
279
  questions_url = f"{api_url}/questions"
280
  submit_url = f"{api_url}/submit"
281
 
282
- # 1. Instantiate Enhanced Agent
283
  try:
284
- print("Initializing Enhanced GAIA Agent...")
285
- agent = EnhancedGAIAAgent()
286
- print("โœ… Agent initialized successfully!")
287
  except Exception as e:
288
- print(f"โŒ Error instantiating agent: {e}")
289
  return f"Error initializing agent: {e}", None
290
-
 
291
  agent_code = f"https://huggingface.co/spaces/{space_id}/tree/main"
292
- print(f"Agent code URL: {agent_code}")
293
 
294
  # 2. Fetch Questions
295
  print(f"Fetching questions from: {questions_url}")
@@ -300,83 +400,80 @@ def run_and_submit_all(profile: gr.OAuthProfile | None):
300
  if not questions_data:
301
  print("Fetched questions list is empty.")
302
  return "Fetched questions list is empty or invalid format.", None
303
- print(f"โœ… Fetched {len(questions_data)} questions.")
304
  except requests.exceptions.RequestException as e:
305
- print(f"โŒ Error fetching questions: {e}")
306
  return f"Error fetching questions: {e}", None
307
  except requests.exceptions.JSONDecodeError as e:
308
- print(f"โŒ Error decoding JSON response from questions endpoint: {e}")
 
309
  return f"Error decoding server response for questions: {e}", None
310
  except Exception as e:
311
- print(f"โŒ An unexpected error occurred fetching questions: {e}")
312
  return f"An unexpected error occurred fetching questions: {e}", None
313
 
314
- # 3. Run Enhanced Agent
315
  results_log = []
316
  answers_payload = []
317
- print(f"๐Ÿš€ Running enhanced agent on {len(questions_data)} questions...")
318
 
319
- for i, item in enumerate(questions_data, 1):
320
  task_id = item.get("task_id")
321
  question_text = item.get("question")
322
-
323
  if not task_id or question_text is None:
324
- print(f"โš ๏ธ Skipping item with missing task_id or question: {item}")
325
  continue
326
-
327
- print(f"๐Ÿ“ Processing question {i}/{len(questions_data)} (ID: {task_id})")
328
 
329
  try:
 
330
  submitted_answer = agent(question_text)
 
 
 
 
331
  answers_payload.append({"task_id": task_id, "submitted_answer": submitted_answer})
332
  results_log.append({
333
  "Task ID": task_id,
334
- "Question": question_text[:200] + "..." if len(question_text) > 200 else question_text,
335
- "Submitted Answer": submitted_answer[:300] + "..." if len(submitted_answer) > 300 else submitted_answer
 
336
  })
337
- print(f"โœ… Completed question {i}")
338
-
339
  except Exception as e:
340
- print(f"โŒ Error running agent on task {task_id}: {e}")
341
- error_response = f"AGENT ERROR: {e}"
342
- answers_payload.append({"task_id": task_id, "submitted_answer": error_response})
343
  results_log.append({
344
  "Task ID": task_id,
345
- "Question": question_text[:200] + "..." if len(question_text) > 200 else question_text,
346
- "Submitted Answer": error_response
 
347
  })
348
 
349
  if not answers_payload:
350
- print("โŒ Agent did not produce any answers to submit.")
351
  return "Agent did not produce any answers to submit.", pd.DataFrame(results_log)
352
 
353
- # 4. Prepare Submission
354
- submission_data = {
355
- "username": username.strip(),
356
- "agent_code": agent_code,
357
- "answers": answers_payload
358
- }
359
-
360
- print(f"๐Ÿ“ค Submitting {len(answers_payload)} answers for user '{username}'...")
361
 
362
  # 5. Submit
 
363
  try:
364
- response = requests.post(submit_url, json=submission_data, timeout=120) # Increased timeout
365
  response.raise_for_status()
366
  result_data = response.json()
367
-
368
  final_status = (
369
- f"๐ŸŽ‰ Submission Successful!\n"
370
  f"User: {result_data.get('username')}\n"
371
  f"Overall Score: {result_data.get('score', 'N/A')}% "
372
  f"({result_data.get('correct_count', '?')}/{result_data.get('total_attempted', '?')} correct)\n"
373
  f"Message: {result_data.get('message', 'No message received.')}"
374
  )
375
-
376
- print("โœ… Submission successful!")
377
  results_df = pd.DataFrame(results_log)
378
  return final_status, results_df
379
-
380
  except requests.exceptions.HTTPError as e:
381
  error_detail = f"Server responded with status {e.response.status_code}."
382
  try:
@@ -384,56 +481,65 @@ def run_and_submit_all(profile: gr.OAuthProfile | None):
384
  error_detail += f" Detail: {error_json.get('detail', e.response.text)}"
385
  except requests.exceptions.JSONDecodeError:
386
  error_detail += f" Response: {e.response.text[:500]}"
387
- status_message = f"โŒ Submission Failed: {error_detail}"
 
 
 
 
 
 
 
 
 
 
388
  print(status_message)
389
  results_df = pd.DataFrame(results_log)
390
  return status_message, results_df
391
-
392
  except Exception as e:
393
- status_message = f"โŒ An unexpected error occurred during submission: {e}"
394
  print(status_message)
395
  results_df = pd.DataFrame(results_log)
396
  return status_message, results_df
397
 
398
 
399
  # --- Build Gradio Interface using Blocks ---
400
- with gr.Blocks(title="Enhanced GAIA Agent") as demo:
401
- gr.Markdown("# ๐Ÿš€ Enhanced GAIA Agent with Mistral-7B")
402
  gr.Markdown(
403
  """
404
- **Enhanced Features:**
405
- - ๐Ÿง  **Mistral-7B-Instruct** for advanced reasoning
406
- - ๐Ÿ”ง **Tool Integration** for calculations and data processing
407
- - ๐Ÿ“Š **Task Type Detection** for optimized responses
408
- - ๐ŸŽฏ **GAIA-Optimized** prompting strategies
 
409
 
410
  **Instructions:**
411
- 1. Clone this space and ensure you have access to Mistral-7B-Instruct
412
  2. Log in to your Hugging Face account using the button below
413
- 3. Click 'Run Enhanced Evaluation' to process all questions with the enhanced agent
414
 
415
- **Note:** The enhanced agent uses Mistral-7B which requires significant computational resources.
416
- Processing may take several minutes depending on the number of questions.
417
  """
418
  )
419
 
420
- with gr.Row():
421
- gr.LoginButton()
422
-
423
- with gr.Row():
424
- run_button = gr.Button("๐Ÿš€ Run Enhanced Evaluation & Submit All Answers", variant="primary")
425
 
426
- status_output = gr.Textbox(
427
- label="๐Ÿ“Š Run Status / Submission Result",
428
- lines=8,
429
- interactive=False
430
- )
431
-
432
- results_table = gr.DataFrame(
433
- label="๐Ÿ“ Questions and Agent Answers",
434
- wrap=True,
435
- height=400
436
- )
 
 
 
 
437
 
438
  run_button.click(
439
  fn=run_and_submit_all,
@@ -441,33 +547,39 @@ with gr.Blocks(title="Enhanced GAIA Agent") as demo:
441
  )
442
 
443
  if __name__ == "__main__":
444
- print("\n" + "="*50)
445
- print("๐Ÿš€ ENHANCED GAIA AGENT STARTING")
446
- print("="*50)
447
 
448
  # Environment check
449
  space_host = os.getenv("SPACE_HOST")
450
  space_id = os.getenv("SPACE_ID")
451
-
 
452
  if space_host:
453
  print(f"โœ… SPACE_HOST: {space_host}")
454
- print(f"๐ŸŒ Runtime URL: https://{space_host}.hf.space")
455
  else:
456
- print("โ„น๏ธ Running locally - SPACE_HOST not found")
457
 
458
  if space_id:
459
  print(f"โœ… SPACE_ID: {space_id}")
460
- print(f"๐Ÿ“ Repo URL: https://huggingface.co/spaces/{space_id}")
461
  else:
462
  print("โ„น๏ธ SPACE_ID not found")
463
-
464
- # GPU/CPU check
465
- if torch.cuda.is_available():
466
- print(f"๐ŸŽฎ GPU Available: {torch.cuda.get_device_name()}")
467
- print(f"๐Ÿ’พ GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")
468
  else:
469
- print("๐Ÿ’ป Running on CPU (GPU not available)")
470
-
471
- print("="*50)
472
- print("๐Ÿš€ Launching Enhanced GAIA Agent Interface...")
 
 
 
 
 
 
 
473
  demo.launch(debug=True, share=False)
 
5
  import pandas as pd
6
  import json
7
  import re
8
+ import time
9
+ from typing import List, Dict, Any, Optional
10
  from datetime import datetime
11
+ import threading
12
+ import queue
13
+ from ctransformers import AutoModelForCausalLM
14
+ import logging
15
 
16
+ # Setup logging
17
+ logging.basicConfig(level=logging.INFO)
18
+ logger = logging.getLogger(__name__)
19
 
20
  # --- Constants ---
21
  DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
22
 
23
+ class WebSearchTool:
24
+ """Web search tool using Serper API for real-time information retrieval"""
25
+
26
+ def __init__(self, api_key: str):
27
+ self.api_key = api_key
28
+ self.base_url = "https://google.serper.dev/search"
29
 
30
+ def search(self, query: str, num_results: int = 5) -> Dict[str, Any]:
31
+ """Perform web search and return structured results"""
32
  try:
33
+ headers = {
34
+ 'X-API-KEY': self.api_key,
35
+ 'Content-Type': 'application/json'
36
+ }
 
 
 
37
 
38
+ payload = {
39
+ 'q': query,
40
+ 'num': num_results,
41
+ 'gl': 'us',
42
+ 'hl': 'en'
43
+ }
 
 
 
 
 
44
 
45
+ response = requests.post(self.base_url, json=payload, headers=headers, timeout=10)
46
+ response.raise_for_status()
47
+
48
+ data = response.json()
49
+
50
+ # Extract and format results
51
+ results = []
52
+ if 'organic' in data:
53
+ for item in data['organic'][:num_results]:
54
+ results.append({
55
+ 'title': item.get('title', ''),
56
+ 'snippet': item.get('snippet', ''),
57
+ 'link': item.get('link', ''),
58
+ 'position': item.get('position', 0)
59
+ })
60
+
61
+ return {
62
+ 'success': True,
63
+ 'results': results,
64
+ 'query': query,
65
+ 'total_results': len(results)
66
+ }
67
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
68
  except Exception as e:
69
+ logger.error(f"Web search error: {e}")
70
+ return {
71
+ 'success': False,
72
+ 'error': str(e),
73
+ 'results': [],
74
+ 'query': query,
75
+ 'total_results': 0
76
+ }
77
+
78
+ class CalculatorTool:
79
+ """Enhanced calculator tool for mathematical operations"""
80
 
81
+ def calculate(self, expression: str) -> Dict[str, Any]:
82
+ """Safely evaluate mathematical expressions"""
83
  try:
84
+ # Clean the expression
85
+ expression = expression.strip()
86
+
87
+ # Replace common mathematical functions
88
+ expression = expression.replace('^', '**') # Power operator
89
+ expression = re.sub(r'\b(\d+)x(\d+)\b', r'\1*\2', expression) # Handle multiplication like 5x3
90
 
91
+ # Allow only safe mathematical operations
92
+ allowed_chars = set('0123456789+-*/().,eE pi')
93
+ allowed_funcs = ['abs', 'round', 'min', 'max', 'sum', 'pow', 'sqrt']
 
 
 
 
94
 
95
+ # Basic safety check
96
+ if any(char.isalpha() and char not in 'pie' for char in expression):
97
+ # Check if it contains allowed function names
98
+ import math
99
+ safe_dict = {
100
+ "__builtins__": {},
101
+ "abs": abs, "round": round, "min": min, "max": max,
102
+ "sum": sum, "pow": pow, "sqrt": math.sqrt,
103
+ "pi": math.pi, "e": math.e,
104
+ "sin": math.sin, "cos": math.cos, "tan": math.tan,
105
+ "log": math.log, "log10": math.log10,
106
+ "exp": math.exp, "floor": math.floor, "ceil": math.ceil
107
+ }
108
+ result = eval(expression, safe_dict)
109
+ else:
110
+ result = eval(expression)
111
 
112
+ return {
113
+ 'success': True,
114
+ 'result': result,
115
+ 'expression': expression
116
+ }
117
 
118
  except Exception as e:
119
+ logger.error(f"Calculator error: {e}")
120
+ return {
121
+ 'success': False,
122
+ 'error': str(e),
123
+ 'expression': expression,
124
+ 'result': None
125
+ }
126
+
127
+ class LocalLLMManager:
128
+ """Manages local quantized LLM for reasoning"""
129
 
130
+ def __init__(self):
131
+ self.model = None
132
+ self.model_loaded = False
133
+ self.load_lock = threading.Lock()
134
 
135
+ def load_model(self):
136
+ """Load quantized model optimized for CPU inference"""
137
+ with self.load_lock:
138
+ if self.model_loaded:
139
+ return
140
+
141
+ try:
142
+ logger.info("Loading quantized model...")
143
+
144
+ # Use Phi-3-mini for better performance on CPU with limited resources
145
+ self.model = AutoModelForCausalLM.from_pretrained(
146
+ "microsoft/Phi-3-mini-4k-instruct-gguf",
147
+ model_file="Phi-3-mini-4k-instruct-q4.gguf",
148
+ model_type="phi3",
149
+ gpu_layers=0, # CPU only
150
+ context_length=3072, # Reduced context to save memory
151
+ max_new_tokens=512,
152
+ temperature=0.1,
153
+ top_p=0.9,
154
+ repetition_penalty=1.1
155
+ )
156
+
157
+ self.model_loaded = True
158
+ logger.info("Model loaded successfully")
159
+
160
+ except Exception as e:
161
+ logger.error(f"Error loading model: {e}")
162
+ # Fallback to a smaller model if Phi-3 fails
163
+ try:
164
+ logger.info("Trying fallback model...")
165
+ self.model = AutoModelForCausalLM.from_pretrained(
166
+ "TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF",
167
+ model_file="tinyllama-1.1b-chat-v1.0.q4_k_m.gguf",
168
+ model_type="llama",
169
+ gpu_layers=0,
170
+ context_length=2048,
171
+ max_new_tokens=256
172
+ )
173
+ self.model_loaded = True
174
+ logger.info("Fallback model loaded successfully")
175
+ except Exception as e2:
176
+ logger.error(f"Fallback model also failed: {e2}")
177
+ raise
178
+
179
+ def generate(self, prompt: str, max_tokens: int = 256) -> str:
180
+ """Generate response from local model"""
181
+ if not self.model_loaded:
182
+ self.load_model()
183
 
184
+ if not self.model:
185
+ return "Error: Model not available"
186
 
187
+ try:
188
+ # Format prompt for Phi-3
189
+ formatted_prompt = f"<|user|>\n{prompt}<|end|>\n<|assistant|>\n"
 
 
 
 
 
 
 
 
 
 
190
 
191
+ response = self.model(
192
+ formatted_prompt,
193
+ max_new_tokens=min(max_tokens, 256), # Limit tokens for speed
194
+ temperature=0.1,
195
+ stop=["<|end|>", "<|user|>"]
196
+ )
197
+
198
+ # Clean response
199
+ response = response.replace(formatted_prompt, "").strip()
200
+ if "<|end|>" in response:
201
+ response = response.split("<|end|>")[0].strip()
202
+
203
+ return response
204
 
205
  except Exception as e:
206
+ logger.error(f"Generation error: {e}")
207
+ return f"Error generating response: {e}"
208
+
209
+ class GAIAAgent:
210
+ """Advanced GAIA agent with reasoning, tools, and multi-step problem solving"""
211
+
212
+ def __init__(self):
213
+ # Initialize tools
214
+ self.serper_api_key = os.getenv("SERPER_API_KEY")
215
+ if not self.serper_api_key:
216
+ logger.warning("SERPER_API_KEY not found. Web search will be disabled.")
217
+ self.web_search = None
218
+ else:
219
+ self.web_search = WebSearchTool(self.serper_api_key)
220
+
221
+ self.calculator = CalculatorTool()
222
+ self.llm = LocalLLMManager()
223
+
224
+ # Agent configuration
225
+ self.max_iterations = 5
226
+ self.max_reasoning_length = 1000
227
+
228
+ logger.info("GAIA Agent initialized")
229
 
230
+ def _identify_question_type(self, question: str) -> str:
231
+ """Identify the type of question to determine approach"""
232
  question_lower = question.lower()
233
 
234
+ if any(word in question_lower for word in ['calculate', 'compute', 'math', '+', '-', '*', '/', '=', 'sum', 'multiply', 'divide']):
235
+ return 'mathematical'
236
+ elif any(word in question_lower for word in ['current', 'latest', 'recent', 'today', 'now', '2024', '2025']):
237
+ return 'current_info'
238
+ elif any(word in question_lower for word in ['who', 'what', 'where', 'when', 'why', 'how']):
239
+ return 'factual'
240
+ elif any(word in question_lower for word in ['analyze', 'compare', 'explain', 'reason']):
241
+ return 'analytical'
242
  else:
243
+ return 'general'
244
 
245
+ def _use_web_search(self, query: str) -> str:
246
+ """Use web search tool and format results"""
247
+ if not self.web_search:
248
+ return "Web search not available (API key missing)"
249
+
250
+ results = self.web_search.search(query, num_results=3)
251
 
252
+ if not results['success']:
253
+ return f"Search failed: {results.get('error', 'Unknown error')}"
 
254
 
255
+ if not results['results']:
256
+ return "No search results found"
257
+
258
+ formatted_results = f"Search results for '{query}':\n"
259
+ for i, result in enumerate(results['results'], 1):
260
+ formatted_results += f"{i}. {result['title']}\n {result['snippet']}\n\n"
261
+
262
+ return formatted_results
263
+
264
+ def _use_calculator(self, expression: str) -> str:
265
+ """Use calculator tool and format result"""
266
+ result = self.calculator.calculate(expression)
267
+
268
+ if result['success']:
269
+ return f"Calculation: {result['expression']} = {result['result']}"
270
+ else:
271
+ return f"Calculation error: {result['error']}"
272
+
273
+ def _generate_reasoning(self, question: str, context: str = "") -> str:
274
+ """Generate reasoning step using local LLM"""
275
+ reasoning_prompt = f"""Question: {question}
276
 
277
+ Context: {context}
278
 
279
+ Think step by step about this question. Consider:
280
+ 1. What information do I need?
281
+ 2. What tools might help?
282
+ 3. How should I approach this problem?
 
 
283
 
284
+ Provide a clear reasoning step:"""
 
 
 
 
285
 
286
+ try:
287
+ reasoning = self.llm.generate(reasoning_prompt, max_tokens=200)
288
+ return reasoning
289
+ except Exception as e:
290
+ logger.error(f"Reasoning generation error: {e}")
291
+ return "Unable to generate reasoning step"
292
+
293
+ def _generate_final_answer(self, question: str, context: str, reasoning_steps: List[str]) -> str:
294
+ """Generate final answer using all available information"""
295
+
296
+ all_reasoning = "\n".join([f"Step {i+1}: {step}" for i, step in enumerate(reasoning_steps)])
297
+
298
+ answer_prompt = f"""Question: {question}
299
 
300
+ Context and Information:
301
+ {context}
 
 
302
 
303
+ Reasoning Steps:
304
+ {all_reasoning}
 
 
 
 
305
 
306
+ Based on all the information and reasoning above, provide a clear, concise, and accurate final answer to the question:"""
307
 
308
+ try:
309
+ answer = self.llm.generate(answer_prompt, max_tokens=200)
310
+ return answer.strip()
311
+ except Exception as e:
312
+ logger.error(f"Answer generation error: {e}")
313
+ return "Unable to generate final answer"
314
+
315
+ def __call__(self, question: str) -> str:
316
+ """Main agent execution method"""
317
+ logger.info(f"Processing question: {question[:100]}...")
318
 
 
319
  try:
320
+ # Initialize
321
+ context = ""
322
+ reasoning_steps = []
323
+ question_type = self._identify_question_type(question)
324
+
325
+ logger.info(f"Question type identified: {question_type}")
326
+
327
+ # Step 1: Initial reasoning
328
+ initial_reasoning = self._generate_reasoning(question)
329
+ reasoning_steps.append(initial_reasoning)
330
+ context += f"Initial reasoning: {initial_reasoning}\n\n"
331
 
332
+ # Step 2: Apply tools based on question type
333
+ if question_type == 'mathematical':
334
+ # Try to extract mathematical expressions
335
+ math_matches = re.findall(r'[\d\+\-\*/\(\)\.\s\^]+', question)
336
+ for match in math_matches:
337
+ if len(match.strip()) > 3: # Avoid single digits
338
+ calc_result = self._use_calculator(match.strip())
339
+ context += f"Calculation: {calc_result}\n"
340
+
341
+ elif question_type in ['current_info', 'factual']:
342
+ # Use web search for factual or current information
343
+ search_result = self._use_web_search(question)
344
+ context += f"Web search results: {search_result}\n"
345
+
346
+ # Step 3: Additional reasoning with context
347
+ if context:
348
+ additional_reasoning = self._generate_reasoning(question, context)
349
+ reasoning_steps.append(additional_reasoning)
350
+ context += f"Additional reasoning: {additional_reasoning}\n\n"
351
 
352
+ # Step 4: Generate final answer
353
+ final_answer = self._generate_final_answer(question, context, reasoning_steps)
354
+
355
+ logger.info(f"Generated answer: {final_answer[:100]}...")
356
+ return final_answer
357
 
358
  except Exception as e:
359
+ logger.error(f"Agent execution error: {e}")
360
+ return f"Error processing question: {str(e)}"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
361
 
362
  def run_and_submit_all(profile: gr.OAuthProfile | None):
363
  """
364
+ Fetches all questions, runs the GAIA Agent on them, submits all answers,
365
  and displays the results.
366
  """
367
  # --- Determine HF Space Runtime URL and Repo URL ---
368
+ space_id = os.getenv("SPACE_ID") # Get the SPACE_ID for sending link to the code
369
 
370
  if profile:
371
  username = f"{profile.username}"
 
378
  questions_url = f"{api_url}/questions"
379
  submit_url = f"{api_url}/submit"
380
 
381
+ # 1. Instantiate Agent
382
  try:
383
+ print("Initializing GAIA Agent...")
384
+ agent = GAIAAgent()
385
+ print("GAIA Agent initialized successfully")
386
  except Exception as e:
387
+ print(f"Error instantiating agent: {e}")
388
  return f"Error initializing agent: {e}", None
389
+
390
+ # Agent code link
391
  agent_code = f"https://huggingface.co/spaces/{space_id}/tree/main"
392
+ print(f"Agent code: {agent_code}")
393
 
394
  # 2. Fetch Questions
395
  print(f"Fetching questions from: {questions_url}")
 
400
  if not questions_data:
401
  print("Fetched questions list is empty.")
402
  return "Fetched questions list is empty or invalid format.", None
403
+ print(f"Fetched {len(questions_data)} questions.")
404
  except requests.exceptions.RequestException as e:
405
+ print(f"Error fetching questions: {e}")
406
  return f"Error fetching questions: {e}", None
407
  except requests.exceptions.JSONDecodeError as e:
408
+ print(f"Error decoding JSON response from questions endpoint: {e}")
409
+ print(f"Response text: {response.text[:500]}")
410
  return f"Error decoding server response for questions: {e}", None
411
  except Exception as e:
412
+ print(f"An unexpected error occurred fetching questions: {e}")
413
  return f"An unexpected error occurred fetching questions: {e}", None
414
 
415
+ # 3. Run GAIA Agent
416
  results_log = []
417
  answers_payload = []
418
+ print(f"Running GAIA agent on {len(questions_data)} questions...")
419
 
420
+ for i, item in enumerate(questions_data):
421
  task_id = item.get("task_id")
422
  question_text = item.get("question")
 
423
  if not task_id or question_text is None:
424
+ print(f"Skipping item with missing task_id or question: {item}")
425
  continue
426
+
427
+ print(f"Processing question {i+1}/{len(questions_data)}: {task_id}")
428
 
429
  try:
430
+ start_time = time.time()
431
  submitted_answer = agent(question_text)
432
+ processing_time = time.time() - start_time
433
+
434
+ print(f"Question {task_id} processed in {processing_time:.2f}s")
435
+
436
  answers_payload.append({"task_id": task_id, "submitted_answer": submitted_answer})
437
  results_log.append({
438
  "Task ID": task_id,
439
+ "Question": question_text[:100] + "..." if len(question_text) > 100 else question_text,
440
+ "Submitted Answer": submitted_answer[:200] + "..." if len(submitted_answer) > 200 else submitted_answer,
441
+ "Processing Time (s)": f"{processing_time:.2f}"
442
  })
 
 
443
  except Exception as e:
444
+ print(f"Error running agent on task {task_id}: {e}")
 
 
445
  results_log.append({
446
  "Task ID": task_id,
447
+ "Question": question_text[:100] + "..." if len(question_text) > 100 else question_text,
448
+ "Submitted Answer": f"AGENT ERROR: {e}",
449
+ "Processing Time (s)": "Error"
450
  })
451
 
452
  if not answers_payload:
453
+ print("Agent did not produce any answers to submit.")
454
  return "Agent did not produce any answers to submit.", pd.DataFrame(results_log)
455
 
456
+ # 4. Prepare Submission
457
+ submission_data = {"username": username.strip(), "agent_code": agent_code, "answers": answers_payload}
458
+ status_update = f"Agent finished. Submitting {len(answers_payload)} answers for user '{username}'..."
459
+ print(status_update)
 
 
 
 
460
 
461
  # 5. Submit
462
+ print(f"Submitting {len(answers_payload)} answers to: {submit_url}")
463
  try:
464
+ response = requests.post(submit_url, json=submission_data, timeout=120)
465
  response.raise_for_status()
466
  result_data = response.json()
 
467
  final_status = (
468
+ f"Submission Successful!\n"
469
  f"User: {result_data.get('username')}\n"
470
  f"Overall Score: {result_data.get('score', 'N/A')}% "
471
  f"({result_data.get('correct_count', '?')}/{result_data.get('total_attempted', '?')} correct)\n"
472
  f"Message: {result_data.get('message', 'No message received.')}"
473
  )
474
+ print("Submission successful.")
 
475
  results_df = pd.DataFrame(results_log)
476
  return final_status, results_df
 
477
  except requests.exceptions.HTTPError as e:
478
  error_detail = f"Server responded with status {e.response.status_code}."
479
  try:
 
481
  error_detail += f" Detail: {error_json.get('detail', e.response.text)}"
482
  except requests.exceptions.JSONDecodeError:
483
  error_detail += f" Response: {e.response.text[:500]}"
484
+ status_message = f"Submission Failed: {error_detail}"
485
+ print(status_message)
486
+ results_df = pd.DataFrame(results_log)
487
+ return status_message, results_df
488
+ except requests.exceptions.Timeout:
489
+ status_message = "Submission Failed: The request timed out."
490
+ print(status_message)
491
+ results_df = pd.DataFrame(results_log)
492
+ return status_message, results_df
493
+ except requests.exceptions.RequestException as e:
494
+ status_message = f"Submission Failed: Network error - {e}"
495
  print(status_message)
496
  results_df = pd.DataFrame(results_log)
497
  return status_message, results_df
 
498
  except Exception as e:
499
+ status_message = f"An unexpected error occurred during submission: {e}"
500
  print(status_message)
501
  results_df = pd.DataFrame(results_log)
502
  return status_message, results_df
503
 
504
 
505
  # --- Build Gradio Interface using Blocks ---
506
+ with gr.Blocks(title="GAIA Agent Evaluation") as demo:
507
+ gr.Markdown("# GAIA Agent Evaluation Runner")
508
  gr.Markdown(
509
  """
510
+ **Advanced GAIA Agent Features:**
511
+ - ๐Ÿง  Local quantized LLM for reasoning (Phi-3-mini optimized for CPU)
512
+ - ๐Ÿ” Web search capabilities via Serper API
513
+ - ๐Ÿงฎ Mathematical calculation tools
514
+ - ๐ŸŽฏ Multi-step problem solving approach
515
+ - ๐Ÿš€ Optimized for 16GB RAM / 2 vCPU constraints
516
 
517
  **Instructions:**
518
+ 1. Ensure your SERPER_API_KEY environment variable is set for web search
519
  2. Log in to your Hugging Face account using the button below
520
+ 3. Click 'Run GAIA Evaluation' to start the comprehensive evaluation
521
 
522
+ **Note:** Initial model loading may take 1-2 minutes. Subsequent questions will be processed faster.
 
523
  """
524
  )
525
 
526
+ gr.LoginButton()
 
 
 
 
527
 
528
+ run_button = gr.Button("๐Ÿš€ Run GAIA Evaluation & Submit All Answers", variant="primary")
529
+
530
+ status_output = gr.Textbox(label="๐Ÿ“Š Evaluation Status & Results", lines=8, interactive=False)
531
+ results_table = gr.DataFrame(label="๐Ÿ“‹ Detailed Question Results", wrap=True)
532
+
533
+ # Add system info
534
+ with gr.Accordion("๐Ÿ”ง System Information", open=False):
535
+ gr.Markdown(f"""
536
+ - **Environment**: Hugging Face Space
537
+ - **Resources**: 16GB RAM, 2 vCPU
538
+ - **Model**: Phi-3-mini-4k-instruct (quantized)
539
+ - **Web Search**: {'โœ… Enabled' if os.getenv('SERPER_API_KEY') else 'โŒ Disabled (no API key)'}
540
+ - **Calculator**: โœ… Enabled
541
+ - **Timestamp**: {datetime.now().strftime('%Y-%m-%d %H:%M:%S UTC')}
542
+ """)
543
 
544
  run_button.click(
545
  fn=run_and_submit_all,
 
547
  )
548
 
549
  if __name__ == "__main__":
550
+ print("\n" + "="*70)
551
+ print("๐Ÿš€ GAIA AGENT EVALUATION SYSTEM STARTING")
552
+ print("="*70)
553
 
554
  # Environment check
555
  space_host = os.getenv("SPACE_HOST")
556
  space_id = os.getenv("SPACE_ID")
557
+ serper_key = os.getenv("SERPER_API_KEY")
558
+
559
  if space_host:
560
  print(f"โœ… SPACE_HOST: {space_host}")
561
+ print(f" ๐ŸŒ Runtime URL: https://{space_host}.hf.space")
562
  else:
563
+ print("โ„น๏ธ Running locally (SPACE_HOST not found)")
564
 
565
  if space_id:
566
  print(f"โœ… SPACE_ID: {space_id}")
567
+ print(f" ๐Ÿ“ Repo URL: https://huggingface.co/spaces/{space_id}")
568
  else:
569
  print("โ„น๏ธ SPACE_ID not found")
570
+
571
+ if serper_key:
572
+ print("โœ… SERPER_API_KEY: Configured")
 
 
573
  else:
574
+ print("โš ๏ธ SERPER_API_KEY: Not found - Web search will be disabled")
575
+
576
+ print("="*70)
577
+ print("๐Ÿ“š GAIA Agent Features:")
578
+ print(" ๐Ÿง  Local LLM reasoning")
579
+ print(" ๐Ÿ” Web search integration")
580
+ print(" ๐Ÿงฎ Mathematical calculations")
581
+ print(" ๐ŸŽฏ Multi-step problem solving")
582
+ print("="*70 + "\n")
583
+
584
+ print("๐ŸŽฏ Launching GAIA Agent Evaluation Interface...")
585
  demo.launch(debug=True, share=False)
requirements.txt CHANGED
@@ -1,13 +1,13 @@
1
- torch>=2.0.0
2
  transformers>=4.35.0
3
- requests>=2.25.0
4
- pandas>=1.3.0
5
- numpy>=1.21.0
6
- duckduckgo-search>=3.8.0
7
- pdfminer.six>=20220524
8
- beautifulsoup4>=4.9.0
9
- html2text>=2020.1.16
10
- numexpr>=2.8.0
11
- python-dotenv>=0.19.0
12
  accelerate>=0.20.0
13
  sentencepiece>=0.1.99
 
 
 
 
1
+ gradio>=4.0.0
2
  transformers>=4.35.0
3
+ torch>=2.0.0
4
+ pandas>=1.5.0
5
+ requests>=2.28.0
6
+ beautifulsoup4>=4.11.0
7
+ wikipedia>=1.4.0
8
+ smolagents>=0.1.0
 
 
 
9
  accelerate>=0.20.0
10
  sentencepiece>=0.1.99
11
+ openpyxl
12
+ PyPDF2
13
+ pillow