LamiaYT commited on
Commit
7963312
ยท
1 Parent(s): 82a1534

Deploy GAIA agent

Browse files
Files changed (2) hide show
  1. app.py +429 -507
  2. requirements.txt +33 -15
app.py CHANGED
@@ -1,590 +1,512 @@
1
- # app.py - Production-Ready GAIA Agent with Robust Error Handling
2
-
3
  import os
4
  import gradio as gr
5
  import requests
 
6
  import pandas as pd
7
- import traceback
8
- import torch
9
- import re
10
  import json
11
- import time
12
- import random
13
- import urllib.parse
14
- from typing import Dict, List, Any
15
- import logging
16
-
17
- # Set up logging
18
- logging.basicConfig(level=logging.INFO)
19
- logger = logging.getLogger(__name__)
20
-
21
- # Import dependencies with better error handling
22
- try:
23
- from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
24
- HF_AVAILABLE = True
25
- except ImportError:
26
- logger.warning("Transformers not available")
27
- HF_AVAILABLE = False
28
-
29
- try:
30
- import requests
31
- from bs4 import BeautifulSoup
32
- WEB_SCRAPING_AVAILABLE = True
33
- except ImportError:
34
- logger.warning("Web scraping dependencies not available")
35
- WEB_SCRAPING_AVAILABLE = False
36
-
37
- try:
38
- from sympy import sympify, simplify, N, solve
39
- from sympy.core.sympify import SympifyError
40
- SYMPY_AVAILABLE = True
41
- except ImportError:
42
- logger.warning("SymPy not available")
43
- SYMPY_AVAILABLE = False
44
 
45
  # --- Constants ---
46
  DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
47
 
48
- class RobustWebSearcher:
49
- """Robust web searcher with multiple fallback strategies"""
50
-
51
- def __init__(self):
52
- self.session = requests.Session()
53
- self.session.headers.update({
54
- 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
55
- })
56
-
57
- def search_wikipedia(self, query: str) -> str:
58
- """Search Wikipedia directly via API"""
59
- try:
60
- # Clean query for Wikipedia
61
- clean_query = re.sub(r'[^\w\s]', ' ', query).strip()
62
-
63
- # Wikipedia API search
64
- search_url = "https://en.wikipedia.org/api/rest_v1/page/summary/" + urllib.parse.quote(clean_query)
65
-
66
- response = self.session.get(search_url, timeout=10)
67
- if response.status_code == 200:
68
- data = response.json()
69
- return f"Wikipedia: {data.get('extract', 'No summary available')}"
70
-
71
- # Fallback to search API
72
- search_api = "https://en.wikipedia.org/w/api.php"
73
- params = {
74
- 'action': 'query',
75
- 'format': 'json',
76
- 'list': 'search',
77
- 'srsearch': clean_query,
78
- 'srlimit': 3
79
- }
80
-
81
- response = self.session.get(search_api, params=params, timeout=10)
82
- if response.status_code == 200:
83
- data = response.json()
84
- results = data.get('query', {}).get('search', [])
85
- if results:
86
- titles = [r['title'] for r in results[:3]]
87
- return f"Wikipedia search results: {', '.join(titles)}"
88
-
89
- return "Wikipedia search failed"
90
-
91
- except Exception as e:
92
- logger.error(f"Wikipedia search error: {e}")
93
- return f"Wikipedia search error: {str(e)}"
94
-
95
- def search_basic_web(self, query: str) -> str:
96
- """Basic web search using public APIs"""
97
- try:
98
- # Try searching for specific patterns
99
- if "mercedes sosa" in query.lower():
100
- return self._search_mercedes_sosa_albums()
101
- elif "bird species" in query.lower() and "youtube" in query.lower():
102
- return self._analyze_youtube_video(query)
103
- elif "malko competition" in query.lower():
104
- return self._search_malko_competition()
105
- else:
106
- return self.search_wikipedia(query)
107
-
108
- except Exception as e:
109
- return f"Web search failed: {str(e)}"
110
-
111
- def _search_mercedes_sosa_albums(self) -> str:
112
- """Specific search for Mercedes Sosa discography"""
113
- return """Mercedes Sosa Albums 2000-2009:
114
- Based on discography information:
115
- - "Misa Criolla" (2000)
116
- - "Cantora 1" (2009)
117
- - Several compilation albums but limited new studio releases
118
- - Total studio albums in this period: approximately 2-3"""
119
-
120
- def _analyze_youtube_video(self, query: str) -> str:
121
- """Analyze YouTube video for bird species"""
122
- video_match = re.search(r'youtube\.com/watch\?v=([a-zA-Z0-9_-]+)', query)
123
- if video_match:
124
- video_id = video_match.group(1)
125
- return f"Cannot directly analyze YouTube video {video_id} content. Would need video analysis tools to count bird species simultaneously on camera."
126
- return "Cannot analyze YouTube video without direct access"
127
-
128
- def _search_malko_competition(self) -> str:
129
- """Search for Malko competition information"""
130
- return """Herbert von Karajan International Conducting Competition (Malko Competition):
131
- - Annual conducting competition
132
- - Winners from various countries
133
- - Some winners from countries that no longer exist (Soviet Union, Yugoslavia)
134
- - Would need specific year and winner list to determine exact nationality"""
135
-
136
- class EnhancedCalculator:
137
- """Enhanced calculator with multiple calculation strategies"""
138
-
139
- def calculate(self, expression: str) -> str:
140
- """Perform calculations with multiple fallback methods"""
141
- try:
142
- # Check if it's actually a math problem
143
- if not self._is_math_expression(expression):
144
- return "This doesn't appear to be a mathematical expression"
145
-
146
- # Clean the expression
147
- clean_expr = self._clean_expression(expression)
148
-
149
- # Try basic evaluation
150
- try:
151
- if self._is_safe_expression(clean_expr):
152
- result = eval(clean_expr)
153
- return f"Result: {result}"
154
- except:
155
- pass
156
-
157
- # Try SymPy if available
158
- if SYMPY_AVAILABLE:
159
- try:
160
- expr = sympify(clean_expr)
161
- result = simplify(expr)
162
- numerical = N(result, 8)
163
- return f"Mathematical result: {numerical}"
164
- except:
165
- pass
166
-
167
- # Try basic arithmetic parsing
168
- return self._parse_arithmetic(clean_expr)
169
-
170
- except Exception as e:
171
- return f"Calculation error: {str(e)}"
172
-
173
- def _is_math_expression(self, text: str) -> bool:
174
- """Check if text contains mathematical expressions"""
175
- math_indicators = ['+', '-', '*', '/', '=', '%', 'calculate', 'solve', 'equation']
176
- return any(indicator in text.lower() for indicator in math_indicators)
177
-
178
- def _clean_expression(self, expr: str) -> str:
179
- """Clean mathematical expression"""
180
- expr = expr.replace('^', '**').replace('ร—', '*').replace('รท', '/')
181
- expr = re.sub(r'(\d)\s*\(', r'\1*(', expr)
182
- return expr
183
-
184
- def _is_safe_expression(self, expr: str) -> bool:
185
- """Check if expression is safe to evaluate"""
186
  allowed_chars = set('0123456789+-*/.() ')
187
- return all(char in allowed_chars for char in expr)
188
-
189
- def _parse_arithmetic(self, expr: str) -> str:
190
- """Parse basic arithmetic expressions"""
191
- try:
192
- # Simple addition/subtraction/multiplication/division
193
- if '+' in expr:
194
- parts = expr.split('+')
195
- if len(parts) == 2:
196
- result = float(parts[0].strip()) + float(parts[1].strip())
197
- return f"Addition result: {result}"
198
- elif '-' in expr and expr.count('-') == 1:
199
- parts = expr.split('-')
200
- if len(parts) == 2:
201
- result = float(parts[0].strip()) - float(parts[1].strip())
202
- return f"Subtraction result: {result}"
203
- elif '*' in expr:
204
- parts = expr.split('*')
205
- if len(parts) == 2:
206
- result = float(parts[0].strip()) * float(parts[1].strip())
207
- return f"Multiplication result: {result}"
208
- elif '/' in expr:
209
- parts = expr.split('/')
210
- if len(parts) == 2:
211
- result = float(parts[0].strip()) / float(parts[1].strip())
212
- return f"Division result: {result}"
213
- except:
214
- pass
215
 
216
- return f"Could not calculate: {expr}"
217
-
218
- class SimpleTextGenerator:
219
- """Simple text generator without complex dependencies"""
220
-
221
- def __init__(self):
222
- self.pipeline = None
223
- if HF_AVAILABLE:
224
- try:
225
- # Use a very small, reliable model
226
- self.pipeline = pipeline(
227
- "text-generation",
228
- model="gpt2",
229
- device=-1, # CPU only
230
- torch_dtype=torch.float32
231
- )
232
- logger.info("Loaded GPT-2 for text generation")
233
- except Exception as e:
234
- logger.error(f"Failed to load text generation model: {e}")
235
-
236
- def generate_response(self, prompt: str, max_length: int = 150) -> str:
237
- """Generate a response to the prompt"""
238
- try:
239
- if self.pipeline:
240
- # Generate with conservative settings
241
- result = self.pipeline(
242
- prompt,
243
- max_length=max_length,
244
- num_return_sequences=1,
245
- temperature=0.7,
246
- do_sample=True,
247
- pad_token_id=50256
248
- )
249
- return result[0]['generated_text'][len(prompt):].strip()
250
- else:
251
- return "Text generation not available"
252
- except Exception as e:
253
- logger.error(f"Text generation error: {e}")
254
- return f"Generation error: {str(e)}"
255
 
256
- class ProductionGAIAAgent:
257
- """Production-ready GAIA agent with robust error handling"""
258
-
259
- def __init__(self):
260
- logger.info("Initializing Production GAIA Agent...")
 
 
 
 
 
 
 
261
 
262
- # Initialize components
263
- self.searcher = RobustWebSearcher()
264
- self.calculator = EnhancedCalculator()
265
- self.text_generator = SimpleTextGenerator()
266
 
267
- # Question type patterns
268
- self.question_patterns = {
269
- 'mathematical': [r'\+', r'-', r'\*', r'/', r'calculate', r'solve', r'equation', r'percent', r'%'],
270
- 'factual': [r'who is', r'what is', r'when was', r'where is', r'how many'],
271
- 'youtube': [r'youtube\.com', r'video'],
272
- 'wikipedia': [r'wikipedia', r'wiki'],
273
- 'biographical': [r'born', r'nationality', r'country']
274
- }
275
 
276
- logger.info("Production GAIA Agent initialized successfully")
277
-
278
- def classify_question(self, question: str) -> str:
279
- """Classify question type for appropriate routing"""
280
- question_lower = question.lower()
 
 
 
 
 
281
 
282
- for question_type, patterns in self.question_patterns.items():
283
- if any(re.search(pattern, question_lower) for pattern in patterns):
284
- return question_type
 
 
 
 
285
 
286
- return 'general'
287
-
288
- def process_question(self, question: str) -> str:
289
- """Process question with appropriate strategy"""
290
- logger.info(f"Processing question: {question[:100]}...")
 
 
 
 
 
 
 
 
 
 
291
 
292
- question_type = self.classify_question(question)
293
- logger.info(f"Question type: {question_type}")
294
 
295
- try:
296
- if question_type == 'mathematical':
297
- return self._handle_mathematical_question(question)
298
- elif question_type == 'youtube':
299
- return self._handle_youtube_question(question)
300
- elif question_type in ['factual', 'biographical', 'wikipedia']:
301
- return self._handle_factual_question(question)
302
- else:
303
- return self._handle_general_question(question)
304
-
305
- except Exception as e:
306
- logger.error(f"Error processing question: {e}")
307
- return f"Error processing question: {str(e)}"
308
-
309
- def _handle_mathematical_question(self, question: str) -> str:
310
- """Handle mathematical questions"""
311
- logger.info("Handling mathematical question")
312
- result = self.calculator.calculate(question)
313
 
314
- if "doesn't appear to be" in result:
315
- # Maybe it's a factual question about numbers
316
- return self._handle_factual_question(question)
 
317
 
318
- return result
319
-
320
- def _handle_youtube_question(self, question: str) -> str:
321
- """Handle YouTube video questions"""
322
- logger.info("Handling YouTube question")
323
 
324
- # Extract video ID
325
- video_match = re.search(r'youtube\.com/watch\?v=([a-zA-Z0-9_-]+)', question)
326
- if video_match:
327
- video_id = video_match.group(1)
328
 
329
- # For bird species counting, provide a reasonable approach
330
- if "bird species" in question.lower() and "simultaneously" in question.lower():
331
- return f"Cannot directly analyze YouTube video {video_id} for simultaneous bird species count. This would require:\n1. Video frame analysis\n2. Species identification AI\n3. Temporal tracking\n\nWithout access to video analysis tools, cannot provide specific count."
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
332
 
333
- return self.searcher.search_basic_web(question)
334
-
335
- def _handle_factual_question(self, question: str) -> str:
336
- """Handle factual questions"""
337
- logger.info("Handling factual question")
338
 
339
- # Add delay to avoid rate limiting
340
- time.sleep(random.uniform(2, 4))
 
341
 
342
- result = self.searcher.search_basic_web(question)
 
343
 
344
- # If search failed, try to provide some context
345
- if "failed" in result.lower() or "error" in result.lower():
346
- return self._provide_contextual_answer(question)
347
 
348
- return result
349
-
350
- def _handle_general_question(self, question: str) -> str:
351
- """Handle general questions"""
352
- logger.info("Handling general question")
353
 
354
- # Try factual approach first
355
- factual_result = self._handle_factual_question(question)
 
356
 
357
- if "failed" not in factual_result.lower():
358
- return factual_result
 
 
 
 
 
 
 
 
359
 
360
- # Fallback to contextual answer
361
- return self._provide_contextual_answer(question)
362
-
363
- def _provide_contextual_answer(self, question: str) -> str:
364
- """Provide contextual answer when search fails"""
365
- question_lower = question.lower()
 
 
 
 
 
 
366
 
367
- # Specific question patterns
368
- if "mercedes sosa" in question_lower and "album" in question_lower:
369
- return "Mercedes Sosa released several albums between 2000-2009, including 'Misa Criolla' (2000) and 'Cantora 1' (2009). Exact studio album count requires discography verification."
 
 
 
 
 
 
370
 
371
- elif "malko competition" in question_lower:
372
- return "The Herbert von Karajan International Conducting Competition (Malko Competition) has had winners from various countries, including some from countries that no longer exist like the Soviet Union and Yugoslavia."
 
 
 
 
 
 
 
 
 
 
 
 
373
 
374
- elif "youtube" in question_lower and "bird" in question_lower:
375
- return "Counting simultaneous bird species in a video requires specialized video analysis tools and ornithological expertise."
 
376
 
377
- else:
378
- return f"Unable to provide specific information for: {question}. This may require specialized tools or access to current databases."
 
 
 
 
 
 
 
379
 
380
- def cleanup_memory():
381
- """Clean up memory and cache"""
382
- try:
383
- if torch.cuda.is_available():
384
- torch.cuda.empty_cache()
385
- logger.info("Memory cleaned")
386
- except Exception as e:
387
- logger.error(f"Memory cleanup error: {e}")
388
 
389
- def run_and_submit_all(profile: gr.OAuthProfile | None):
390
- """Run evaluation with production-ready agent"""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
391
 
392
- if not profile:
393
- return "โŒ Please login to Hugging Face first", None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
394
 
395
- username = profile.username
396
- logger.info(f"User: {username}")
 
 
 
 
397
 
398
- # API endpoints
399
  api_url = DEFAULT_API_URL
400
  questions_url = f"{api_url}/questions"
401
  submit_url = f"{api_url}/submit"
402
-
403
- cleanup_memory()
404
 
405
- # Initialize production agent
406
  try:
407
- logger.info("Initializing Production GAIA Agent...")
408
- agent = ProductionGAIAAgent()
409
- logger.info("Agent initialized successfully")
410
  except Exception as e:
411
- error_msg = f"โŒ Agent initialization failed: {str(e)}\n{traceback.format_exc()}"
412
- logger.error(error_msg)
413
- return error_msg, None
414
 
415
- # Get space info
416
- space_id = os.getenv("SPACE_ID", "unknown")
417
  agent_code = f"https://huggingface.co/spaces/{space_id}/tree/main"
 
418
 
419
- # Fetch questions
 
420
  try:
421
- logger.info("Fetching questions...")
422
- response = requests.get(questions_url, timeout=30)
423
  response.raise_for_status()
424
  questions_data = response.json()
425
- logger.info(f"Got {len(questions_data)} questions")
 
 
 
 
 
 
 
 
 
 
426
  except Exception as e:
427
- return f"โŒ Failed to fetch questions: {str(e)}", None
 
428
 
429
- # Process questions
430
  results_log = []
431
  answers_payload = []
 
432
 
433
- logger.info("="*50)
434
- logger.info("๐Ÿš€ STARTING PRODUCTION GAIA EVALUATION")
435
- logger.info("="*50)
436
-
437
- for i, item in enumerate(questions_data, 1):
438
  task_id = item.get("task_id")
439
  question_text = item.get("question")
440
-
441
- if not task_id or not question_text:
442
  continue
443
-
444
- logger.info(f"\nQuestion {i}/{len(questions_data)}")
445
- logger.info(f"ID: {task_id}")
446
- logger.info(f"Question: {question_text}")
447
 
448
  try:
449
- # Process with production agent
450
- answer = agent.process_question(question_text)
451
-
452
- # Ensure answer quality
453
- if not answer or len(answer.strip()) < 10:
454
- answer = f"Unable to determine specific answer for: {question_text[:100]}..."
455
-
456
- logger.info(f"Answer: {answer[:200]}...")
457
-
458
- # Store results
459
- answers_payload.append({
460
- "task_id": task_id,
461
- "submitted_answer": answer
462
- })
463
-
464
  results_log.append({
465
- "Task ID": task_id,
466
- "Question": question_text[:200] + ("..." if len(question_text) > 200 else ""),
467
- "Answer": answer[:300] + ("..." if len(answer) > 300 else "")
468
  })
469
-
470
- # Memory management and rate limiting
471
- if i % 3 == 0:
472
- cleanup_memory()
473
- logger.info("Cooling down...")
474
- time.sleep(random.uniform(3, 6))
475
-
476
  except Exception as e:
477
- logger.error(f"Error processing {task_id}: {e}")
478
- error_answer = f"Processing error: {str(e)[:200]}"
479
-
480
- answers_payload.append({
481
- "task_id": task_id,
482
- "submitted_answer": error_answer
483
- })
484
-
485
  results_log.append({
486
- "Task ID": task_id,
487
- "Question": question_text[:200] + "...",
488
- "Answer": error_answer
489
  })
490
 
491
- logger.info(f"Submitting {len(answers_payload)} answers...")
 
 
492
 
493
- # Submit answers
494
- submission_data = {
495
- "username": username,
496
- "agent_code": agent_code,
497
- "answers": answers_payload
498
- }
499
-
500
  try:
501
- response = requests.post(submit_url, json=submission_data, timeout=180)
502
  response.raise_for_status()
503
  result_data = response.json()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
504
 
505
- score = result_data.get('score', 0)
506
- correct = result_data.get('correct_count', 0)
507
- total = result_data.get('total_attempted', len(answers_payload))
508
- message = result_data.get('message', '')
509
-
510
- # Create final status message
511
- final_status = f"""๐ŸŽ‰ PRODUCTION GAIA EVALUATION COMPLETE!
512
-
513
- ๐Ÿ‘ค User: {username}
514
- ๐Ÿ–ฅ๏ธ Hardware: 2 vCPU + 16GB RAM (Production Optimized)
515
- ๐Ÿค– Architecture: Multi-strategy Agent with Robust Error Handling
516
- ๐Ÿ“Š Final Score: {score}%
517
- โœ… Correct: {correct}/{total}
518
- ๐ŸŽฏ Target: 10%+ {'๐ŸŽ‰ SUCCESS!' if score >= 10 else '๐Ÿ“ˆ Significant Improvement Expected'}
519
-
520
- ๐Ÿ“ Message: {message}
521
-
522
- ๐Ÿ”ง Production Features:
523
- - โœ… Robust error handling and fallbacks
524
- - โœ… Multiple search strategies (Wikipedia API, web scraping)
525
- - โœ… Smart question classification and routing
526
- - โœ… Enhanced calculator with SymPy support
527
- - โœ… Rate limiting and memory management
528
- - โœ… Contextual answers when search fails
529
- - โœ… Production-grade logging and monitoring
530
-
531
- ๐Ÿ’ก Strategy: Reliability, accuracy, and comprehensive coverage
532
- """
533
 
534
- logger.info(f"FINAL SCORE: {score}%")
535
- return final_status, pd.DataFrame(results_log)
 
 
536
 
537
- except Exception as e:
538
- error_msg = f"โŒ Submission failed: {str(e)}"
539
- logger.error(error_msg)
540
- return error_msg, pd.DataFrame(results_log)
541
-
542
- # --- Gradio Interface ---
543
- with gr.Blocks(title="Production GAIA Agent", theme=gr.themes.Default()) as demo:
544
- gr.Markdown("# ๐Ÿš€ Production-Ready GAIA Agent")
545
- gr.Markdown("""
546
- **Production Features:**
547
- - ๐Ÿ”ง **Robust Error Handling**: Multiple fallback strategies
548
- - ๐ŸŒ **Multi-Source Search**: Wikipedia API, web scraping, contextual answers
549
- - ๐Ÿงฎ **Enhanced Calculator**: SymPy integration with basic arithmetic fallbacks
550
- - ๐ŸŽฏ **Smart Routing**: Question classification for optimal processing
551
- - โšก **Memory Optimized**: Efficient resource usage for 2 vCPU + 16GB RAM
552
- - ๐Ÿ“Š **Production Logging**: Comprehensive monitoring and debugging
553
-
554
- **Target: Achieve 10%+ accuracy on GAIA benchmark**
555
- """)
556
-
557
- with gr.Row():
558
- gr.LoginButton()
559
-
560
- with gr.Row():
561
- run_button = gr.Button(
562
- "๐Ÿš€ Run Production GAIA Evaluation",
563
- variant="primary",
564
- size="lg"
565
- )
566
-
567
- status_output = gr.Textbox(
568
- label="๐Ÿ“Š Evaluation Results",
569
- lines=25,
570
- interactive=False
571
- )
572
-
573
- results_table = gr.DataFrame(
574
- label="๐Ÿ“ Detailed Results",
575
- wrap=True
576
  )
577
 
 
 
 
 
 
 
 
578
  run_button.click(
579
  fn=run_and_submit_all,
580
  outputs=[status_output, results_table]
581
  )
582
 
583
  if __name__ == "__main__":
584
- logger.info("๐Ÿš€ Starting Production GAIA Agent...")
585
- logger.info("๐Ÿ’ป Optimized for 2 vCPU + 16GB RAM environment")
586
- demo.launch(
587
- server_name="0.0.0.0",
588
- server_port=7860,
589
- show_error=True
590
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import os
2
  import gradio as gr
3
  import requests
4
+ import inspect
5
  import pandas as pd
 
 
 
6
  import json
7
+ import re
8
+ import io
9
+ import base64
10
+ from PIL import Image
11
+ import matplotlib.pyplot as plt
12
+ import numpy as np
13
+ from pathlib import Path
14
+
15
+ # SmolaAgent imports
16
+ from smolagents import CodeAgent, tool, DuckDuckGoSearchTool, PythonInterpreterTool
17
+ from smolagents.models import LiteLLMModel
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
18
 
19
  # --- Constants ---
20
  DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
21
 
22
+ # --- Enhanced Tools for GAIA ---
23
+
24
+ @tool
25
+ def web_search_tool(query: str) -> str:
26
+ """
27
+ Search the web for information using DuckDuckGo.
28
+ Args:
29
+ query: The search query string
30
+ Returns:
31
+ String containing search results
32
+ """
33
+ try:
34
+ search_tool = DuckDuckGoSearchTool()
35
+ results = search_tool(query)
36
+ return str(results)
37
+ except Exception as e:
38
+ return f"Search failed: {str(e)}"
39
+
40
+ @tool
41
+ def calculator_tool(expression: str) -> str:
42
+ """
43
+ Evaluate mathematical expressions safely.
44
+ Args:
45
+ expression: Mathematical expression as string
46
+ Returns:
47
+ Result of the calculation
48
+ """
49
+ try:
50
+ # Safe evaluation - only allow basic math operations
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
51
  allowed_chars = set('0123456789+-*/.() ')
52
+ if not all(c in allowed_chars for c in expression.replace(' ', '')):
53
+ return "Error: Expression contains invalid characters"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
54
 
55
+ result = eval(expression)
56
+ return str(result)
57
+ except Exception as e:
58
+ return f"Calculation error: {str(e)}"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
59
 
60
+ @tool
61
+ def image_analyzer_tool(image_path: str) -> str:
62
+ """
63
+ Analyze images and extract information.
64
+ Args:
65
+ image_path: Path to the image file
66
+ Returns:
67
+ Description of image content
68
+ """
69
+ try:
70
+ if not os.path.exists(image_path):
71
+ return "Error: Image file not found"
72
 
73
+ img = Image.open(image_path)
 
 
 
74
 
75
+ # Basic image analysis
76
+ width, height = img.size
77
+ mode = img.mode
78
+ format_info = img.format if img.format else "Unknown"
 
 
 
 
79
 
80
+ # Simple color analysis
81
+ if mode == 'RGB':
82
+ colors = img.getcolors(maxcolors=256*256*256)
83
+ if colors:
84
+ dominant_color = max(colors, key=lambda x: x[0])[1]
85
+ color_info = f"Dominant color: RGB{dominant_color}"
86
+ else:
87
+ color_info = "Complex color palette"
88
+ else:
89
+ color_info = f"Color mode: {mode}"
90
 
91
+ analysis = f"""Image Analysis:
92
+ - Dimensions: {width}x{height} pixels
93
+ - Format: {format_info}
94
+ - {color_info}
95
+ - File size: {os.path.getsize(image_path)} bytes
96
+ """
97
+ return analysis
98
 
99
+ except Exception as e:
100
+ return f"Image analysis error: {str(e)}"
101
+
102
+ @tool
103
+ def file_reader_tool(file_path: str) -> str:
104
+ """
105
+ Read and analyze various file types (text, CSV, JSON, etc.).
106
+ Args:
107
+ file_path: Path to the file
108
+ Returns:
109
+ File content or analysis
110
+ """
111
+ try:
112
+ if not os.path.exists(file_path):
113
+ return "Error: File not found"
114
 
115
+ file_ext = Path(file_path).suffix.lower()
 
116
 
117
+ if file_ext == '.csv':
118
+ df = pd.read_csv(file_path)
119
+ return f"CSV file with {len(df)} rows and {len(df.columns)} columns.\nColumns: {list(df.columns)}\nFirst 5 rows:\n{df.head().to_string()}"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
120
 
121
+ elif file_ext == '.json':
122
+ with open(file_path, 'r', encoding='utf-8') as f:
123
+ data = json.load(f)
124
+ return f"JSON file content:\n{json.dumps(data, indent=2)[:1000]}..."
125
 
126
+ elif file_ext in ['.txt', '.md', '.py', '.js', '.html', '.css']:
127
+ with open(file_path, 'r', encoding='utf-8') as f:
128
+ content = f.read()
129
+ return f"Text file content ({len(content)} characters):\n{content[:1000]}..."
 
130
 
131
+ else:
132
+ return f"Binary file: {file_ext}, size: {os.path.getsize(file_path)} bytes"
 
 
133
 
134
+ except Exception as e:
135
+ return f"File reading error: {str(e)}"
136
+
137
+ @tool
138
+ def data_processor_tool(data: str, operation: str) -> str:
139
+ """
140
+ Process data with various operations (sort, filter, calculate statistics).
141
+ Args:
142
+ data: Data as string (JSON, CSV format, or numbers)
143
+ operation: Operation to perform (sort, sum, average, count, etc.)
144
+ Returns:
145
+ Processed data result
146
+ """
147
+ try:
148
+ # Try to parse as JSON first
149
+ try:
150
+ parsed_data = json.loads(data)
151
+ except:
152
+ # Try to parse as numbers
153
+ try:
154
+ parsed_data = [float(x.strip()) for x in data.replace(',', ' ').split() if x.strip()]
155
+ except:
156
+ return "Error: Could not parse data"
157
 
158
+ if operation.lower() == 'sum' and isinstance(parsed_data, list):
159
+ return str(sum([x for x in parsed_data if isinstance(x, (int, float))]))
 
 
 
160
 
161
+ elif operation.lower() == 'average' and isinstance(parsed_data, list):
162
+ nums = [x for x in parsed_data if isinstance(x, (int, float))]
163
+ return str(sum(nums) / len(nums) if nums else 0)
164
 
165
+ elif operation.lower() == 'count':
166
+ return str(len(parsed_data))
167
 
168
+ elif operation.lower() == 'sort' and isinstance(parsed_data, list):
169
+ return str(sorted(parsed_data))
 
170
 
171
+ elif operation.lower() == 'max' and isinstance(parsed_data, list):
172
+ nums = [x for x in parsed_data if isinstance(x, (int, float))]
173
+ return str(max(nums) if nums else "No numbers found")
 
 
174
 
175
+ elif operation.lower() == 'min' and isinstance(parsed_data, list):
176
+ nums = [x for x in parsed_data if isinstance(x, (int, float))]
177
+ return str(min(nums) if nums else "No numbers found")
178
 
179
+ else:
180
+ return f"Unsupported operation: {operation}"
181
+
182
+ except Exception as e:
183
+ return f"Data processing error: {str(e)}"
184
+
185
+ # --- Enhanced GAIA Agent ---
186
+ class GAIAAgent:
187
+ def __init__(self):
188
+ print("GAIAAgent initialized with SmolaAgent framework.")
189
 
190
+ # Initialize model - using a lightweight model for resource efficiency
191
+ try:
192
+ # Use HuggingFace's free inference API or local model
193
+ self.model = LiteLLMModel(
194
+ model_id="microsoft/DialoGPT-medium", # Lightweight model
195
+ max_tokens=512,
196
+ temperature=0.1
197
+ )
198
+ except:
199
+ # Fallback to a basic model
200
+ print("Warning: Using fallback model configuration")
201
+ self.model = None
202
 
203
+ # Initialize tools
204
+ self.tools = [
205
+ web_search_tool,
206
+ calculator_tool,
207
+ image_analyzer_tool,
208
+ file_reader_tool,
209
+ data_processor_tool,
210
+ PythonInterpreterTool()
211
+ ]
212
 
213
+ # Initialize the agent
214
+ try:
215
+ self.agent = CodeAgent(
216
+ tools=self.tools,
217
+ model=self.model,
218
+ max_iterations=5,
219
+ verbosity_level=1
220
+ )
221
+ except Exception as e:
222
+ print(f"Agent initialization error: {e}")
223
+ self.agent = None
224
+
225
+ def __call__(self, question: str) -> str:
226
+ print(f"GAIAAgent processing question: {question[:100]}...")
227
 
228
+ if not self.agent:
229
+ # Fallback logic if agent failed to initialize
230
+ return self._fallback_processing(question)
231
 
232
+ try:
233
+ # Enhanced prompt for GAIA tasks
234
+ enhanced_prompt = f"""
235
+ You are a helpful AI assistant designed to solve complex real-world problems that may require:
236
+ - Web searching for current information
237
+ - Mathematical calculations
238
+ - Image analysis
239
+ - File processing
240
+ - Multi-step reasoning
241
 
242
+ Question: {question}
 
 
 
 
 
 
 
243
 
244
+ Please approach this systematically:
245
+ 1. Analyze what type of problem this is
246
+ 2. Determine what tools/information you need
247
+ 3. Use available tools to gather information
248
+ 4. Reason through the problem step by step
249
+ 5. Provide a clear, concise final answer
250
+
251
+ Remember to be precise and factual in your response.
252
+ """
253
+
254
+ response = self.agent.run(enhanced_prompt)
255
+
256
+ # Extract the final answer if it's in the response
257
+ if isinstance(response, str):
258
+ # Look for common answer patterns
259
+ answer_patterns = [
260
+ r"Final answer:?\s*(.+)",
261
+ r"Answer:?\s*(.+)",
262
+ r"The answer is:?\s*(.+)",
263
+ r"Result:?\s*(.+)"
264
+ ]
265
+
266
+ for pattern in answer_patterns:
267
+ match = re.search(pattern, response, re.IGNORECASE)
268
+ if match:
269
+ return match.group(1).strip()
270
+
271
+ # If no pattern found, return the last sentence or the whole response
272
+ sentences = response.split('.')
273
+ return sentences[-1].strip() if sentences else response
274
+
275
+ return str(response)
276
+
277
+ except Exception as e:
278
+ print(f"Error in agent processing: {e}")
279
+ return self._fallback_processing(question)
280
 
281
+ def _fallback_processing(self, question: str) -> str:
282
+ """Fallback processing when main agent fails"""
283
+ try:
284
+ # Simple heuristic-based processing
285
+ question_lower = question.lower()
286
+
287
+ # Math questions
288
+ if any(op in question for op in ['+', '-', '*', '/', 'calculate', 'sum', 'average']):
289
+ # Extract numbers and try basic calculation
290
+ numbers = re.findall(r'-?\d+\.?\d*', question)
291
+ if len(numbers) >= 2:
292
+ try:
293
+ if 'sum' in question_lower or '+' in question:
294
+ result = sum(float(n) for n in numbers)
295
+ return str(result)
296
+ elif 'average' in question_lower:
297
+ result = sum(float(n) for n in numbers) / len(numbers)
298
+ return str(result)
299
+ except:
300
+ pass
301
+
302
+ # Search-based questions
303
+ if any(word in question_lower for word in ['what', 'who', 'when', 'where', 'how', 'why']):
304
+ try:
305
+ search_result = web_search_tool(question)
306
+ # Extract key information from search results
307
+ lines = search_result.split('\n')
308
+ relevant_lines = [line for line in lines if len(line.strip()) > 20]
309
+ return relevant_lines[0] if relevant_lines else "Unable to find specific information"
310
+ except:
311
+ pass
312
+
313
+ # Default response
314
+ return "I need more context or tools to answer this question accurately."
315
+
316
+ except Exception as e:
317
+ return f"Processing error: {str(e)}"
318
+
319
+ def run_and_submit_all(profile: gr.OAuthProfile | None):
320
+ """
321
+ Fetches all questions, runs the GAIAAgent on them, submits all answers,
322
+ and displays the results.
323
+ """
324
+ # --- Determine HF Space Runtime URL and Repo URL ---
325
+ space_id = os.getenv("SPACE_ID")
326
 
327
+ if profile:
328
+ username = f"{profile.username}"
329
+ print(f"User logged in: {username}")
330
+ else:
331
+ print("User not logged in.")
332
+ return "Please Login to Hugging Face with the button.", None
333
 
 
334
  api_url = DEFAULT_API_URL
335
  questions_url = f"{api_url}/questions"
336
  submit_url = f"{api_url}/submit"
 
 
337
 
338
+ # 1. Instantiate Agent
339
  try:
340
+ agent = GAIAAgent()
 
 
341
  except Exception as e:
342
+ print(f"Error instantiating agent: {e}")
343
+ return f"Error initializing agent: {e}", None
 
344
 
 
 
345
  agent_code = f"https://huggingface.co/spaces/{space_id}/tree/main"
346
+ print(agent_code)
347
 
348
+ # 2. Fetch Questions
349
+ print(f"Fetching questions from: {questions_url}")
350
  try:
351
+ response = requests.get(questions_url, timeout=15)
 
352
  response.raise_for_status()
353
  questions_data = response.json()
354
+ if not questions_data:
355
+ print("Fetched questions list is empty.")
356
+ return "Fetched questions list is empty or invalid format.", None
357
+ print(f"Fetched {len(questions_data)} questions.")
358
+ except requests.exceptions.RequestException as e:
359
+ print(f"Error fetching questions: {e}")
360
+ return f"Error fetching questions: {e}", None
361
+ except requests.exceptions.JSONDecodeError as e:
362
+ print(f"Error decoding JSON response from questions endpoint: {e}")
363
+ print(f"Response text: {response.text[:500]}")
364
+ return f"Error decoding server response for questions: {e}", None
365
  except Exception as e:
366
+ print(f"An unexpected error occurred fetching questions: {e}")
367
+ return f"An unexpected error occurred fetching questions: {e}", None
368
 
369
+ # 3. Run GAIA Agent
370
  results_log = []
371
  answers_payload = []
372
+ print(f"Running GAIA agent on {len(questions_data)} questions...")
373
 
374
+ for i, item in enumerate(questions_data):
 
 
 
 
375
  task_id = item.get("task_id")
376
  question_text = item.get("question")
377
+ if not task_id or question_text is None:
378
+ print(f"Skipping item with missing task_id or question: {item}")
379
  continue
380
+
381
+ print(f"Processing question {i+1}/{len(questions_data)}: {task_id}")
 
 
382
 
383
  try:
384
+ submitted_answer = agent(question_text)
385
+ answers_payload.append({"task_id": task_id, "submitted_answer": submitted_answer})
 
 
 
 
 
 
 
 
 
 
 
 
 
386
  results_log.append({
387
+ "Task ID": task_id,
388
+ "Question": question_text[:100] + "..." if len(question_text) > 100 else question_text,
389
+ "Submitted Answer": submitted_answer
390
  })
391
+ print(f"Answer for {task_id}: {submitted_answer[:50]}...")
 
 
 
 
 
 
392
  except Exception as e:
393
+ print(f"Error running agent on task {task_id}: {e}")
394
+ error_answer = f"AGENT ERROR: {e}"
395
+ answers_payload.append({"task_id": task_id, "submitted_answer": error_answer})
 
 
 
 
 
396
  results_log.append({
397
+ "Task ID": task_id,
398
+ "Question": question_text[:100] + "..." if len(question_text) > 100 else question_text,
399
+ "Submitted Answer": error_answer
400
  })
401
 
402
+ if not answers_payload:
403
+ print("Agent did not produce any answers to submit.")
404
+ return "Agent did not produce any answers to submit.", pd.DataFrame(results_log)
405
 
406
+ # 4. Prepare Submission
407
+ submission_data = {"username": username.strip(), "agent_code": agent_code, "answers": answers_payload}
408
+ status_update = f"GAIA Agent finished. Submitting {len(answers_payload)} answers for user '{username}'..."
409
+ print(status_update)
410
+
411
+ # 5. Submit
412
+ print(f"Submitting {len(answers_payload)} answers to: {submit_url}")
413
  try:
414
+ response = requests.post(submit_url, json=submission_data, timeout=60)
415
  response.raise_for_status()
416
  result_data = response.json()
417
+ final_status = (
418
+ f"Submission Successful!\n"
419
+ f"User: {result_data.get('username')}\n"
420
+ f"Overall Score: {result_data.get('score', 'N/A')}% "
421
+ f"({result_data.get('correct_count', '?')}/{result_data.get('total_attempted', '?')} correct)\n"
422
+ f"Message: {result_data.get('message', 'No message received.')}"
423
+ )
424
+ print("Submission successful.")
425
+ results_df = pd.DataFrame(results_log)
426
+ return final_status, results_df
427
+ except requests.exceptions.HTTPError as e:
428
+ error_detail = f"Server responded with status {e.response.status_code}."
429
+ try:
430
+ error_json = e.response.json()
431
+ error_detail += f" Detail: {error_json.get('detail', e.response.text)}"
432
+ except requests.exceptions.JSONDecodeError:
433
+ error_detail += f" Response: {e.response.text[:500]}"
434
+ status_message = f"Submission Failed: {error_detail}"
435
+ print(status_message)
436
+ results_df = pd.DataFrame(results_log)
437
+ return status_message, results_df
438
+ except requests.exceptions.Timeout:
439
+ status_message = "Submission Failed: The request timed out."
440
+ print(status_message)
441
+ results_df = pd.DataFrame(results_log)
442
+ return status_message, results_df
443
+ except requests.exceptions.RequestException as e:
444
+ status_message = f"Submission Failed: Network error - {e}"
445
+ print(status_message)
446
+ results_df = pd.DataFrame(results_log)
447
+ return status_message, results_df
448
+ except Exception as e:
449
+ status_message = f"An unexpected error occurred during submission: {e}"
450
+ print(status_message)
451
+ results_df = pd.DataFrame(results_log)
452
+ return status_message, results_df
453
+
454
+
455
+ # --- Build Gradio Interface using Blocks ---
456
+ with gr.Blocks() as demo:
457
+ gr.Markdown("# GAIA Agent Evaluation Runner")
458
+ gr.Markdown(
459
+ """
460
+ **Enhanced GAIA Agent with SmolaAgent Framework**
461
 
462
+ This agent is equipped with:
463
+ - ๐Ÿ” Web search capabilities (DuckDuckGo)
464
+ - ๐Ÿงฎ Mathematical calculator
465
+ - ๐Ÿ–ผ๏ธ Image analysis
466
+ - ๐Ÿ“ File processing (CSV, JSON, text files)
467
+ - ๐Ÿ“Š Data processing and statistics
468
+ - ๐Ÿ Python code execution
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
469
 
470
+ **Instructions:**
471
+ 1. Log in to your Hugging Face account using the button below
472
+ 2. Click 'Run GAIA Evaluation & Submit All Answers' to start the evaluation
473
+ 3. The agent will process each question systematically using available tools
474
 
475
+ **Note:** Processing may take time as the agent analyzes each question thoroughly.
476
+ """
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
477
  )
478
 
479
+ gr.LoginButton()
480
+
481
+ run_button = gr.Button("Run GAIA Evaluation & Submit All Answers", variant="primary")
482
+
483
+ status_output = gr.Textbox(label="Run Status / Submission Result", lines=5, interactive=False)
484
+ results_table = gr.DataFrame(label="Questions and Agent Answers", wrap=True)
485
+
486
  run_button.click(
487
  fn=run_and_submit_all,
488
  outputs=[status_output, results_table]
489
  )
490
 
491
  if __name__ == "__main__":
492
+ print("\n" + "-"*30 + " GAIA Agent Starting " + "-"*30)
493
+
494
+ space_host_startup = os.getenv("SPACE_HOST")
495
+ space_id_startup = os.getenv("SPACE_ID")
496
+
497
+ if space_host_startup:
498
+ print(f"โœ… SPACE_HOST found: {space_host_startup}")
499
+ print(f" Runtime URL should be: https://{space_host_startup}.hf.space")
500
+ else:
501
+ print("โ„น๏ธ SPACE_HOST environment variable not found (running locally?).")
502
+
503
+ if space_id_startup:
504
+ print(f"โœ… SPACE_ID found: {space_id_startup}")
505
+ print(f" Repo URL: https://huggingface.co/spaces/{space_id_startup}")
506
+ print(f" Repo Tree URL: https://huggingface.co/spaces/{space_id_startup}/tree/main")
507
+ else:
508
+ print("โ„น๏ธ SPACE_ID environment variable not found (running locally?). Repo URL cannot be determined.")
509
+
510
+ print("-"*(60 + len(" GAIA Agent Starting ")) + "\n")
511
+ print("Launching Gradio Interface for GAIA Agent Evaluation...")
512
+ demo.launch(debug=True, share=False)
requirements.txt CHANGED
@@ -1,15 +1,33 @@
1
- llama-index-core
2
- llama-index-llms-huggingface
3
- transformers>=4.30.0
4
- torch>=2.0.0
5
- accelerate
6
- bitsandbytes # For 8-bit quantization
7
- gradio>=4.0.0
8
- requests
9
- pandas
10
- python-dotenv
11
- duckduckgo-search
12
- sympy
13
- sentencepiece
14
- protobuf
15
- peft
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Core dependencies
2
+ gradio==4.44.0
3
+ requests==2.31.0
4
+ pandas==2.0.3
5
+ numpy==1.24.3
6
+
7
+ # SmolaAgent framework - lightweight agent framework
8
+ smolagents==0.3.3
9
+
10
+ # Image processing (lightweight)
11
+ Pillow==10.0.1
12
+
13
+ # Plotting (lightweight alternative to matplotlib)
14
+ matplotlib==3.7.2
15
+
16
+ # JSON and data processing
17
+ pathlib
18
+
19
+ # Web search
20
+ duckduckgo-search==3.9.6
21
+
22
+ # LLM integration (lightweight)
23
+ litellm==1.44.14
24
+
25
+ # Optional: For better performance with limited resources
26
+ psutil==5.9.5
27
+
28
+ # File processing utilities
29
+ openpyxl==3.1.2 # For Excel files if needed
30
+ python-magic==0.4.27 # For file type detection
31
+
32
+ # Math and scientific computing (minimal)
33
+ sympy==1.12 # For symbolic math if needed