LamiaYT commited on
Commit
d591a7a
·
1 Parent(s): 56455d6
Files changed (3) hide show
  1. app.py +323 -559
  2. requirements.txt +10 -12
  3. run.py +8 -0
app.py CHANGED
@@ -1,594 +1,358 @@
1
  import os
2
  import gradio as gr
3
  import requests
4
- import pandas as pd
5
  import re
 
 
6
  import time
7
- import json
8
- from typing import Dict, Any, List, Optional, Tuple
9
- from io import StringIO
10
- import ast
11
  import math
 
 
 
 
 
 
 
12
 
 
13
  DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
 
 
 
 
14
 
15
- class GAIASpecializedSearchEngine:
16
- """GAIA-specialized search engine with improved result processing"""
17
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
18
  def __init__(self):
19
- self.session = requests.Session()
20
- self.session.headers.update({
21
- 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
22
- })
23
- self.serper_api_key = os.getenv("SERPER_API_KEY")
24
- self.search_cache = {}
25
-
26
- def search_with_serper(self, query: str, num_results: int = 10) -> Dict[str, Any]:
27
- """Enhanced Serper search with better parameters"""
28
- if not self.serper_api_key:
29
- return {}
30
-
31
- cache_key = f"{query}_{num_results}"
32
- if cache_key in self.search_cache:
33
- return self.search_cache[cache_key]
 
 
 
 
 
 
 
 
34
 
35
- try:
36
- url = "https://google.serper.dev/search"
37
- payload = {
38
- "q": query,
39
- "num": num_results,
40
- "gl": "us",
41
- "hl": "en"
42
- }
43
- headers = {
44
- "X-API-KEY": self.serper_api_key,
45
- "Content-Type": "application/json"
46
- }
47
 
48
- response = self.session.post(url, json=payload, headers=headers, timeout=25)
49
- if response.status_code == 200:
50
- result = response.json()
51
- self.search_cache[cache_key] = result
52
- return result
53
- else:
54
- print(f"Search API error: {response.status_code}")
55
- return {}
56
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
57
  except Exception as e:
58
- print(f"Search error: {e}")
59
- return {}
60
-
61
- def comprehensive_search(self, query: str) -> Dict[str, Any]:
62
- """Return full search data structure instead of just text"""
63
- print(f"🔍 Searching: {query[:100]}...")
64
- return self.search_with_serper(query, 15)
65
 
66
- class GAIAQuestionSolver:
67
- """Improved solver for GAIA benchmark questions"""
68
-
69
- def __init__(self):
70
- self.search_engine = GAIASpecializedSearchEngine()
71
-
72
- def solve_question(self, question: str) -> str:
73
- """Main solving method with improved pattern detection"""
74
- print(f"🤔 Analyzing: {question[:100]}...")
75
-
76
- # Handle actual reversed text questions (very specific detection)
77
- if self.is_genuine_reversed_text_question(question):
78
- return self.solve_reversed_text(question)
79
-
80
- # Handle computational questions
81
- if self.is_computational_question(question):
82
- return self.solve_computational_question(question)
83
-
84
- # Handle person/actor questions
85
- if self.is_person_question(question):
86
- return self.solve_person_question(question)
87
-
88
- # Handle location/geography questions
89
- if self.is_location_question(question):
90
- return self.solve_location_question(question)
91
-
92
- # Handle numerical/counting questions
93
- if self.is_numerical_question(question):
94
- return self.solve_numerical_question(question)
95
-
96
- # Handle date/time questions
97
- if self.is_date_question(question):
98
- return self.solve_date_question(question)
99
-
100
- # Default factual search
101
- return self.solve_general_question(question)
102
-
103
- def is_genuine_reversed_text_question(self, question: str) -> bool:
104
- """Very specific detection for actual reversed text questions"""
105
- # Only trigger if we see obvious reversed words that don't make sense in English
106
- reversed_words = re.findall(r'\b[a-z]{4,}\b', question.lower())
107
- genuine_reversed = []
108
-
109
- for word in reversed_words:
110
- reversed_word = word[::-1]
111
- # Check if the reversed version is a common English word
112
- common_words = ['left', 'right', 'opposite', 'answer', 'word', 'text']
113
- if reversed_word in common_words:
114
- genuine_reversed.append((word, reversed_word))
115
-
116
- return len(genuine_reversed) > 0
117
-
118
- def solve_reversed_text(self, question: str) -> str:
119
- """Solve genuine reversed text questions"""
120
- words = question.lower().split()
121
- for word in words:
122
- if len(word) >= 4:
123
- reversed_word = word[::-1]
124
- if reversed_word == 'left':
125
- return 'right'
126
- elif reversed_word == 'right':
127
- return 'left'
128
- elif reversed_word == 'opposite':
129
- # Find what the opposite of
130
- word_index = words.index(word)
131
- if word_index + 1 < len(words):
132
- next_word = words[word_index + 1][::-1]
133
- opposites = {'left': 'right', 'right': 'left', 'up': 'down', 'down': 'up'}
134
- return opposites.get(next_word, next_word)
135
-
136
- return "Could not determine reversed text answer"
137
-
138
- def is_computational_question(self, question: str) -> bool:
139
- """Detect questions requiring computation"""
140
- comp_keywords = ['calculate', 'compute', 'sum', 'total', 'multiply', 'divide', 'add', 'subtract']
141
- return any(keyword in question.lower() for keyword in comp_keywords)
142
-
143
- def solve_computational_question(self, question: str) -> str:
144
- """Solve computational questions"""
145
- # Extract numbers from the question
146
- numbers = re.findall(r'-?\d+\.?\d*', question)
147
-
148
- if len(numbers) >= 2:
149
- try:
150
- nums = [float(n) for n in numbers]
151
-
152
- if any(word in question.lower() for word in ['sum', 'add', 'total', '+']):
153
- result = sum(nums)
154
- elif any(word in question.lower() for word in ['multiply', 'times', '*']):
155
- result = 1
156
- for n in nums:
157
- result *= n
158
- elif any(word in question.lower() for word in ['subtract', 'minus', '-']):
159
- result = nums[0] - nums[1]
160
- elif any(word in question.lower() for word in ['divide', '/']):
161
- result = nums[0] / nums[1] if nums[1] != 0 else 0
162
- else:
163
- # Search for the computational context
164
- return self.search_and_extract_number(question)
165
-
166
- # Return as integer if it's a whole number
167
- return str(int(result)) if result.is_integer() else str(result)
168
- except:
169
- pass
170
-
171
- return self.search_and_extract_number(question)
172
-
173
- def is_person_question(self, question: str) -> bool:
174
- """Detect questions about people"""
175
- person_keywords = ['who', 'actor', 'person', 'name', 'character', 'played', 'starred']
176
- return any(keyword in question.lower() for keyword in person_keywords)
177
-
178
- def solve_person_question(self, question: str) -> str:
179
- """Solve questions about people with improved search"""
180
- data = self.search_engine.comprehensive_search(question)
181
-
182
- if not data:
183
- return "Person information not found"
184
-
185
- # Check answer box first
186
- if "answerBox" in data and "answer" in data["answerBox"]:
187
- answer = data["answerBox"]["answer"].strip()
188
- if self.looks_like_person_name(answer):
189
- return self.format_person_answer(answer, question)
190
-
191
- # Check knowledge graph
192
- if "knowledgeGraph" in data:
193
- kg = data["knowledgeGraph"]
194
- if "title" in kg and self.looks_like_person_name(kg["title"]):
195
- return self.format_person_answer(kg["title"], question)
196
-
197
- # Extract from organic results
198
- all_text = ""
199
- for result in data.get("organic", [])[:5]:
200
- all_text += f"{result.get('title', '')} {result.get('snippet', '')} "
201
-
202
- return self.extract_person_from_text(all_text, question)
203
-
204
- def looks_like_person_name(self, text: str) -> bool:
205
- """Check if text looks like a person's name"""
206
- if not text or len(text) > 50:
207
- return False
208
-
209
- # Simple heuristic: 1-4 capitalized words, reasonable length
210
- words = text.split()
211
- if 1 <= len(words) <= 4:
212
- return all(word[0].isupper() and word.isalpha() for word in words if word)
213
- return False
214
-
215
- def format_person_answer(self, name: str, question: str) -> str:
216
- """Format person answer based on what the question asks for"""
217
- words = name.split()
218
- q_lower = question.lower()
219
-
220
- if 'first name' in q_lower and words:
221
- return words[0]
222
- elif any(term in q_lower for term in ['last name', 'surname']) and words:
223
- return words[-1]
224
- else:
225
- return name
226
-
227
- def extract_person_from_text(self, text: str, question: str) -> str:
228
- """Extract person names from text"""
229
- # Find potential names (2-3 capitalized words)
230
- names = re.findall(r'\b[A-Z][a-z]+ [A-Z][a-z]+(?:\s[A-Z][a-z]+)?\b', text)
231
-
232
- # Filter out common non-names
233
- exclude = {'The New', 'New York', 'Los Angeles', 'Las Vegas', 'United States'}
234
- valid_names = [name for name in names if name not in exclude and len(name.split()) <= 3]
235
-
236
- if valid_names:
237
- return self.format_person_answer(valid_names[0], question)
238
-
239
- return "Person name not found"
240
-
241
- def is_location_question(self, question: str) -> bool:
242
- """Detect location/geography questions"""
243
- location_keywords = ['where', 'country', 'city', 'state', 'location', 'place', 'born in', 'from']
244
- return any(keyword in question.lower() for keyword in location_keywords)
245
-
246
- def solve_location_question(self, question: str) -> str:
247
- """Solve location questions"""
248
- data = self.search_engine.comprehensive_search(question)
249
-
250
- if not data:
251
- return "Location not found"
252
-
253
- # Check answer box
254
- if "answerBox" in data and "answer" in data["answerBox"]:
255
- answer = data["answerBox"]["answer"].strip()
256
- if self.looks_like_location(answer):
257
- return answer
258
-
259
- # Extract from results
260
- all_text = ""
261
- for result in data.get("organic", [])[:3]:
262
- all_text += f"{result.get('snippet', '')} "
263
-
264
- return self.extract_location_from_text(all_text)
265
-
266
- def looks_like_location(self, text: str) -> bool:
267
- """Check if text looks like a location"""
268
- if not text or len(text) > 100:
269
- return False
270
-
271
- location_indicators = ['University', 'College', 'City', 'County', 'State', 'Country']
272
- return any(indicator in text for indicator in location_indicators) or len(text.split()) <= 4
273
-
274
- def extract_location_from_text(self, text: str) -> str:
275
- """Extract location from text"""
276
- # Look for patterns like "in [Location]", "at [Location]", "[Location] University"
277
- location_patterns = [
278
- r'\bin ([A-Z][a-z]+(?: [A-Z][a-z]+)*)',
279
- r'\bat ([A-Z][a-z]+(?: [A-Z][a-z]+)*)',
280
- r'([A-Z][a-z]+(?: [A-Z][a-z]+)*) University',
281
- r'([A-Z][a-z]+(?: [A-Z][a-z]+)*) College',
282
- ]
283
-
284
- for pattern in location_patterns:
285
- matches = re.findall(pattern, text)
286
- if matches:
287
- return matches[0]
288
-
289
- # Fallback: look for capitalized phrases
290
- locations = re.findall(r'\b[A-Z][a-z]+(?: [A-Z][a-z]+)*\b', text)
291
- if locations:
292
- return locations[0]
293
 
294
- return "Location not found"
295
-
296
- def is_numerical_question(self, question: str) -> bool:
297
- """Detect questions asking for numbers"""
298
- numerical_keywords = ['how many', 'how much', 'number of', 'count', 'total']
299
- return any(keyword in question.lower() for keyword in numerical_keywords)
300
-
301
- def solve_numerical_question(self, question: str) -> str:
302
- """Solve questions asking for numbers"""
303
- return self.search_and_extract_number(question)
304
-
305
- def search_and_extract_number(self, question: str) -> str:
306
- """Search and extract numerical answers"""
307
- data = self.search_engine.comprehensive_search(question)
308
-
309
- if not data:
310
- return "Number not found"
311
-
312
- # Check answer box first
313
- if "answerBox" in data and "answer" in data["answerBox"]:
314
- answer = data["answerBox"]["answer"].strip()
315
- numbers = re.findall(r'\b\d+(?:,\d{3})*(?:\.\d+)?\b', answer)
316
- if numbers:
317
- return numbers[0].replace(',', '')
318
-
319
- # Extract from snippets
320
- all_text = ""
321
- for result in data.get("organic", [])[:5]:
322
- all_text += f"{result.get('snippet', '')} "
323
-
324
- # Look for numbers in context
325
- sentences = re.split(r'[.!?]', all_text)
326
- for sentence in sentences[:10]:
327
- numbers = re.findall(r'\b\d+(?:,\d{3})*(?:\.\d+)?\b', sentence)
328
- if numbers:
329
- # Try to find the most relevant number
330
- q_lower = question.lower()
331
- if any(word in sentence.lower() for word in q_lower.split()[:3]):
332
- return numbers[0].replace(',', '')
333
-
334
- # Fallback: return first number found
335
- all_numbers = re.findall(r'\b\d+(?:,\d{3})*(?:\.\d+)?\b', all_text)
336
- if all_numbers:
337
- return all_numbers[0].replace(',', '')
338
-
339
- return "Number not found"
340
-
341
- def is_date_question(self, question: str) -> bool:
342
- """Detect date/time questions"""
343
- date_keywords = ['when', 'year', 'date', 'born', 'died', 'founded', 'established']
344
- return any(keyword in question.lower() for keyword in date_keywords)
345
-
346
- def solve_date_question(self, question: str) -> str:
347
- """Solve date questions"""
348
- data = self.search_engine.comprehensive_search(question)
349
-
350
- if not data:
351
- return "Date not found"
352
-
353
- # Check answer box
354
- if "answerBox" in data and "answer" in data["answerBox"]:
355
- answer = data["answerBox"]["answer"].strip()
356
- years = re.findall(r'\b(?:19|20)\d{2}\b', answer)
357
- dates = re.findall(r'\b(?:January|February|March|April|May|June|July|August|September|October|November|December)\s+\d{1,2},?\s+(?:19|20)\d{2}\b', answer)
358
- if dates:
359
- return dates[0]
360
- elif years:
361
- return years[0]
362
-
363
- # Extract from snippets
364
- all_text = ""
365
- for result in data.get("organic", [])[:3]:
366
- all_text += f"{result.get('snippet', '')} "
367
-
368
- # Look for dates and years
369
- dates = re.findall(r'\b(?:January|February|March|April|May|June|July|August|September|October|November|December)\s+\d{1,2},?\s+(?:19|20)\d{2}\b', all_text)
370
- if dates:
371
- return dates[0]
372
-
373
- years = re.findall(r'\b(?:19|20)\d{2}\b', all_text)
374
- if years:
375
- return years[0]
376
-
377
- return "Date not found"
378
-
379
- def solve_general_question(self, question: str) -> str:
380
- """Solve general factual questions"""
381
- data = self.search_engine.comprehensive_search(question)
382
-
383
- if not data:
384
- return "Information not found"
385
-
386
- # Check answer box first - this is usually the best answer
387
- if "answerBox" in data:
388
- answer_box = data["answerBox"]
389
- if "answer" in answer_box:
390
- return answer_box["answer"].strip()
391
- elif "snippet" in answer_box:
392
- return answer_box["snippet"].strip()
393
-
394
- # Check knowledge graph
395
- if "knowledgeGraph" in data:
396
- kg = data["knowledgeGraph"]
397
- if "description" in kg:
398
- return kg["description"].strip()
399
-
400
- # Get the most relevant snippet from organic results
401
- for result in data.get("organic", [])[:3]:
402
- snippet = result.get("snippet", "")
403
- if snippet and len(snippet.strip()) > 10:
404
- return snippet.strip()
405
-
406
- return "Answer not found in search results"
407
 
408
- def get_api_status():
409
- """Check API configuration status"""
410
- if os.getenv("SERPER_API_KEY"):
411
- return "✅ Serper API: Configured and Ready"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
412
  else:
413
- return " Serper API: Not configured - Set SERPER_API_KEY environment variable"
 
414
 
415
- def run_gaia_evaluation(profile: gr.OAuthProfile | None):
416
- """Run GAIA evaluation with improved solver"""
417
- if not profile:
418
- return "Please log in to Hugging Face first.", None
419
-
420
- api_status = get_api_status()
421
- if "❌" in api_status:
422
- return f"⚠️ Configuration Error!\n\n{api_status}\n\nGet your free API key at: https://serper.dev", None
423
-
424
- username = profile.username
425
- questions_url = f"{DEFAULT_API_URL}/questions"
426
- submit_url = f"{DEFAULT_API_URL}/submit"
427
-
428
  try:
429
- solver = GAIAQuestionSolver()
430
- print("✅ GAIA improved solver initialized")
431
  except Exception as e:
432
- return f" Solver initialization failed: {e}", None
433
-
 
 
 
 
 
 
434
  try:
435
- print("📥 Fetching GAIA questions...")
436
- response = requests.get(questions_url, timeout=30)
437
  response.raise_for_status()
438
- questions = response.json()
439
- print(f"✅ Retrieved {len(questions)} questions")
 
 
 
 
 
 
 
 
 
 
440
  except Exception as e:
441
- return f" Failed to fetch questions: {e}", None
442
-
443
- answers = []
444
- detailed_logs = []
445
-
446
- for i, item in enumerate(questions):
 
 
447
  task_id = item.get("task_id")
448
- question = item.get("question")
449
-
450
- if not task_id or not question:
451
  continue
452
-
453
- print(f"\n🔄 Processing {i+1}/{len(questions)}: {task_id}")
454
-
455
  try:
456
- start_time = time.time()
457
- answer = solver.solve_question(question)
458
- processing_time = time.time() - start_time
459
-
460
- answers.append({"task_id": task_id, "submitted_answer": answer})
461
- detailed_logs.append({
462
- "Task ID": task_id,
463
- "Question Preview": question[:120] + "..." if len(question) > 120 else question,
464
- "Answer": answer[:80] + "..." if len(answer) > 80 else answer,
465
- "Processing Time": f"{processing_time:.2f}s"
466
- })
467
-
468
- print(f"✅ Answer: {answer}")
469
-
470
- # Rate limiting
471
- time.sleep(0.5)
472
-
473
  except Exception as e:
474
- error_msg = f"Processing error: {str(e)}"
475
- answers.append({"task_id": task_id, "submitted_answer": error_msg})
476
- detailed_logs.append({
477
- "Task ID": task_id,
478
- "Question Preview": question[:120] + "..." if len(question) > 120 else question,
479
- "Answer": error_msg,
480
- "Processing Time": "Error"
481
- })
482
- print(f" Error processing {task_id}: {e}")
483
-
484
- # Submit answers
485
- print(f"\n📤 Submitting {len(answers)} answers to GAIA benchmark...")
486
- submission_payload = {
487
- "username": username,
488
- "agent_code": f"https://huggingface.co/spaces/{os.getenv('SPACE_ID', 'your-space')}/tree/main",
489
- "answers": answers
490
- }
491
-
492
  try:
493
- submit_response = requests.post(submit_url, json=submission_payload, timeout=240)
494
- submit_response.raise_for_status()
495
- result_data = submit_response.json()
496
-
497
- score = result_data.get('score', 'N/A')
498
- correct_count = result_data.get('correct_count', '?')
499
- total_attempted = result_data.get('total_attempted', '?')
500
-
501
- results_summary = f"""🎯 GAIA BENCHMARK RESULTS (IMPROVED VERSION)
502
-
503
- 📊 Final Score: {score}%
504
- Correct Answers: {correct_count}/{total_attempted}
505
-
506
- 🔧 System Status:
507
- {api_status}
508
-
509
- 🚀 Key Improvements Made:
510
- Fixed overly broad reversed text detection
511
- Improved search result processing with structured data
512
- Better answer box and knowledge graph utilization
513
- Enhanced person/actor name extraction
514
- • Improved numerical and date extraction
515
- More precise question classification
516
- Eliminated generic "right" fallback answers
517
-
518
- 📈 Technical Fixes:
519
- • Removed faulty 'fo' pattern that triggered false positives
520
- Added proper search result structure handling
521
- Implemented context-aware answer formatting
522
- Better handling of edge cases and errors
523
- Improved rate limiting and error recovery
524
-
525
- 💡 Performance Notes:
526
- This version should show significantly better accuracy by properly processing search results and avoiding the classification errors that caused nonsensical answers in the previous version."""
527
-
528
- return results_summary, pd.DataFrame(detailed_logs)
529
-
530
  except Exception as e:
531
- return f" Submission failed: {str(e)}\n\nAnswers were processed but could not be submitted.", pd.DataFrame(detailed_logs)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
532
 
533
- # Gradio Interface
534
- with gr.Blocks(title="GAIA Improved Agent", theme=gr.themes.Soft()) as demo:
535
- gr.Markdown("""
536
- # 🧠 GAIA Benchmark Agent (IMPROVED VERSION)
537
-
538
- **🔧 Major Fixes Applied:**
539
- - ✅ Fixed overly broad reversed text detection that caused false positives
540
- - ✅ Improved search result processing to use structured data properly
541
- - ✅ Enhanced question classification to avoid nonsensical answers
542
- - ✅ Better extraction of names, numbers, dates, and locations
543
- - ✅ Proper handling of answer boxes and knowledge graphs
544
-
545
- **🎯 Specialized Question Handling:**
546
- - 🔄 Genuine reversed text questions (with precise detection)
547
- - 🧮 Computational questions with proper math operations
548
- - 🎭 Person/actor questions with improved name extraction
549
- - 📍 Location questions with geographic context
550
- - 🔢 Numerical questions with context-aware number extraction
551
- - 📅 Date/time questions with proper temporal parsing
552
-
553
- **🔧 Setup Required:**
554
- - Set `SERPER_API_KEY` in your Hugging Face Space secrets
555
- - Get free 2500 searches/month at [serper.dev](https://serper.dev)
556
- """)
557
-
558
  gr.LoginButton()
559
-
560
- with gr.Row():
561
- with gr.Column(scale=1):
562
- status_display = gr.Textbox(
563
- label="🔧 API Status",
564
- value=get_api_status(),
565
- lines=3,
566
- interactive=False
567
- )
568
-
569
- evaluate_button = gr.Button(
570
- "🚀 Run GAIA Evaluation (Improved)",
571
- variant="primary",
572
- size="lg"
573
- )
574
-
575
- with gr.Row():
576
- results_output = gr.Textbox(
577
- label="📊 Evaluation Results",
578
- lines=20,
579
- interactive=False
580
- )
581
-
582
- with gr.Row():
583
- logs_table = gr.DataFrame(
584
- label="📋 Detailed Processing Logs",
585
- wrap=True
586
- )
587
-
588
- evaluate_button.click(
589
- fn=run_gaia_evaluation,
590
- outputs=[results_output, logs_table]
591
  )
592
 
593
  if __name__ == "__main__":
594
- demo.launch(share=True, debug=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import os
2
  import gradio as gr
3
  import requests
4
+ import json
5
  import re
6
+ import numexpr
7
+ import pandas as pd
8
  import time
 
 
 
 
9
  import math
10
+ import pdfminer
11
+ from ctransformers import AutoModelForCausalLM
12
+ from duckduckgo_search import DDGS
13
+ from pdfminer.high_level import extract_text
14
+ from bs4 import BeautifulSoup
15
+ import html2text
16
+ from typing import Dict, Any, List, Tuple, Callable
17
 
18
+ # --- Constants ---
19
  DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
20
+ MAX_STEPS = 6 # Limit reasoning steps for performance
21
+ MAX_TOKENS = 256 # Limit token generation
22
+ MODEL_NAME = "TheBloke/phi-3-mini-128k-instruct-GGUF"
23
+ MODEL_FILE = "phi-3-mini-128k-instruct.Q4_K_M.gguf"
24
 
25
+ # --- Load Quantized Model ---
26
+ print("Loading quantized model...")
27
+ start_time = time.time()
28
+ model = AutoModelForCausalLM.from_pretrained(
29
+ MODEL_NAME,
30
+ model_file=MODEL_FILE,
31
+ model_type="phi3",
32
+ gpu_layers=0, # CPU only
33
+ context_length=4096
34
+ )
35
+ load_time = time.time() - start_time
36
+ print(f"Model loaded in {load_time:.2f} seconds")
37
+
38
+ # --- Tools for GAIA Agent ---
39
+ def web_search(query: str) -> str:
40
+ """Search the web using DuckDuckGo"""
41
+ try:
42
+ with DDGS() as ddgs:
43
+ results = [r for r in ddgs.text(query, max_results=3)]
44
+ return json.dumps(results)
45
+ except Exception as e:
46
+ return f"Search error: {str(e)}"
47
+
48
+ def calculator(expression: str) -> str:
49
+ """Evaluate mathematical expressions safely"""
50
+ try:
51
+ return str(numexpr.evaluate(expression))
52
+ except Exception as e:
53
+ return f"Calculation error: {str(e)}"
54
+
55
+ def read_pdf(file_path: str) -> str:
56
+ """Extract text from PDF files"""
57
+ try:
58
+ return extract_text(file_path)
59
+ except Exception as e:
60
+ return f"PDF read error: {str(e)}"
61
+
62
+ def read_webpage(url: str) -> str:
63
+ """Fetch and extract text from web pages"""
64
+ try:
65
+ response = requests.get(url, timeout=10)
66
+ soup = BeautifulSoup(response.text, 'html.parser')
67
+ return soup.get_text(separator=' ', strip=True)[:2000] # Limit text
68
+ except Exception as e:
69
+ return f"Webpage read error: {str(e)}"
70
+
71
+ TOOLS = {
72
+ "web_search": web_search,
73
+ "calculator": calculator,
74
+ "read_pdf": read_pdf,
75
+ "read_webpage": read_webpage
76
+ }
77
+
78
+ # --- GAIA Agent Implementation ---
79
+ class GAIA_Agent:
80
  def __init__(self):
81
+ self.tools = TOOLS
82
+ self.history = []
83
+ self.system_prompt = (
84
+ "You are an expert GAIA problem solver. Use these tools: {web_search, calculator, read_pdf, read_webpage}.\n"
85
+ "Guidelines:\n"
86
+ "1. Think step-by-step. Explain reasoning\n"
87
+ "2. Use tools for calculations, searches, or file operations\n"
88
+ "3. Tools must be called as: ```json\n{'tool': 'tool_name', 'args': {'arg1': value}}```\n"
89
+ "4. Final Answer must be exact and standalone\n\n"
90
+ "Example:\n"
91
+ "Question: \"What's the population density of France? (File: france_data.pdf)\"\n"
92
+ "Thought: Need population and area. Read PDF first.\n"
93
+ "Action: ```json\n{'tool': 'read_pdf', 'args': {'file_path': 'france_data.pdf'}}```\n"
94
+ "Observation: Population: 67.8M, Area: 643,801 km²\n"
95
+ "Thought: Now calculate density: 67,800,000 / 643,801\n"
96
+ "Action: ```json\n{'tool': 'calculator', 'args': {'expression': '67800000 / 643801'}}```\n"
97
+ "Observation: 105.32\n"
98
+ "Final Answer: 105.32 people/km²"
99
+ )
100
+
101
+ def __call__(self, question: str) -> str:
102
+ print(f"\nProcessing: {question[:80]}...")
103
+ self.history = [f"Question: {question}"]
104
 
105
+ for step in range(MAX_STEPS):
106
+ prompt = self._build_prompt()
107
+ response = self._call_model(prompt)
 
 
 
 
 
 
 
 
 
108
 
109
+ if "Final Answer" in response:
110
+ answer = response.split("Final Answer:")[-1].strip()
111
+ print(f"Final Answer: {answer}")
112
+ return answer
 
 
 
 
113
 
114
+ tool_call = self._parse_tool_call(response)
115
+ if tool_call:
116
+ tool_name, args = tool_call
117
+ observation = self._use_tool(tool_name, args)
118
+ self.history.append(f"Observation: {observation}")
119
+ else:
120
+ self.history.append(f"Thought: {response}")
121
+
122
+ return "Agent couldn't find solution within step limit"
123
+
124
+ def _build_prompt(self) -> str:
125
+ prompt = f"<|system|>\n{self.system_prompt}<|end|>\n"
126
+ prompt += "<|user|>\n" + "\n".join(self.history) + "<|end|>\n"
127
+ prompt += "<|assistant|>"
128
+ return prompt
129
+
130
+ def _call_model(self, prompt: str) -> str:
131
+ start_time = time.time()
132
+ response = model(
133
+ prompt,
134
+ max_new_tokens=MAX_TOKENS,
135
+ temperature=0.01,
136
+ stop=["<|end|>", "Observation:", "```"]
137
+ )
138
+ gen_time = time.time() - start_time
139
+ print(f"Generated {len(response)} tokens in {gen_time:.2f}s: {response[:60]}...")
140
+ return response
141
+
142
+ def _parse_tool_call(self, text: str) -> Tuple[str, Dict] or None:
143
+ try:
144
+ json_match = re.search(r'```json\s*({.*?})\s*```', text, re.DOTALL)
145
+ if json_match:
146
+ tool_call = json.loads(json_match.group(1))
147
+ return tool_call["tool"], tool_call["args"]
148
  except Exception as e:
149
+ print(f"Tool parse error: {str(e)}")
150
+ return None
 
 
 
 
 
151
 
152
+ def _use_tool(self, tool_name: str, args: Dict) -> str:
153
+ if tool_name not in self.tools:
154
+ return f"Error: Unknown tool {tool_name}"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
155
 
156
+ print(f"Using tool: {tool_name}({args})")
157
+ try:
158
+ start_time = time.time()
159
+ result = self.tools[tool_name](**args)
160
+ exec_time = time.time() - start_time
161
+ print(f"Tool executed in {exec_time:.2f}s")
162
+ return str(result)[:500] # Truncate long outputs
163
+ except Exception as e:
164
+ return f"Tool error: {str(e)}"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
165
 
166
+ # --- Evaluation Runner ---
167
+ def run_and_submit_all(profile: gr.OAuthProfile | None):
168
+ # ... [Keep the original run_and_submit_all function structure] ...
169
+ # Only change the agent initialization:
170
+ try:
171
+ agent = GAIA_Agent() # Use our custom agent
172
+ except Exception as e:
173
+ print(f"Error instantiating agent: {e}")
174
+ return f"Error initializing agent: {e}", None
175
+ # ... [rest of the function remains unchanged] ...
176
+
177
+ # --- Gradio Interface ---
178
+ with gr.Blocks() as demo:
179
+ # ... [Keep the original Gradio interface] ...
180
+ # Only add resource monitoring:
181
+ gr.Markdown(f"**Resource Info:** Using {MODEL_FILE} | Max steps: {MAX_STEPS} | Max tokens: {MAX_TOKENS}")
182
+
183
+ # Add a clear button for history
184
+ clear_btn = gr.Button("Clear History")
185
+ clear_btn.click(lambda: [None, None], outputs=[status_output, results_table])
186
+ def run_and_submit_all( profile: gr.OAuthProfile | None):
187
+ """
188
+ Fetches all questions, runs the BasicAgent on them, submits all answers,
189
+ and displays the results.
190
+ """
191
+ # --- Determine HF Space Runtime URL and Repo URL ---
192
+ space_id = os.getenv("SPACE_ID") # Get the SPACE_ID for sending link to the code
193
+
194
+ if profile:
195
+ username= f"{profile.username}"
196
+ print(f"User logged in: {username}")
197
  else:
198
+ print("User not logged in.")
199
+ return "Please Login to Hugging Face with the button.", None
200
 
201
+ api_url = DEFAULT_API_URL
202
+ questions_url = f"{api_url}/questions"
203
+ submit_url = f"{api_url}/submit"
204
+
205
+ # 1. Instantiate Agent ( modify this part to create your agent)
 
 
 
 
 
 
 
 
206
  try:
207
+ agent = BasicAgent()
 
208
  except Exception as e:
209
+ print(f"Error instantiating agent: {e}")
210
+ return f"Error initializing agent: {e}", None
211
+ # In the case of an app running as a hugging Face space, this link points toward your codebase ( usefull for others so please keep it public)
212
+ agent_code = f"https://huggingface.co/spaces/{space_id}/tree/main"
213
+ print(agent_code)
214
+
215
+ # 2. Fetch Questions
216
+ print(f"Fetching questions from: {questions_url}")
217
  try:
218
+ response = requests.get(questions_url, timeout=15)
 
219
  response.raise_for_status()
220
+ questions_data = response.json()
221
+ if not questions_data:
222
+ print("Fetched questions list is empty.")
223
+ return "Fetched questions list is empty or invalid format.", None
224
+ print(f"Fetched {len(questions_data)} questions.")
225
+ except requests.exceptions.RequestException as e:
226
+ print(f"Error fetching questions: {e}")
227
+ return f"Error fetching questions: {e}", None
228
+ except requests.exceptions.JSONDecodeError as e:
229
+ print(f"Error decoding JSON response from questions endpoint: {e}")
230
+ print(f"Response text: {response.text[:500]}")
231
+ return f"Error decoding server response for questions: {e}", None
232
  except Exception as e:
233
+ print(f"An unexpected error occurred fetching questions: {e}")
234
+ return f"An unexpected error occurred fetching questions: {e}", None
235
+
236
+ # 3. Run your Agent
237
+ results_log = []
238
+ answers_payload = []
239
+ print(f"Running agent on {len(questions_data)} questions...")
240
+ for item in questions_data:
241
  task_id = item.get("task_id")
242
+ question_text = item.get("question")
243
+ if not task_id or question_text is None:
244
+ print(f"Skipping item with missing task_id or question: {item}")
245
  continue
 
 
 
246
  try:
247
+ submitted_answer = agent(question_text)
248
+ answers_payload.append({"task_id": task_id, "submitted_answer": submitted_answer})
249
+ results_log.append({"Task ID": task_id, "Question": question_text, "Submitted Answer": submitted_answer})
 
 
 
 
 
 
 
 
 
 
 
 
 
 
250
  except Exception as e:
251
+ print(f"Error running agent on task {task_id}: {e}")
252
+ results_log.append({"Task ID": task_id, "Question": question_text, "Submitted Answer": f"AGENT ERROR: {e}"})
253
+
254
+ if not answers_payload:
255
+ print("Agent did not produce any answers to submit.")
256
+ return "Agent did not produce any answers to submit.", pd.DataFrame(results_log)
257
+
258
+ # 4. Prepare Submission
259
+ submission_data = {"username": username.strip(), "agent_code": agent_code, "answers": answers_payload}
260
+ status_update = f"Agent finished. Submitting {len(answers_payload)} answers for user '{username}'..."
261
+ print(status_update)
262
+
263
+ # 5. Submit
264
+ print(f"Submitting {len(answers_payload)} answers to: {submit_url}")
 
 
 
 
265
  try:
266
+ response = requests.post(submit_url, json=submission_data, timeout=60)
267
+ response.raise_for_status()
268
+ result_data = response.json()
269
+ final_status = (
270
+ f"Submission Successful!\n"
271
+ f"User: {result_data.get('username')}\n"
272
+ f"Overall Score: {result_data.get('score', 'N/A')}% "
273
+ f"({result_data.get('correct_count', '?')}/{result_data.get('total_attempted', '?')} correct)\n"
274
+ f"Message: {result_data.get('message', 'No message received.')}"
275
+ )
276
+ print("Submission successful.")
277
+ results_df = pd.DataFrame(results_log)
278
+ return final_status, results_df
279
+ except requests.exceptions.HTTPError as e:
280
+ error_detail = f"Server responded with status {e.response.status_code}."
281
+ try:
282
+ error_json = e.response.json()
283
+ error_detail += f" Detail: {error_json.get('detail', e.response.text)}"
284
+ except requests.exceptions.JSONDecodeError:
285
+ error_detail += f" Response: {e.response.text[:500]}"
286
+ status_message = f"Submission Failed: {error_detail}"
287
+ print(status_message)
288
+ results_df = pd.DataFrame(results_log)
289
+ return status_message, results_df
290
+ except requests.exceptions.Timeout:
291
+ status_message = "Submission Failed: The request timed out."
292
+ print(status_message)
293
+ results_df = pd.DataFrame(results_log)
294
+ return status_message, results_df
295
+ except requests.exceptions.RequestException as e:
296
+ status_message = f"Submission Failed: Network error - {e}"
297
+ print(status_message)
298
+ results_df = pd.DataFrame(results_log)
299
+ return status_message, results_df
 
 
 
300
  except Exception as e:
301
+ status_message = f"An unexpected error occurred during submission: {e}"
302
+ print(status_message)
303
+ results_df = pd.DataFrame(results_log)
304
+ return status_message, results_df
305
+
306
+
307
+ # --- Build Gradio Interface using Blocks ---
308
+ with gr.Blocks() as demo:
309
+ gr.Markdown("# Basic Agent Evaluation Runner")
310
+ gr.Markdown(
311
+ """
312
+ **Instructions:**
313
+ 1. Please clone this space, then modify the code to define your agent's logic, the tools, the necessary packages, etc ...
314
+ 2. Log in to your Hugging Face account using the button below. This uses your HF username for submission.
315
+ 3. Click 'Run Evaluation & Submit All Answers' to fetch questions, run your agent, submit answers, and see the score.
316
+ ---
317
+ **Disclaimers:**
318
+ Once clicking on the "submit button, it can take quite some time ( this is the time for the agent to go through all the questions).
319
+ This space provides a basic setup and is intentionally sub-optimal to encourage you to develop your own, more robust solution. For instance for the delay process of the submit button, a solution could be to cache the answers and submit in a seperate action or even to answer the questions in async.
320
+ """
321
+ )
322
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
323
  gr.LoginButton()
324
+
325
+ run_button = gr.Button("Run Evaluation & Submit All Answers")
326
+
327
+ status_output = gr.Textbox(label="Run Status / Submission Result", lines=5, interactive=False)
328
+ # Removed max_rows=10 from DataFrame constructor
329
+ results_table = gr.DataFrame(label="Questions and Agent Answers", wrap=True)
330
+
331
+ run_button.click(
332
+ fn=run_and_submit_all,
333
+ outputs=[status_output, results_table]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
334
  )
335
 
336
  if __name__ == "__main__":
337
+ print("\n" + "-"*30 + " App Starting " + "-"*30)
338
+ # Check for SPACE_HOST and SPACE_ID at startup for information
339
+ space_host_startup = os.getenv("SPACE_HOST")
340
+ space_id_startup = os.getenv("SPACE_ID") # Get SPACE_ID at startup
341
+
342
+ if space_host_startup:
343
+ print(f"✅ SPACE_HOST found: {space_host_startup}")
344
+ print(f" Runtime URL should be: https://{space_host_startup}.hf.space")
345
+ else:
346
+ print("ℹ️ SPACE_HOST environment variable not found (running locally?).")
347
+
348
+ if space_id_startup: # Print repo URLs if SPACE_ID is found
349
+ print(f"✅ SPACE_ID found: {space_id_startup}")
350
+ print(f" Repo URL: https://huggingface.co/spaces/{space_id_startup}")
351
+ print(f" Repo Tree URL: https://huggingface.co/spaces/{space_id_startup}/tree/main")
352
+ else:
353
+ print("ℹ️ SPACE_ID environment variable not found (running locally?). Repo URL cannot be determined.")
354
+
355
+ print("-"*(60 + len(" App Starting ")) + "\n")
356
+
357
+ print("Launching Gradio Interface for Basic Agent Evaluation...")
358
+ demo.launch(debug=True, share=False)
requirements.txt CHANGED
@@ -1,13 +1,11 @@
1
- gradio>=4.0.0
2
- transformers>=4.35.0
3
- torch>=2.0.0
4
- pandas>=1.5.0
5
- requests>=2.28.0
6
- beautifulsoup4>=4.11.0
7
- wikipedia>=1.4.0
8
- smolagents>=0.1.0
9
- accelerate>=0.20.0
10
- sentencepiece>=0.1.99
11
- openpyxl
12
  PyPDF2
13
- pillow
 
 
 
1
+ ctransformers==0.2.27
2
+ gradio==4.19.0
3
+ requests
4
+ pandas
5
+ python-dotenv
6
+ duckduckgo-search
7
+ numexpr
 
 
 
 
8
  PyPDF2
9
+ pdfminer.six
10
+ beautifulsoup4
11
+ html2text
run.py ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ from smolagents import DuckDuckGoSearchTool
2
+
3
+ # Initialize the DuckDuckGo search tool
4
+ search_tool = DuckDuckGoSearchTool()
5
+
6
+ # Example usage
7
+ results = search_tool("Who's the current President of France?")
8
+ print(results)