Spaces:
Runtime error
Runtime error
Last
Browse files
app.py
CHANGED
@@ -4,13 +4,91 @@ import requests
|
|
4 |
import pandas as pd
|
5 |
import re
|
6 |
import time
|
7 |
-
|
8 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
9 |
|
10 |
DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
|
11 |
|
12 |
-
class
|
13 |
-
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
14 |
|
15 |
def __init__(self):
|
16 |
self.session = requests.Session()
|
@@ -18,99 +96,267 @@ class WebSearchEngine:
|
|
18 |
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
|
19 |
})
|
20 |
self.serper_api_key = os.getenv("SERPER_API_KEY")
|
21 |
-
|
22 |
-
|
23 |
-
|
|
|
24 |
if not self.serper_api_key:
|
25 |
return {}
|
26 |
|
|
|
|
|
|
|
|
|
|
|
27 |
try:
|
28 |
-
url = "https://google.serper.dev/
|
29 |
-
payload = {
|
30 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
31 |
|
32 |
-
response = self.session.post(url, json=payload, headers=headers, timeout=15)
|
33 |
-
return response.json() if response.status_code == 200 else {}
|
34 |
except Exception as e:
|
35 |
print(f"Serper API error: {e}")
|
36 |
return {}
|
37 |
|
38 |
-
def
|
39 |
-
"""
|
40 |
-
|
41 |
-
|
42 |
-
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
-
|
47 |
-
if
|
48 |
-
|
49 |
-
|
50 |
-
|
51 |
-
|
52 |
-
|
53 |
-
|
54 |
-
for
|
55 |
-
|
56 |
-
|
57 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
58 |
|
59 |
-
#
|
60 |
-
if
|
61 |
-
|
62 |
-
|
63 |
-
|
64 |
-
|
65 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
66 |
|
67 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
68 |
|
69 |
-
class
|
70 |
-
"""
|
71 |
|
72 |
def __init__(self):
|
73 |
-
self.search_engine =
|
|
|
74 |
|
75 |
-
def solve_question(self, question: str) -> str:
|
76 |
-
"""
|
77 |
print(f"๐ค Analyzing: {question[:100]}...")
|
78 |
|
79 |
-
# Handle
|
80 |
-
if
|
81 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
82 |
|
83 |
-
# Handle mathematical
|
84 |
if self.is_math_question(question):
|
85 |
return self.handle_math_question(question)
|
86 |
|
87 |
-
# Handle
|
88 |
-
if self.
|
89 |
-
return self.
|
90 |
|
91 |
-
#
|
92 |
-
return self.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
93 |
|
94 |
-
def
|
95 |
-
"""
|
96 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
97 |
|
98 |
-
def
|
99 |
-
"""
|
|
|
|
|
|
|
|
|
100 |
try:
|
101 |
-
|
102 |
-
|
103 |
-
|
104 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
105 |
|
106 |
def is_math_question(self, question: str) -> bool:
|
107 |
"""Detect mathematical questions"""
|
108 |
-
|
109 |
-
|
|
|
|
|
|
|
110 |
|
111 |
def handle_math_question(self, question: str) -> str:
|
112 |
-
"""Handle mathematical questions
|
113 |
-
#
|
114 |
expressions = re.findall(r'\b\d+\s*[\+\-\*\/]\s*\d+\b', question)
|
115 |
for expr in expressions:
|
116 |
try:
|
@@ -119,105 +365,131 @@ class QuestionSolver:
|
|
119 |
except:
|
120 |
continue
|
121 |
|
122 |
-
# For
|
123 |
-
|
|
|
124 |
|
125 |
-
def
|
126 |
-
"""
|
127 |
-
|
128 |
-
|
129 |
-
|
130 |
-
r'last name',
|
131 |
-
r'video.*youtube\.com'
|
132 |
]
|
133 |
-
return any(
|
134 |
|
135 |
-
def
|
136 |
-
"""
|
137 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
138 |
|
139 |
-
#
|
140 |
-
|
141 |
-
|
142 |
-
|
143 |
-
|
144 |
-
|
145 |
-
|
146 |
-
|
147 |
-
|
148 |
-
|
149 |
-
|
150 |
-
|
151 |
-
|
152 |
-
|
153 |
-
|
154 |
-
"
|
155 |
-
|
156 |
-
|
157 |
-
|
158 |
-
|
159 |
-
|
160 |
-
|
161 |
-
|
162 |
-
|
163 |
-
|
164 |
-
|
165 |
-
|
166 |
-
|
167 |
-
|
168 |
-
if
|
169 |
-
|
170 |
-
|
171 |
-
|
172 |
-
|
173 |
-
|
174 |
-
|
175 |
-
|
176 |
-
|
177 |
-
|
178 |
-
|
179 |
-
|
180 |
-
|
181 |
-
|
182 |
-
|
183 |
-
|
184 |
-
return search_result.replace("Direct Answer:", "").strip()
|
185 |
-
|
186 |
-
# Extract most relevant number for quantitative questions
|
187 |
-
if any(w in question.lower() for w in ['how many', 'how much', 'number']):
|
188 |
-
numbers = re.findall(r'\b\d+\b', search_result)
|
189 |
-
return numbers[0] if numbers else "Number not found"
|
190 |
-
|
191 |
-
# Extract names for person-based questions
|
192 |
-
if any(w in question.lower() for w in ['who', 'whom', 'person']):
|
193 |
-
names = re.findall(r'\b[A-Z][a-z]+ [A-Z][a-z]+\b', search_result)
|
194 |
-
return names[0] if names else "Name not found"
|
195 |
-
|
196 |
-
# Default: return first meaningful snippet
|
197 |
-
snippets = [s for s in search_result.split('\n\n') if len(s) > 20]
|
198 |
-
return snippets[0] if snippets else "Answer not found"
|
199 |
|
200 |
-
def
|
201 |
-
"""Check
|
202 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
203 |
|
204 |
-
def
|
205 |
-
"""Run GAIA evaluation with enhanced
|
206 |
if not profile:
|
207 |
return "Please log in to Hugging Face first.", None
|
208 |
|
209 |
# Check API status
|
210 |
-
api_status =
|
211 |
-
if "โ" in api_status:
|
212 |
-
return f"โ ๏ธ API not configured!\n\n{api_status}", None
|
213 |
|
214 |
username = profile.username
|
215 |
questions_url = f"{DEFAULT_API_URL}/questions"
|
216 |
submit_url = f"{DEFAULT_API_URL}/submit"
|
217 |
|
218 |
try:
|
219 |
-
solver =
|
220 |
-
print("โ
|
221 |
except Exception as e:
|
222 |
return f"โ Initialization failed: {e}", None
|
223 |
|
@@ -236,35 +508,41 @@ def run_gaia_evaluation(profile: gr.OAuthProfile | None):
|
|
236 |
for i, item in enumerate(questions):
|
237 |
task_id = item.get("task_id")
|
238 |
question = item.get("question")
|
|
|
239 |
|
240 |
if not task_id or not question:
|
241 |
continue
|
242 |
|
243 |
print(f"\n๐ Processing {i+1}/{len(questions)}: {task_id}")
|
|
|
|
|
|
|
244 |
|
245 |
try:
|
246 |
start_time = time.time()
|
247 |
-
answer = solver.solve_question(question)
|
248 |
processing_time = time.time() - start_time
|
249 |
|
250 |
answers.append({"task_id": task_id, "submitted_answer": answer})
|
251 |
logs.append({
|
252 |
"Task ID": task_id,
|
253 |
-
"Question": question[:
|
254 |
-
"Answer": answer,
|
|
|
255 |
"Time (s)": f"{processing_time:.2f}"
|
256 |
})
|
257 |
|
258 |
print(f"โ
Answer: {answer[:80]}{'...' if len(answer) > 80 else ''}")
|
259 |
-
time.sleep(0.
|
260 |
|
261 |
except Exception as e:
|
262 |
error_msg = f"Error: {str(e)}"
|
263 |
answers.append({"task_id": task_id, "submitted_answer": error_msg})
|
264 |
logs.append({
|
265 |
"Task ID": task_id,
|
266 |
-
"Question": question,
|
267 |
"Answer": error_msg,
|
|
|
268 |
"Time (s)": "Error"
|
269 |
})
|
270 |
print(f"โ Error: {e}")
|
@@ -278,7 +556,7 @@ def run_gaia_evaluation(profile: gr.OAuthProfile | None):
|
|
278 |
}
|
279 |
|
280 |
try:
|
281 |
-
resp = requests.post(submit_url, json=payload, timeout=
|
282 |
resp.raise_for_status()
|
283 |
data = resp.json()
|
284 |
|
@@ -286,68 +564,99 @@ def run_gaia_evaluation(profile: gr.OAuthProfile | None):
|
|
286 |
correct = data.get('correct_count', '?')
|
287 |
total = data.get('total_attempted', '?')
|
288 |
|
289 |
-
result_message = f"""๐ฏ GAIA EVALUATION RESULTS
|
290 |
|
291 |
-
๐ Score: {score}% ({correct}/{total} correct)
|
292 |
|
293 |
-
๐ง
|
294 |
{api_status}
|
295 |
|
296 |
-
|
297 |
-
โข
|
298 |
-
โข
|
299 |
-
โข
|
300 |
-
โข
|
301 |
-
โข
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
302 |
|
303 |
return result_message, pd.DataFrame(logs)
|
304 |
|
305 |
except Exception as e:
|
306 |
return f"โ Submission failed: {str(e)}", pd.DataFrame(logs)
|
307 |
|
308 |
-
# Gradio Interface
|
309 |
-
with gr.Blocks(title="GAIA Agent", theme=gr.themes.
|
310 |
gr.Markdown("""
|
311 |
-
# ๐ง GAIA Benchmark Agent
|
|
|
|
|
|
|
312 |
|
313 |
-
|
314 |
-
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
315 |
|
316 |
-
|
317 |
-
-
|
318 |
-
-
|
319 |
-
-
|
320 |
-
-
|
321 |
""")
|
322 |
|
323 |
gr.LoginButton()
|
324 |
|
325 |
with gr.Row():
|
326 |
with gr.Column():
|
327 |
-
|
328 |
-
label="๐ง
|
329 |
-
value=
|
330 |
-
lines=
|
331 |
interactive=False
|
332 |
)
|
333 |
-
|
|
|
|
|
|
|
|
|
|
|
334 |
|
335 |
with gr.Row():
|
336 |
-
|
337 |
-
label="๐ Results",
|
338 |
-
lines=
|
339 |
interactive=False
|
340 |
)
|
341 |
|
342 |
with gr.Row():
|
343 |
-
|
344 |
-
label="๐ Question
|
345 |
-
wrap=True
|
|
|
346 |
)
|
347 |
|
348 |
-
|
349 |
-
|
350 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
351 |
)
|
352 |
|
353 |
if __name__ == "__main__":
|
|
|
4 |
import pandas as pd
|
5 |
import re
|
6 |
import time
|
7 |
+
import json
|
8 |
+
import base64
|
9 |
+
from typing import Dict, Any, List, Optional, Tuple
|
10 |
+
from io import StringIO, BytesIO
|
11 |
+
import openpyxl
|
12 |
+
from PIL import Image
|
13 |
+
import PyPDF2
|
14 |
+
import ast
|
15 |
+
import math
|
16 |
+
import statistics
|
17 |
+
from datetime import datetime, timedelta
|
18 |
|
19 |
DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
|
20 |
|
21 |
+
class FileProcessor:
|
22 |
+
"""Handle various file types that GAIA questions might reference"""
|
23 |
+
|
24 |
+
@staticmethod
|
25 |
+
def process_excel_file(file_path: str) -> Dict[str, Any]:
|
26 |
+
"""Process Excel files and extract data"""
|
27 |
+
try:
|
28 |
+
# Try multiple sheet reading approaches
|
29 |
+
excel_data = {}
|
30 |
+
workbook = openpyxl.load_workbook(file_path, data_only=True)
|
31 |
+
|
32 |
+
for sheet_name in workbook.sheetnames:
|
33 |
+
sheet = workbook[sheet_name]
|
34 |
+
data = []
|
35 |
+
for row in sheet.iter_rows(values_only=True):
|
36 |
+
if any(cell is not None for cell in row):
|
37 |
+
data.append(row)
|
38 |
+
excel_data[sheet_name] = data
|
39 |
+
|
40 |
+
return excel_data
|
41 |
+
except Exception as e:
|
42 |
+
print(f"Excel processing error: {e}")
|
43 |
+
return {}
|
44 |
+
|
45 |
+
@staticmethod
|
46 |
+
def process_python_code(code_content: str) -> str:
|
47 |
+
"""Execute Python code safely and return output"""
|
48 |
+
try:
|
49 |
+
# Create a safe execution environment
|
50 |
+
safe_globals = {
|
51 |
+
'__builtins__': {
|
52 |
+
'print': print, 'len': len, 'range': range, 'sum': sum,
|
53 |
+
'max': max, 'min': min, 'abs': abs, 'round': round,
|
54 |
+
'int': int, 'float': float, 'str': str, 'list': list,
|
55 |
+
'dict': dict, 'set': set, 'tuple': tuple
|
56 |
+
},
|
57 |
+
'math': math,
|
58 |
+
'statistics': statistics
|
59 |
+
}
|
60 |
+
|
61 |
+
# Capture output
|
62 |
+
import io
|
63 |
+
import sys
|
64 |
+
old_stdout = sys.stdout
|
65 |
+
sys.stdout = captured_output = io.StringIO()
|
66 |
+
|
67 |
+
try:
|
68 |
+
exec(code_content, safe_globals)
|
69 |
+
output = captured_output.getvalue()
|
70 |
+
finally:
|
71 |
+
sys.stdout = old_stdout
|
72 |
+
|
73 |
+
return output.strip()
|
74 |
+
except Exception as e:
|
75 |
+
return f"Code execution error: {e}"
|
76 |
+
|
77 |
+
@staticmethod
|
78 |
+
def process_pdf_file(file_path: str) -> str:
|
79 |
+
"""Extract text from PDF files"""
|
80 |
+
try:
|
81 |
+
with open(file_path, 'rb') as file:
|
82 |
+
pdf_reader = PyPDF2.PdfReader(file)
|
83 |
+
text = ""
|
84 |
+
for page in pdf_reader.pages:
|
85 |
+
text += page.extract_text() + "\n"
|
86 |
+
return text.strip()
|
87 |
+
except Exception as e:
|
88 |
+
return f"PDF processing error: {e}"
|
89 |
+
|
90 |
+
class AdvancedWebSearchEngine:
|
91 |
+
"""Enhanced web search with multiple strategies"""
|
92 |
|
93 |
def __init__(self):
|
94 |
self.session = requests.Session()
|
|
|
96 |
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
|
97 |
})
|
98 |
self.serper_api_key = os.getenv("SERPER_API_KEY")
|
99 |
+
self.search_cache = {}
|
100 |
+
|
101 |
+
def search_with_serper(self, query: str, search_type: str = "search") -> Dict[str, Any]:
|
102 |
+
"""Enhanced Serper API search with different types"""
|
103 |
if not self.serper_api_key:
|
104 |
return {}
|
105 |
|
106 |
+
# Check cache first
|
107 |
+
cache_key = f"{query}_{search_type}"
|
108 |
+
if cache_key in self.search_cache:
|
109 |
+
return self.search_cache[cache_key]
|
110 |
+
|
111 |
try:
|
112 |
+
url = f"https://google.serper.dev/{search_type}"
|
113 |
+
payload = {
|
114 |
+
"q": query,
|
115 |
+
"num": 15, # Get more results
|
116 |
+
"gl": "us", # US results
|
117 |
+
"hl": "en" # English language
|
118 |
+
}
|
119 |
+
|
120 |
+
headers = {
|
121 |
+
"X-API-KEY": self.serper_api_key,
|
122 |
+
"Content-Type": "application/json"
|
123 |
+
}
|
124 |
+
|
125 |
+
response = self.session.post(url, json=payload, headers=headers, timeout=20)
|
126 |
+
result = response.json() if response.status_code == 200 else {}
|
127 |
+
|
128 |
+
# Cache the result
|
129 |
+
self.search_cache[cache_key] = result
|
130 |
+
return result
|
131 |
|
|
|
|
|
132 |
except Exception as e:
|
133 |
print(f"Serper API error: {e}")
|
134 |
return {}
|
135 |
|
136 |
+
def multi_strategy_search(self, query: str) -> Dict[str, Any]:
|
137 |
+
"""Try multiple search strategies for better results"""
|
138 |
+
results = {}
|
139 |
+
|
140 |
+
# Primary search
|
141 |
+
primary = self.search_with_serper(query)
|
142 |
+
if primary:
|
143 |
+
results['primary'] = primary
|
144 |
+
|
145 |
+
# Try variations if primary doesn't yield good results
|
146 |
+
variations = [
|
147 |
+
f'"{query}"', # Exact phrase
|
148 |
+
f"{query} site:wikipedia.org", # Wikipedia specific
|
149 |
+
f"{query} facts information", # More specific
|
150 |
+
]
|
151 |
+
|
152 |
+
for i, variation in enumerate(variations):
|
153 |
+
if len(results) < 2: # Don't overdo it
|
154 |
+
var_result = self.search_with_serper(variation)
|
155 |
+
if var_result and var_result != primary:
|
156 |
+
results[f'variation_{i}'] = var_result
|
157 |
+
|
158 |
+
return results
|
159 |
+
|
160 |
+
def extract_answer_from_results(self, results: Dict[str, Any], question: str) -> str:
|
161 |
+
"""Advanced answer extraction from search results"""
|
162 |
+
all_content = []
|
163 |
+
|
164 |
+
for result_type, data in results.items():
|
165 |
+
# Extract answer box
|
166 |
+
if "answerBox" in data:
|
167 |
+
answer_box = data["answerBox"]
|
168 |
+
if "answer" in answer_box:
|
169 |
+
return answer_box["answer"]
|
170 |
+
elif "snippet" in answer_box:
|
171 |
+
return answer_box["snippet"]
|
172 |
|
173 |
+
# Extract knowledge graph
|
174 |
+
if "knowledgeGraph" in data:
|
175 |
+
kg = data["knowledgeGraph"]
|
176 |
+
if "description" in kg:
|
177 |
+
all_content.append(kg["description"])
|
178 |
+
|
179 |
+
# Extract organic results
|
180 |
+
for organic in data.get("organic", []):
|
181 |
+
title = organic.get("title", "")
|
182 |
+
snippet = organic.get("snippet", "")
|
183 |
+
if title and snippet:
|
184 |
+
all_content.append(f"{title}: {snippet}")
|
185 |
+
|
186 |
+
# Combine all content
|
187 |
+
combined_content = "\n".join(all_content)
|
188 |
+
|
189 |
+
# Apply question-specific extraction
|
190 |
+
return self.extract_specific_answer(combined_content, question)
|
191 |
+
|
192 |
+
def extract_specific_answer(self, content: str, question: str) -> str:
|
193 |
+
"""Extract specific answers based on question type"""
|
194 |
+
q_lower = question.lower()
|
195 |
|
196 |
+
# Numbers and quantities
|
197 |
+
if any(word in q_lower for word in ['how many', 'how much', 'number of', 'count']):
|
198 |
+
numbers = re.findall(r'\b\d{1,10}\b', content)
|
199 |
+
if numbers:
|
200 |
+
# Return the most likely number (often the first one found)
|
201 |
+
return numbers[0]
|
202 |
+
|
203 |
+
# Names and people
|
204 |
+
if any(word in q_lower for word in ['who', 'whom', 'name', 'person']):
|
205 |
+
# Look for proper names (capitalized words)
|
206 |
+
names = re.findall(r'\b[A-Z][a-z]+ [A-Z][a-z]+(?:\s[A-Z][a-z]+)*\b', content)
|
207 |
+
if names:
|
208 |
+
if 'first name' in q_lower:
|
209 |
+
return names[0].split()[0]
|
210 |
+
elif 'last name' in q_lower or 'surname' in q_lower:
|
211 |
+
return names[0].split()[-1]
|
212 |
+
else:
|
213 |
+
return names[0]
|
214 |
+
|
215 |
+
# Dates and years
|
216 |
+
if any(word in q_lower for word in ['when', 'year', 'date']):
|
217 |
+
years = re.findall(r'\b(19|20)\d{2}\b', content)
|
218 |
+
if years:
|
219 |
+
return years[0]
|
220 |
+
dates = re.findall(r'\b\w+ \d{1,2}, \d{4}\b', content)
|
221 |
+
if dates:
|
222 |
+
return dates[0]
|
223 |
+
|
224 |
+
# Places and locations
|
225 |
+
if any(word in q_lower for word in ['where', 'location', 'place', 'country']):
|
226 |
+
# Look for place names
|
227 |
+
places = re.findall(r'\b[A-Z][a-z]+(?:\s[A-Z][a-z]+)*(?:\s(?:City|State|Country|Province|Region))?\b', content)
|
228 |
+
if places:
|
229 |
+
return places[0]
|
230 |
+
|
231 |
+
# Country codes
|
232 |
+
if 'country code' in q_lower:
|
233 |
+
codes = re.findall(r'\b[A-Z]{2,3}\b', content)
|
234 |
+
if codes:
|
235 |
+
return codes[0]
|
236 |
+
|
237 |
+
# Default: return first meaningful sentence
|
238 |
+
sentences = [s.strip() for s in content.split('.') if len(s.strip()) > 20]
|
239 |
+
return sentences[0] if sentences else "Answer not found in search results"
|
240 |
|
241 |
+
class EnhancedQuestionSolver:
|
242 |
+
"""Advanced question solver with multiple reasoning strategies"""
|
243 |
|
244 |
def __init__(self):
|
245 |
+
self.search_engine = AdvancedWebSearchEngine()
|
246 |
+
self.file_processor = FileProcessor()
|
247 |
|
248 |
+
def solve_question(self, question: str, files: List[str] = None) -> str:
|
249 |
+
"""Main question solving method with multiple strategies"""
|
250 |
print(f"๐ค Analyzing: {question[:100]}...")
|
251 |
|
252 |
+
# Handle file-based questions first
|
253 |
+
if files:
|
254 |
+
file_answer = self.handle_file_based_question(question, files)
|
255 |
+
if file_answer and file_answer != "File processing failed":
|
256 |
+
return file_answer
|
257 |
+
|
258 |
+
# Detect file references in question text
|
259 |
+
if self.has_file_references(question):
|
260 |
+
return self.handle_file_reference_question(question)
|
261 |
|
262 |
+
# Handle mathematical calculations
|
263 |
if self.is_math_question(question):
|
264 |
return self.handle_math_question(question)
|
265 |
|
266 |
+
# Handle multi-step reasoning questions
|
267 |
+
if self.needs_multi_step_reasoning(question):
|
268 |
+
return self.handle_multi_step_question(question)
|
269 |
|
270 |
+
# Handle specific structured questions
|
271 |
+
return self.handle_structured_question(question)
|
272 |
+
|
273 |
+
def has_file_references(self, question: str) -> bool:
|
274 |
+
"""Check if question references files"""
|
275 |
+
file_indicators = [
|
276 |
+
"attached", "excel file", "python code", "pdf", "image",
|
277 |
+
"spreadsheet", "document", "file contains", "in the file"
|
278 |
+
]
|
279 |
+
return any(indicator in question.lower() for indicator in file_indicators)
|
280 |
+
|
281 |
+
def handle_file_reference_question(self, question: str) -> str:
|
282 |
+
"""Handle questions that reference files but files aren't provided"""
|
283 |
+
# Try to search for the specific content mentioned
|
284 |
+
if "excel file" in question.lower() and "sales" in question.lower():
|
285 |
+
return "Unable to access attached Excel file. Please ensure file is properly uploaded."
|
286 |
+
elif "python code" in question.lower():
|
287 |
+
return "Unable to access attached Python code. Please ensure file is properly uploaded."
|
288 |
+
else:
|
289 |
+
return "File referenced but not accessible. Please provide the file."
|
290 |
|
291 |
+
def handle_file_based_question(self, question: str, files: List[str]) -> str:
|
292 |
+
"""Handle questions that involve file processing"""
|
293 |
+
try:
|
294 |
+
for file_path in files:
|
295 |
+
if file_path.endswith('.xlsx') or file_path.endswith('.xls'):
|
296 |
+
excel_data = self.file_processor.process_excel_file(file_path)
|
297 |
+
return self.analyze_excel_data(excel_data, question)
|
298 |
+
elif file_path.endswith('.py'):
|
299 |
+
with open(file_path, 'r') as f:
|
300 |
+
code_content = f.read()
|
301 |
+
return self.file_processor.process_python_code(code_content)
|
302 |
+
elif file_path.endswith('.pdf'):
|
303 |
+
pdf_text = self.file_processor.process_pdf_file(file_path)
|
304 |
+
return self.analyze_text_content(pdf_text, question)
|
305 |
+
except Exception as e:
|
306 |
+
return f"File processing failed: {e}"
|
307 |
+
|
308 |
+
return "File processing failed"
|
309 |
|
310 |
+
def analyze_excel_data(self, excel_data: Dict, question: str) -> str:
|
311 |
+
"""Analyze Excel data to answer questions"""
|
312 |
+
if not excel_data:
|
313 |
+
return "No data found in Excel file"
|
314 |
+
|
315 |
+
# Convert to DataFrame for analysis
|
316 |
try:
|
317 |
+
for sheet_name, data in excel_data.items():
|
318 |
+
if data:
|
319 |
+
df = pd.DataFrame(data[1:], columns=data[0]) # First row as header
|
320 |
+
|
321 |
+
# Handle sales analysis questions
|
322 |
+
if "sales" in question.lower():
|
323 |
+
if "total" in question.lower():
|
324 |
+
numeric_cols = df.select_dtypes(include=[int, float]).columns
|
325 |
+
if len(numeric_cols) > 0:
|
326 |
+
return str(df[numeric_cols[0]].sum())
|
327 |
+
elif "average" in question.lower():
|
328 |
+
numeric_cols = df.select_dtypes(include=[int, float]).columns
|
329 |
+
if len(numeric_cols) > 0:
|
330 |
+
return str(df[numeric_cols[0]].mean())
|
331 |
+
|
332 |
+
return "Could not analyze Excel data for this question"
|
333 |
+
except Exception as e:
|
334 |
+
return f"Excel analysis error: {e}"
|
335 |
+
|
336 |
+
def analyze_text_content(self, text: str, question: str) -> str:
|
337 |
+
"""Analyze text content to find answers"""
|
338 |
+
# Look for specific patterns based on question
|
339 |
+
if "surname" in question.lower() or "last name" in question.lower():
|
340 |
+
names = re.findall(r'\b[A-Z][a-z]+ [A-Z][a-z]+\b', text)
|
341 |
+
if names:
|
342 |
+
return names[0].split()[-1]
|
343 |
+
|
344 |
+
# Use search to find more specific information
|
345 |
+
search_query = f"{question} {text[:100]}"
|
346 |
+
results = self.search_engine.multi_strategy_search(search_query)
|
347 |
+
return self.search_engine.extract_answer_from_results(results, question)
|
348 |
|
349 |
def is_math_question(self, question: str) -> bool:
|
350 |
"""Detect mathematical questions"""
|
351 |
+
math_indicators = [
|
352 |
+
'calculate', 'compute', 'sum', 'average', 'mean',
|
353 |
+
'total', 'how many', 'how much', 'solve', 'equation'
|
354 |
+
]
|
355 |
+
return any(indicator in question.lower() for indicator in math_indicators)
|
356 |
|
357 |
def handle_math_question(self, question: str) -> str:
|
358 |
+
"""Handle mathematical questions"""
|
359 |
+
# Try to extract and solve mathematical expressions
|
360 |
expressions = re.findall(r'\b\d+\s*[\+\-\*\/]\s*\d+\b', question)
|
361 |
for expr in expressions:
|
362 |
try:
|
|
|
365 |
except:
|
366 |
continue
|
367 |
|
368 |
+
# For word problems, search for the answer
|
369 |
+
results = self.search_engine.multi_strategy_search(question)
|
370 |
+
return self.search_engine.extract_answer_from_results(results, question)
|
371 |
|
372 |
+
def needs_multi_step_reasoning(self, question: str) -> bool:
|
373 |
+
"""Check if question needs multi-step reasoning"""
|
374 |
+
multi_step_indicators = [
|
375 |
+
"who played", "actor who", "person who", "after",
|
376 |
+
"before", "then", "subsequently", "following"
|
|
|
|
|
377 |
]
|
378 |
+
return any(indicator in question.lower() for indicator in multi_step_indicators)
|
379 |
|
380 |
+
def handle_multi_step_question(self, question: str) -> str:
|
381 |
+
"""Handle questions requiring multiple steps"""
|
382 |
+
# Break down complex questions
|
383 |
+
if "actor who played" in question.lower():
|
384 |
+
return self.handle_actor_chain_question(question)
|
385 |
+
elif "before and after" in question.lower():
|
386 |
+
return self.handle_sequence_question(question)
|
387 |
+
else:
|
388 |
+
return self.handle_structured_question(question)
|
389 |
+
|
390 |
+
def handle_actor_chain_question(self, question: str) -> str:
|
391 |
+
"""Handle questions about actors playing different roles"""
|
392 |
+
# Step 1: Find the initial actor/role
|
393 |
+
parts = question.split(" in ")
|
394 |
+
if len(parts) >= 2:
|
395 |
+
first_search = f"actor who played {parts[0].split('actor who played')[1]} in {parts[1].split(' play in')[0]}"
|
396 |
+
results1 = self.search_engine.multi_strategy_search(first_search)
|
397 |
+
actor_name = self.search_engine.extract_answer_from_results(results1, f"who is the actor")
|
398 |
+
|
399 |
+
if actor_name and actor_name != "Answer not found in search results":
|
400 |
+
# Step 2: Find what this actor played in the target show/movie
|
401 |
+
target = parts[1].split(" play in ")[1] if " play in " in parts[1] else parts[1]
|
402 |
+
second_search = f"{actor_name} role in {target}"
|
403 |
+
results2 = self.search_engine.multi_strategy_search(second_search)
|
404 |
+
return self.search_engine.extract_answer_from_results(results2, f"what role did {actor_name} play")
|
405 |
|
406 |
+
# Fallback to single search
|
407 |
+
results = self.search_engine.multi_strategy_search(question)
|
408 |
+
return self.search_engine.extract_answer_from_results(results, question)
|
409 |
+
|
410 |
+
def handle_sequence_question(self, question: str) -> str:
|
411 |
+
"""Handle questions about sequences (before/after)"""
|
412 |
+
results = self.search_engine.multi_strategy_search(question)
|
413 |
+
return self.search_engine.extract_answer_from_results(results, question)
|
414 |
+
|
415 |
+
def handle_structured_question(self, question: str) -> str:
|
416 |
+
"""Handle general structured questions with enhanced search"""
|
417 |
+
results = self.search_engine.multi_strategy_search(question)
|
418 |
+
answer = self.search_engine.extract_answer_from_results(results, question)
|
419 |
+
|
420 |
+
# If no good answer found, try rephrasing the question
|
421 |
+
if answer == "Answer not found in search results":
|
422 |
+
rephrased_questions = self.rephrase_question(question)
|
423 |
+
for rq in rephrased_questions:
|
424 |
+
results = self.search_engine.multi_strategy_search(rq)
|
425 |
+
answer = self.search_engine.extract_answer_from_results(results, question)
|
426 |
+
if answer != "Answer not found in search results":
|
427 |
+
break
|
428 |
+
|
429 |
+
return answer
|
430 |
+
|
431 |
+
def rephrase_question(self, question: str) -> List[str]:
|
432 |
+
"""Generate alternative phrasings of the question"""
|
433 |
+
rephrased = []
|
434 |
+
|
435 |
+
# Add question marks if missing
|
436 |
+
if not question.endswith('?'):
|
437 |
+
rephrased.append(question + '?')
|
438 |
+
|
439 |
+
# Remove question words for factual search
|
440 |
+
words_to_remove = ['what is', 'who is', 'where is', 'when is', 'how many', 'how much']
|
441 |
+
for word in words_to_remove:
|
442 |
+
if word in question.lower():
|
443 |
+
rephrased.append(question.lower().replace(word, '').strip())
|
444 |
+
|
445 |
+
# Add context words
|
446 |
+
context_words = ['information about', 'facts about', 'details about']
|
447 |
+
for context in context_words:
|
448 |
+
rephrased.append(f"{context} {question}")
|
449 |
+
|
450 |
+
return rephrased[:3] # Limit to 3 rephrasings
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
451 |
|
452 |
+
def get_enhanced_api_status():
|
453 |
+
"""Check API status with more details"""
|
454 |
+
status = []
|
455 |
+
|
456 |
+
if os.getenv("SERPER_API_KEY"):
|
457 |
+
status.append("โ
Serper API: Configured")
|
458 |
+
else:
|
459 |
+
status.append("โ Serper API: Missing - Get key at serper.dev")
|
460 |
+
|
461 |
+
# Check if we can access file processing libraries
|
462 |
+
try:
|
463 |
+
import openpyxl
|
464 |
+
status.append("โ
Excel Processing: Available")
|
465 |
+
except ImportError:
|
466 |
+
status.append("โ Excel Processing: openpyxl not available")
|
467 |
+
|
468 |
+
try:
|
469 |
+
import PyPDF2
|
470 |
+
status.append("โ
PDF Processing: Available")
|
471 |
+
except ImportError:
|
472 |
+
status.append("โ PDF Processing: PyPDF2 not available")
|
473 |
+
|
474 |
+
return "\n".join(status)
|
475 |
|
476 |
+
def run_enhanced_gaia_evaluation(profile: gr.OAuthProfile | None):
|
477 |
+
"""Run GAIA evaluation with enhanced solving capabilities"""
|
478 |
if not profile:
|
479 |
return "Please log in to Hugging Face first.", None
|
480 |
|
481 |
# Check API status
|
482 |
+
api_status = get_enhanced_api_status()
|
483 |
+
if "โ Serper API" in api_status:
|
484 |
+
return f"โ ๏ธ Serper API not configured!\n\n{api_status}", None
|
485 |
|
486 |
username = profile.username
|
487 |
questions_url = f"{DEFAULT_API_URL}/questions"
|
488 |
submit_url = f"{DEFAULT_API_URL}/submit"
|
489 |
|
490 |
try:
|
491 |
+
solver = EnhancedQuestionSolver()
|
492 |
+
print("โ
Enhanced question solver initialized")
|
493 |
except Exception as e:
|
494 |
return f"โ Initialization failed: {e}", None
|
495 |
|
|
|
508 |
for i, item in enumerate(questions):
|
509 |
task_id = item.get("task_id")
|
510 |
question = item.get("question")
|
511 |
+
files = item.get("files", []) # Get attached files if any
|
512 |
|
513 |
if not task_id or not question:
|
514 |
continue
|
515 |
|
516 |
print(f"\n๐ Processing {i+1}/{len(questions)}: {task_id}")
|
517 |
+
print(f"๐ Question: {question[:100]}{'...' if len(question) > 100 else ''}")
|
518 |
+
if files:
|
519 |
+
print(f"๐ Files: {files}")
|
520 |
|
521 |
try:
|
522 |
start_time = time.time()
|
523 |
+
answer = solver.solve_question(question, files)
|
524 |
processing_time = time.time() - start_time
|
525 |
|
526 |
answers.append({"task_id": task_id, "submitted_answer": answer})
|
527 |
logs.append({
|
528 |
"Task ID": task_id,
|
529 |
+
"Question": question[:150] + "..." if len(question) > 150 else question,
|
530 |
+
"Answer": answer[:100] + "..." if len(answer) > 100 else answer,
|
531 |
+
"Files": len(files) if files else 0,
|
532 |
"Time (s)": f"{processing_time:.2f}"
|
533 |
})
|
534 |
|
535 |
print(f"โ
Answer: {answer[:80]}{'...' if len(answer) > 80 else ''}")
|
536 |
+
time.sleep(0.5) # Rate limiting for API
|
537 |
|
538 |
except Exception as e:
|
539 |
error_msg = f"Error: {str(e)}"
|
540 |
answers.append({"task_id": task_id, "submitted_answer": error_msg})
|
541 |
logs.append({
|
542 |
"Task ID": task_id,
|
543 |
+
"Question": question[:150] + "..." if len(question) > 150 else question,
|
544 |
"Answer": error_msg,
|
545 |
+
"Files": len(files) if files else 0,
|
546 |
"Time (s)": "Error"
|
547 |
})
|
548 |
print(f"โ Error: {e}")
|
|
|
556 |
}
|
557 |
|
558 |
try:
|
559 |
+
resp = requests.post(submit_url, json=payload, timeout=300) # Increased timeout
|
560 |
resp.raise_for_status()
|
561 |
data = resp.json()
|
562 |
|
|
|
564 |
correct = data.get('correct_count', '?')
|
565 |
total = data.get('total_attempted', '?')
|
566 |
|
567 |
+
result_message = f"""๐ฏ ENHANCED GAIA EVALUATION RESULTS
|
568 |
|
569 |
+
๐ Final Score: {score}% ({correct}/{total} correct)
|
570 |
|
571 |
+
๐ง System Status:
|
572 |
{api_status}
|
573 |
|
574 |
+
๐ Enhanced Features:
|
575 |
+
โข Multi-strategy web search with result caching
|
576 |
+
โข Advanced file processing (Excel, PDF, Python)
|
577 |
+
โข Multi-step reasoning for complex questions
|
578 |
+
โข Context-aware answer extraction
|
579 |
+
โข Question rephrasing for better results
|
580 |
+
โข Specialized handlers for different question types
|
581 |
+
|
582 |
+
๐ Performance Improvements:
|
583 |
+
โข Better search result processing
|
584 |
+
โข Enhanced name/number extraction
|
585 |
+
โข Improved mathematical computation
|
586 |
+
โข File-based question handling
|
587 |
+
โข Actor chain and sequence reasoning"""
|
588 |
|
589 |
return result_message, pd.DataFrame(logs)
|
590 |
|
591 |
except Exception as e:
|
592 |
return f"โ Submission failed: {str(e)}", pd.DataFrame(logs)
|
593 |
|
594 |
+
# Enhanced Gradio Interface
|
595 |
+
with gr.Blocks(title="Enhanced GAIA Agent", theme=gr.themes.Soft()) as demo:
|
596 |
gr.Markdown("""
|
597 |
+
# ๐ง Enhanced GAIA Benchmark Agent v2.0
|
598 |
+
|
599 |
+
**๐ง Required Setup:**
|
600 |
+
- `SERPER_API_KEY` environment variable - Get 2500 free searches/month at [serper.dev](https://serper.dev)
|
601 |
|
602 |
+
**โก Advanced Capabilities:**
|
603 |
+
- ๐ Multi-strategy web search with intelligent caching
|
604 |
+
- ๐ Excel/CSV file processing and analysis
|
605 |
+
- ๐ Python code execution for computational questions
|
606 |
+
- ๐ PDF document text extraction and analysis
|
607 |
+
- ๐งฎ Advanced mathematical problem solving
|
608 |
+
- ๐ญ Multi-step reasoning for complex actor/person chains
|
609 |
+
- ๐ฏ Context-aware answer extraction with multiple fallbacks
|
610 |
+
- ๐ Question rephrasing for better search results
|
611 |
|
612 |
+
**๐ Expected Performance:**
|
613 |
+
- Significantly improved accuracy on GAIA benchmark
|
614 |
+
- Better handling of file-based questions
|
615 |
+
- Enhanced name/number/date extraction
|
616 |
+
- Robust error handling and fallback strategies
|
617 |
""")
|
618 |
|
619 |
gr.LoginButton()
|
620 |
|
621 |
with gr.Row():
|
622 |
with gr.Column():
|
623 |
+
api_status_display = gr.Textbox(
|
624 |
+
label="๐ง System Status",
|
625 |
+
value=get_enhanced_api_status(),
|
626 |
+
lines=4,
|
627 |
interactive=False
|
628 |
)
|
629 |
+
|
630 |
+
run_button = gr.Button(
|
631 |
+
"๐ Run Enhanced GAIA Evaluation",
|
632 |
+
variant="primary",
|
633 |
+
size="lg"
|
634 |
+
)
|
635 |
|
636 |
with gr.Row():
|
637 |
+
results_display = gr.Textbox(
|
638 |
+
label="๐ Evaluation Results",
|
639 |
+
lines=15,
|
640 |
interactive=False
|
641 |
)
|
642 |
|
643 |
with gr.Row():
|
644 |
+
detailed_results = gr.DataFrame(
|
645 |
+
label="๐ Detailed Question Analysis",
|
646 |
+
wrap=True,
|
647 |
+
interactive=False
|
648 |
)
|
649 |
|
650 |
+
# Refresh status button
|
651 |
+
refresh_status = gr.Button("๐ Refresh Status", size="sm")
|
652 |
+
refresh_status.click(
|
653 |
+
lambda: get_enhanced_api_status(),
|
654 |
+
outputs=[api_status_display]
|
655 |
+
)
|
656 |
+
|
657 |
+
run_button.click(
|
658 |
+
run_enhanced_gaia_evaluation,
|
659 |
+
outputs=[results_display, detailed_results]
|
660 |
)
|
661 |
|
662 |
if __name__ == "__main__":
|