Spaces:
Runtime error
Runtime error
fixing
Browse files
app.py
CHANGED
@@ -17,6 +17,8 @@ from transformers import AutoModelForCausalLM, AutoTokenizer, GenerationConfig
|
|
17 |
import torch
|
18 |
import time
|
19 |
import gc
|
|
|
|
|
20 |
|
21 |
# --- Load Environment Variables ---
|
22 |
load_dotenv()
|
@@ -24,109 +26,96 @@ SERPER_API_KEY = os.getenv("SERPER_API_KEY")
|
|
24 |
|
25 |
# --- Constants ---
|
26 |
DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
|
27 |
-
MAX_STEPS = 6
|
28 |
-
MAX_TOKENS = 256
|
29 |
MODEL_NAME = "microsoft/Phi-3-mini-4k-instruct"
|
|
|
30 |
|
31 |
# --- Configure Environment for Hugging Face Spaces ---
|
32 |
os.environ["PIP_BREAK_SYSTEM_PACKAGES"] = "1"
|
33 |
os.environ["HF_HUB_DISABLE_SYMLINKS_WARNING"] = "1"
|
34 |
os.environ["BITSANDBYTES_NOWELCOME"] = "1"
|
35 |
|
36 |
-
print("Loading model (CPU-
|
37 |
start_time = time.time()
|
38 |
|
39 |
-
# Load model with
|
40 |
model = AutoModelForCausalLM.from_pretrained(
|
41 |
MODEL_NAME,
|
42 |
trust_remote_code=True,
|
43 |
-
torch_dtype=torch.float32,
|
44 |
-
device_map="cpu",
|
45 |
-
low_cpu_mem_usage=True,
|
46 |
-
use_cache=False
|
|
|
47 |
)
|
48 |
|
49 |
tokenizer = AutoTokenizer.from_pretrained(
|
50 |
MODEL_NAME,
|
51 |
-
use_fast=
|
52 |
trust_remote_code=True
|
53 |
)
|
54 |
|
55 |
-
# Ensure pad token is set
|
56 |
if tokenizer.pad_token is None:
|
57 |
tokenizer.pad_token = tokenizer.eos_token
|
58 |
|
59 |
load_time = time.time() - start_time
|
60 |
print(f"Model loaded in {load_time:.2f} seconds")
|
61 |
|
62 |
-
# --- Tools
|
63 |
def web_search(query: str) -> str:
|
64 |
-
"""Search the web
|
65 |
try:
|
66 |
if SERPER_API_KEY:
|
67 |
-
|
68 |
-
|
69 |
-
'q': query,
|
70 |
-
'num': 3,
|
71 |
-
'hl': 'en',
|
72 |
-
'gl': 'us'
|
73 |
-
}
|
74 |
-
headers = {
|
75 |
-
'X-API-KEY': SERPER_API_KEY,
|
76 |
-
'Content-Type': 'application/json'
|
77 |
-
}
|
78 |
response = requests.post(
|
79 |
'https://google.serper.dev/search',
|
80 |
headers=headers,
|
81 |
json=params,
|
82 |
-
timeout=
|
83 |
)
|
84 |
results = response.json()
|
85 |
if 'organic' in results:
|
86 |
-
return json.dumps([r['title']
|
87 |
return "No results found"
|
88 |
else:
|
89 |
-
# Fallback to DuckDuckGo
|
90 |
with DDGS() as ddgs:
|
91 |
-
results = [r for r in ddgs.text(query, max_results=
|
92 |
-
return json.dumps([r['title']
|
93 |
except Exception as e:
|
94 |
return f"Search error: {str(e)}"
|
95 |
|
96 |
def calculator(expression: str) -> str:
|
97 |
-
"""
|
98 |
try:
|
99 |
-
# Clean the expression
|
100 |
expression = re.sub(r'[^\d+\-*/().\s]', '', expression)
|
101 |
result = numexpr.evaluate(expression)
|
102 |
-
return str(result)
|
103 |
except Exception as e:
|
104 |
return f"Calculation error: {str(e)}"
|
105 |
|
106 |
def read_pdf(file_path: str) -> str:
|
107 |
-
"""Extract text from PDF
|
108 |
try:
|
109 |
text = extract_text(file_path)
|
110 |
-
return text[:
|
111 |
except Exception as e:
|
112 |
return f"PDF read error: {str(e)}"
|
113 |
|
114 |
def read_webpage(url: str) -> str:
|
115 |
-
"""
|
116 |
try:
|
117 |
-
headers = {
|
118 |
-
|
119 |
-
}
|
120 |
-
response = requests.get(url, timeout=10, headers=headers)
|
121 |
response.raise_for_status()
|
122 |
soup = BeautifulSoup(response.text, 'html.parser')
|
123 |
|
124 |
-
# Remove script and style elements
|
125 |
for script in soup(["script", "style"]):
|
126 |
script.decompose()
|
127 |
|
128 |
text = soup.get_text(separator=' ', strip=True)
|
129 |
-
return text[:
|
130 |
except Exception as e:
|
131 |
return f"Webpage read error: {str(e)}"
|
132 |
|
@@ -137,115 +126,98 @@ TOOLS = {
|
|
137 |
"read_webpage": read_webpage
|
138 |
}
|
139 |
|
140 |
-
# --- GAIA Agent
|
141 |
class GAIA_Agent:
|
142 |
def __init__(self):
|
143 |
self.tools = TOOLS
|
144 |
-
self.history = []
|
145 |
self.system_prompt = (
|
146 |
-
"You are
|
147 |
-
"
|
148 |
-
"
|
149 |
-
"
|
150 |
-
"3. Tools must be called as: ```json\n{'tool': 'tool_name', 'args': {'arg1': value}}```\n"
|
151 |
-
"4. Final Answer must be exact and standalone\n\n"
|
152 |
-
"Example:\n"
|
153 |
-
"Question: \"What's the population density of France? (File: france_data.pdf)\"\n"
|
154 |
-
"Thought: Need population and area. Read PDF first.\n"
|
155 |
-
"Action: ```json\n{'tool': 'read_pdf', 'args': {'file_path': 'france_data.pdf'}}```\n"
|
156 |
-
"Observation: Population: 67.8M, Area: 643,801 kmΒ²\n"
|
157 |
-
"Thought: Now calculate density: 67,800,000 / 643,801\n"
|
158 |
-
"Action: ```json\n{'tool': 'calculator', 'args': {'expression': '67800000 / 643801'}}```\n"
|
159 |
-
"Observation: 105.32\n"
|
160 |
-
"Final Answer: 105.32 people/kmΒ²"
|
161 |
)
|
162 |
|
163 |
def __call__(self, question: str) -> str:
|
164 |
-
|
165 |
-
|
166 |
|
167 |
try:
|
|
|
|
|
168 |
for step in range(MAX_STEPS):
|
169 |
-
|
|
|
|
|
|
|
|
|
170 |
response = self._call_model(prompt)
|
171 |
|
172 |
if "Final Answer" in response:
|
173 |
answer = response.split("Final Answer:")[-1].strip()
|
174 |
-
|
|
|
175 |
return answer
|
176 |
|
177 |
tool_call = self._parse_tool_call(response)
|
178 |
if tool_call:
|
179 |
tool_name, args = tool_call
|
180 |
observation = self._use_tool(tool_name, args)
|
181 |
-
|
|
|
182 |
else:
|
183 |
-
|
184 |
-
|
185 |
-
#
|
186 |
-
|
187 |
-
gc.collect()
|
188 |
|
189 |
-
return "
|
190 |
|
191 |
except Exception as e:
|
192 |
-
print(f"
|
193 |
-
return f"
|
194 |
|
195 |
-
def _build_prompt(self) -> str:
|
196 |
prompt = "<|system|>\n" + self.system_prompt + "<|end|>\n"
|
197 |
-
prompt += "<|user|>\n" + "\n".join(
|
198 |
prompt += "<|assistant|>"
|
199 |
return prompt
|
200 |
|
201 |
def _call_model(self, prompt: str) -> str:
|
202 |
-
start_time = time.time()
|
203 |
-
|
204 |
try:
|
205 |
-
# Tokenize input
|
206 |
inputs = tokenizer(
|
207 |
prompt,
|
208 |
return_tensors="pt",
|
209 |
-
return_attention_mask=True,
|
210 |
truncation=True,
|
211 |
-
max_length=
|
|
|
212 |
)
|
213 |
|
214 |
-
# Move to same device as model
|
215 |
-
inputs = {k: v.to(model.device) for k, v in inputs.items()}
|
216 |
-
|
217 |
-
# Create generation config
|
218 |
generation_config = GenerationConfig(
|
219 |
max_new_tokens=MAX_TOKENS,
|
220 |
-
temperature=0.
|
221 |
do_sample=True,
|
222 |
pad_token_id=tokenizer.pad_token_id,
|
223 |
eos_token_id=tokenizer.eos_token_id,
|
224 |
-
use_cache=False
|
225 |
)
|
226 |
|
227 |
-
# Generate response
|
228 |
with torch.no_grad():
|
229 |
outputs = model.generate(
|
230 |
-
|
231 |
-
generation_config=generation_config
|
|
|
232 |
)
|
233 |
|
234 |
-
# Decode response
|
235 |
full_response = tokenizer.decode(outputs[0], skip_special_tokens=True)
|
236 |
response = full_response.split("<|assistant|>")[-1].strip()
|
237 |
|
238 |
-
|
239 |
-
print(f"Generated {len(response)} tokens in {gen_time:.2f}s: {response[:60]}...")
|
240 |
-
|
241 |
-
# Clean up
|
242 |
del inputs, outputs
|
243 |
-
|
244 |
|
245 |
return response
|
246 |
|
247 |
except Exception as e:
|
248 |
-
print(f"Model generation error: {str(e)}")
|
249 |
return f"Generation error: {str(e)}"
|
250 |
|
251 |
def _parse_tool_call(self, text: str) -> Optional[Tuple[str, Dict]]:
|
@@ -255,36 +227,29 @@ class GAIA_Agent:
|
|
255 |
tool_call = json.loads(json_match.group(1))
|
256 |
if "tool" in tool_call and "args" in tool_call:
|
257 |
return tool_call["tool"], tool_call["args"]
|
258 |
-
except
|
259 |
-
|
260 |
return None
|
261 |
|
262 |
def _use_tool(self, tool_name: str, args: Dict) -> str:
|
263 |
if tool_name not in self.tools:
|
264 |
-
return f"
|
265 |
|
266 |
-
print(f"Using tool: {tool_name}({args})")
|
267 |
try:
|
268 |
-
start_time = time.time()
|
269 |
result = self.tools[tool_name](**args)
|
270 |
-
|
271 |
-
print(f"Tool executed in {exec_time:.2f}s")
|
272 |
-
return str(result)[:500] # Truncate long outputs
|
273 |
except Exception as e:
|
274 |
return f"Tool error: {str(e)}"
|
275 |
|
276 |
-
# --- Evaluation Runner ---
|
277 |
def run_and_submit_all(profile: gr.OAuthProfile | None):
|
278 |
-
"""
|
279 |
space_id = os.getenv("SPACE_ID")
|
280 |
|
281 |
-
if profile:
|
282 |
-
username = f"{profile.username}"
|
283 |
-
print(f"User logged in: {username}")
|
284 |
-
else:
|
285 |
-
print("User not logged in.")
|
286 |
return "Please Login to Hugging Face with the button.", None
|
287 |
|
|
|
288 |
api_url = DEFAULT_API_URL
|
289 |
questions_url = f"{api_url}/questions"
|
290 |
submit_url = f"{api_url}/submit"
|
@@ -292,127 +257,103 @@ def run_and_submit_all(profile: gr.OAuthProfile | None):
|
|
292 |
try:
|
293 |
agent = GAIA_Agent()
|
294 |
except Exception as e:
|
295 |
-
print(f"Error instantiating agent: {e}")
|
296 |
return f"Error initializing agent: {e}", None
|
297 |
|
298 |
agent_code = f"https://huggingface.co/spaces/{space_id}/tree/main"
|
299 |
-
print(f"Agent code URL: {agent_code}")
|
300 |
|
301 |
# Fetch Questions
|
302 |
-
print(f"Fetching questions from: {questions_url}")
|
303 |
try:
|
304 |
-
response = requests.get(questions_url, timeout=
|
305 |
response.raise_for_status()
|
306 |
questions_data = response.json()
|
307 |
if not questions_data:
|
308 |
-
|
309 |
-
|
310 |
-
print(f"Fetched {len(questions_data)} questions.")
|
311 |
-
except requests.exceptions.RequestException as e:
|
312 |
-
print(f"Error fetching questions: {e}")
|
313 |
-
return f"Error fetching questions: {e}", None
|
314 |
except Exception as e:
|
315 |
-
|
316 |
-
return f"An unexpected error occurred fetching questions: {e}", None
|
317 |
|
318 |
-
#
|
319 |
results_log = []
|
320 |
answers_payload = []
|
321 |
-
|
322 |
|
323 |
for i, item in enumerate(questions_data):
|
324 |
task_id = item.get("task_id")
|
325 |
question_text = item.get("question")
|
326 |
|
327 |
if not task_id or question_text is None:
|
328 |
-
print(f"Skipping item with missing task_id or question: {item}")
|
329 |
continue
|
330 |
|
331 |
try:
|
332 |
-
print(f"
|
333 |
submitted_answer = agent(question_text)
|
334 |
answers_payload.append({"task_id": task_id, "submitted_answer": submitted_answer})
|
335 |
results_log.append({
|
336 |
"Task ID": task_id,
|
337 |
-
"Question": question_text[:
|
338 |
-
"
|
339 |
})
|
340 |
|
341 |
-
#
|
342 |
-
if i %
|
343 |
gc.collect()
|
344 |
|
345 |
except Exception as e:
|
346 |
-
|
347 |
-
error_answer = f"AGENT ERROR: {str(e)}"
|
348 |
answers_payload.append({"task_id": task_id, "submitted_answer": error_answer})
|
349 |
results_log.append({
|
350 |
"Task ID": task_id,
|
351 |
-
"Question": question_text[:
|
352 |
-
"
|
353 |
})
|
354 |
|
|
|
|
|
|
|
355 |
if not answers_payload:
|
356 |
-
|
357 |
-
return "Agent did not produce any answers to submit.", pd.DataFrame(results_log)
|
358 |
|
359 |
-
#
|
360 |
submission_data = {
|
361 |
"username": username.strip(),
|
362 |
"agent_code": agent_code,
|
363 |
"answers": answers_payload
|
364 |
}
|
365 |
-
status_update = f"Agent finished. Submitting {len(answers_payload)} answers for user '{username}'..."
|
366 |
-
print(status_update)
|
367 |
|
368 |
-
# Submit
|
369 |
-
print(f"Submitting {len(answers_payload)} answers to: {submit_url}")
|
370 |
try:
|
371 |
-
response = requests.post(submit_url, json=submission_data, timeout=
|
372 |
response.raise_for_status()
|
373 |
result_data = response.json()
|
|
|
374 |
final_status = (
|
375 |
-
f"Submission Successful!\n"
|
376 |
f"User: {result_data.get('username')}\n"
|
377 |
-
f"
|
378 |
f"({result_data.get('correct_count', '?')}/{result_data.get('total_attempted', '?')} correct)\n"
|
379 |
-
f"
|
|
|
380 |
)
|
381 |
-
|
382 |
results_df = pd.DataFrame(results_log)
|
383 |
return final_status, results_df
|
384 |
-
|
385 |
-
error_detail = f"Server responded with status {e.response.status_code}."
|
386 |
-
try:
|
387 |
-
error_json = e.response.json()
|
388 |
-
error_detail += f" Detail: {error_json.get('detail', e.response.text)}"
|
389 |
-
except requests.exceptions.JSONDecodeError:
|
390 |
-
error_detail += f" Response: {e.response.text[:500]}"
|
391 |
-
status_message = f"Submission Failed: {error_detail}"
|
392 |
-
print(status_message)
|
393 |
-
results_df = pd.DataFrame(results_log)
|
394 |
-
return status_message, results_df
|
395 |
except Exception as e:
|
396 |
-
|
397 |
-
print(status_message)
|
398 |
results_df = pd.DataFrame(results_log)
|
399 |
-
return
|
400 |
|
401 |
# --- Gradio Interface ---
|
402 |
-
with gr.Blocks(title="GAIA Agent
|
403 |
-
gr.Markdown("# GAIA Agent Evaluation
|
404 |
gr.Markdown(
|
405 |
"""
|
406 |
-
**
|
407 |
-
|
408 |
-
|
409 |
-
|
|
|
410 |
|
411 |
-
**
|
412 |
-
- Model: Phi-3-mini-4k-instruct (CPU optimized)
|
413 |
-
- Tools: Web Search, Calculator, PDF Reader, Webpage Reader
|
414 |
-
- Max Steps: 6 per question
|
415 |
-
- Memory: Optimized for 2vCPU/16GB environment
|
416 |
"""
|
417 |
)
|
418 |
|
@@ -420,19 +361,19 @@ with gr.Blocks(title="GAIA Agent Evaluation") as demo:
|
|
420 |
gr.LoginButton()
|
421 |
|
422 |
with gr.Row():
|
423 |
-
run_button = gr.Button("Run Evaluation
|
424 |
|
425 |
with gr.Row():
|
426 |
status_output = gr.Textbox(
|
427 |
-
label="
|
428 |
-
lines=
|
429 |
interactive=False,
|
430 |
-
placeholder="
|
431 |
)
|
432 |
|
433 |
with gr.Row():
|
434 |
results_table = gr.DataFrame(
|
435 |
-
label="Questions
|
436 |
wrap=True,
|
437 |
interactive=False
|
438 |
)
|
@@ -444,27 +385,12 @@ with gr.Blocks(title="GAIA Agent Evaluation") as demo:
|
|
444 |
)
|
445 |
|
446 |
if __name__ == "__main__":
|
447 |
-
print("
|
448 |
-
print("
|
449 |
-
print("
|
450 |
-
|
451 |
-
space_host = os.getenv("SPACE_HOST")
|
452 |
-
space_id = os.getenv("SPACE_ID")
|
453 |
-
|
454 |
-
if space_host:
|
455 |
-
print(f"β
SPACE_HOST found: {space_host}")
|
456 |
-
else:
|
457 |
-
print("β οΈ SPACE_HOST not found")
|
458 |
-
|
459 |
-
if space_id:
|
460 |
-
print(f"β
SPACE_ID found: {space_id}")
|
461 |
-
else:
|
462 |
-
print("β οΈ SPACE_ID not found")
|
463 |
|
464 |
-
print("="*50)
|
465 |
-
print("Launching Gradio Interface...")
|
466 |
demo.launch(
|
467 |
-
debug=False,
|
468 |
share=False,
|
469 |
server_name="0.0.0.0",
|
470 |
server_port=7860,
|
|
|
17 |
import torch
|
18 |
import time
|
19 |
import gc
|
20 |
+
import threading
|
21 |
+
from concurrent.futures import ThreadPoolExecutor, as_completed
|
22 |
|
23 |
# --- Load Environment Variables ---
|
24 |
load_dotenv()
|
|
|
26 |
|
27 |
# --- Constants ---
|
28 |
DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
|
29 |
+
MAX_STEPS = 4 # Reduced from 6
|
30 |
+
MAX_TOKENS = 128 # Reduced from 256
|
31 |
MODEL_NAME = "microsoft/Phi-3-mini-4k-instruct"
|
32 |
+
TIMEOUT_PER_QUESTION = 30 # 30 seconds max per question
|
33 |
|
34 |
# --- Configure Environment for Hugging Face Spaces ---
|
35 |
os.environ["PIP_BREAK_SYSTEM_PACKAGES"] = "1"
|
36 |
os.environ["HF_HUB_DISABLE_SYMLINKS_WARNING"] = "1"
|
37 |
os.environ["BITSANDBYTES_NOWELCOME"] = "1"
|
38 |
|
39 |
+
print("Loading model (CPU-optimized)...")
|
40 |
start_time = time.time()
|
41 |
|
42 |
+
# Load model with aggressive optimization
|
43 |
model = AutoModelForCausalLM.from_pretrained(
|
44 |
MODEL_NAME,
|
45 |
trust_remote_code=True,
|
46 |
+
torch_dtype=torch.float32,
|
47 |
+
device_map="cpu",
|
48 |
+
low_cpu_mem_usage=True,
|
49 |
+
use_cache=False,
|
50 |
+
attn_implementation="eager" # Use eager attention for better CPU performance
|
51 |
)
|
52 |
|
53 |
tokenizer = AutoTokenizer.from_pretrained(
|
54 |
MODEL_NAME,
|
55 |
+
use_fast=True, # Changed to True for faster tokenization
|
56 |
trust_remote_code=True
|
57 |
)
|
58 |
|
|
|
59 |
if tokenizer.pad_token is None:
|
60 |
tokenizer.pad_token = tokenizer.eos_token
|
61 |
|
62 |
load_time = time.time() - start_time
|
63 |
print(f"Model loaded in {load_time:.2f} seconds")
|
64 |
|
65 |
+
# --- Optimized Tools ---
|
66 |
def web_search(query: str) -> str:
|
67 |
+
"""Search the web with timeout and result limiting"""
|
68 |
try:
|
69 |
if SERPER_API_KEY:
|
70 |
+
params = {'q': query, 'num': 2, 'hl': 'en', 'gl': 'us'}
|
71 |
+
headers = {'X-API-KEY': SERPER_API_KEY, 'Content-Type': 'application/json'}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
72 |
response = requests.post(
|
73 |
'https://google.serper.dev/search',
|
74 |
headers=headers,
|
75 |
json=params,
|
76 |
+
timeout=5 # Reduced timeout
|
77 |
)
|
78 |
results = response.json()
|
79 |
if 'organic' in results:
|
80 |
+
return json.dumps([f"{r['title']}: {r['snippet'][:100]}" for r in results['organic'][:2]])
|
81 |
return "No results found"
|
82 |
else:
|
|
|
83 |
with DDGS() as ddgs:
|
84 |
+
results = [r for r in ddgs.text(query, max_results=2)]
|
85 |
+
return json.dumps([f"{r['title']}: {r['body'][:100]}" for r in results])
|
86 |
except Exception as e:
|
87 |
return f"Search error: {str(e)}"
|
88 |
|
89 |
def calculator(expression: str) -> str:
|
90 |
+
"""Fast mathematical evaluation"""
|
91 |
try:
|
|
|
92 |
expression = re.sub(r'[^\d+\-*/().\s]', '', expression)
|
93 |
result = numexpr.evaluate(expression)
|
94 |
+
return str(float(result))
|
95 |
except Exception as e:
|
96 |
return f"Calculation error: {str(e)}"
|
97 |
|
98 |
def read_pdf(file_path: str) -> str:
|
99 |
+
"""Extract text from PDF with length limit"""
|
100 |
try:
|
101 |
text = extract_text(file_path)
|
102 |
+
return text[:1000] if text else "No text found in PDF" # Reduced limit
|
103 |
except Exception as e:
|
104 |
return f"PDF read error: {str(e)}"
|
105 |
|
106 |
def read_webpage(url: str) -> str:
|
107 |
+
"""Fast webpage reading with aggressive limits"""
|
108 |
try:
|
109 |
+
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'}
|
110 |
+
response = requests.get(url, timeout=5, headers=headers) # Reduced timeout
|
|
|
|
|
111 |
response.raise_for_status()
|
112 |
soup = BeautifulSoup(response.text, 'html.parser')
|
113 |
|
|
|
114 |
for script in soup(["script", "style"]):
|
115 |
script.decompose()
|
116 |
|
117 |
text = soup.get_text(separator=' ', strip=True)
|
118 |
+
return text[:1000] if text else "No text found on webpage" # Reduced limit
|
119 |
except Exception as e:
|
120 |
return f"Webpage read error: {str(e)}"
|
121 |
|
|
|
126 |
"read_webpage": read_webpage
|
127 |
}
|
128 |
|
129 |
+
# --- Optimized GAIA Agent ---
|
130 |
class GAIA_Agent:
|
131 |
def __init__(self):
|
132 |
self.tools = TOOLS
|
|
|
133 |
self.system_prompt = (
|
134 |
+
"You are a GAIA problem solver. Tools: {web_search, calculator, read_pdf, read_webpage}.\n"
|
135 |
+
"Be concise and direct. Use tools efficiently.\n"
|
136 |
+
"Tool format: ```json\n{'tool': 'tool_name', 'args': {'arg1': value}}```\n"
|
137 |
+
"End with: Final Answer: [exact answer]"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
138 |
)
|
139 |
|
140 |
def __call__(self, question: str) -> str:
|
141 |
+
start_time = time.time()
|
142 |
+
print(f"Processing: {question[:50]}...")
|
143 |
|
144 |
try:
|
145 |
+
history = [f"Question: {question}"]
|
146 |
+
|
147 |
for step in range(MAX_STEPS):
|
148 |
+
# Check timeout
|
149 |
+
if time.time() - start_time > TIMEOUT_PER_QUESTION:
|
150 |
+
return "TIMEOUT: Question took too long"
|
151 |
+
|
152 |
+
prompt = self._build_prompt(history)
|
153 |
response = self._call_model(prompt)
|
154 |
|
155 |
if "Final Answer" in response:
|
156 |
answer = response.split("Final Answer:")[-1].strip()
|
157 |
+
elapsed = time.time() - start_time
|
158 |
+
print(f"Completed in {elapsed:.1f}s: {answer[:30]}...")
|
159 |
return answer
|
160 |
|
161 |
tool_call = self._parse_tool_call(response)
|
162 |
if tool_call:
|
163 |
tool_name, args = tool_call
|
164 |
observation = self._use_tool(tool_name, args)
|
165 |
+
history.append(f"Action: {tool_name}")
|
166 |
+
history.append(f"Result: {observation}")
|
167 |
else:
|
168 |
+
history.append(f"Thought: {response}")
|
169 |
+
|
170 |
+
# Aggressive memory cleanup
|
171 |
+
gc.collect()
|
|
|
172 |
|
173 |
+
return "Could not solve within step limit"
|
174 |
|
175 |
except Exception as e:
|
176 |
+
print(f"Agent error: {str(e)}")
|
177 |
+
return f"Error: {str(e)}"
|
178 |
|
179 |
+
def _build_prompt(self, history: List[str]) -> str:
|
180 |
prompt = "<|system|>\n" + self.system_prompt + "<|end|>\n"
|
181 |
+
prompt += "<|user|>\n" + "\n".join(history) + "<|end|>\n"
|
182 |
prompt += "<|assistant|>"
|
183 |
return prompt
|
184 |
|
185 |
def _call_model(self, prompt: str) -> str:
|
|
|
|
|
186 |
try:
|
|
|
187 |
inputs = tokenizer(
|
188 |
prompt,
|
189 |
return_tensors="pt",
|
|
|
190 |
truncation=True,
|
191 |
+
max_length=2048, # Reduced context
|
192 |
+
padding=False
|
193 |
)
|
194 |
|
|
|
|
|
|
|
|
|
195 |
generation_config = GenerationConfig(
|
196 |
max_new_tokens=MAX_TOKENS,
|
197 |
+
temperature=0.1, # Less randomness for faster convergence
|
198 |
do_sample=True,
|
199 |
pad_token_id=tokenizer.pad_token_id,
|
200 |
eos_token_id=tokenizer.eos_token_id,
|
201 |
+
use_cache=False
|
202 |
)
|
203 |
|
|
|
204 |
with torch.no_grad():
|
205 |
outputs = model.generate(
|
206 |
+
inputs.input_ids,
|
207 |
+
generation_config=generation_config,
|
208 |
+
attention_mask=inputs.attention_mask
|
209 |
)
|
210 |
|
|
|
211 |
full_response = tokenizer.decode(outputs[0], skip_special_tokens=True)
|
212 |
response = full_response.split("<|assistant|>")[-1].strip()
|
213 |
|
214 |
+
# Immediate cleanup
|
|
|
|
|
|
|
215 |
del inputs, outputs
|
216 |
+
torch.cuda.empty_cache() if torch.cuda.is_available() else None
|
217 |
|
218 |
return response
|
219 |
|
220 |
except Exception as e:
|
|
|
221 |
return f"Generation error: {str(e)}"
|
222 |
|
223 |
def _parse_tool_call(self, text: str) -> Optional[Tuple[str, Dict]]:
|
|
|
227 |
tool_call = json.loads(json_match.group(1))
|
228 |
if "tool" in tool_call and "args" in tool_call:
|
229 |
return tool_call["tool"], tool_call["args"]
|
230 |
+
except:
|
231 |
+
pass
|
232 |
return None
|
233 |
|
234 |
def _use_tool(self, tool_name: str, args: Dict) -> str:
|
235 |
if tool_name not in self.tools:
|
236 |
+
return f"Unknown tool: {tool_name}"
|
237 |
|
|
|
238 |
try:
|
|
|
239 |
result = self.tools[tool_name](**args)
|
240 |
+
return str(result)[:300] # Truncate results
|
|
|
|
|
241 |
except Exception as e:
|
242 |
return f"Tool error: {str(e)}"
|
243 |
|
244 |
+
# --- Optimized Evaluation Runner ---
|
245 |
def run_and_submit_all(profile: gr.OAuthProfile | None):
|
246 |
+
"""Fast evaluation with parallel processing where possible"""
|
247 |
space_id = os.getenv("SPACE_ID")
|
248 |
|
249 |
+
if not profile:
|
|
|
|
|
|
|
|
|
250 |
return "Please Login to Hugging Face with the button.", None
|
251 |
|
252 |
+
username = profile.username
|
253 |
api_url = DEFAULT_API_URL
|
254 |
questions_url = f"{api_url}/questions"
|
255 |
submit_url = f"{api_url}/submit"
|
|
|
257 |
try:
|
258 |
agent = GAIA_Agent()
|
259 |
except Exception as e:
|
|
|
260 |
return f"Error initializing agent: {e}", None
|
261 |
|
262 |
agent_code = f"https://huggingface.co/spaces/{space_id}/tree/main"
|
|
|
263 |
|
264 |
# Fetch Questions
|
|
|
265 |
try:
|
266 |
+
response = requests.get(questions_url, timeout=15)
|
267 |
response.raise_for_status()
|
268 |
questions_data = response.json()
|
269 |
if not questions_data:
|
270 |
+
return "No questions found.", None
|
271 |
+
print(f"Processing {len(questions_data)} questions...")
|
|
|
|
|
|
|
|
|
272 |
except Exception as e:
|
273 |
+
return f"Error fetching questions: {e}", None
|
|
|
274 |
|
275 |
+
# Process questions with progress tracking
|
276 |
results_log = []
|
277 |
answers_payload = []
|
278 |
+
total_start = time.time()
|
279 |
|
280 |
for i, item in enumerate(questions_data):
|
281 |
task_id = item.get("task_id")
|
282 |
question_text = item.get("question")
|
283 |
|
284 |
if not task_id or question_text is None:
|
|
|
285 |
continue
|
286 |
|
287 |
try:
|
288 |
+
print(f"[{i+1}/{len(questions_data)}] Processing {task_id}...")
|
289 |
submitted_answer = agent(question_text)
|
290 |
answers_payload.append({"task_id": task_id, "submitted_answer": submitted_answer})
|
291 |
results_log.append({
|
292 |
"Task ID": task_id,
|
293 |
+
"Question": question_text[:80] + "..." if len(question_text) > 80 else question_text,
|
294 |
+
"Answer": submitted_answer[:100] + "..." if len(submitted_answer) > 100 else submitted_answer
|
295 |
})
|
296 |
|
297 |
+
# Memory cleanup every few questions
|
298 |
+
if i % 3 == 0:
|
299 |
gc.collect()
|
300 |
|
301 |
except Exception as e:
|
302 |
+
error_answer = f"ERROR: {str(e)}"
|
|
|
303 |
answers_payload.append({"task_id": task_id, "submitted_answer": error_answer})
|
304 |
results_log.append({
|
305 |
"Task ID": task_id,
|
306 |
+
"Question": question_text[:80] + "..." if len(question_text) > 80 else question_text,
|
307 |
+
"Answer": error_answer
|
308 |
})
|
309 |
|
310 |
+
total_time = time.time() - total_start
|
311 |
+
print(f"All questions processed in {total_time:.1f} seconds")
|
312 |
+
|
313 |
if not answers_payload:
|
314 |
+
return "No answers generated.", pd.DataFrame(results_log)
|
|
|
315 |
|
316 |
+
# Submit results
|
317 |
submission_data = {
|
318 |
"username": username.strip(),
|
319 |
"agent_code": agent_code,
|
320 |
"answers": answers_payload
|
321 |
}
|
|
|
|
|
322 |
|
|
|
|
|
323 |
try:
|
324 |
+
response = requests.post(submit_url, json=submission_data, timeout=60)
|
325 |
response.raise_for_status()
|
326 |
result_data = response.json()
|
327 |
+
|
328 |
final_status = (
|
329 |
+
f"β
Submission Successful!\n"
|
330 |
f"User: {result_data.get('username')}\n"
|
331 |
+
f"Score: {result_data.get('score', 'N/A')}% "
|
332 |
f"({result_data.get('correct_count', '?')}/{result_data.get('total_attempted', '?')} correct)\n"
|
333 |
+
f"Processing Time: {total_time:.1f}s\n"
|
334 |
+
f"Message: {result_data.get('message', 'No message')}"
|
335 |
)
|
336 |
+
|
337 |
results_df = pd.DataFrame(results_log)
|
338 |
return final_status, results_df
|
339 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
340 |
except Exception as e:
|
341 |
+
error_msg = f"β Submission Failed: {str(e)}"
|
|
|
342 |
results_df = pd.DataFrame(results_log)
|
343 |
+
return error_msg, results_df
|
344 |
|
345 |
# --- Gradio Interface ---
|
346 |
+
with gr.Blocks(title="GAIA Agent - Fast Mode") as demo:
|
347 |
+
gr.Markdown("# π GAIA Agent Evaluation (Optimized)")
|
348 |
gr.Markdown(
|
349 |
"""
|
350 |
+
**Fast Mode Optimizations:**
|
351 |
+
- Reduced max steps: 4 per question
|
352 |
+
- Shorter token generation: 128 tokens max
|
353 |
+
- 30s timeout per question
|
354 |
+
- Aggressive memory management
|
355 |
|
356 |
+
**Usage:** Login β Click Run β View Results
|
|
|
|
|
|
|
|
|
357 |
"""
|
358 |
)
|
359 |
|
|
|
361 |
gr.LoginButton()
|
362 |
|
363 |
with gr.Row():
|
364 |
+
run_button = gr.Button("πββοΈ Run Fast Evaluation", variant="primary", size="lg")
|
365 |
|
366 |
with gr.Row():
|
367 |
status_output = gr.Textbox(
|
368 |
+
label="π Status & Results",
|
369 |
+
lines=6,
|
370 |
interactive=False,
|
371 |
+
placeholder="Ready to run evaluation..."
|
372 |
)
|
373 |
|
374 |
with gr.Row():
|
375 |
results_table = gr.DataFrame(
|
376 |
+
label="π Questions & Answers",
|
377 |
wrap=True,
|
378 |
interactive=False
|
379 |
)
|
|
|
385 |
)
|
386 |
|
387 |
if __name__ == "__main__":
|
388 |
+
print("π GAIA Agent Fast Mode Starting...")
|
389 |
+
print(f"βοΈ Max Steps: {MAX_STEPS}, Max Tokens: {MAX_TOKENS}")
|
390 |
+
print(f"β±οΈ Timeout per question: {TIMEOUT_PER_QUESTION}s")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
391 |
|
|
|
|
|
392 |
demo.launch(
|
393 |
+
debug=False,
|
394 |
share=False,
|
395 |
server_name="0.0.0.0",
|
396 |
server_port=7860,
|