LamiaYT commited on
Commit
26e4907
·
1 Parent(s): c549c70

Initial commit with LlamaIndex-based agent

Browse files
Files changed (1) hide show
  1. app.py +319 -343
app.py CHANGED
@@ -1,14 +1,17 @@
1
- # app.py
2
  from llama_index.llms.huggingface import HuggingFaceLLM
3
  from llama_index.core.agent import ReActAgent
4
  from llama_index.core.tools import FunctionTool
5
- from transformers import AutoTokenizer
6
  import os
7
  import gradio as gr
8
  import requests
9
  import pandas as pd
10
  import traceback
11
  import torch
 
 
 
 
12
 
13
  # Import real tool dependencies
14
  try:
@@ -27,262 +30,317 @@ except ImportError:
27
 
28
  # --- Constants ---
29
  DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
 
30
 
31
  # --- Advanced Agent Definition ---
32
  class SmartAgent:
33
  def __init__(self):
34
- print("Initializing Local LLM Agent...")
 
35
 
36
- # Check available memory and CUDA
37
- if torch.cuda.is_available():
38
- print(f"CUDA available. GPU memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f}GB")
39
- else:
40
- print("CUDA not available, using CPU")
41
-
42
- # Use a smaller, more efficient model for Hugging Face Spaces
43
  model_options = [
44
- "microsoft/DialoGPT-medium", # Much smaller, works well for chat
45
- "google/flan-t5-base", # Good for reasoning tasks
46
- "microsoft/DialoGPT-small", # Smallest fallback
47
- "HuggingFaceH4/zephyr-7b-beta" # Original (may fail in limited memory)
48
  ]
49
 
50
- model_name = model_options[1] # Start with flan-t5-base
51
- print(f"Attempting to load model: {model_name}")
 
 
 
52
 
53
- try:
54
- # Initialize with memory-efficient settings
55
- self.llm = HuggingFaceLLM(
56
- model_name=model_name,
57
- tokenizer_name=model_name,
58
- context_window=1024, # Increased for better reasoning
59
- max_new_tokens=256, # Increased for better responses
60
- generate_kwargs={
61
- "temperature": 0.3, # Lower temperature for more focused responses
62
- "do_sample": True,
63
- "top_p": 0.9,
64
- "repetition_penalty": 1.1
65
- },
66
- device_map="auto",
67
- # Add memory optimization parameters
68
- model_kwargs={
69
- "torch_dtype": torch.float16, # Use half precision
70
- "low_cpu_mem_usage": True,
71
- "load_in_8bit": True, # Enable 8-bit quantization if available
72
- },
73
- # Add system message for better instruction following
74
- system_message="You are a helpful AI assistant that can search the web and perform calculations. Always provide detailed, accurate answers."
75
- )
76
- print(f"Successfully loaded model: {model_name}")
77
-
78
- except Exception as e:
79
- print(f"Failed to load {model_name}: {e}")
80
- # Fallback to an even smaller model
81
- try:
82
- fallback_model = "microsoft/DialoGPT-small"
83
- print(f"Falling back to: {fallback_model}")
84
- self.llm = HuggingFaceLLM(
85
- model_name=fallback_model,
86
- tokenizer_name=fallback_model,
87
- context_window=256,
88
- max_new_tokens=64,
89
- generate_kwargs={"temperature": 0.7, "do_sample": True},
90
- device_map="cpu", # Force CPU to avoid memory issues
91
- model_kwargs={"low_cpu_mem_usage": True}
92
- )
93
- print(f"Successfully loaded fallback model: {fallback_model}")
94
- except Exception as e2:
95
- print(f"Flan-T5 also failed: {e2}")
96
- # Try an even more basic approach with a very small model
97
- try:
98
- basic_model = "microsoft/DialoGPT-small"
99
- print(f"Final fallback to: {basic_model}")
100
- self.llm = HuggingFaceLLM(
101
- model_name=basic_model,
102
- tokenizer_name=basic_model,
103
- context_window=512,
104
- max_new_tokens=128,
105
- generate_kwargs={"temperature": 0.3, "do_sample": True},
106
- device_map="cpu", # Force CPU to avoid memory issues
107
- model_kwargs={"low_cpu_mem_usage": True}
108
- )
109
- print(f"Successfully loaded final fallback: {basic_model}")
110
- except Exception as e3:
111
- print(f"All model loading attempts failed: {e3}")
112
- raise Exception("Unable to load any language model")
113
 
114
- # Define tools with real implementations
115
  self.tools = [
116
  FunctionTool.from_defaults(
117
- fn=self.web_search,
118
  name="web_search",
119
- description="Searches the web for current information using DuckDuckGo when questions require up-to-date knowledge"
120
  ),
121
  FunctionTool.from_defaults(
122
- fn=self.math_calculator,
123
  name="math_calculator",
124
- description="Performs mathematical calculations and symbolic math using SymPy when questions involve numbers or equations"
125
  )
126
  ]
127
 
128
- # Create ReAct agent with tools
129
  try:
130
  self.agent = ReActAgent.from_tools(
131
  tools=self.tools,
132
  llm=self.llm,
133
  verbose=True,
134
- max_iterations=3 # Limit iterations to prevent infinite loops
 
 
 
 
135
  )
136
- print("Local LLM Agent initialized successfully.")
137
  except Exception as e:
138
- print(f"Error creating ReAct agent: {e}")
139
- # Create a simple fallback agent
140
  self.agent = None
141
- print("Using fallback direct tool calling approach")
142
 
143
- def web_search(self, query: str) -> str:
144
- """Real web search using DuckDuckGo"""
145
- print(f"Web search triggered for: {query[:50]}...")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
146
 
147
  if not DDGS:
148
- return "Web search unavailable - duckduckgo_search not installed"
149
 
150
  try:
151
  with DDGS() as ddgs:
152
- results = list(ddgs.text(query, max_results=5)) # Get more results
153
- if results:
154
- formatted_results = []
155
- for i, r in enumerate(results, 1):
156
- title = r.get('title', 'No title')
157
- body = r.get('body', 'No description')[:300] # More context
158
- url = r.get('href', '')
159
- formatted_results.append(f"{i}. **{title}**\n{body}...\nSource: {url}")
 
 
 
 
160
 
161
- return "\n\n".join(formatted_results)
162
- else:
163
- return f"No search results found for '{query}'. Try rephrasing your search terms."
 
 
 
 
 
 
 
 
164
  except Exception as e:
165
- print(f"Web search error: {e}")
166
- return f"Error during web search for '{query}': {str(e)}"
167
 
168
- def math_calculator(self, expression: str) -> str:
169
- """Safe math evaluation using SymPy"""
170
- print(f"Math calculation triggered for: {expression}")
 
171
 
172
- if not sympify:
173
- # Fallback to basic eval with safety checks
174
- try:
175
- # Only allow basic math operations
176
- allowed_chars = set('0123456789+-*/().^ ')
177
- if not all(c in allowed_chars for c in expression.replace(' ', '')):
178
- return "Error: Only basic math operations are allowed"
179
- result = eval(expression.replace('^', '**'))
180
- return str(result)
181
- except Exception as e:
182
- return f"Error: Could not evaluate the mathematical expression - {str(e)}"
183
 
184
- try:
185
- # Use SymPy for safe evaluation
186
- result = sympify(expression).evalf()
187
- return str(result)
188
- except SympifyError as e:
189
- return f"Error: Could not parse the mathematical expression - {str(e)}"
190
- except Exception as e:
191
- return f"Error: Calculation failed - {str(e)}"
 
 
 
 
192
 
193
- def __call__(self, question: str) -> str:
194
- print(f"Processing question (first 50 chars): {question[:50]}...")
 
195
 
196
- # Enhanced reasoning approach
197
- question_lower = question.lower()
198
 
199
- # Check if we need to analyze files
200
- if any(word in question_lower for word in ['file', 'excel', 'csv', 'spreadsheet', 'data', 'attached']):
201
- return "I cannot access attached files in this environment. Please ensure the file is accessible via a direct URL or describe the data content directly in your question."
 
 
 
 
202
 
203
- # Check if we need web search
204
- needs_web_search = any(word in question_lower for word in [
205
- 'current', 'latest', 'recent', 'today', 'news', 'who is', 'what is',
206
- 'competition', 'winner', 'recipient', 'nationality', 'country',
207
- 'malko', 'century', 'award', 'born', 'died'
208
- ])
209
 
210
- # Check if we need math calculation
211
- needs_calculation = any(word in question_lower for word in [
212
- 'calculate', 'compute', 'sum', 'total', 'average', 'percentage',
213
- 'equation', 'solve', 'math', 'number'
214
- ]) or any(char in question for char in '+-*/=()0123456789')
215
 
216
  try:
217
- if self.agent:
218
- # Try using the ReAct agent first
219
- response = self.agent.query(question)
220
- response_str = str(response)
221
-
222
- # Check if the response is too short or nonsensical
223
- if len(response_str.strip()) < 3 or response_str.strip() in ['!', '?', 'what', 'I', 'The', 'A']:
224
- print("Agent gave a poor response, trying direct tool approach...")
225
- return self._direct_tool_approach(question, needs_web_search, needs_calculation)
226
-
227
- return response_str
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
228
  else:
229
- # Use direct tool approach
230
- return self._direct_tool_approach(question, needs_web_search, needs_calculation)
231
 
232
  except Exception as e:
233
- print(f"Agent error: {str(e)}")
234
- print(f"Full traceback: {traceback.format_exc()}")
235
- # Try direct tool approach as fallback
236
- try:
237
- return self._direct_tool_approach(question, needs_web_search, needs_calculation)
238
- except:
239
- return f"I apologize, but I'm having technical difficulties processing your question. The question appears to be: {question[:100]}..."
240
-
241
- def _direct_tool_approach(self, question: str, needs_web_search: bool, needs_calculation: bool) -> str:
242
- """Direct tool usage when agent fails"""
 
 
 
243
 
244
- if needs_web_search:
245
- # Extract key search terms
246
- search_terms = []
247
- important_words = question.split()
248
- for word in important_words:
249
- if len(word) > 3 and word.lower() not in ['what', 'when', 'where', 'who', 'how', 'the', 'and', 'or', 'but', 'from', 'with']:
250
- search_terms.append(word)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
251
 
252
- search_query = ' '.join(search_terms[:5]) # Limit to 5 key terms
253
- print(f"Performing web search for: {search_query}")
254
 
255
- search_result = self.web_search(search_query)
256
- return f"Based on my web search for '{search_query}':\n\n{search_result}\n\nPlease review the search results above to find the specific information you're looking for."
 
 
 
257
 
258
- if needs_calculation:
259
- # Try to extract mathematical expressions
260
- import re
261
- # Look for mathematical expressions
262
- math_patterns = re.findall(r'[\d+\-*/().\s]+', question)
263
- for pattern in math_patterns:
264
- if any(char in pattern for char in '+-*/') and any(char.isdigit() for char in pattern):
265
- result = self.math_calculator(pattern.strip())
266
- return f"Mathematical calculation result: {result}"
267
 
268
- # Default response with better reasoning
269
- return f"I understand you're asking about: {question[:150]}... However, I need more specific information or context to provide an accurate answer. Could you please rephrase your question or provide additional details?"
 
 
 
 
 
 
 
270
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
271
 
272
- # --- Memory cleanup function ---
273
- def cleanup_memory():
274
- """Clean up GPU memory"""
275
- if torch.cuda.is_available():
276
- torch.cuda.empty_cache()
277
- print("GPU memory cleared")
278
 
279
 
280
  # --- Submission Logic ---
281
  def run_and_submit_all(profile: gr.OAuthProfile | None):
282
- """
283
- Fetches all questions, runs the agent on them, submits all answers,
284
- and displays the results.
285
- """
286
  space_id = os.getenv("SPACE_ID")
287
 
288
  if profile:
@@ -296,15 +354,11 @@ def run_and_submit_all(profile: gr.OAuthProfile | None):
296
  questions_url = f"{api_url}/questions"
297
  submit_url = f"{api_url}/submit"
298
 
299
- # Clean memory before starting
300
- cleanup_memory()
301
-
302
- # Instantiate Agent
303
  try:
304
  agent = SmartAgent()
305
  except Exception as e:
306
- print(f"Error instantiating agent: {e}")
307
- print(f"Full traceback: {traceback.format_exc()}")
308
  return f"Error initializing agent: {e}", None
309
 
310
  agent_code = f"https://huggingface.co/spaces/{space_id}/tree/main"
@@ -317,197 +371,119 @@ def run_and_submit_all(profile: gr.OAuthProfile | None):
317
  response.raise_for_status()
318
  questions_data = response.json()
319
  if not questions_data:
320
- print("Fetched questions list is empty.")
321
- return "Fetched questions list is empty or invalid format.", None
322
  print(f"Fetched {len(questions_data)} questions.")
323
- except requests.exceptions.RequestException as e:
324
- print(f"Error fetching questions: {e}")
325
- return f"Error fetching questions: {e}", None
326
- except requests.exceptions.JSONDecodeError as e:
327
- print(f"Error decoding JSON response from questions endpoint: {e}")
328
- return f"Error decoding server response for questions: {e}", None
329
  except Exception as e:
330
- print(f"An unexpected error occurred fetching questions: {e}")
331
- return f"An unexpected error occurred fetching questions: {e}", None
332
 
333
- # Run Agent on all questions
334
  results_log = []
335
  answers_payload = []
336
- print(f"Running agent on {len(questions_data)} questions...")
337
 
338
  for i, item in enumerate(questions_data, 1):
339
  task_id = item.get("task_id")
340
- question_text = item.get("question")
341
 
342
- if not task_id or question_text is None:
343
- print(f"Skipping item with missing task_id or question: {item}")
344
  continue
345
 
346
- print(f"Processing question {i}/{len(questions_data)}: {task_id}")
347
 
348
  try:
349
- submitted_answer = agent(question_text)
350
- answers_payload.append({"task_id": task_id, "submitted_answer": submitted_answer})
 
 
 
351
  results_log.append({
352
- "Task ID": task_id,
353
- "Question": question_text[:100] + "..." if len(question_text) > 100 else question_text,
354
- "Submitted Answer": submitted_answer[:200] + "..." if len(submitted_answer) > 200 else submitted_answer
355
  })
356
- print(f"✅ Completed question {i}: {task_id}")
357
 
358
- # Clean memory after each question
359
- if i % 5 == 0: # Every 5 questions
360
- cleanup_memory()
361
 
362
  except Exception as e:
363
- print(f"Error running agent on task {task_id}: {e}")
364
- error_answer = f"AGENT ERROR: {str(e)}"
365
- answers_payload.append({"task_id": task_id, "submitted_answer": error_answer})
 
 
366
  results_log.append({
367
- "Task ID": task_id,
368
- "Question": question_text[:100] + "..." if len(question_text) > 100 else question_text,
369
- "Submitted Answer": error_answer
370
  })
371
 
372
- if not answers_payload:
373
- print("Agent did not produce any answers to submit.")
374
- return "Agent did not produce any answers to submit.", pd.DataFrame(results_log)
375
-
376
- # Prepare submission
377
  submission_data = {
378
- "username": username.strip(),
379
- "agent_code": agent_code,
380
  "answers": answers_payload
381
  }
382
 
383
- status_update = f"Agent finished processing. Submitting {len(answers_payload)} answers for user '{username}'..."
384
- print(status_update)
385
-
386
- # Submit answers
387
- print(f"Submitting {len(answers_payload)} answers to: {submit_url}")
388
  try:
389
  response = requests.post(submit_url, json=submission_data, timeout=60)
390
  response.raise_for_status()
391
- result_data = response.json()
392
 
393
- final_status = (
394
- f"🎉 Submission Successful!\n\n"
395
- f"User: {result_data.get('username')}\n"
396
- f"Overall Score: {result_data.get('score', 'N/A')}% "
397
- f"({result_data.get('correct_count', '?')}/{result_data.get('total_attempted', '?')} correct)\n"
398
- f"Message: {result_data.get('message', 'No message received.')}"
399
  )
400
- print("✅ Submission successful!")
401
- results_df = pd.DataFrame(results_log)
402
- return final_status, results_df
403
-
404
- except requests.exceptions.HTTPError as e:
405
- error_detail = f"Server responded with status {e.response.status_code}."
406
- try:
407
- error_json = e.response.json()
408
- error_detail += f" Detail: {error_json.get('detail', e.response.text)}"
409
- except requests.exceptions.JSONDecodeError:
410
- error_detail += f" Response: {e.response.text[:500]}"
411
- status_message = f"❌ Submission Failed: {error_detail}"
412
- print(status_message)
413
- results_df = pd.DataFrame(results_log)
414
- return status_message, results_df
415
-
416
- except requests.exceptions.Timeout:
417
- status_message = "❌ Submission Failed: The request timed out."
418
- print(status_message)
419
- results_df = pd.DataFrame(results_log)
420
- return status_message, results_df
421
-
422
- except requests.exceptions.RequestException as e:
423
- status_message = f"❌ Submission Failed: Network error - {e}"
424
- print(status_message)
425
- results_df = pd.DataFrame(results_log)
426
- return status_message, results_df
427
 
428
  except Exception as e:
429
- status_message = f"❌ An unexpected error occurred during submission: {e}"
430
- print(status_message)
431
- results_df = pd.DataFrame(results_log)
432
- return status_message, results_df
433
 
434
 
435
  # --- Gradio UI ---
436
  with gr.Blocks(title="Local LLM Agent Evaluation") as demo:
437
- gr.Markdown("# 🤖 Local LLM Agent Evaluation Runner")
438
- gr.Markdown(
439
- """
440
- **Instructions:**
441
- 1. 🔐 Log in to your Hugging Face account using the button below
442
- 2. 🚀 Click 'Run Evaluation & Submit All Answers'
443
- 3. ⏳ Wait for the local LLM to process all questions (using memory-optimized smaller model)
444
- 4. 📊 View your results and submission status
445
-
446
- **Features:**
447
- - 🔍 Real web search using DuckDuckGo
448
- - 🧮 Advanced math calculations with SymPy
449
- - 🧠 Memory-optimized language model with fallback options
450
- - 🛡️ Error handling and recovery mechanisms
451
- """
452
- )
453
-
454
  with gr.Row():
455
  gr.LoginButton()
456
 
457
- with gr.Row():
458
- run_button = gr.Button(
459
- "🚀 Run Evaluation & Submit All Answers",
460
- variant="primary",
461
- size="lg"
462
- )
463
 
464
- status_output = gr.Textbox(
465
- label="📋 Run Status / Submission Result",
466
- lines=8,
467
- interactive=False,
468
- placeholder="Click the button above to start the evaluation..."
469
  )
470
 
471
  results_table = gr.DataFrame(
472
- label="📊 Questions and Agent Answers",
473
- wrap=True,
474
- interactive=False
475
  )
476
 
477
- # Wire up the button
478
- run_button.click(
479
  fn=run_and_submit_all,
480
- outputs=[status_output, results_table]
481
  )
482
 
483
 
484
  if __name__ == "__main__":
485
  print("\n" + "="*60)
486
- print("🚀 Application Startup at", pd.Timestamp.now().strftime("%Y-%m-%d %H:%M:%S"))
 
487
  print("="*60)
488
 
489
- space_host_startup = os.getenv("SPACE_HOST")
490
- space_id_startup = os.getenv("SPACE_ID")
491
-
492
- if space_host_startup:
493
- print(f"✅ SPACE_HOST found: {space_host_startup}")
494
- print(f" Runtime URL should be: https://{space_host_startup}")
495
- else:
496
- print("ℹ️ SPACE_HOST environment variable not found (running locally?).")
497
-
498
- if space_id_startup:
499
- print(f"✅ SPACE_ID found: {space_id_startup}")
500
- print(f" Repo URL: https://huggingface.co/spaces/{space_id_startup}")
501
- print(f" Repo Tree URL: https://huggingface.co/spaces/{space_id_startup}/tree/main")
502
- else:
503
- print("ℹ️ SPACE_ID environment variable not found (running locally?).")
504
-
505
- print("-" * 60)
506
- print("🎯 Launching Gradio Interface for Local LLM Agent Evaluation...")
507
-
508
- # Launch without share=True for Hugging Face Spaces
509
  demo.launch(
510
  server_name="0.0.0.0",
511
- server_port=7860,
512
- show_error=True
513
  )
 
 
1
  from llama_index.llms.huggingface import HuggingFaceLLM
2
  from llama_index.core.agent import ReActAgent
3
  from llama_index.core.tools import FunctionTool
4
+ from transformers import AutoTokenizer, pipeline
5
  import os
6
  import gradio as gr
7
  import requests
8
  import pandas as pd
9
  import traceback
10
  import torch
11
+ import re
12
+ import gc
13
+ from typing import List, Dict
14
+ from datetime import datetime
15
 
16
  # Import real tool dependencies
17
  try:
 
30
 
31
  # --- Constants ---
32
  DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
33
+ MEMORY_LIMIT_GB = 16 # Your system's memory limit
34
 
35
  # --- Advanced Agent Definition ---
36
  class SmartAgent:
37
  def __init__(self):
38
+ print(f"Initializing Local LLM Agent (Memory Limit: {MEMORY_LIMIT_GB}GB)...")
39
+ self.model_loaded = False
40
 
41
+ # Model options sorted by capability (name, approx size in GB, quantization)
 
 
 
 
 
 
42
  model_options = [
43
+ ("google/flan-t5-large", 3, "8-bit"), # Best balance for 16GB
44
+ ("google/flan-t5-base", 1, "8-bit"), # Smaller fallback
45
+ ("facebook/opt-1.3b", 2.5, "8-bit") # Alternative option
 
46
  ]
47
 
48
+ # Try loading models until success
49
+ for model_name, size_gb, quantization in model_options:
50
+ if size_gb <= MEMORY_LIMIT_GB and self._try_load_model(model_name, quantization):
51
+ self.model_loaded = True
52
+ break
53
 
54
+ if not self.model_loaded:
55
+ raise RuntimeError("Failed to load any suitable model within memory constraints")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
56
 
57
+ # Initialize tools with enhanced implementations
58
  self.tools = [
59
  FunctionTool.from_defaults(
60
+ fn=self.smart_web_search,
61
  name="web_search",
62
+ description="Searches the web for current information. Use for questions about recent events, people, or facts not in the model's training data."
63
  ),
64
  FunctionTool.from_defaults(
65
+ fn=self.robust_math_calculator,
66
  name="math_calculator",
67
+ description="Solves mathematical expressions and equations. Use for calculations, arithmetic, algebra, or numerical problems."
68
  )
69
  ]
70
 
71
+ # Initialize ReAct agent with memory optimization
72
  try:
73
  self.agent = ReActAgent.from_tools(
74
  tools=self.tools,
75
  llm=self.llm,
76
  verbose=True,
77
+ max_iterations=4,
78
+ react_context="""Think step by step. Use tools when needed:
79
+ - For current/recent information: web_search
80
+ - For calculations: math_calculator
81
+ - Be concise but accurate"""
82
  )
83
+ print("ReAct Agent initialized successfully")
84
  except Exception as e:
85
+ print(f"ReAct Agent init failed: {e}")
 
86
  self.agent = None
 
87
 
88
+ def _try_load_model(self, model_name: str, quantization: str) -> bool:
89
+ """Attempt to load model with memory constraints"""
90
+ try:
91
+ print(f"Loading {model_name} with {quantization} quantization...")
92
+
93
+ model_kwargs = {
94
+ "torch_dtype": torch.float16,
95
+ "low_cpu_mem_usage": True,
96
+ }
97
+
98
+ if quantization == "8-bit":
99
+ model_kwargs["load_in_8bit"] = True
100
+ elif quantization == "4-bit":
101
+ model_kwargs["load_in_4bit"] = True
102
+
103
+ self.llm = HuggingFaceLLM(
104
+ model_name=model_name,
105
+ tokenizer_name=model_name,
106
+ context_window=2048,
107
+ max_new_tokens=256,
108
+ generate_kwargs={
109
+ "temperature": 0.4,
110
+ "do_sample": True,
111
+ "top_p": 0.9,
112
+ "repetition_penalty": 1.1
113
+ },
114
+ device_map="auto" if torch.cuda.is_available() else "cpu",
115
+ model_kwargs=model_kwargs
116
+ )
117
+
118
+ # Test the model
119
+ test_response = self.llm.complete("Test response:")
120
+ if not test_response:
121
+ raise ValueError("Model failed test response")
122
+
123
+ print(f"Successfully loaded {model_name}")
124
+ return True
125
+
126
+ except Exception as e:
127
+ print(f"Failed to load {model_name}: {str(e)}")
128
+ self.cleanup_memory()
129
+ return False
130
+
131
+ def smart_web_search(self, query: str) -> str:
132
+ """Enhanced web search with focused results"""
133
+ print(f"Searching: {query[:60]}...")
134
 
135
  if not DDGS:
136
+ return "Web search unavailable (duckduckgo_search not installed)"
137
 
138
  try:
139
  with DDGS() as ddgs:
140
+ # Get focused results with longer snippets
141
+ results = list(ddgs.text(query, max_results=3))
142
+
143
+ if not results:
144
+ return "No results found"
145
+
146
+ # Process results for key information
147
+ processed = []
148
+ for i, res in enumerate(results, 1):
149
+ title = res.get('title', 'No title')
150
+ body = res.get('body', 'No description')
151
+ url = res.get('href', '')
152
 
153
+ # Extract most relevant part for the query
154
+ key_info = self._extract_relevant_info(query, body)
155
+
156
+ processed.append(
157
+ f"🔍 Result {i}:\n"
158
+ f"Title: {title}\n"
159
+ f"Info: {key_info[:250]}\n"
160
+ f"Source: {url}\n"
161
+ )
162
+
163
+ return "\n".join(processed)
164
  except Exception as e:
165
+ return f"Search error: {str(e)}"
 
166
 
167
+ def _extract_relevant_info(self, query: str, text: str) -> str:
168
+ """Extract the most relevant portion of text for the query"""
169
+ query_lower = query.lower()
170
+ text_lower = text.lower()
171
 
172
+ # Handle different question types
173
+ if any(w in query_lower for w in ['who is', 'biography', 'born']):
174
+ # Look for birth/death info
175
+ match = re.search(r"(born [^.]+? in [^.]+?\.)", text, re.I)
176
+ return match.group(1) if match else text[:250]
 
 
 
 
 
 
177
 
178
+ elif any(w in query_lower for w in ['died', 'death']):
179
+ match = re.search(r"(died [^.]+?\.)", text, re.I)
180
+ return match.group(1) if match else text[:250]
181
+
182
+ elif any(w in query_lower for w in ['award', 'prize', 'won']):
183
+ match = re.search(r"(awarded [^.]+? in [^.]+?\.)", text, re.I)
184
+ return match.group(1) if match else text[:250]
185
+
186
+ # Default: return first 250 chars with important sentences
187
+ sentences = re.split(r'(?<=[.!?]) +', text)
188
+ important = [s for s in sentences if any(w in s.lower() for w in query.lower().split())]
189
+ return " ".join(important[:3]) if important else text[:250]
190
 
191
+ def robust_math_calculator(self, expression: str) -> str:
192
+ """Improved math calculator with better parsing"""
193
+ print(f"Calculating: {expression}")
194
 
195
+ # Clean and preprocess the expression
196
+ expr = expression.strip("'\"")
197
 
198
+ # Replace words with operators
199
+ replacements = {
200
+ 'plus': '+', 'minus': '-', 'times': '*', 'divided by': '/',
201
+ '^': '**', 'percent': '/100', 'modulo': '%'
202
+ }
203
+ for word, op in replacements.items():
204
+ expr = expr.replace(word, op)
205
 
206
+ # Extract math expression from text
207
+ math_match = re.search(r"([-+]?\d*\.?\d+[+\-*/%^()\s]+\d+\.?\d*)", expr)
208
+ if math_match:
209
+ expr = math_match.group(1)
 
 
210
 
211
+ # Safety check
212
+ allowed_chars = set("0123456789+-*/().%^ ")
213
+ if not all(c in allowed_chars for c in expr.replace(" ", "")):
214
+ return "Error: Invalid characters in expression"
 
215
 
216
  try:
217
+ # Try direct evaluation first
218
+ result = eval(expr)
219
+ return f"Result: {result}"
220
+ except:
221
+ # Fallback to sympy if available
222
+ if sympify:
223
+ try:
224
+ result = sympify(expr).evalf()
225
+ return f"Result: {result}"
226
+ except SympifyError as e:
227
+ return f"Math error: {str(e)}"
228
+ return "Error: Could not evaluate the expression"
229
+
230
+ def __call__(self, question: str) -> str:
231
+ """Main interface for answering questions"""
232
+ print(f"\nQuestion: {question[:100]}...")
233
+
234
+ try:
235
+ # Step 1: Classify question type
236
+ q_type = self._classify_question(question)
237
+
238
+ # Step 2: Use appropriate strategy
239
+ if q_type == "fact":
240
+ return self._answer_fact_question(question)
241
+ elif q_type == "math":
242
+ return self._answer_math_question(question)
243
  else:
244
+ return self._answer_general_question(question)
 
245
 
246
  except Exception as e:
247
+ print(f"Error processing question: {str(e)}")
248
+ return self._fallback_response(question)
249
+
250
+ def _classify_question(self, question: str) -> str:
251
+ """Determine the type of question"""
252
+ q_lower = question.lower()
253
+
254
+ # Math questions
255
+ math_keywords = ['calculate', 'compute', 'sum', 'total', 'average',
256
+ 'percentage', 'equation', 'solve', 'math', 'number',
257
+ '+', '-', '*', '/', '=']
258
+ if any(kw in q_lower for kw in math_keywords):
259
+ return "math"
260
 
261
+ # Fact-based questions
262
+ fact_keywords = ['current', 'latest', 'recent', 'today', 'news',
263
+ 'who is', 'what is', 'when did', 'where is',
264
+ 'competition', 'winner', 'recipient', 'nationality',
265
+ 'country', 'malko', 'century', 'award', 'born', 'died']
266
+ if any(kw in q_lower for kw in fact_keywords):
267
+ return "fact"
268
+
269
+ return "general"
270
+
271
+ def _answer_fact_question(self, question: str) -> str:
272
+ """Handle fact-based questions with web search"""
273
+ # Extract key entities for focused search
274
+ entities = re.findall(r"([A-Z][a-z]+(?:\s+[A-Z][a-z]+)*)", question)
275
+ search_query = " ".join(entities[:3]) or question[:50]
276
+
277
+ # Get search results
278
+ search_results = self.smart_web_search(search_query)
279
+
280
+ # Process with LLM if available
281
+ if self.model_loaded:
282
+ prompt = f"""Question: {question}
283
+ Search Results:
284
+ {search_results}
285
 
286
+ Based ONLY on these results, provide a concise answer.
287
+ If the answer isn't there, say so."""
288
 
289
+ try:
290
+ response = self.llm.complete(prompt)
291
+ return str(response).strip()
292
+ except:
293
+ return f"Search results for '{search_query}':\n{search_results}"
294
 
295
+ return f"Search results for '{search_query}':\n{search_results}"
296
+
297
+ def _answer_math_question(self, question: str) -> str:
298
+ """Handle math questions with calculator"""
299
+ # Try to extract math expression
300
+ math_expr = re.search(r"([\d\s+\-*/().^]+)", question)
301
+ if math_expr:
302
+ return self.robust_math_calculator(math_expr.group(1))
 
303
 
304
+ # If no clear expression, use agent reasoning
305
+ if self.agent:
306
+ try:
307
+ response = self.agent.query(question)
308
+ return str(response)
309
+ except:
310
+ return self._fallback_response(question)
311
+
312
+ return self._fallback_response(question)
313
 
314
+ def _answer_general_question(self, question: str) -> str:
315
+ """Handle general knowledge questions"""
316
+ if self.agent:
317
+ try:
318
+ response = self.agent.query(question)
319
+ return str(response)
320
+ except:
321
+ return self._fallback_response(question)
322
+
323
+ # Fallback to simple LLM response
324
+ try:
325
+ response = self.llm.complete(question)
326
+ return str(response)
327
+ except:
328
+ return self._fallback_response(question)
329
+
330
+ def _fallback_response(self, question: str) -> str:
331
+ """Final fallback when all else fails"""
332
+ return f"I couldn't generate a complete answer for: {question[:150]}... Please try rephrasing or ask about something more specific."
333
 
334
+ def cleanup_memory(self):
335
+ """Clean up memory resources"""
336
+ if torch.cuda.is_available():
337
+ torch.cuda.empty_cache()
338
+ gc.collect()
 
339
 
340
 
341
  # --- Submission Logic ---
342
  def run_and_submit_all(profile: gr.OAuthProfile | None):
343
+ """Handle the full evaluation process"""
 
 
 
344
  space_id = os.getenv("SPACE_ID")
345
 
346
  if profile:
 
354
  questions_url = f"{api_url}/questions"
355
  submit_url = f"{api_url}/submit"
356
 
357
+ # Initialize agent with memory management
 
 
 
358
  try:
359
  agent = SmartAgent()
360
  except Exception as e:
361
+ print(f"Agent initialization failed: {e}")
 
362
  return f"Error initializing agent: {e}", None
363
 
364
  agent_code = f"https://huggingface.co/spaces/{space_id}/tree/main"
 
371
  response.raise_for_status()
372
  questions_data = response.json()
373
  if not questions_data:
374
+ return "No questions received from server.", None
 
375
  print(f"Fetched {len(questions_data)} questions.")
 
 
 
 
 
 
376
  except Exception as e:
377
+ return f"Error fetching questions: {e}", None
 
378
 
379
+ # Process Questions
380
  results_log = []
381
  answers_payload = []
 
382
 
383
  for i, item in enumerate(questions_data, 1):
384
  task_id = item.get("task_id")
385
+ question = item.get("question")
386
 
387
+ if not task_id or not question:
 
388
  continue
389
 
390
+ print(f"Processing question {i}/{len(questions_data)} (ID: {task_id})")
391
 
392
  try:
393
+ answer = agent(question)
394
+ answers_payload.append({
395
+ "task_id": task_id,
396
+ "submitted_answer": answer[:2000] # Limit answer length
397
+ })
398
  results_log.append({
399
+ "Task ID": task_id,
400
+ "Question": question[:100] + "..." if len(question) > 100 else question,
401
+ "Answer": answer[:200] + "..." if len(answer) > 200 else answer
402
  })
 
403
 
404
+ # Clean memory every 5 questions
405
+ if i % 5 == 0:
406
+ agent.cleanup_memory()
407
 
408
  except Exception as e:
409
+ print(f"Error on question {task_id}: {e}")
410
+ answers_payload.append({
411
+ "task_id": task_id,
412
+ "submitted_answer": f"Error processing question: {str(e)}"
413
+ })
414
  results_log.append({
415
+ "Task ID": task_id,
416
+ "Question": question[:100] + "..." if len(question) > 100 else question,
417
+ "Answer": f"Error: {str(e)}"
418
  })
419
 
420
+ # Submit Answers
 
 
 
 
421
  submission_data = {
422
+ "username": username.strip(),
423
+ "agent_code": agent_code,
424
  "answers": answers_payload
425
  }
426
 
427
+ print(f"Submitting {len(answers_payload)} answers...")
 
 
 
 
428
  try:
429
  response = requests.post(submit_url, json=submission_data, timeout=60)
430
  response.raise_for_status()
431
+ result = response.json()
432
 
433
+ status = (
434
+ f" Submission Successful!\n\n"
435
+ f"User: {result.get('username')}\n"
436
+ f"Score: {result.get('score', 'N/A')}% "
437
+ f"({result.get('correct_count', '?')}/{result.get('total_attempted', '?')})\n"
438
+ f"Message: {result.get('message', '')}"
439
  )
440
+ return status, pd.DataFrame(results_log)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
441
 
442
  except Exception as e:
443
+ error_msg = f"❌ Submission Failed: {str(e)}"
444
+ print(error_msg)
445
+ return error_msg, pd.DataFrame(results_log)
 
446
 
447
 
448
  # --- Gradio UI ---
449
  with gr.Blocks(title="Local LLM Agent Evaluation") as demo:
450
+ gr.Markdown("""
451
+ # � Local LLM Agent Evaluation
452
+ **Run your local agent against the course evaluation questions**
453
+ """)
454
+
 
 
 
 
 
 
 
 
 
 
 
 
455
  with gr.Row():
456
  gr.LoginButton()
457
 
458
+ run_btn = gr.Button(
459
+ "🚀 Run Evaluation & Submit Answers",
460
+ variant="primary"
461
+ )
 
 
462
 
463
+ status_out = gr.Textbox(
464
+ label="📋 Status",
465
+ interactive=False
 
 
466
  )
467
 
468
  results_table = gr.DataFrame(
469
+ label="📊 Results",
470
+ interactive=False,
471
+ wrap=True
472
  )
473
 
474
+ run_btn.click(
 
475
  fn=run_and_submit_all,
476
+ outputs=[status_out, results_table]
477
  )
478
 
479
 
480
  if __name__ == "__main__":
481
  print("\n" + "="*60)
482
+ print(f"🚀 Starting Agent Evaluation - {datetime.now().strftime('%Y-%m-%d %H:%M')}")
483
+ print(f"Memory Limit: {MEMORY_LIMIT_GB}GB")
484
  print("="*60)
485
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
486
  demo.launch(
487
  server_name="0.0.0.0",
488
+ server_port=7860
 
489
  )