Final_Assignment_Template

Runtime error

App Files Files Community

Final_Assignment_Template / app.py

LamiaYT

Initial commit with LlamaIndex-based agent

34c5bf3 about 2 months ago

raw

history blame

19.3 kB

	# app.py - Improved GAIA Agent with GPT-NeoX-20B + LoRA
	from llama_index.llms.huggingface import HuggingFaceLLM
	from llama_index.core.agent import ReActAgent
	from llama_index.core.tools import FunctionTool
	from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
	from peft import LoraConfig, get_peft_model
	import os
	import gradio as gr
	import requests
	import pandas as pd
	import traceback
	import torch
	import re
	import json

	# Import real tool dependencies
	try:
	from duckduckgo_search import DDGS
	except ImportError:
	print("Warning: duckduckgo_search not installed. Web search will be limited.")
	DDGS = None

	try:
	from sympy import sympify, solve, simplify, N, symbols
	from sympy.core.sympify import SympifyError
	except ImportError:
	print("Warning: sympy not installed. Math calculator will be limited.")
	sympify = None
	SympifyError = Exception

	# --- Constants ---
	DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"

	def print_trainable_parameters(model):
	"""Print trainable parameters info"""
	trainable_parameters = 0
	all_parameters = 0
	for _, param in model.named_parameters():
	all_parameters += param.numel()
	if param.requires_grad:
	trainable_parameters += param.numel()
	print(
	f"Trainable: {trainable_parameters} \|\| All: {all_parameters} \|\| Trainable %: {100 * trainable_parameters / all_parameters:.2f}%"
	)

	class ImprovedGAIAAgent:
	def __init__(self):
	print("🚀 Initializing Improved GAIA Agent with GPT-NeoX-20B...")

	if not torch.cuda.is_available():
	raise RuntimeError("❌ CUDA required for GPT-NeoX-20B. Please use a GPU environment.")

	gpu_memory = torch.cuda.get_device_properties(0).total_memory / 1e9
	print(f"🔥 GPU Memory: {gpu_memory:.1f}GB")

	# Model configuration
	self.model_name = "EleutherAI/gpt-neox-20b"

	# 4-bit quantization config for memory efficiency
	self.bnb_config = BitsAndBytesConfig(
	load_in_4bit=True,
	bnb_4bit_use_double_quant=True,
	bnb_4bit_quant_type="nf4",
	bnb_4bit_compute_dtype=torch.bfloat16
	)

	# LoRA configuration for efficient fine-tuning capability
	self.lora_config = LoraConfig(
	r=16, # Increased for better performance
	lora_alpha=32,
	target_modules=["query_key_value", "dense", "dense_h_to_4h", "dense_4h_to_h"], # More comprehensive targets
	lora_dropout=0.1,
	bias="none",
	task_type="CAUSAL_LM"
	)

	self.load_model()
	self.setup_tools()
	self.create_agent()

	def load_model(self):
	"""Load and configure the model"""
	print("📥 Loading tokenizer...")
	self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)

	# Add padding token if not present
	if self.tokenizer.pad_token is None:
	self.tokenizer.pad_token = self.tokenizer.eos_token

	print("📥 Loading model with 4-bit quantization...")
	self.model = AutoModelForCausalLM.from_pretrained(
	self.model_name,
	quantization_config=self.bnb_config,
	device_map="auto",
	trust_remote_code=True,
	torch_dtype=torch.bfloat16
	)

	print("🔧 Applying LoRA configuration...")
	self.model = get_peft_model(self.model, self.lora_config)
	print_trainable_parameters(self.model)

	# Create LlamaIndex LLM wrapper
	print("🔗 Creating LlamaIndex LLM wrapper...")
	self.llm = HuggingFaceLLM(
	model=self.model,
	tokenizer=self.tokenizer,
	context_window=2048, # GPT-NeoX context length
	max_new_tokens=512,
	generate_kwargs={
	"temperature": 0.1,
	"do_sample": True,
	"top_p": 0.9,
	"repetition_penalty": 1.1,
	"pad_token_id": self.tokenizer.eos_token_id,
	},
	# Improved system message for GAIA tasks
	system_message="""You are a helpful AI assistant that can search the web and perform calculations.
	When answering questions:
	1. Think step by step
	2. Use tools when you need current information or calculations
	3. Be precise and factual
	4. For numerical answers, provide exact numbers when possible
	5. Always show your reasoning

	Available tools: web_search, math_calculator"""
	)

	def setup_tools(self):
	"""Setup enhanced tools for GAIA benchmark"""
	self.tools = [
	FunctionTool.from_defaults(
	fn=self.enhanced_web_search,
	name="web_search",
	description="Search the web for current information, facts, people, events, or recent data. Use specific keywords."
	),
	FunctionTool.from_defaults(
	fn=self.advanced_calculator,
	name="math_calculator",
	description="Perform mathematical calculations, solve equations, handle percentages, averages, and complex math operations."
	),
	FunctionTool.from_defaults(
	fn=self.fact_checker,
	name="fact_checker",
	description="Verify facts and get detailed information about people, places, events, or concepts."
	)
	]

	def enhanced_web_search(self, query: str) -> str:
	"""Enhanced web search with better result processing"""
	print(f"🔍 Enhanced search: {query}")

	if not DDGS:
	return "Web search unavailable - duckduckgo_search not installed"

	try:
	with DDGS() as ddgs:
	# Get both regular results and news if relevant
	results = list(ddgs.text(query, max_results=8, region='wt-wt'))

	if not results:
	return f"No results found for: {query}"

	# Process and format results
	formatted_results = []
	for i, result in enumerate(results, 1):
	title = result.get('title', 'No title')
	body = result.get('body', '').strip()
	url = result.get('href', '')

	# Extract key information
	if len(body) > 300:
	body = body[:300] + "..."

	formatted_results.append(f"""Result {i}: {title}
	Content: {body}
	Source: {url}
	""")

	search_summary = f"Search results for '{query}':\n\n" + "\n".join(formatted_results)

	# Try to extract specific answers for common question types
	if any(keyword in query.lower() for keyword in ['how many', 'when was', 'who is', 'what year']):
	# Look for numbers and dates in results
	all_text = " ".join([r.get('body', '') for r in results])

	# Extract years
	years = re.findall(r'\b(19\|20)\d{2}\b', all_text)
	if years and 'when' in query.lower():
	search_summary += f"\n\nExtracted years: {', '.join(set(years))}"

	# Extract numbers
	numbers = re.findall(r'\b\d+\b', all_text)
	if numbers and 'how many' in query.lower():
	search_summary += f"\n\nExtracted numbers: {', '.join(set(numbers)[:5])}"

	return search_summary

	except Exception as e:
	print(f"❌ Search error: {e}")
	return f"Search failed: {str(e)}"

	def advanced_calculator(self, expression: str) -> str:
	"""Advanced calculator with symbolic math"""
	print(f"🧮 Advanced calculation: {expression}")

	try:
	# Clean and normalize the expression
	clean_expr = expression.replace('^', '*').replace('×', '').replace('÷', '/')
	clean_expr = re.sub(r'(\d)\s\(', r'\1(', clean_expr) # Add implicit multiplication

	if sympify:
	try:
	# Try symbolic computation first
	expr = sympify(clean_expr, evaluate=False)
	result = simplify(expr)
	numerical = N(result, 15) # High precision

	# Handle different result types
	if result.is_number:
	return f"Calculation: {expression} = {numerical}"
	else:
	return f"Calculation: {expression} = {result} ≈ {numerical}"

	except SympifyError:
	# Fallback to numerical evaluation
	result = eval(clean_expr)
	return f"Calculation: {expression} = {result}"
	else:
	# Basic evaluation
	result = eval(clean_expr)
	return f"Calculation: {expression} = {result}"

	except Exception as e:
	return f"Could not calculate '{expression}': {str(e)}"

	def fact_checker(self, query: str) -> str:
	"""Specialized fact checking with multiple search strategies"""
	print(f"✅ Fact checking: {query}")

	# Try different search strategies
	search_variations = [
	query,
	f"{query} facts",
	f"{query} biography" if any(word in query.lower() for word in ['who is', 'person', 'artist']) else f"{query} information",
	]

	all_results = []
	for search_query in search_variations[:2]: # Limit to avoid rate limiting
	result = self.enhanced_web_search(search_query)
	if "No results found" not in result:
	all_results.append(f"Search: {search_query}\n{result}")

	return "\n\n" + "="*50 + "\n\n".join(all_results) if all_results else f"Could not verify facts about: {query}"

	def create_agent(self):
	"""Create the ReAct agent"""
	print("🤖 Creating ReAct agent...")
	try:
	self.agent = ReActAgent.from_tools(
	tools=self.tools,
	llm=self.llm,
	verbose=True,
	max_iterations=5, # Allow more iterations for complex problems
	react_chat_formatter=None, # Use default formatter
	)
	print("✅ ReAct Agent created successfully")
	except Exception as e:
	print(f"❌ Agent creation failed: {e}")
	traceback.print_exc()
	raise

	def __call__(self, question: str) -> str:
	"""Process question through the agent"""
	print(f"\n" + "="*60)
	print(f"🤔 Processing: {question}")
	print("="*60)

	try:
	# Use the agent to process the question
	response = self.agent.query(question)
	answer = str(response).strip()

	# Validate response quality
	if len(answer) < 10 or answer.lower() in ['error', 'none', 'unknown']:
	print("⚠️ Poor response, trying direct approach...")
	return self._direct_approach(question)

	print(f"✅ Agent response: {answer[:200]}...")
	return answer

	except Exception as e:
	print(f"❌ Agent error: {e}")
	print("🔄 Falling back to direct approach...")
	return self._direct_approach(question)

	def _direct_approach(self, question: str) -> str:
	"""Direct approach when agent fails"""
	question_lower = question.lower()

	# Determine approach based on question type
	if any(term in question_lower for term in ['calculate', 'compute', 'math', '+', '-', '*', '/', '=', 'percentage', 'average']):
	# Math-focused approach
	math_result = self.advanced_calculator(question)
	return math_result

	elif any(term in question_lower for term in ['who is', 'when was', 'where is', 'what is', 'how many']):
	# Search-focused approach
	search_result = self.enhanced_web_search(question)
	fact_result = self.fact_checker(question)
	return f"{search_result}\n\nFact Check:\n{fact_result}"

	else:
	# General approach
	search_result = self.enhanced_web_search(question)
	return search_result

	def cleanup_memory():
	"""Clean up GPU memory"""
	if torch.cuda.is_available():
	torch.cuda.empty_cache()
	print("🧹 Memory cleaned")

	def run_and_submit_all(profile: gr.OAuthProfile \| None):
	"""Run evaluation with improved agent"""

	if not profile:
	return "❌ Please login to Hugging Face first", None

	username = profile.username
	print(f"👤 User: {username}")

	# API endpoints
	api_url = DEFAULT_API_URL
	questions_url = f"{api_url}/questions"
	submit_url = f"{api_url}/submit"

	cleanup_memory()

	# Initialize improved agent
	try:
	print("🚀 Initializing Improved GAIA Agent...")
	agent = ImprovedGAIAAgent()
	print("✅ Agent initialized successfully")
	except Exception as e:
	error_msg = f"❌ Agent initialization failed: {str(e)}\n{traceback.format_exc()}"
	print(error_msg)
	return error_msg, None

	# Get space info
	space_id = os.getenv("SPACE_ID", "unknown")
	agent_code = f"https://huggingface.co/spaces/{space_id}/tree/main"

	# Fetch questions
	try:
	print("📥 Fetching questions...")
	response = requests.get(questions_url, timeout=30)
	response.raise_for_status()
	questions_data = response.json()
	print(f"📋 Got {len(questions_data)} questions")
	except Exception as e:
	return f"❌ Failed to fetch questions: {str(e)}", None

	# Process all questions
	results_log = []
	answers_payload = []

	print("\n" + "="*50)
	print("🚀 STARTING GAIA EVALUATION")
	print("="*50)

	for i, item in enumerate(questions_data, 1):
	task_id = item.get("task_id")
	question_text = item.get("question")

	if not task_id or not question_text:
	continue

	print(f"\n📝 Question {i}/{len(questions_data)}")
	print(f"🆔 ID: {task_id}")
	print(f"❓ Question: {question_text}")

	try:
	# Get answer from improved agent
	answer = agent(question_text)

	# Ensure answer is meaningful
	if not answer or len(answer.strip()) < 5:
	answer = f"Unable to determine answer for: {question_text[:100]}..."

	print(f"✅ Answer: {answer[:200]}...")

	# Store results
	answers_payload.append({
	"task_id": task_id,
	"submitted_answer": answer
	})

	results_log.append({
	"Task ID": task_id,
	"Question": question_text[:150] + ("..." if len(question_text) > 150 else ""),
	"Answer": answer[:200] + ("..." if len(answer) > 200 else "")
	})

	# Memory cleanup every few questions
	if i % 3 == 0:
	cleanup_memory()

	except Exception as e:
	print(f"❌ Error processing {task_id}: {e}")
	error_answer = f"Processing error: {str(e)[:150]}"

	answers_payload.append({
	"task_id": task_id,
	"submitted_answer": error_answer
	})

	results_log.append({
	"Task ID": task_id,
	"Question": question_text[:150] + "...",
	"Answer": error_answer
	})

	print(f"\n📤 Submitting {len(answers_payload)} answers...")

	# Submit answers
	submission_data = {
	"username": username,
	"agent_code": agent_code,
	"answers": answers_payload
	}

	try:
	response = requests.post(submit_url, json=submission_data, timeout=180)
	response.raise_for_status()
	result_data = response.json()

	score = result_data.get('score', 0)
	correct = result_data.get('correct_count', 0)
	total = result_data.get('total_attempted', len(answers_payload))
	message = result_data.get('message', '')

	# Create final status message
	final_status = f"""🎉 IMPROVED GAIA EVALUATION COMPLETE!

	👤 User: {username}
	🤖 Model: GPT-NeoX-20B + LoRA + 4-bit Quantization
	📊 Final Score: {score}%
	✅ Correct: {correct}/{total}
	🎯 Target: 30%+ {'🎉 ACHIEVED!' if score >= 30 else '📈 Significant improvement expected!'}

	📝 Message: {message}

	🔧 Improvements Made:
	- ✅ Proper causal LM (GPT-NeoX-20B) instead of encoder-decoder
	- ✅ 4-bit quantization for memory efficiency
	- ✅ LoRA for better parameter efficiency
	- ✅ Enhanced tools with fact checking
	- ✅ Better reasoning prompts
	- ✅ Multi-strategy search approach
	"""

	print(f"\n🏆 FINAL SCORE: {score}%")
	return final_status, pd.DataFrame(results_log)

	except Exception as e:
	error_msg = f"❌ Submission failed: {str(e)}"
	print(error_msg)
	return error_msg, pd.DataFrame(results_log)

	# --- Gradio Interface ---
	with gr.Blocks(title="Improved GAIA Agent", theme=gr.themes.Soft()) as demo:
	gr.Markdown("# 🚀 Improved GAIA Agent - GPT-NeoX-20B + LoRA")
	gr.Markdown("""
	Major Improvements:
	- 🧠 GPT-NeoX-20B: 20B parameter causal language model (vs 220M FLAN-T5)
	- ⚡ 4-bit Quantization: Memory efficient loading with BitsAndBytes
	- 🎯 LoRA: Parameter-efficient fine-tuning ready
	- 🔍 Enhanced Tools: Multi-strategy search + fact checking + advanced math
	- 🤖 Better ReAct: Improved reasoning prompts and error handling
	- 📈 Expected: Significant improvement over 0% baseline

	Requirements: CUDA GPU with 16GB+ VRAM
	""")

	with gr.Row():
	gr.LoginButton()

	with gr.Row():
	run_button = gr.Button(
	"🚀 Run Improved GAIA Evaluation",
	variant="primary",
	size="lg"
	)

	status_output = gr.Textbox(
	label="📊 Evaluation Results",
	lines=15,
	interactive=False
	)

	results_table = gr.DataFrame(
	label="📝 Detailed Results",
	wrap=True
	)

	run_button.click(
	fn=run_and_submit_all,
	outputs=[status_output, results_table]
	)

	if __name__ == "__main__":
	print("🚀 Starting Improved GAIA Agent...")
	print("💪 Using GPT-NeoX-20B + LoRA + 4-bit Quantization")
	demo.launch(
	server_name="0.0.0.0",
	server_port=7860,
	show_error=True
	)