HF_Agents_Final_Assignment

Build error

App Files Files Community

leofltt commited on Jun 17

Commit

ec8845c

1 Parent(s): 85d8289

lesgo

Browse files

Files changed (1) hide show

app.py +114 -36

app.py CHANGED Viewed

@@ -3,20 +3,29 @@ import gradio as gr
 import requests
 import pandas as pd
 import torch
 from llama_index.core.tools import FunctionTool
 from llama_index.llms.huggingface import HuggingFaceLLM
-# This import is correct and works when `llama-index` is installed
 from llama_index.core.agent import ReActAgent
 from llama_index.tools.duckduckgo import DuckDuckGoSearchToolSpec
 from youtube_transcript_api import YouTubeTranscriptApi
 # --- Constants ---
 DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
 # --- Helper Functions for Tools ---
 def get_video_transcript(youtube_url: str):
     """Fetches the transcript of a YouTube video given its URL."""
     try:
@@ -30,33 +39,104 @@ def get_video_transcript(youtube_url: str):
         return f"Error fetching transcript: {e}"
 # --- Tool Definitions ---
 youtube_tool = FunctionTool.from_defaults(
     fn=get_video_transcript,
     name="youtube_transcript_tool",
-    description="Use this tool to get the transcript of a YouTube video. Provide the full YouTube URL.",
 )
 # --- LlamaIndex Agent Definition ---
 class LlamaIndexAgent:
     def __init__(self):
-        print("Initializing LlamaIndexAgent with Tools...")
-        # Initialize the DuckDuckGo search tool
         ddg_spec = DuckDuckGoSearchToolSpec()
-        self.tools = [youtube_tool] + ddg_spec.to_tool_list()
         system_prompt = """
         You are a helpful assistant tasked with answering questions.
-        You have access to a set of tools, including a web search tool and a YouTube video transcriber. Use them if needed to answer the question.
-        When you have the final answer, you MUST use the following template: FINAL ANSWER: [YOUR FINAL ANSWER].
-        YOUR FINAL ANSWER should be a number OR as few words as possible OR a comma separated list of numbers and/or strings.
-        Follow the formatting rules for numbers and strings as specified.
         """
-        # Load the primary language model for reasoning
         self.llm = HuggingFaceLLM(
             model_name="HuggingFaceH4/zephyr-7b-beta",
             tokenizer_name="HuggingFaceH4/zephyr-7b-beta",
@@ -64,14 +144,13 @@ class LlamaIndexAgent:
             model_kwargs={"torch_dtype": torch.float16, "load_in_8bit": True},
         )
-        # Initialize the ReAct Agent
         self.agent = ReActAgent.from_tools(
             tools=self.tools, llm=self.llm, verbose=True, system_prompt=system_prompt
         )
         print("LlamaIndexAgent initialized successfully.")
     def __call__(self, question: str) -> str:
-        print(f"Agent received question (first 80 chars): {question[:80]}...")
         response = self.agent.chat(question)
         answer = str(response).strip()
@@ -82,17 +161,21 @@ class LlamaIndexAgent:
                 f"Warning: Agent did not use the 'FINAL ANSWER:' template. Raw output: {answer}"
             )
             final_answer = answer
-        return f"FINAL ANSWER: {final_answer}"
 # --- Main Gradio App Logic ---
 def run_and_submit_all(profile: gr.OAuthProfile | None):
     space_id = os.getenv("SPACE_ID")
     if profile:
         username = f"{profile.username}"
-        print(f"User logged in: {username}")
     else:
-        print("User not logged in.")
         return "Please Login to Hugging Face with the button.", None
     api_url = DEFAULT_API_URL
@@ -102,21 +185,14 @@ def run_and_submit_all(profile: gr.OAuthProfile | None):
     try:
         agent = LlamaIndexAgent()
     except Exception as e:
-        print(f"Error instantiating agent: {e}")
         return f"Error initializing agent: {e}", None
     agent_code = f"https://huggingface.co/spaces/{space_id}/tree/main"
-    print(agent_code)
-    print(f"Fetching questions from: {questions_url}")
     try:
         response = requests.get(questions_url, timeout=15)
         response.raise_for_status()
         questions_data = response.json()
-        if not questions_data:
-            print("Fetched questions list is empty.")
-            return "Fetched questions list is empty or invalid format.", None
-        print(f"Fetched {len(questions_data)} questions.")
     except Exception as e:
         return f"Error fetching questions: {e}", None
@@ -130,17 +206,14 @@ def run_and_submit_all(profile: gr.OAuthProfile | None):
             continue
         try:
             submitted_answer = agent(question_text)
-            answer_for_submission = submitted_answer.replace(
-                "FINAL ANSWER:", ""
-            ).strip()
             answers_payload.append(
-                {"task_id": task_id, "submitted_answer": answer_for_submission}
             )
             results_log.append(
                 {
                     "Task ID": task_id,
                     "Question": question_text,
-                    "Submitted Answer": answer_for_submission,
                 }
             )
         except Exception as e:
@@ -160,7 +233,7 @@ def run_and_submit_all(profile: gr.OAuthProfile | None):
         "agent_code": agent_code,
         "answers": answers_payload,
     }
-    print(f"Submitting {len(answers_payload)} answers to: {submit_url}")
     try:
         response = requests.post(submit_url, json=submission_data, timeout=180)
         response.raise_for_status()
@@ -181,13 +254,13 @@ def run_and_submit_all(profile: gr.OAuthProfile | None):
 # --- Build Gradio Interface ---
 with gr.Blocks() as demo:
-    gr.Markdown("# Stable ReAct Agent for GAIA (Text + Web Search)")
     gr.Markdown(
         """
-        **Instructions:**
-        1. This agent is equipped with Web Search (DuckDuckGo) and a YouTube transcript reader.
-        2. The installation issues are now resolved. The agent logic is part of the core `llama-index` package.
-        3. Log in and click 'Run Evaluation'.
         """
     )
     gr.LoginButton()
@@ -200,6 +273,11 @@ with gr.Blocks() as demo:
 if __name__ == "__main__":
     print("\n" + "-" * 30 + " App Starting " + "-" * 30)
-    print("Agent is configured with DuckDuckGo Search and YouTube tools.")
     print("Launching Gradio Interface...")
     demo.launch(debug=True, share=False)

 import requests
 import pandas as pd
 import torch
+import base64
+from io import BytesIO
 from llama_index.core.tools import FunctionTool
 from llama_index.llms.huggingface import HuggingFaceLLM
 from llama_index.core.agent import ReActAgent
 from llama_index.tools.duckduckgo import DuckDuckGoSearchToolSpec
+from llama_index.tools.python_repl import PythonREPLTool
 from youtube_transcript_api import YouTubeTranscriptApi
+from PIL import Image
 # --- Constants ---
 DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
+IMAGE_ANALYSIS_API_URL = (
+    "https://api-inference.huggingface.co/models/llava-hf/llava-1.5-7b-hf"
+)
 # --- Helper Functions for Tools ---
+# HF_TOKEN must be set as a Space Secret in Hugging Face
+HF_TOKEN = os.getenv("HF_TOKEN")
 def get_video_transcript(youtube_url: str):
     """Fetches the transcript of a YouTube video given its URL."""
     try:
         return f"Error fetching transcript: {e}"
+def analyze_image_url(image_url: str, question: str):
+    """Analyzes an image from a URL using the Hugging Face Inference API."""
+    if not HF_TOKEN:
+        return (
+            "Error: Hugging Face token is not set. Cannot use the image analysis tool."
+        )
+    try:
+        # Download image
+        response = requests.get(image_url)
+        response.raise_for_status()
+        # Prepare data for the Inference API
+        image_bytes = BytesIO(response.content).getvalue()
+        # Call Inference API
+        headers = {
+            "Authorization": f"Bearer {HF_TOKEN}",
+            "Content-Type": "image/png",  # Specify content type
+        }
+        # The Llava prompt format is specific
+        prompt = f"USER: <image>\n{question}\nASSISTANT:"
+        # To send both image and text, we can't use a simple JSON payload.
+        # A common approach is to use a multi-part form, but the HF API
+        # can be tricky. Let's try a different model that supports image url directly if available,
+        # or stick to a method that works with its API.
+        # For llava, sending the raw image data is the documented way.
+        # Re-checking llava API documentation for combined prompt/image...
+        # The API doesn't cleanly support separate text prompts with raw image data posts.
+        # A workaround is to embed the prompt in the image or use a model designed for this API format.
+        # Let's pivot to a model that explicitly takes a URL or a simpler payload.
+        # However, to keep it simple, we'll assume the prompt is simple enough.
+        # Let's simplify the tool's goal: describe the image, then the LLM can reason on the description.
+        # This is a more robust pattern than trying to force a complex prompt into an API.
+        description_prompt = "USER: <image>\nDescribe this image in detail.\nASSISTANT:"
+        # For the sake of this example, we will stick to the documented behavior
+        # and assume the `question` can be answered from a general description.
+        response = requests.post(
+            IMAGE_ANALYSIS_API_URL, headers=headers, data=image_bytes
+        )
+        response.raise_for_status()
+        result = response.json()
+        generated_text = result[0].get("generated_text", "").strip()
+        final_answer = generated_text.split("ASSISTANT:")[-1].strip()
+        # The agent will get the description, then re-evaluate with the original question.
+        return f"The image description is: {final_answer}. Now, answer the original question based on this."
+    except Exception as e:
+        return f"Error analyzing image: {e}"
 # --- Tool Definitions ---
 youtube_tool = FunctionTool.from_defaults(
     fn=get_video_transcript,
     name="youtube_transcript_tool",
+    description="Use this tool to get the transcript of a YouTube video.",
 )
+image_analyzer_tool = FunctionTool.from_defaults(
+    fn=analyze_image_url,
+    name="image_analyzer_tool",
+    description="Use this tool to analyze an image when you are given a URL. Provide both the image URL and the question about the image.",
+)
+python_repl_tool = PythonREPLTool()
 # --- LlamaIndex Agent Definition ---
 class LlamaIndexAgent:
     def __init__(self):
+        print("Initializing LlamaIndexAgent with Final Tools...")
         ddg_spec = DuckDuckGoSearchToolSpec()
+        self.tools = [
+            youtube_tool,
+            image_analyzer_tool,
+            python_repl_tool,
+        ] + ddg_spec.to_tool_list()
         system_prompt = """
         You are a helpful assistant tasked with answering questions.
+        You have access to a set of tools to help you. These tools include:
+        - A web search tool.
+        - A YouTube video transcriber.
+        - An image analyzer for URLs (this tool provides a description of the image).
+        - A Python code interpreter for math and calculations.
+        Use a tool if it is helpful. When you have the final answer, you MUST use the following template: FINAL ANSWER: [YOUR FINAL ANSWER].
+        YOUR FINAL ANSWER should be a number OR as few words as possible OR a comma separated list.
         """
         self.llm = HuggingFaceLLM(
             model_name="HuggingFaceH4/zephyr-7b-beta",
             tokenizer_name="HuggingFaceH4/zephyr-7b-beta",
             model_kwargs={"torch_dtype": torch.float16, "load_in_8bit": True},
         )
         self.agent = ReActAgent.from_tools(
             tools=self.tools, llm=self.llm, verbose=True, system_prompt=system_prompt
         )
         print("LlamaIndexAgent initialized successfully.")
     def __call__(self, question: str) -> str:
+        print(f"Agent received question: {question[:80]}...")
         response = self.agent.chat(question)
         answer = str(response).strip()
                 f"Warning: Agent did not use the 'FINAL ANSWER:' template. Raw output: {answer}"
             )
             final_answer = answer
+        return final_answer
 # --- Main Gradio App Logic ---
 def run_and_submit_all(profile: gr.OAuthProfile | None):
+    if not HF_TOKEN:
+        return (
+            "ERROR: The `HF_TOKEN` secret is not set in this Space. The image analysis tool will fail. Please set it in Settings > Secrets.",
+            None,
+        )
     space_id = os.getenv("SPACE_ID")
     if profile:
         username = f"{profile.username}"
     else:
         return "Please Login to Hugging Face with the button.", None
     api_url = DEFAULT_API_URL
     try:
         agent = LlamaIndexAgent()
     except Exception as e:
         return f"Error initializing agent: {e}", None
     agent_code = f"https://huggingface.co/spaces/{space_id}/tree/main"
     try:
         response = requests.get(questions_url, timeout=15)
         response.raise_for_status()
         questions_data = response.json()
     except Exception as e:
         return f"Error fetching questions: {e}", None
             continue
         try:
             submitted_answer = agent(question_text)
             answers_payload.append(
+                {"task_id": task_id, "submitted_answer": submitted_answer}
             )
             results_log.append(
                 {
                     "Task ID": task_id,
                     "Question": question_text,
+                    "Submitted Answer": submitted_answer,
                 }
             )
         except Exception as e:
         "agent_code": agent_code,
         "answers": answers_payload,
     }
     try:
         response = requests.post(submit_url, json=submission_data, timeout=180)
         response.raise_for_status()
 # --- Build Gradio Interface ---
 with gr.Blocks() as demo:
+    gr.Markdown("# Final, Fully-Featured GAIA Agent")
     gr.Markdown(
         """
+        **Agent Capabilities:** Web Search, YouTube Analysis, Image Analysis (via API), and Python Code Execution.
+        1.  **IMPORTANT**: This Space requires a Hugging Face Token to be set in the secrets as `HF_TOKEN` for the image analysis tool to work.
+        2.  Log in to your Hugging Face account using the button below.
+        3.  Click 'Run Evaluation & Submit All Answers'. This process is complex and will take a very long time.
         """
     )
     gr.LoginButton()
 if __name__ == "__main__":
     print("\n" + "-" * 30 + " App Starting " + "-" * 30)
+    if not HF_TOKEN:
+        print(
+            "⚠️ WARNING: The `HF_TOKEN` secret is not set. The image analysis tool will be unavailable."
+        )
+    else:
+        print("✅ `HF_TOKEN` secret is set.")
     print("Launching Gradio Interface...")
     demo.launch(debug=True, share=False)