leofltt commited on
Commit
ec8845c
·
1 Parent(s): 85d8289
Files changed (1) hide show
  1. app.py +114 -36
app.py CHANGED
@@ -3,20 +3,29 @@ import gradio as gr
3
  import requests
4
  import pandas as pd
5
  import torch
 
 
6
 
7
  from llama_index.core.tools import FunctionTool
8
  from llama_index.llms.huggingface import HuggingFaceLLM
9
-
10
- # This import is correct and works when `llama-index` is installed
11
  from llama_index.core.agent import ReActAgent
12
  from llama_index.tools.duckduckgo import DuckDuckGoSearchToolSpec
 
13
  from youtube_transcript_api import YouTubeTranscriptApi
 
14
 
15
  # --- Constants ---
16
  DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
17
-
 
 
18
 
19
  # --- Helper Functions for Tools ---
 
 
 
 
 
20
  def get_video_transcript(youtube_url: str):
21
  """Fetches the transcript of a YouTube video given its URL."""
22
  try:
@@ -30,33 +39,104 @@ def get_video_transcript(youtube_url: str):
30
  return f"Error fetching transcript: {e}"
31
 
32
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
33
  # --- Tool Definitions ---
34
  youtube_tool = FunctionTool.from_defaults(
35
  fn=get_video_transcript,
36
  name="youtube_transcript_tool",
37
- description="Use this tool to get the transcript of a YouTube video. Provide the full YouTube URL.",
38
  )
 
 
 
 
 
 
39
 
40
 
41
  # --- LlamaIndex Agent Definition ---
42
  class LlamaIndexAgent:
43
  def __init__(self):
44
- print("Initializing LlamaIndexAgent with Tools...")
45
 
46
- # Initialize the DuckDuckGo search tool
47
  ddg_spec = DuckDuckGoSearchToolSpec()
48
 
49
- self.tools = [youtube_tool] + ddg_spec.to_tool_list()
 
 
 
 
50
 
51
  system_prompt = """
52
  You are a helpful assistant tasked with answering questions.
53
- You have access to a set of tools, including a web search tool and a YouTube video transcriber. Use them if needed to answer the question.
54
- When you have the final answer, you MUST use the following template: FINAL ANSWER: [YOUR FINAL ANSWER].
55
- YOUR FINAL ANSWER should be a number OR as few words as possible OR a comma separated list of numbers and/or strings.
56
- Follow the formatting rules for numbers and strings as specified.
 
 
 
57
  """
58
 
59
- # Load the primary language model for reasoning
60
  self.llm = HuggingFaceLLM(
61
  model_name="HuggingFaceH4/zephyr-7b-beta",
62
  tokenizer_name="HuggingFaceH4/zephyr-7b-beta",
@@ -64,14 +144,13 @@ class LlamaIndexAgent:
64
  model_kwargs={"torch_dtype": torch.float16, "load_in_8bit": True},
65
  )
66
 
67
- # Initialize the ReAct Agent
68
  self.agent = ReActAgent.from_tools(
69
  tools=self.tools, llm=self.llm, verbose=True, system_prompt=system_prompt
70
  )
71
  print("LlamaIndexAgent initialized successfully.")
72
 
73
  def __call__(self, question: str) -> str:
74
- print(f"Agent received question (first 80 chars): {question[:80]}...")
75
  response = self.agent.chat(question)
76
  answer = str(response).strip()
77
 
@@ -82,17 +161,21 @@ class LlamaIndexAgent:
82
  f"Warning: Agent did not use the 'FINAL ANSWER:' template. Raw output: {answer}"
83
  )
84
  final_answer = answer
85
- return f"FINAL ANSWER: {final_answer}"
86
 
87
 
88
  # --- Main Gradio App Logic ---
89
  def run_and_submit_all(profile: gr.OAuthProfile | None):
 
 
 
 
 
 
90
  space_id = os.getenv("SPACE_ID")
91
  if profile:
92
  username = f"{profile.username}"
93
- print(f"User logged in: {username}")
94
  else:
95
- print("User not logged in.")
96
  return "Please Login to Hugging Face with the button.", None
97
 
98
  api_url = DEFAULT_API_URL
@@ -102,21 +185,14 @@ def run_and_submit_all(profile: gr.OAuthProfile | None):
102
  try:
103
  agent = LlamaIndexAgent()
104
  except Exception as e:
105
- print(f"Error instantiating agent: {e}")
106
  return f"Error initializing agent: {e}", None
107
 
108
  agent_code = f"https://huggingface.co/spaces/{space_id}/tree/main"
109
- print(agent_code)
110
 
111
- print(f"Fetching questions from: {questions_url}")
112
  try:
113
  response = requests.get(questions_url, timeout=15)
114
  response.raise_for_status()
115
  questions_data = response.json()
116
- if not questions_data:
117
- print("Fetched questions list is empty.")
118
- return "Fetched questions list is empty or invalid format.", None
119
- print(f"Fetched {len(questions_data)} questions.")
120
  except Exception as e:
121
  return f"Error fetching questions: {e}", None
122
 
@@ -130,17 +206,14 @@ def run_and_submit_all(profile: gr.OAuthProfile | None):
130
  continue
131
  try:
132
  submitted_answer = agent(question_text)
133
- answer_for_submission = submitted_answer.replace(
134
- "FINAL ANSWER:", ""
135
- ).strip()
136
  answers_payload.append(
137
- {"task_id": task_id, "submitted_answer": answer_for_submission}
138
  )
139
  results_log.append(
140
  {
141
  "Task ID": task_id,
142
  "Question": question_text,
143
- "Submitted Answer": answer_for_submission,
144
  }
145
  )
146
  except Exception as e:
@@ -160,7 +233,7 @@ def run_and_submit_all(profile: gr.OAuthProfile | None):
160
  "agent_code": agent_code,
161
  "answers": answers_payload,
162
  }
163
- print(f"Submitting {len(answers_payload)} answers to: {submit_url}")
164
  try:
165
  response = requests.post(submit_url, json=submission_data, timeout=180)
166
  response.raise_for_status()
@@ -181,13 +254,13 @@ def run_and_submit_all(profile: gr.OAuthProfile | None):
181
 
182
  # --- Build Gradio Interface ---
183
  with gr.Blocks() as demo:
184
- gr.Markdown("# Stable ReAct Agent for GAIA (Text + Web Search)")
185
  gr.Markdown(
186
  """
187
- **Instructions:**
188
- 1. This agent is equipped with Web Search (DuckDuckGo) and a YouTube transcript reader.
189
- 2. The installation issues are now resolved. The agent logic is part of the core `llama-index` package.
190
- 3. Log in and click 'Run Evaluation'.
191
  """
192
  )
193
  gr.LoginButton()
@@ -200,6 +273,11 @@ with gr.Blocks() as demo:
200
 
201
  if __name__ == "__main__":
202
  print("\n" + "-" * 30 + " App Starting " + "-" * 30)
203
- print("Agent is configured with DuckDuckGo Search and YouTube tools.")
 
 
 
 
 
204
  print("Launching Gradio Interface...")
205
  demo.launch(debug=True, share=False)
 
3
  import requests
4
  import pandas as pd
5
  import torch
6
+ import base64
7
+ from io import BytesIO
8
 
9
  from llama_index.core.tools import FunctionTool
10
  from llama_index.llms.huggingface import HuggingFaceLLM
 
 
11
  from llama_index.core.agent import ReActAgent
12
  from llama_index.tools.duckduckgo import DuckDuckGoSearchToolSpec
13
+ from llama_index.tools.python_repl import PythonREPLTool
14
  from youtube_transcript_api import YouTubeTranscriptApi
15
+ from PIL import Image
16
 
17
  # --- Constants ---
18
  DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
19
+ IMAGE_ANALYSIS_API_URL = (
20
+ "https://api-inference.huggingface.co/models/llava-hf/llava-1.5-7b-hf"
21
+ )
22
 
23
  # --- Helper Functions for Tools ---
24
+
25
+ # HF_TOKEN must be set as a Space Secret in Hugging Face
26
+ HF_TOKEN = os.getenv("HF_TOKEN")
27
+
28
+
29
  def get_video_transcript(youtube_url: str):
30
  """Fetches the transcript of a YouTube video given its URL."""
31
  try:
 
39
  return f"Error fetching transcript: {e}"
40
 
41
 
42
+ def analyze_image_url(image_url: str, question: str):
43
+ """Analyzes an image from a URL using the Hugging Face Inference API."""
44
+ if not HF_TOKEN:
45
+ return (
46
+ "Error: Hugging Face token is not set. Cannot use the image analysis tool."
47
+ )
48
+
49
+ try:
50
+ # Download image
51
+ response = requests.get(image_url)
52
+ response.raise_for_status()
53
+
54
+ # Prepare data for the Inference API
55
+ image_bytes = BytesIO(response.content).getvalue()
56
+
57
+ # Call Inference API
58
+ headers = {
59
+ "Authorization": f"Bearer {HF_TOKEN}",
60
+ "Content-Type": "image/png", # Specify content type
61
+ }
62
+
63
+ # The Llava prompt format is specific
64
+ prompt = f"USER: <image>\n{question}\nASSISTANT:"
65
+
66
+ # To send both image and text, we can't use a simple JSON payload.
67
+ # A common approach is to use a multi-part form, but the HF API
68
+ # can be tricky. Let's try a different model that supports image url directly if available,
69
+ # or stick to a method that works with its API.
70
+ # For llava, sending the raw image data is the documented way.
71
+
72
+ # Re-checking llava API documentation for combined prompt/image...
73
+ # The API doesn't cleanly support separate text prompts with raw image data posts.
74
+ # A workaround is to embed the prompt in the image or use a model designed for this API format.
75
+ # Let's pivot to a model that explicitly takes a URL or a simpler payload.
76
+ # However, to keep it simple, we'll assume the prompt is simple enough.
77
+
78
+ # Let's simplify the tool's goal: describe the image, then the LLM can reason on the description.
79
+ # This is a more robust pattern than trying to force a complex prompt into an API.
80
+
81
+ description_prompt = "USER: <image>\nDescribe this image in detail.\nASSISTANT:"
82
+
83
+ # For the sake of this example, we will stick to the documented behavior
84
+ # and assume the `question` can be answered from a general description.
85
+ response = requests.post(
86
+ IMAGE_ANALYSIS_API_URL, headers=headers, data=image_bytes
87
+ )
88
+ response.raise_for_status()
89
+
90
+ result = response.json()
91
+ generated_text = result[0].get("generated_text", "").strip()
92
+
93
+ final_answer = generated_text.split("ASSISTANT:")[-1].strip()
94
+
95
+ # The agent will get the description, then re-evaluate with the original question.
96
+ return f"The image description is: {final_answer}. Now, answer the original question based on this."
97
+
98
+ except Exception as e:
99
+ return f"Error analyzing image: {e}"
100
+
101
+
102
  # --- Tool Definitions ---
103
  youtube_tool = FunctionTool.from_defaults(
104
  fn=get_video_transcript,
105
  name="youtube_transcript_tool",
106
+ description="Use this tool to get the transcript of a YouTube video.",
107
  )
108
+ image_analyzer_tool = FunctionTool.from_defaults(
109
+ fn=analyze_image_url,
110
+ name="image_analyzer_tool",
111
+ description="Use this tool to analyze an image when you are given a URL. Provide both the image URL and the question about the image.",
112
+ )
113
+ python_repl_tool = PythonREPLTool()
114
 
115
 
116
  # --- LlamaIndex Agent Definition ---
117
  class LlamaIndexAgent:
118
  def __init__(self):
119
+ print("Initializing LlamaIndexAgent with Final Tools...")
120
 
 
121
  ddg_spec = DuckDuckGoSearchToolSpec()
122
 
123
+ self.tools = [
124
+ youtube_tool,
125
+ image_analyzer_tool,
126
+ python_repl_tool,
127
+ ] + ddg_spec.to_tool_list()
128
 
129
  system_prompt = """
130
  You are a helpful assistant tasked with answering questions.
131
+ You have access to a set of tools to help you. These tools include:
132
+ - A web search tool.
133
+ - A YouTube video transcriber.
134
+ - An image analyzer for URLs (this tool provides a description of the image).
135
+ - A Python code interpreter for math and calculations.
136
+ Use a tool if it is helpful. When you have the final answer, you MUST use the following template: FINAL ANSWER: [YOUR FINAL ANSWER].
137
+ YOUR FINAL ANSWER should be a number OR as few words as possible OR a comma separated list.
138
  """
139
 
 
140
  self.llm = HuggingFaceLLM(
141
  model_name="HuggingFaceH4/zephyr-7b-beta",
142
  tokenizer_name="HuggingFaceH4/zephyr-7b-beta",
 
144
  model_kwargs={"torch_dtype": torch.float16, "load_in_8bit": True},
145
  )
146
 
 
147
  self.agent = ReActAgent.from_tools(
148
  tools=self.tools, llm=self.llm, verbose=True, system_prompt=system_prompt
149
  )
150
  print("LlamaIndexAgent initialized successfully.")
151
 
152
  def __call__(self, question: str) -> str:
153
+ print(f"Agent received question: {question[:80]}...")
154
  response = self.agent.chat(question)
155
  answer = str(response).strip()
156
 
 
161
  f"Warning: Agent did not use the 'FINAL ANSWER:' template. Raw output: {answer}"
162
  )
163
  final_answer = answer
164
+ return final_answer
165
 
166
 
167
  # --- Main Gradio App Logic ---
168
  def run_and_submit_all(profile: gr.OAuthProfile | None):
169
+ if not HF_TOKEN:
170
+ return (
171
+ "ERROR: The `HF_TOKEN` secret is not set in this Space. The image analysis tool will fail. Please set it in Settings > Secrets.",
172
+ None,
173
+ )
174
+
175
  space_id = os.getenv("SPACE_ID")
176
  if profile:
177
  username = f"{profile.username}"
 
178
  else:
 
179
  return "Please Login to Hugging Face with the button.", None
180
 
181
  api_url = DEFAULT_API_URL
 
185
  try:
186
  agent = LlamaIndexAgent()
187
  except Exception as e:
 
188
  return f"Error initializing agent: {e}", None
189
 
190
  agent_code = f"https://huggingface.co/spaces/{space_id}/tree/main"
 
191
 
 
192
  try:
193
  response = requests.get(questions_url, timeout=15)
194
  response.raise_for_status()
195
  questions_data = response.json()
 
 
 
 
196
  except Exception as e:
197
  return f"Error fetching questions: {e}", None
198
 
 
206
  continue
207
  try:
208
  submitted_answer = agent(question_text)
 
 
 
209
  answers_payload.append(
210
+ {"task_id": task_id, "submitted_answer": submitted_answer}
211
  )
212
  results_log.append(
213
  {
214
  "Task ID": task_id,
215
  "Question": question_text,
216
+ "Submitted Answer": submitted_answer,
217
  }
218
  )
219
  except Exception as e:
 
233
  "agent_code": agent_code,
234
  "answers": answers_payload,
235
  }
236
+
237
  try:
238
  response = requests.post(submit_url, json=submission_data, timeout=180)
239
  response.raise_for_status()
 
254
 
255
  # --- Build Gradio Interface ---
256
  with gr.Blocks() as demo:
257
+ gr.Markdown("# Final, Fully-Featured GAIA Agent")
258
  gr.Markdown(
259
  """
260
+ **Agent Capabilities:** Web Search, YouTube Analysis, Image Analysis (via API), and Python Code Execution.
261
+ 1. **IMPORTANT**: This Space requires a Hugging Face Token to be set in the secrets as `HF_TOKEN` for the image analysis tool to work.
262
+ 2. Log in to your Hugging Face account using the button below.
263
+ 3. Click 'Run Evaluation & Submit All Answers'. This process is complex and will take a very long time.
264
  """
265
  )
266
  gr.LoginButton()
 
273
 
274
  if __name__ == "__main__":
275
  print("\n" + "-" * 30 + " App Starting " + "-" * 30)
276
+ if not HF_TOKEN:
277
+ print(
278
+ "⚠️ WARNING: The `HF_TOKEN` secret is not set. The image analysis tool will be unavailable."
279
+ )
280
+ else:
281
+ print("✅ `HF_TOKEN` secret is set.")
282
  print("Launching Gradio Interface...")
283
  demo.launch(debug=True, share=False)