paultltc commited on
Commit
c502633
·
1 Parent(s): 0871236

Fix utils bug

Browse files
Files changed (2) hide show
  1. tool.py +130 -11
  2. utils.py +0 -124
tool.py CHANGED
@@ -29,6 +29,26 @@ class VisualRAGTool(Tool):
29
 
30
  model_name: str = "vidore/colqwen2-v1.0"
31
  api_key: str = os.getenv("OPENAI_KEY")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
32
 
33
  def __init__(self, *args, **kwargs):
34
  self.is_initialized = False
@@ -58,20 +78,103 @@ class VisualRAGTool(Tool):
58
 
59
  self.is_initialized = True
60
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
61
  def _extract_contexts(self, images, api_key, window=10) -> list:
62
  """Extracts context from images."""
63
- from utils import query_openai, Page, CONTEXT_SYSTEM_PROMPT
64
- from pqdm.processes import pqdm
 
 
 
 
 
65
  try:
66
  args = [
67
  {
68
  'query': "Give the general context about these pages. Give the context in the same language as the documents.",
69
- 'pages': [Page(image=im) for im in images[max(i-window+1, 0):i+1]],
70
  'api_key': api_key,
71
  'system_prompt': CONTEXT_SYSTEM_PROMPT,
72
  } for i in range(0, len(images), window)
73
  ]
74
- window_contexts = pqdm(args, query_openai, n_jobs=8, argument_type='kwargs')
75
 
76
  # code sequentially ftm with tqdm
77
  # query = "Give the general context about these pages. Give the context in the same language as the documents."
@@ -95,7 +198,6 @@ class VisualRAGTool(Tool):
95
  def _preprocess_file(self, file: str, contextualize: bool = True, api_key: str = None, window: int = 10) -> list:
96
  """Converts a file to images and extracts metadata."""
97
  from pdf2image import convert_from_path
98
- from utils import Metadata, Page
99
 
100
  title = file.split("/")[-1]
101
  images = convert_from_path(file, thread_count=4)
@@ -103,15 +205,15 @@ class VisualRAGTool(Tool):
103
  contexts = self._extract_contexts(images, api_key, window=window)
104
  else:
105
  contexts = [None for _ in range(len(images))]
106
- metadatas = [Metadata(doc_title=title, page_id=i, context=contexts[i]) for i in range(len(images))]
107
 
108
- return [Page(image=img, metadata=metadata) for img, metadata in zip(images, metadatas)]
109
 
110
  def preprocess(self, files: list, contextualize: bool = True, api_key: str = None, window: int = 10) -> list:
111
  """Preprocesses the files and extracts metadata."""
112
  pages = [page for file in files for page in self._preprocess_file(file, contextualize=contextualize, api_key=api_key, window=window)]
113
 
114
- print(f"Example metadata:\n{pages[0].metadata.context}")
115
 
116
  return pages
117
 
@@ -183,7 +285,7 @@ class VisualRAGTool(Tool):
183
  top_k_idx = scores.topk(k).indices.tolist()
184
 
185
  print("Top Scores:")
186
- [print(f'Page {self.pages[idx].metadata.page_id}: {scores[idx]}') for idx in top_k_idx]
187
 
188
  # Get the top k results
189
  results = [self.pages[idx] for idx in top_k_idx]
@@ -192,8 +294,25 @@ class VisualRAGTool(Tool):
192
 
193
  def generate_answer(self, query: str, docs: list, api_key: str = None):
194
  """Generates an answer based on the query and the retrieved documents."""
195
- from utils import query_openai, RAG_SYSTEM_PROMPT
196
- result = query_openai(query, docs, api_key or self.api_key, system_prompt=RAG_SYSTEM_PROMPT)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
197
  return result
198
 
199
  def search(self, query: str, k: int = 1, api_key: str = None) -> tuple:
 
29
 
30
  model_name: str = "vidore/colqwen2-v1.0"
31
  api_key: str = os.getenv("OPENAI_KEY")
32
+
33
+ class Page:
34
+ from typing import Optional, Dict, Any
35
+ from PIL import Image
36
+
37
+ image: Image.Image
38
+ metadata: Optional[Dict[str, Any]] = None
39
+
40
+ def __init__(self, image, metadata=None):
41
+ self.image = image
42
+ self.metadata = metadata
43
+
44
+ @property
45
+ def caption(self):
46
+ if self.metadata is None:
47
+ return None
48
+ return f"Document: {self.metadata.get('doc_title')}, Context: {self.metadata.get('context')}"
49
+
50
+ def __hash__(self):
51
+ return hash(self.image)
52
 
53
  def __init__(self, *args, **kwargs):
54
  self.is_initialized = False
 
78
 
79
  self.is_initialized = True
80
 
81
+ def _encode_image_to_base64(self, image):
82
+ """Encodes a PIL image to a base64 string."""
83
+ from io import BytesIO
84
+ import base64
85
+
86
+ buffered = BytesIO()
87
+ image.save(buffered, format="JPEG")
88
+ return base64.b64encode(buffered.getvalue()).decode("utf-8")
89
+
90
+ def _build_query(self, query: str, pages: list) -> list:
91
+ """Builds the query for OpenAI based on the pages and the query."""
92
+ messages = []
93
+ messages.append({"type": "text", "text": "PDF pages:\n"})
94
+ for page in pages:
95
+ capt = page.caption
96
+ if capt is not None:
97
+ messages.append({
98
+ "type": "text",
99
+ "text": capt
100
+ })
101
+ messages.append({
102
+ "type": "image_url",
103
+ "image_url": {
104
+ "url": f"data:image/jpeg;base64,{self._encode_image_to_base64(page.image)}"
105
+ },
106
+ })
107
+ messages.append({"type": "text", "text": f"Query:\n{query}"})
108
+
109
+ return messages
110
+
111
+ def query_openai(self, query, pages, api_key=None, system_prompt=None, model="gpt-4o-mini"):
112
+ """Calls OpenAI's GPT-4o-mini with the query and image data."""
113
+ from smolagents import ChatMessage
114
+
115
+ system_prompt = system_prompt or \
116
+ """You are a smart assistant designed to answer questions about a PDF document.
117
+ You are given relevant information in the form of PDF pages preceded by their metadata: document title, page identifier, surrounding context.
118
+ Use them to construct a short response to the question, and cite your sources in the following format: (document, page number).
119
+ If it is not possible to answer using the provided pages, do not attempt to provide an answer and simply say the answer is not present within the documents.
120
+ Give detailed and extensive answers, only containing info in the pages you are given.
121
+ You can answer using information contained in plots and figures if necessary.
122
+ Answer in the same language as the query."""
123
+
124
+ api_key = api_key or self.api_key
125
+
126
+ if api_key and api_key.startswith("sk"):
127
+ try:
128
+ from openai import OpenAI
129
+
130
+ client = OpenAI(api_key=api_key.strip())
131
+
132
+ response = client.chat.completions.create(
133
+ model=model,
134
+ messages=[
135
+ {
136
+ "role": "system",
137
+ "content": system_prompt
138
+ },
139
+ {
140
+ "role": "user",
141
+ "content": self._build_query(query, pages)
142
+ }
143
+ ],
144
+ max_tokens=500,
145
+ )
146
+
147
+ message = ChatMessage.from_dict(
148
+ response.choices[0].message.model_dump(include={"role", "content", "tool_calls"})
149
+ )
150
+ message.raw = response
151
+
152
+ return message
153
+
154
+ except Exception as e:
155
+ return "OpenAI API connection failure. Verify the provided key is correct (sk-***)."
156
+
157
+ return "Enter your OpenAI API key to get a custom response"
158
+
159
  def _extract_contexts(self, images, api_key, window=10) -> list:
160
  """Extracts context from images."""
161
+ from pqdm.threads import pqdm
162
+
163
+ CONTEXT_SYSTEM_PROMPT = \
164
+ """You are a smart assistant designed to extract context of PDF pages.
165
+ Give concise answers, only containing info in the pages you are given.
166
+ You can answer using information contained in plots and figures if necessary."""
167
+
168
  try:
169
  args = [
170
  {
171
  'query': "Give the general context about these pages. Give the context in the same language as the documents.",
172
+ 'pages': [self.Page(image=im) for im in images[max(i-window+1, 0):i+1]],
173
  'api_key': api_key,
174
  'system_prompt': CONTEXT_SYSTEM_PROMPT,
175
  } for i in range(0, len(images), window)
176
  ]
177
+ window_contexts = pqdm(args, self.query_openai, n_jobs=8, argument_type='kwargs')
178
 
179
  # code sequentially ftm with tqdm
180
  # query = "Give the general context about these pages. Give the context in the same language as the documents."
 
198
  def _preprocess_file(self, file: str, contextualize: bool = True, api_key: str = None, window: int = 10) -> list:
199
  """Converts a file to images and extracts metadata."""
200
  from pdf2image import convert_from_path
 
201
 
202
  title = file.split("/")[-1]
203
  images = convert_from_path(file, thread_count=4)
 
205
  contexts = self._extract_contexts(images, api_key, window=window)
206
  else:
207
  contexts = [None for _ in range(len(images))]
208
+ metadatas = [{'doc_title': title, 'page_id': i, 'context': contexts[i]} for i in range(len(images))]
209
 
210
+ return [self.Page(image=img, metadata=metadata) for img, metadata in zip(images, metadatas)]
211
 
212
  def preprocess(self, files: list, contextualize: bool = True, api_key: str = None, window: int = 10) -> list:
213
  """Preprocesses the files and extracts metadata."""
214
  pages = [page for file in files for page in self._preprocess_file(file, contextualize=contextualize, api_key=api_key, window=window)]
215
 
216
+ print(f"Example metadata:\n{pages[0].metadata.get('context')}")
217
 
218
  return pages
219
 
 
285
  top_k_idx = scores.topk(k).indices.tolist()
286
 
287
  print("Top Scores:")
288
+ [print(f'Page {self.pages[idx].metadata.get('page_id')}: {scores[idx]}') for idx in top_k_idx]
289
 
290
  # Get the top k results
291
  results = [self.pages[idx] for idx in top_k_idx]
 
294
 
295
  def generate_answer(self, query: str, docs: list, api_key: str = None):
296
  """Generates an answer based on the query and the retrieved documents."""
297
+
298
+ RAG_SYSTEM_PROMPT = \
299
+ """ You are a smart assistant designed to answer questions about a PDF document.
300
+
301
+ You are given relevant information in the form of PDF pages preceded by their metadata: document title, page identifier, surrounding context.
302
+ Use them to construct a response to the question, and cite your sources.
303
+ Use the following citation format:
304
+ "Some information from a first document [1, p.Page Number]. Some information from the same first document but at a different page [1, p.Page Number]. Some more information from another document [2, p.Page Number].
305
+ ...
306
+ Sources:
307
+ [1] Document Title
308
+ [2] Another Document Title"
309
+
310
+ You can answer using information contained in plots and figures if necessary.
311
+ If it is not possible to answer using the provided pages, do not attempt to provide an answer and simply say the answer is not present within the documents.
312
+ Give detailed answers, only containing info in the pages you are given.
313
+ Answer in the same language as the query."""
314
+
315
+ result = self.query_openai(query, docs, api_key or self.api_key, system_prompt=RAG_SYSTEM_PROMPT)
316
  return result
317
 
318
  def search(self, query: str, k: int = 1, api_key: str = None) -> tuple:
utils.py DELETED
@@ -1,124 +0,0 @@
1
- from dataclasses import dataclass
2
- from typing import List, Optional, Tuple
3
-
4
- import base64
5
- from io import BytesIO
6
- from PIL import Image
7
-
8
-
9
- from smolagents import ChatMessage
10
-
11
- def encode_image_to_base64(image):
12
- """Encodes a PIL image to a base64 string."""
13
- buffered = BytesIO()
14
- image.save(buffered, format="JPEG")
15
- return base64.b64encode(buffered.getvalue()).decode("utf-8")
16
-
17
- DEFAULT_SYSTEM_PROMPT = \
18
- """You are a smart assistant designed to answer questions about a PDF document.
19
- You are given relevant information in the form of PDF pages preceded by their metadata: document title, page identifier, surrounding context.
20
- Use them to construct a short response to the question, and cite your sources in the following format: (document, page number).
21
- If it is not possible to answer using the provided pages, do not attempt to provide an answer and simply say the answer is not present within the documents.
22
- Give detailed and extensive answers, only containing info in the pages you are given.
23
- You can answer using information contained in plots and figures if necessary.
24
- Answer in the same language as the query."""
25
-
26
- def _build_query(query, pages):
27
- messages = []
28
- messages.append({"type": "text", "text": "PDF pages:\n"})
29
- for page in pages:
30
- capt = page.caption
31
- if capt is not None:
32
- messages.append({
33
- "type": "text",
34
- "text": capt
35
- })
36
- messages.append({
37
- "type": "image_url",
38
- "image_url": {
39
- "url": f"data:image/jpeg;base64,{encode_image_to_base64(page.image)}"
40
- },
41
- })
42
- messages.append({"type": "text", "text": f"Query:\n{query}"})
43
-
44
- return messages
45
-
46
- def query_openai(query, pages, api_key=None, system_prompt=DEFAULT_SYSTEM_PROMPT, model="gpt-4o-mini") -> ChatMessage:
47
- """Calls OpenAI's GPT-4o-mini with the query and image data."""
48
- if api_key and api_key.startswith("sk"):
49
- try:
50
- from openai import OpenAI
51
-
52
- client = OpenAI(api_key=api_key.strip())
53
-
54
- response = client.chat.completions.create(
55
- model=model,
56
- messages=[
57
- {
58
- "role": "system",
59
- "content": system_prompt
60
- },
61
- {
62
- "role": "user",
63
- "content": _build_query(query, pages)
64
- }
65
- ],
66
- max_tokens=500,
67
- )
68
-
69
- message = ChatMessage.from_dict(
70
- response.choices[0].message.model_dump(include={"role", "content", "tool_calls"})
71
- )
72
- message.raw = response
73
-
74
- return message
75
-
76
- except Exception as e:
77
- return "OpenAI API connection failure. Verify the provided key is correct (sk-***)."
78
-
79
- return "Enter your OpenAI API key to get a custom response"
80
-
81
- CONTEXT_SYSTEM_PROMPT = \
82
- """You are a smart assistant designed to extract context of PDF pages.
83
- Give concise answers, only containing info in the pages you are given.
84
- You can answer using information contained in plots and figures if necessary."""
85
-
86
- RAG_SYSTEM_PROMPT = \
87
- """ You are a smart assistant designed to answer questions about a PDF document.
88
-
89
- You are given relevant information in the form of PDF pages preceded by their metadata: document title, page identifier, surrounding context.
90
- Use them to construct a response to the question, and cite your sources.
91
- Use the following citation format:
92
- "Some information from a first document [1, p.Page Number]. Some information from the same first document but at a different page [1, p.Page Number]. Some more information from another document [2, p.Page Number].
93
- ...
94
- Sources:
95
- [1] Document Title
96
- [2] Another Document Title"
97
-
98
- You can answer using information contained in plots and figures if necessary.
99
- If it is not possible to answer using the provided pages, do not attempt to provide an answer and simply say the answer is not present within the documents.
100
- Give detailed answers, only containing info in the pages you are given.
101
- Answer in the same language as the query."""
102
-
103
- @dataclass
104
- class Metadata:
105
- doc_title: str
106
- page_id: int
107
- context: Optional[str] = None
108
-
109
- def __str__(self):
110
- return f"Document: {self.doc_title}, Page ID: {self.page_id}, Context: {self.context}"
111
-
112
- @dataclass
113
- class Page:
114
- image: Image.Image
115
- metadata: Optional[Metadata] = None
116
-
117
- @property
118
- def caption(self):
119
- if self.metadata is None:
120
- return None
121
- return f"Document: {self.metadata.doc_title}, Context: {self.metadata.context}"
122
-
123
- def __hash__(self):
124
- return hash(self.image)