Spaces:
Running
Running
from dataclasses import dataclass | |
from typing import List, Optional, Tuple | |
import base64 | |
from io import BytesIO | |
from PIL import Image | |
from smolagents import ChatMessage | |
def encode_image_to_base64(image): | |
"""Encodes a PIL image to a base64 string.""" | |
buffered = BytesIO() | |
image.save(buffered, format="JPEG") | |
return base64.b64encode(buffered.getvalue()).decode("utf-8") | |
DEFAULT_SYSTEM_PROMPT = \ | |
"""You are a smart assistant designed to answer questions about a PDF document. | |
You are given relevant information in the form of PDF pages preceded by their metadata: document title, page identifier, surrounding context. | |
Use them to construct a short response to the question, and cite your sources in the following format: (document, page number). | |
If it is not possible to answer using the provided pages, do not attempt to provide an answer and simply say the answer is not present within the documents. | |
Give detailed and extensive answers, only containing info in the pages you are given. | |
You can answer using information contained in plots and figures if necessary. | |
Answer in the same language as the query.""" | |
def _build_query(query, pages): | |
messages = [] | |
messages.append({"type": "text", "text": "PDF pages:\n"}) | |
for page in pages: | |
capt = page.caption | |
if capt is not None: | |
messages.append({ | |
"type": "text", | |
"text": capt | |
}) | |
messages.append({ | |
"type": "image_url", | |
"image_url": { | |
"url": f"data:image/jpeg;base64,{encode_image_to_base64(page.image)}" | |
}, | |
}) | |
messages.append({"type": "text", "text": f"Query:\n{query}"}) | |
return messages | |
def query_openai(query, pages, api_key=None, system_prompt=DEFAULT_SYSTEM_PROMPT, model="gpt-4o-mini") -> ChatMessage: | |
"""Calls OpenAI's GPT-4o-mini with the query and image data.""" | |
if api_key and api_key.startswith("sk"): | |
try: | |
from openai import OpenAI | |
client = OpenAI(api_key=api_key.strip()) | |
response = client.chat.completions.create( | |
model=model, | |
messages=[ | |
{ | |
"role": "system", | |
"content": system_prompt | |
}, | |
{ | |
"role": "user", | |
"content": _build_query(query, pages) | |
} | |
], | |
max_tokens=500, | |
) | |
message = ChatMessage.from_dict( | |
response.choices[0].message.model_dump(include={"role", "content", "tool_calls"}) | |
) | |
message.raw = response | |
return message | |
except Exception as e: | |
return "OpenAI API connection failure. Verify the provided key is correct (sk-***)." | |
return "Enter your OpenAI API key to get a custom response" | |
CONTEXT_SYSTEM_PROMPT = \ | |
"""You are a smart assistant designed to extract context of PDF pages. | |
Give concise answers, only containing info in the pages you are given. | |
You can answer using information contained in plots and figures if necessary.""" | |
RAG_SYSTEM_PROMPT = \ | |
""" You are a smart assistant designed to answer questions about a PDF document. | |
You are given relevant information in the form of PDF pages preceded by their metadata: document title, page identifier, surrounding context. | |
Use them to construct a response to the question, and cite your sources. | |
Use the following citation format: | |
"Some information from a first document [1, p.Page Number]. Some information from the same first document but at a different page [1, p.Page Number]. Some more information from another document [2, p.Page Number]. | |
... | |
Sources: | |
[1] Document Title | |
[2] Another Document Title" | |
You can answer using information contained in plots and figures if necessary. | |
If it is not possible to answer using the provided pages, do not attempt to provide an answer and simply say the answer is not present within the documents. | |
Give detailed answers, only containing info in the pages you are given. | |
Answer in the same language as the query.""" | |
class Metadata: | |
doc_title: str | |
page_id: int | |
context: Optional[str] = None | |
def __str__(self): | |
return f"Document: {self.doc_title}, Page ID: {self.page_id}, Context: {self.context}" | |
class Page: | |
image: Image.Image | |
metadata: Optional[Metadata] = None | |
def caption(self): | |
if self.metadata is None: | |
return None | |
return f"Document: {self.metadata.doc_title}, Context: {self.metadata.context}" | |
def __hash__(self): | |
return hash(self.image) |