Spaces:
Sleeping
Sleeping
Commit
·
30857ba
1
Parent(s):
23ca6b0
Added source code
Browse files- Dockerfile +5 -20
- src/app.py +370 -0
- src/config.py +2 -0
- src/predict_output.py +559 -0
- src/requirements.txt +175 -0
Dockerfile
CHANGED
@@ -1,21 +1,6 @@
|
|
1 |
-
FROM python:3.
|
2 |
-
|
3 |
-
|
4 |
-
|
5 |
-
RUN apt-get update && apt-get install -y \
|
6 |
-
build-essential \
|
7 |
-
curl \
|
8 |
-
software-properties-common \
|
9 |
-
git \
|
10 |
-
&& rm -rf /var/lib/apt/lists/*
|
11 |
-
|
12 |
-
COPY requirements.txt ./
|
13 |
-
COPY src/ ./src/
|
14 |
-
|
15 |
-
RUN pip3 install -r requirements.txt
|
16 |
-
|
17 |
EXPOSE 8501
|
18 |
-
|
19 |
-
HEALTHCHECK CMD curl --fail http://localhost:8501/_stcore/health
|
20 |
-
|
21 |
-
ENTRYPOINT ["streamlit", "run", "src/streamlit_app.py", "--server.port=8501", "--server.address=0.0.0.0"]
|
|
|
1 |
+
FROM python:3.10
|
2 |
+
WORKDIR /src
|
3 |
+
COPY . /src
|
4 |
+
RUN pip install -r requirements.txt
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
5 |
EXPOSE 8501
|
6 |
+
CMD ["streamlit", "run", "main/demo/app.py", "--server.port=8501", "--server.address=0.0.0.0"]
|
|
|
|
|
|
src/app.py
ADDED
@@ -0,0 +1,370 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
import time
|
3 |
+
import os
|
4 |
+
import shutil
|
5 |
+
import pymupdf
|
6 |
+
import json
|
7 |
+
|
8 |
+
st.set_page_config(
|
9 |
+
page_title="MGVG Grounding Demo",
|
10 |
+
layout="wide",
|
11 |
+
initial_sidebar_state="expanded",
|
12 |
+
page_icon="logo.png"
|
13 |
+
)
|
14 |
+
|
15 |
+
# --- Simple Authentication ---
|
16 |
+
import streamlit as st
|
17 |
+
import time
|
18 |
+
|
19 |
+
# Define your valid credentials
|
20 |
+
VALID_USERS = {
|
21 |
+
"iitb": "iitb123",
|
22 |
+
"badri": "badri123"
|
23 |
+
}
|
24 |
+
|
25 |
+
def login():
|
26 |
+
# Set a professional background for the whole app
|
27 |
+
st.markdown(
|
28 |
+
'''
|
29 |
+
<style>
|
30 |
+
body, .stApp {
|
31 |
+
background: linear-gradient(120deg, #e0eafc 0%, #cfdef3 100%) !important;
|
32 |
+
}
|
33 |
+
.login-box {
|
34 |
+
background: #fff;
|
35 |
+
padding: 2.5em 2em 2em 2em;
|
36 |
+
border-radius: 16px;
|
37 |
+
box-shadow: 0 4px 24px rgba(80, 120, 200, 0.12);
|
38 |
+
min-width: 320px;
|
39 |
+
max-width: 90vw;
|
40 |
+
margin: auto;
|
41 |
+
}
|
42 |
+
</style>
|
43 |
+
''', unsafe_allow_html=True
|
44 |
+
)
|
45 |
+
# Center the login box using columns
|
46 |
+
col1, col2, col3 = st.columns([1,2,1])
|
47 |
+
with col2:
|
48 |
+
# st.markdown('<div class="login-box">', unsafe_allow_html=True)
|
49 |
+
# image at center
|
50 |
+
st.image("logo.png", width=800, use_container_width=False)
|
51 |
+
st.markdown('<h2 style="text-align:center; color:#2b6cb0; margin-bottom:1.5em;">🔒 Please log in to access the app</h2>', unsafe_allow_html=True)
|
52 |
+
username = st.text_input("Username", key="login_username")
|
53 |
+
password = st.text_input("Password", type="password", key="login_password")
|
54 |
+
login_btn = st.button("Login")
|
55 |
+
if login_btn:
|
56 |
+
if username in VALID_USERS and VALID_USERS[username] == password:
|
57 |
+
st.session_state["authenticated"] = True
|
58 |
+
st.success("Login successful!")
|
59 |
+
st.session_state["show_continue"] = True
|
60 |
+
else:
|
61 |
+
st.error("Invalid username or password")
|
62 |
+
if st.session_state.get("show_continue", False):
|
63 |
+
if st.button("Continue to App"):
|
64 |
+
st.session_state["show_continue"] = False
|
65 |
+
st.experimental_rerun() if hasattr(st, "experimental_rerun") else None
|
66 |
+
st.markdown('</div>', unsafe_allow_html=True)
|
67 |
+
|
68 |
+
if "authenticated" not in st.session_state:
|
69 |
+
st.session_state["authenticated"] = False
|
70 |
+
|
71 |
+
if not st.session_state["authenticated"]:
|
72 |
+
login()
|
73 |
+
st.stop()
|
74 |
+
# --- End Authentication ---
|
75 |
+
|
76 |
+
# st.image("logo.png", width=250)
|
77 |
+
|
78 |
+
from PIL import Image, ImageDraw
|
79 |
+
import io
|
80 |
+
# from st_audiorec import st_audiorec
|
81 |
+
|
82 |
+
from surya.layout import LayoutPredictor
|
83 |
+
from doctr.models import ocr_predictor
|
84 |
+
from transformers import pipeline
|
85 |
+
|
86 |
+
@st.cache_resource
|
87 |
+
def get_layout_predictor():
|
88 |
+
return LayoutPredictor()
|
89 |
+
|
90 |
+
@st.cache_resource
|
91 |
+
def get_ocr_model():
|
92 |
+
return ocr_predictor(det_arch='db_resnet50', reco_arch='crnn_vgg16_bn', pretrained=True)
|
93 |
+
|
94 |
+
@st.cache_resource
|
95 |
+
def get_llm_model(device):
|
96 |
+
return pipeline("text-generation", model="meta-llama/Meta-Llama-3.1-8B-Instruct", device=device)
|
97 |
+
|
98 |
+
from predict_output import predict_output
|
99 |
+
|
100 |
+
|
101 |
+
layout_predictor = get_layout_predictor()
|
102 |
+
model = get_ocr_model()
|
103 |
+
pipe = get_llm_model("cuda")
|
104 |
+
|
105 |
+
print("Models loaded")
|
106 |
+
|
107 |
+
# --- Placeholder function for demo ---
|
108 |
+
def get_corresponding_bboxes(image, question):
|
109 |
+
# Returns dummy bounding boxes and answer for demo
|
110 |
+
# Each bbox: (x1, y1, x2, y2)
|
111 |
+
w, h = image.size
|
112 |
+
block_bboxes = [(w//8, h//8, w//2, h//2)]
|
113 |
+
line_bboxes = [(w//4, h//4, w//2, h//3)]
|
114 |
+
word_bboxes = [(w//3, h//3, w//2, h//2)]
|
115 |
+
point_bboxes = [(w//2, h//2, w//2+5, h//2+5)]
|
116 |
+
answer = "This is a demo answer."
|
117 |
+
return block_bboxes, line_bboxes, word_bboxes, point_bboxes, answer
|
118 |
+
|
119 |
+
# --- Helper to draw bboxes ---
|
120 |
+
def draw_bboxes(image, bboxes, color):
|
121 |
+
img = image.copy()
|
122 |
+
# width proportional to the image size
|
123 |
+
width = int(img.width/100)
|
124 |
+
draw = ImageDraw.Draw(img)
|
125 |
+
for bbox in bboxes:
|
126 |
+
draw.rectangle(bbox, outline=color, width=width)
|
127 |
+
return img
|
128 |
+
|
129 |
+
def draw_points(image, bboxes, color):
|
130 |
+
img = image.copy()
|
131 |
+
width = int(img.width)
|
132 |
+
draw = ImageDraw.Draw(img)
|
133 |
+
for bbox in bboxes:
|
134 |
+
# x1, y1, x2, y2 = bbox
|
135 |
+
cx, cy = bbox[0], bbox[1]
|
136 |
+
# r being relative to the image size
|
137 |
+
r = int(img.width/100)
|
138 |
+
draw.ellipse((cx-r, cy-r, cx+r, cy+r), outline=color, width=width, fill=color)
|
139 |
+
return img
|
140 |
+
|
141 |
+
# model_type = st.sidebar.checkbox("Use LLM Model", value=False)
|
142 |
+
# model_type = "llm" if model_type else "inhouse"
|
143 |
+
|
144 |
+
st.markdown("""
|
145 |
+
<style>
|
146 |
+
.main {
|
147 |
+
background: linear-gradient(135deg, #f8fafc 0%, #e0e7ef 100%);
|
148 |
+
}
|
149 |
+
.block-container {
|
150 |
+
padding-top: 2rem;
|
151 |
+
padding-bottom: 2rem;
|
152 |
+
}
|
153 |
+
.stButton>button {
|
154 |
+
background-color: #4F8BF9;
|
155 |
+
color: white;
|
156 |
+
border-radius: 8px;
|
157 |
+
font-size: 1.1rem;
|
158 |
+
padding: 0.5em 2em;
|
159 |
+
}
|
160 |
+
.stTextInput>div>input {
|
161 |
+
border-radius: 8px;
|
162 |
+
border: 1px solid #4F8BF9;
|
163 |
+
}
|
164 |
+
.stFileUploader>div>div {
|
165 |
+
border-radius: 8px;
|
166 |
+
border: 2px dashed #4F8BF9;
|
167 |
+
}
|
168 |
+
.stAudio>audio {
|
169 |
+
width: 100% !important;
|
170 |
+
}
|
171 |
+
</style>
|
172 |
+
""", unsafe_allow_html=True)
|
173 |
+
|
174 |
+
col_logo, col_title = st.columns([1, 8])
|
175 |
+
with col_logo:
|
176 |
+
st.image("logo.png", width=180)
|
177 |
+
with col_title:
|
178 |
+
st.markdown("<h1 style='margin-bottom: 0;'>MGVG - Multi-Granular Visual Grounding</h1>", unsafe_allow_html=True)
|
179 |
+
|
180 |
+
# List of quotes (HTML formatted)
|
181 |
+
QUOTES = [
|
182 |
+
'''<div style="color: #2b6cb0; font-size: 1.3em; font-weight: 500; margin-bottom: 1em;">
|
183 |
+
"प्रत्यक्षं किं प्रमाणं?" <span style="font-size:0.9em; color:#444;">(<i>What better proof is there than direct perception?)</i></span>
|
184 |
+
</div>''',
|
185 |
+
'''<div style="color: #2b6cb0; font-size: 1.3em; font-weight: 500; margin-bottom: 1em;">
|
186 |
+
<i>"Truth is not told—it is seen."</i>
|
187 |
+
</div>'''
|
188 |
+
]
|
189 |
+
|
190 |
+
# Initialize session state for quote index and last update time
|
191 |
+
if "quote_index" not in st.session_state:
|
192 |
+
st.session_state.quote_index = 0
|
193 |
+
st.session_state.last_quote_time = time.time()
|
194 |
+
|
195 |
+
# Check if 5 seconds have passed
|
196 |
+
if time.time() - st.session_state.last_quote_time > 5:
|
197 |
+
st.session_state.quote_index = (st.session_state.quote_index + 1) % len(QUOTES)
|
198 |
+
st.session_state.last_quote_time = time.time()
|
199 |
+
# Rerun the app to update the quote
|
200 |
+
if hasattr(st, "experimental_rerun"):
|
201 |
+
st.experimental_rerun()
|
202 |
+
|
203 |
+
# Display the current quote
|
204 |
+
st.markdown(QUOTES[st.session_state.quote_index], unsafe_allow_html=True)
|
205 |
+
|
206 |
+
col1, col2 = st.columns([1, 2])
|
207 |
+
|
208 |
+
with col1:
|
209 |
+
st.subheader("1. Upload Image or pdf document")
|
210 |
+
image = "Not Uploaded"
|
211 |
+
uploaded_file = st.file_uploader("Choose an image", type=["png", "jpg", "jpeg", "pdf"])
|
212 |
+
if uploaded_file:
|
213 |
+
current_dir = os.getcwd()
|
214 |
+
temp_output_folder = os.path.join(current_dir, "temp_output_folder/")
|
215 |
+
# delete the temp_output_folder
|
216 |
+
if os.path.exists(temp_output_folder):
|
217 |
+
shutil.rmtree(temp_output_folder)
|
218 |
+
|
219 |
+
document_type = "image"
|
220 |
+
if uploaded_file.type == "application/pdf":
|
221 |
+
|
222 |
+
|
223 |
+
# save the uploaded file to a temp file
|
224 |
+
temp_file_path = os.path.join(current_dir, "temp_file.pdf")
|
225 |
+
|
226 |
+
# delete the temp_file_path
|
227 |
+
if os.path.exists(temp_file_path):
|
228 |
+
os.remove(temp_file_path)
|
229 |
+
|
230 |
+
with open(temp_file_path, "wb") as f:
|
231 |
+
f.write(uploaded_file.getbuffer())
|
232 |
+
|
233 |
+
if not os.path.exists(temp_output_folder):
|
234 |
+
os.makedirs(temp_output_folder)
|
235 |
+
# output_file = simple_counter_generator("page", ".jpg")
|
236 |
+
# convert_from_path(document_path, output_folder=temp_output_folder, dpi=300, fmt='jpeg', jpegopt= jpg_options, output_file=output_file)
|
237 |
+
|
238 |
+
pages = 0
|
239 |
+
doc = pymupdf.open(temp_file_path) # open document
|
240 |
+
for page in doc: # iterate through the pages
|
241 |
+
pages += 1
|
242 |
+
pix = page.get_pixmap() # render page to an image
|
243 |
+
pix.save(f"{temp_output_folder}/{page.number}.png")
|
244 |
+
|
245 |
+
if(pages == 1):
|
246 |
+
document_type = "image"
|
247 |
+
document_path = os.path.join(temp_output_folder, "0.png")
|
248 |
+
uploaded_file = os.path.join(temp_output_folder, "0.png")
|
249 |
+
image = Image.open(uploaded_file).convert("RGB")
|
250 |
+
else:
|
251 |
+
document_type = "pdf"
|
252 |
+
# image = Image.open(uploaded_file).convert("RGB")
|
253 |
+
|
254 |
+
if document_type == "image":
|
255 |
+
image = Image.open(uploaded_file).convert("RGB")
|
256 |
+
st.image(image, caption="Uploaded Image", use_container_width=True)
|
257 |
+
# Save uploaded image to a temp file for predict_output
|
258 |
+
temp_file_path = "sample.png"
|
259 |
+
image.save(temp_file_path)
|
260 |
+
else:
|
261 |
+
document_type = "pdf"
|
262 |
+
document_path = uploaded_file.name
|
263 |
+
image = "Uploaded PDF"
|
264 |
+
# st.image(uploaded_file, caption="Uploaded PDF", use_container_width=True)
|
265 |
+
else:
|
266 |
+
image = "Not Uploaded"
|
267 |
+
temp_output_folder = None
|
268 |
+
st.image("https://placehold.co/400x300?text=Upload+Image", caption="Uploaded Image", use_container_width=True)
|
269 |
+
|
270 |
+
st.subheader("2. Ask a question")
|
271 |
+
question = st.text_input("Type your question here")
|
272 |
+
|
273 |
+
# Add radio button for model selection
|
274 |
+
model_type = st.radio(
|
275 |
+
"Select Model Type:",
|
276 |
+
options=["MGVG", "IndoDocs"],
|
277 |
+
index=1,
|
278 |
+
horizontal=True
|
279 |
+
)
|
280 |
+
|
281 |
+
run_demo = st.button("Run Grounding Demo", use_container_width=True)
|
282 |
+
|
283 |
+
# --- Output placeholders ---
|
284 |
+
with col2:
|
285 |
+
st.subheader("3. Visual Grounding Outputs")
|
286 |
+
if image!="Not Uploaded" and (question):
|
287 |
+
print(image)
|
288 |
+
print(question)
|
289 |
+
if run_demo and image!="Not Uploaded" and (question):
|
290 |
+
# Use text input only
|
291 |
+
q = question
|
292 |
+
answer, block_bboxes, line_bboxes, word_bboxes, point_bboxes, current_page = predict_output(
|
293 |
+
temp_file_path, q, pipe, layout_predictor, model, model_type, document_type
|
294 |
+
)
|
295 |
+
|
296 |
+
|
297 |
+
# print(block_bboxes)
|
298 |
+
# print(line_bboxes)
|
299 |
+
# print(word_bboxes)
|
300 |
+
# print(point_bboxes)
|
301 |
+
print(answer)
|
302 |
+
|
303 |
+
if(current_page != -1):
|
304 |
+
image = Image.open(os.path.join(temp_output_folder, f"{current_page}.png")).convert("RGB")
|
305 |
+
print("--------------------------------")
|
306 |
+
print(image)
|
307 |
+
|
308 |
+
block_img = draw_bboxes(image, block_bboxes, color="#4F8BF9")
|
309 |
+
line_img = draw_bboxes(image, line_bboxes, color="#F97B4F")
|
310 |
+
word_img = draw_bboxes(image, word_bboxes, color="#4FF9B2")
|
311 |
+
point_img = draw_points(image, point_bboxes, color="#FFFF00")
|
312 |
+
imgs = [block_img, line_img, word_img, point_img]
|
313 |
+
labels = ["Block Level", "Line Level", "Word Level", "Point Level"]
|
314 |
+
cols = st.columns(4)
|
315 |
+
for i, (img, label) in enumerate(zip(imgs, labels)):
|
316 |
+
with cols[i]:
|
317 |
+
st.image(img, caption=label, use_container_width=True)
|
318 |
+
answer_lines = answer.splitlines()
|
319 |
+
st.markdown("""
|
320 |
+
<div style='background: #f1f5fa; border-radius: 10px; padding: 1em 2em; border: 1.5px solid #4F8BF9;'>
|
321 |
+
<h4 style='color: #4F8BF9;'>Predicted Answer:</h4>
|
322 |
+
<p style='font-size: 1.2em; color: #222;'>""" + "<br>".join(answer_lines) + """</p>
|
323 |
+
</div>
|
324 |
+
""", unsafe_allow_html=True)
|
325 |
+
|
326 |
+
# --- Centered Save Results Button ---
|
327 |
+
result_data = {
|
328 |
+
"question": q,
|
329 |
+
"answer": answer,
|
330 |
+
"block_bboxes": block_bboxes,
|
331 |
+
"line_bboxes": line_bboxes,
|
332 |
+
"word_bboxes": word_bboxes,
|
333 |
+
"point_bboxes": point_bboxes,
|
334 |
+
"current_page": current_page
|
335 |
+
}
|
336 |
+
json_str = json.dumps(result_data, indent=2)
|
337 |
+
col_left, col_center, col_right = st.columns([2, 3, 2])
|
338 |
+
with col_center:
|
339 |
+
st.download_button(
|
340 |
+
label="Save Results as JSON",
|
341 |
+
data=json_str,
|
342 |
+
file_name="grounding_results.json",
|
343 |
+
mime="application/json"
|
344 |
+
)
|
345 |
+
else:
|
346 |
+
st.markdown("""
|
347 |
+
<div style='display: flex; gap: 2em; flex-wrap: wrap;'>
|
348 |
+
<div style='flex: 1; min-width: 220px;'>
|
349 |
+
<img src='https://placehold.co/220x180?text=Block+Level' style='width:100%; border-radius: 10px; border: 2px solid #4F8BF9;'>
|
350 |
+
<p style='text-align:center; font-weight:600;'>Block Level</p>
|
351 |
+
</div>
|
352 |
+
<div style='flex: 1; min-width: 220px;'>
|
353 |
+
<img src='https://placehold.co/220x180?text=Line+Level' style='width:100%; border-radius: 10px; border: 2px solid #4F8BF9;'>
|
354 |
+
<p style='text-align:center; font-weight:600;'>Line Level</p>
|
355 |
+
</div>
|
356 |
+
<div style='flex: 1; min-width: 220px;'>
|
357 |
+
<img src='https://placehold.co/220x180?text=Word+Level' style='width:100%; border-radius: 10px; border: 2px solid #4F8BF9;'>
|
358 |
+
<p style='text-align:center; font-weight:600;'>Word Level</p>
|
359 |
+
</div>
|
360 |
+
<div style='flex: 1; min-width: 220px;'>
|
361 |
+
<img src='https://placehold.co/220x180?text=Point+Level' style='width:100%; border-radius: 10px; border: 2px solid #4F8BF9;'>
|
362 |
+
<p style='text-align:center; font-weight:600;'>Point Level</p>
|
363 |
+
</div>
|
364 |
+
</div>
|
365 |
+
<br>
|
366 |
+
<div style='background: #f1f5fa; border-radius: 10px; padding: 1em 2em; border: 1.5px solid #4F8BF9;'>
|
367 |
+
<h4 style='color: #4F8BF9;'>Predicted Answer:</h4>
|
368 |
+
<p style='font-size: 1.2em; color: #222;'>[Answer will appear here]</p>
|
369 |
+
</div>
|
370 |
+
""", unsafe_allow_html=True)
|
src/config.py
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
MODEL1 = "MGVG"
|
2 |
+
MODEL2 = "IndoDocs"
|
src/predict_output.py
ADDED
@@ -0,0 +1,559 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import shutil
|
3 |
+
from fuzzywuzzy import fuzz
|
4 |
+
from tqdm import tqdm
|
5 |
+
from PIL import Image
|
6 |
+
import requests
|
7 |
+
# from surya.layout import LayoutPredictor
|
8 |
+
|
9 |
+
from doctr.io import DocumentFile
|
10 |
+
from pdf2image import convert_from_path
|
11 |
+
import pymupdf
|
12 |
+
# from doctr.models import ocr_predictor
|
13 |
+
import numpy as np
|
14 |
+
from time import time
|
15 |
+
|
16 |
+
pipe = None
|
17 |
+
layout_predictor = None
|
18 |
+
|
19 |
+
MAX_BLOCK_MATCHES = 2
|
20 |
+
MAX_LINE_MATCHES = 5
|
21 |
+
CUT_OFF_THRESHOLD = 60
|
22 |
+
QUESTION_WEIGHT = 0.2
|
23 |
+
ANSWER_WEIGHT = 0.8
|
24 |
+
LEVEL = "line"
|
25 |
+
|
26 |
+
jpg_options = {
|
27 |
+
"quality" : 100,
|
28 |
+
"progressive": True,
|
29 |
+
"optimize" : False
|
30 |
+
}
|
31 |
+
|
32 |
+
stop_words = {'what', 'is', 'the', 'this', 'that', 'these', 'those', 'which', 'how', 'why', 'where', 'when', 'who', 'will', 'be', 'and', 'or', 'in', 'at', 'to', 'for', 'of', 'with', 'by'}
|
33 |
+
|
34 |
+
def longest_consecutive_range(indices):
|
35 |
+
if not indices:
|
36 |
+
return []
|
37 |
+
|
38 |
+
indices = sorted(set(indices))
|
39 |
+
longest = []
|
40 |
+
current = [indices[0]]
|
41 |
+
|
42 |
+
for i in range(1, len(indices)):
|
43 |
+
if indices[i] == indices[i - 1] + 1:
|
44 |
+
current.append(indices[i])
|
45 |
+
else:
|
46 |
+
if len(current) > len(longest):
|
47 |
+
longest = current
|
48 |
+
current = [indices[i]]
|
49 |
+
|
50 |
+
if len(current) > len(longest):
|
51 |
+
longest = current
|
52 |
+
|
53 |
+
return longest
|
54 |
+
|
55 |
+
|
56 |
+
def get_word_level_matches(answer_text, top_k_matches):
|
57 |
+
bboxes = []
|
58 |
+
for match in top_k_matches:
|
59 |
+
indices = []
|
60 |
+
for index, word in enumerate(match['words']):
|
61 |
+
if word['text'].lower() in answer_text.lower():
|
62 |
+
# bboxes.append(word['bbox'])
|
63 |
+
indices.append(index)
|
64 |
+
longest_indices = longest_consecutive_range(indices)
|
65 |
+
for index in longest_indices:
|
66 |
+
bboxes.append(match['words'][index]['bbox'])
|
67 |
+
return bboxes
|
68 |
+
|
69 |
+
|
70 |
+
def get_matched_regions(question_text, target_text, predictions, level):
|
71 |
+
|
72 |
+
question_terms = [word.lower() for word in question_text.split() if word.lower() not in stop_words]
|
73 |
+
matched_regions = []
|
74 |
+
for region in predictions:
|
75 |
+
region_text = region['text']
|
76 |
+
region_copy = region.copy()
|
77 |
+
|
78 |
+
if target_text.lower() in region_text.lower():
|
79 |
+
region_copy['match_score'] = 100
|
80 |
+
region_copy['match_details'] = {
|
81 |
+
'exact_match': True,
|
82 |
+
'answer_score': 100,
|
83 |
+
'question_score': 100
|
84 |
+
}
|
85 |
+
matched_regions.append(region_copy)
|
86 |
+
continue
|
87 |
+
|
88 |
+
partial_score = fuzz.partial_ratio(target_text.lower(), region_text.lower())
|
89 |
+
token_score = fuzz.token_set_ratio(target_text.lower(), region_text.lower())
|
90 |
+
|
91 |
+
# Calculate length factor (preference for longer matches that contain meaningful content)
|
92 |
+
target_len = len(target_text)
|
93 |
+
region_len = len(region_text)
|
94 |
+
length_factor = min(1.0, region_len / min(50, target_len)) # Cap at 1.0, adapt based on target length
|
95 |
+
|
96 |
+
# Combine scores for answer with weights
|
97 |
+
# Higher weight to token matching for longer texts, higher weight to partial matching for shorter texts
|
98 |
+
if region_len > 10:
|
99 |
+
answer_score = (partial_score * 0.3) + (token_score * 0.5) + (length_factor * 100 * 0.2)
|
100 |
+
else:
|
101 |
+
# For very short texts, reduce their overall score unless they're exact matches
|
102 |
+
answer_score = (partial_score * 0.3) + (token_score * 0.4) + (length_factor * 100 * 0.3)
|
103 |
+
if region_len < 5 and partial_score < 100:
|
104 |
+
answer_score *= 0.5 # Penalize very short inexact matches
|
105 |
+
|
106 |
+
# penalize shorter region_texts
|
107 |
+
if region_len < 5:
|
108 |
+
answer_score *= 0.5
|
109 |
+
|
110 |
+
# Calculate fuzzy match scores for question terms using both methods
|
111 |
+
partial_question_scores = [fuzz.partial_ratio(term, region_text.lower()) for term in question_terms]
|
112 |
+
token_question_scores = [fuzz.token_set_ratio(term, region_text.lower()) for term in question_terms]
|
113 |
+
|
114 |
+
# Get best scores for question terms
|
115 |
+
best_partial_question = max(partial_question_scores) if partial_question_scores else 0
|
116 |
+
best_token_question = max(token_question_scores) if token_question_scores else 0
|
117 |
+
|
118 |
+
# Combine question scores
|
119 |
+
question_score = (best_partial_question * 0.4) + (best_token_question * 0.6)
|
120 |
+
|
121 |
+
# Combine scores (giving more weight to answer matches)
|
122 |
+
combined_score = (answer_score * ANSWER_WEIGHT) + (question_score * QUESTION_WEIGHT)
|
123 |
+
|
124 |
+
# print(combined_score)
|
125 |
+
|
126 |
+
if combined_score >= CUT_OFF_THRESHOLD:
|
127 |
+
region_copy['match_score'] = combined_score
|
128 |
+
region_copy['match_details'] = {
|
129 |
+
'exact_match': False,
|
130 |
+
'answer_score': answer_score,
|
131 |
+
'question_score': question_score,
|
132 |
+
'answer_weight': ANSWER_WEIGHT,
|
133 |
+
'question_weight': QUESTION_WEIGHT
|
134 |
+
}
|
135 |
+
matched_regions.append(region_copy)
|
136 |
+
|
137 |
+
|
138 |
+
matched_regions.sort(key=lambda x: x['match_score'], reverse=True)
|
139 |
+
|
140 |
+
# If no matches, reduce threshold by 20 and get the topmost single output
|
141 |
+
if not matched_regions:
|
142 |
+
new_threshold = max(CUT_OFF_THRESHOLD - 20, 0) # Prevent negative threshold
|
143 |
+
matched_regions = [region for region in matched_regions if region['match_score'] >= new_threshold]
|
144 |
+
matched_regions.sort(key=lambda x: x['match_score'], reverse=True)
|
145 |
+
if matched_regions:
|
146 |
+
matched_regions = [matched_regions[0]] # Only keep the topmost single output
|
147 |
+
|
148 |
+
if level == "block":
|
149 |
+
top_matches = matched_regions[:MAX_BLOCK_MATCHES]
|
150 |
+
elif level == "line":
|
151 |
+
top_matches = matched_regions[:MAX_LINE_MATCHES]
|
152 |
+
return top_matches
|
153 |
+
|
154 |
+
|
155 |
+
def get_processed_text_for_llm(block_predictions, gap):
|
156 |
+
final_text = ""
|
157 |
+
for block_data in block_predictions:
|
158 |
+
final_text += block_data['text'] + gap
|
159 |
+
return final_text
|
160 |
+
|
161 |
+
|
162 |
+
def get_page_number(block_bboxes):
|
163 |
+
pages = {}
|
164 |
+
for block in block_bboxes:
|
165 |
+
if block['page'] not in pages:
|
166 |
+
pages[block['page']] = 1
|
167 |
+
else:
|
168 |
+
pages[block['page']] += 1
|
169 |
+
|
170 |
+
print(pages)
|
171 |
+
max_page = max(pages, key=pages.get)
|
172 |
+
return max_page
|
173 |
+
|
174 |
+
|
175 |
+
def predict_output(document_path, question, pipe, layout_predictor, model, model_type, document_type="image"):
|
176 |
+
|
177 |
+
predicted_answer = None
|
178 |
+
block_box_predictions = None
|
179 |
+
line_box_predictions = None
|
180 |
+
word_box_predictions = None
|
181 |
+
point_box_predictions = None
|
182 |
+
|
183 |
+
|
184 |
+
curr_time = time()
|
185 |
+
line_predictions, pages_count = get_line_predictions(document_path, model, document_type)
|
186 |
+
line_time = time()
|
187 |
+
print(f"Done with line predictions in {line_time - curr_time} seconds")
|
188 |
+
curr_time = time()
|
189 |
+
if(document_type == "pdf" and pages_count < 3):
|
190 |
+
block_predictions = get_block_predictions(document_path, layout_predictor, model, document_type)
|
191 |
+
gap = '\n\n\n'
|
192 |
+
else:
|
193 |
+
block_predictions = line_predictions
|
194 |
+
gap = '\n'
|
195 |
+
block_time = time()
|
196 |
+
print(f"Done with block predictions in {block_time - line_time} seconds")
|
197 |
+
# exit()
|
198 |
+
|
199 |
+
# print(line_predictions)
|
200 |
+
# print(block_predictions)
|
201 |
+
|
202 |
+
|
203 |
+
curr_time = time()
|
204 |
+
if model_type == "MGVG" or document_type=="pdf":
|
205 |
+
processed_text_for_llm = get_processed_text_for_llm(block_predictions, gap=gap)
|
206 |
+
# print("Processed Text for LLM: ", processed_text_for_llm)
|
207 |
+
predicted_answer = generate_llm_answer(question, processed_text_for_llm, pipe)
|
208 |
+
|
209 |
+
elif model_type == "IndoDocs":
|
210 |
+
predicted_answer = generate_via_inhouse_model_answer(question, document_path)
|
211 |
+
llm_time = time()
|
212 |
+
print(f"Done with LLM in {llm_time - curr_time} seconds")
|
213 |
+
|
214 |
+
print("LLM Answer: ", predicted_answer)
|
215 |
+
|
216 |
+
|
217 |
+
total_algo_time = time()
|
218 |
+
|
219 |
+
# print(predicted_answer)
|
220 |
+
curr_time = time()
|
221 |
+
|
222 |
+
line_matches = get_matched_regions(question, predicted_answer, line_predictions, "line")
|
223 |
+
|
224 |
+
|
225 |
+
block_bboxes = get_matched_regions(question, predicted_answer, block_predictions, "block")
|
226 |
+
match_time = time()
|
227 |
+
print(f"Done with match in {match_time - curr_time} seconds")
|
228 |
+
|
229 |
+
|
230 |
+
if document_type == "pdf":
|
231 |
+
current_page = get_page_number(block_bboxes)
|
232 |
+
else:
|
233 |
+
current_page = -1
|
234 |
+
|
235 |
+
if(current_page != -1):
|
236 |
+
predicted_answer = "Answer predicted from page: " + str(current_page+1) + "\n" + predicted_answer
|
237 |
+
|
238 |
+
block_box_predictions = []
|
239 |
+
for match in block_bboxes:
|
240 |
+
block_box_predictions.append(match['bbox'])
|
241 |
+
|
242 |
+
line_box_predictions = []
|
243 |
+
for match in line_matches:
|
244 |
+
# print(match['page'], match['bbox'])
|
245 |
+
if current_page == -1 or match['page'] == current_page:
|
246 |
+
line_box_predictions.append(match['bbox'])
|
247 |
+
|
248 |
+
# for line in line_box_predictions:
|
249 |
+
# print(line)
|
250 |
+
|
251 |
+
curr_time = time()
|
252 |
+
word_box_predictions = get_word_level_matches(predicted_answer, top_k_matches=line_matches)
|
253 |
+
word_time = time()
|
254 |
+
print(f"Done with word in {word_time - curr_time} seconds")
|
255 |
+
|
256 |
+
curr_time = time()
|
257 |
+
point_box_predictions = get_point_level_matches(block_box_predictions, line_box_predictions, word_box_predictions)
|
258 |
+
point_time = time()
|
259 |
+
print(f"Done with point in {point_time - curr_time} seconds")
|
260 |
+
|
261 |
+
print(f"Total algo time: {time() - total_algo_time} seconds")
|
262 |
+
|
263 |
+
|
264 |
+
# print(block_box_predictions)
|
265 |
+
# print(line_box_predictions)
|
266 |
+
# print(word_box_predictions)
|
267 |
+
# print(point_box_predictions)
|
268 |
+
# print(predicted_answer)
|
269 |
+
|
270 |
+
|
271 |
+
return predicted_answer, block_box_predictions, line_box_predictions, word_box_predictions, point_box_predictions, current_page
|
272 |
+
|
273 |
+
|
274 |
+
def calculate_midpoint_of_bboxes(bboxes):
|
275 |
+
|
276 |
+
if not bboxes:
|
277 |
+
return None
|
278 |
+
|
279 |
+
# Convert to numpy array for easier manipulation
|
280 |
+
bboxes = np.array(bboxes)
|
281 |
+
|
282 |
+
# Find the extreme points of all bboxes combined
|
283 |
+
min_x = np.min(bboxes[:, 0])
|
284 |
+
min_y = np.min(bboxes[:, 1])
|
285 |
+
max_x = np.max(bboxes[:, 2])
|
286 |
+
max_y = np.max(bboxes[:, 3])
|
287 |
+
|
288 |
+
# Calculate midpoint
|
289 |
+
midpoint_x = (min_x + max_x) / 2
|
290 |
+
midpoint_y = (min_y + max_y) / 2
|
291 |
+
|
292 |
+
return round(midpoint_x, 2), round(midpoint_y, 2)
|
293 |
+
|
294 |
+
|
295 |
+
def get_point_level_matches(block_box_predictions, line_box_predictions, word_box_predictions):
|
296 |
+
|
297 |
+
point_box_predictions = []
|
298 |
+
|
299 |
+
if len(block_box_predictions) ==1:
|
300 |
+
try:
|
301 |
+
x, y = calculate_midpoint_of_bboxes(block_box_predictions)
|
302 |
+
point_box_predictions = [[x, y]]
|
303 |
+
# print(x, y)
|
304 |
+
except:
|
305 |
+
try:
|
306 |
+
x, y = calculate_midpoint_of_bboxes(line_box_predictions)
|
307 |
+
point_box_predictions = [[x, y]]
|
308 |
+
except:
|
309 |
+
point_box_predictions = []
|
310 |
+
else:
|
311 |
+
points = []
|
312 |
+
for block_bbox in block_box_predictions:
|
313 |
+
try:
|
314 |
+
x, y = calculate_midpoint_of_bboxes(block_bbox)
|
315 |
+
points.append([x, y])
|
316 |
+
except:
|
317 |
+
continue
|
318 |
+
point_box_predictions = points
|
319 |
+
|
320 |
+
return point_box_predictions
|
321 |
+
|
322 |
+
|
323 |
+
def generate_via_inhouse_model_answer(question, image_path, api_key="VISION-TEAM", max_tokens=512, temperature=0.7, endpoint="http://103.207.148.38:9000/api/v1/chat/upload"):
|
324 |
+
headers = {
|
325 |
+
"x-api-key": api_key # or whatever the Swagger UI says
|
326 |
+
}
|
327 |
+
|
328 |
+
files = {
|
329 |
+
"image": open(image_path, "rb")
|
330 |
+
}
|
331 |
+
|
332 |
+
data = {
|
333 |
+
"text": question,
|
334 |
+
"max_tokens": str(max_tokens),
|
335 |
+
"temperature": str(temperature)
|
336 |
+
}
|
337 |
+
|
338 |
+
try:
|
339 |
+
response = requests.post(endpoint, headers=headers, files=files, data=data)
|
340 |
+
response.raise_for_status()
|
341 |
+
result = response.json()
|
342 |
+
except requests.exceptions.RequestException as e:
|
343 |
+
return {"error": str(e)}
|
344 |
+
|
345 |
+
return result['response']['choices'][0]['message']['content']
|
346 |
+
|
347 |
+
def generate_llm_answer(question, context, pipe):
|
348 |
+
|
349 |
+
prompt = f"""You are given a question and context. Your task is to find and return the best possible answer to the question using only the context as it is.
|
350 |
+
Do not generate summaries, paraphrased content, or any additional explanations including any preamble and postamble.
|
351 |
+
Return only the exact phrase or sentence fragment from the context that answers the question.
|
352 |
+
If the answer is not found in the context, return: Answer not found in context.
|
353 |
+
|
354 |
+
Question: {question}
|
355 |
+
Context: {context}
|
356 |
+
Answer:
|
357 |
+
"""
|
358 |
+
|
359 |
+
messages = [ {"role": "user", "content": prompt}]
|
360 |
+
result = pipe(messages, max_new_tokens=512, do_sample=True, temperature=0.7)
|
361 |
+
ans = result[0]["generated_text"][1]['content']
|
362 |
+
return ans
|
363 |
+
|
364 |
+
|
365 |
+
def get_line_predictions(document_path, model, document_type):
|
366 |
+
|
367 |
+
current_dir = os.getcwd()
|
368 |
+
if document_type == "pdf":
|
369 |
+
output_file = simple_counter_generator("page", ".jpg")
|
370 |
+
current_dir = os.getcwd()
|
371 |
+
temp_output_folder = os.path.join(current_dir, "temp_output_folder/")
|
372 |
+
|
373 |
+
# delete the temp_output_folder
|
374 |
+
if os.path.exists(temp_output_folder):
|
375 |
+
shutil.rmtree(temp_output_folder)
|
376 |
+
|
377 |
+
if not os.path.exists(temp_output_folder):
|
378 |
+
os.makedirs(temp_output_folder)
|
379 |
+
# output_file = simple_counter_generator("page", ".jpg")
|
380 |
+
# convert_from_path(document_path, output_folder=temp_output_folder, dpi=300, fmt='jpeg', jpegopt= jpg_options, output_file=output_file)
|
381 |
+
|
382 |
+
doc = pymupdf.open(document_path) # open document
|
383 |
+
for page in doc: # iterate through the pages
|
384 |
+
pix = page.get_pixmap() # render page to an image
|
385 |
+
pix.save(f"{temp_output_folder}/{page.number}.png")
|
386 |
+
|
387 |
+
images_path = sorted(os.listdir(temp_output_folder))
|
388 |
+
else:
|
389 |
+
|
390 |
+
images_path = [os.path.join(current_dir, document_path)]
|
391 |
+
print(images_path)
|
392 |
+
|
393 |
+
block_predictions = []
|
394 |
+
# print(document_path)
|
395 |
+
# if document_type == "pdf":
|
396 |
+
# doc = DocumentFile.from_pdf(document_path)
|
397 |
+
# else:
|
398 |
+
# doc = DocumentFile.from_images(document_path)
|
399 |
+
# result = model(doc)
|
400 |
+
|
401 |
+
line_predictions = []
|
402 |
+
|
403 |
+
pages_count = -1
|
404 |
+
for image_path in images_path:
|
405 |
+
pages_count += 1
|
406 |
+
|
407 |
+
if(len(images_path) > 1):
|
408 |
+
doc = DocumentFile.from_images(os.path.join(temp_output_folder, image_path))
|
409 |
+
else:
|
410 |
+
doc = DocumentFile.from_images(image_path)
|
411 |
+
|
412 |
+
|
413 |
+
result = model(doc)
|
414 |
+
for page in result.pages:
|
415 |
+
dim = tuple(reversed(page.dimensions))
|
416 |
+
for block in page.blocks:
|
417 |
+
for line in block.lines:
|
418 |
+
output = {}
|
419 |
+
geo = line.geometry
|
420 |
+
a = list(a*b for a,b in zip(geo[0],dim))
|
421 |
+
b = list(a*b for a,b in zip(geo[1],dim))
|
422 |
+
x1 = round(a[0], 2).astype(float)
|
423 |
+
y1 = round(a[1], 2).astype(float)
|
424 |
+
x2 = round(b[0], 2).astype(float)
|
425 |
+
y2 = round(b[1], 2).astype(float)
|
426 |
+
line_bbox = [x1, y1, x2, y2]
|
427 |
+
|
428 |
+
sent = []
|
429 |
+
words_data = []
|
430 |
+
for word in line.words:
|
431 |
+
word_data = {}
|
432 |
+
sent.append(word.value)
|
433 |
+
geo = word.geometry
|
434 |
+
a = list(a*b for a,b in zip(geo[0],dim))
|
435 |
+
b = list(a*b for a,b in zip(geo[1],dim))
|
436 |
+
x1 = round(a[0], 2).astype(float)
|
437 |
+
y1 = round(a[1], 2).astype(float)
|
438 |
+
x2 = round(b[0], 2).astype(float)
|
439 |
+
y2 = round(b[1], 2).astype(float)
|
440 |
+
bbox = [x1, y1, x2, y2]
|
441 |
+
|
442 |
+
word_data['bbox'] = bbox
|
443 |
+
word_data['text'] = word.value
|
444 |
+
words_data.append(word_data)
|
445 |
+
output['bbox'] = line_bbox
|
446 |
+
output['text'] = " ".join(sent)
|
447 |
+
output['words'] = words_data
|
448 |
+
output['page'] = pages_count
|
449 |
+
line_predictions.append(output)
|
450 |
+
|
451 |
+
return line_predictions, pages_count
|
452 |
+
|
453 |
+
|
454 |
+
def get_block_predictions(document_path, layout_predictor, model, document_type):
|
455 |
+
current_dir = os.getcwd()
|
456 |
+
if document_type == "pdf":
|
457 |
+
output_file = simple_counter_generator("page", ".jpg")
|
458 |
+
current_dir = os.getcwd()
|
459 |
+
temp_output_folder = os.path.join(current_dir, "temp_output_folder/")
|
460 |
+
|
461 |
+
# delete the temp_output_folder
|
462 |
+
if os.path.exists(temp_output_folder):
|
463 |
+
shutil.rmtree(temp_output_folder)
|
464 |
+
|
465 |
+
if not os.path.exists(temp_output_folder):
|
466 |
+
os.makedirs(temp_output_folder)
|
467 |
+
# output_file = simple_counter_generator("page", ".jpg")
|
468 |
+
# convert_from_path(document_path, output_folder=temp_output_folder, dpi=300, fmt='jpeg', jpegopt= jpg_options, output_file=output_file)
|
469 |
+
|
470 |
+
doc = pymupdf.open(document_path) # open document
|
471 |
+
for page in doc: # iterate through the pages
|
472 |
+
pix = page.get_pixmap() # render page to an image
|
473 |
+
pix.save(f"{temp_output_folder}/{page.number}.png")
|
474 |
+
|
475 |
+
images_path = sorted(os.listdir(temp_output_folder))
|
476 |
+
else:
|
477 |
+
|
478 |
+
images_path = [os.path.join(current_dir, document_path)]
|
479 |
+
# print(images_path)
|
480 |
+
|
481 |
+
block_predictions = []
|
482 |
+
|
483 |
+
|
484 |
+
|
485 |
+
page_count = -1
|
486 |
+
for image_path in images_path:
|
487 |
+
page_count += 1
|
488 |
+
|
489 |
+
if(len(images_path) > 1):
|
490 |
+
image = Image.open(os.path.join(temp_output_folder, image_path))
|
491 |
+
else:
|
492 |
+
image = Image.open(os.path.join(current_dir, document_path))
|
493 |
+
|
494 |
+
# print(image_path)
|
495 |
+
# print(image)
|
496 |
+
|
497 |
+
layout_predictions = layout_predictor([image])
|
498 |
+
|
499 |
+
for block in layout_predictions[0].bboxes:
|
500 |
+
output = {}
|
501 |
+
bbox = [int(x) for x in block.bbox]
|
502 |
+
|
503 |
+
|
504 |
+
cropped_image = image.crop(bbox)
|
505 |
+
|
506 |
+
cropped_image.save(f'temp.png')
|
507 |
+
doc = DocumentFile.from_images('temp.png')
|
508 |
+
result = model(doc)
|
509 |
+
|
510 |
+
text = []
|
511 |
+
for page in result.pages:
|
512 |
+
for block in page.blocks:
|
513 |
+
for line in block.lines:
|
514 |
+
for word in line.words:
|
515 |
+
text.append(word.value)
|
516 |
+
|
517 |
+
|
518 |
+
output['bbox'] = bbox
|
519 |
+
output['text'] = " ".join(text)
|
520 |
+
output['page'] = page_count
|
521 |
+
block_predictions.append(output)
|
522 |
+
|
523 |
+
return block_predictions
|
524 |
+
|
525 |
+
def simple_counter_generator(prefix="", suffix=""):
|
526 |
+
while True:
|
527 |
+
yield 'p'
|
528 |
+
|
529 |
+
|
530 |
+
|
531 |
+
# from doctr.models import ocr_predictor
|
532 |
+
# model = ocr_predictor(det_arch='db_resnet50', reco_arch='crnn_vgg16_bn', pretrained=True)
|
533 |
+
|
534 |
+
|
535 |
+
# # from transformers import pipeline
|
536 |
+
# # def load_llm_model(device):
|
537 |
+
# # pipe = pipeline("text-generation", model="meta-llama/Meta-Llama-3.1-8B-Instruct", device=device)
|
538 |
+
# # return pipe
|
539 |
+
|
540 |
+
# # pipe = load_llm_model("cuda")
|
541 |
+
# pipe = None
|
542 |
+
|
543 |
+
# # from surya.layout import LayoutPredictor
|
544 |
+
# # layout_predictor = LayoutPredictor()
|
545 |
+
# layout_predictor = None
|
546 |
+
|
547 |
+
# document_path = "sample.pdf"
|
548 |
+
# question = "What is the subject of the circular?"
|
549 |
+
|
550 |
+
# answer, block_box_predictions, line_box_predictions, word_box_predictions, point_box_predictions = predict_output(document_path, question, pipe, layout_predictor, model, "Inhouse", document_type="pdf")
|
551 |
+
|
552 |
+
# print(answer)
|
553 |
+
# print(block_box_predictions)
|
554 |
+
# print(line_box_predictions)
|
555 |
+
# print(word_box_predictions)
|
556 |
+
# print(point_box_predictions)
|
557 |
+
|
558 |
+
|
559 |
+
|
src/requirements.txt
ADDED
@@ -0,0 +1,175 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
accelerate==1.6.0
|
2 |
+
aiohappyeyeballs==2.6.1
|
3 |
+
aiohttp==3.11.18
|
4 |
+
aiosignal==1.3.2
|
5 |
+
albucore==0.0.23
|
6 |
+
albumentations==2.0.5
|
7 |
+
altair==5.5.0
|
8 |
+
annotated-types==0.7.0
|
9 |
+
anthropic==0.46.0
|
10 |
+
anyascii==0.3.2
|
11 |
+
anyio==4.9.0
|
12 |
+
asttokens==3.0.0
|
13 |
+
async-timeout==5.0.1
|
14 |
+
attrs==25.3.0
|
15 |
+
av==14.3.0
|
16 |
+
beautifulsoup4==4.13.4
|
17 |
+
blinker==1.9.0
|
18 |
+
cachetools==5.5.2
|
19 |
+
certifi==2025.1.31
|
20 |
+
cfgv==3.4.0
|
21 |
+
charset-normalizer==3.4.1
|
22 |
+
click==8.1.8
|
23 |
+
comm==0.2.2
|
24 |
+
contourpy==1.3.1
|
25 |
+
cycler==0.12.1
|
26 |
+
datasets==3.5.0
|
27 |
+
debugpy==1.8.14
|
28 |
+
decorator==5.2.1
|
29 |
+
defusedxml==0.7.1
|
30 |
+
dill==0.3.8
|
31 |
+
distlib==0.3.9
|
32 |
+
distro==1.9.0
|
33 |
+
doclayout_yolo==0.0.3
|
34 |
+
exceptiongroup==1.2.2
|
35 |
+
executing==2.2.0
|
36 |
+
filelock==3.18.0
|
37 |
+
filetype==1.2.0
|
38 |
+
fonttools==4.57.0
|
39 |
+
frozenlist==1.6.0
|
40 |
+
fsspec==2024.12.0
|
41 |
+
ftfy==6.3.1
|
42 |
+
fuzzywuzzy==0.18.0
|
43 |
+
gitdb==4.0.12
|
44 |
+
GitPython==3.1.44
|
45 |
+
google-auth==2.39.0
|
46 |
+
google-genai==1.11.0
|
47 |
+
h11==0.14.0
|
48 |
+
h5py==3.13.0
|
49 |
+
httpcore==1.0.8
|
50 |
+
httpx==0.28.1
|
51 |
+
huggingface-hub==0.30.2
|
52 |
+
identify==2.6.10
|
53 |
+
idna==3.10
|
54 |
+
ipykernel==6.29.5
|
55 |
+
ipython==8.35.0
|
56 |
+
jedi==0.19.2
|
57 |
+
Jinja2==3.1.6
|
58 |
+
jiter==0.9.0
|
59 |
+
joblib==1.4.2
|
60 |
+
jsonschema==4.23.0
|
61 |
+
jsonschema-specifications==2025.4.1
|
62 |
+
jupyter_client==8.6.3
|
63 |
+
jupyter_core==5.7.2
|
64 |
+
kiwisolver==1.4.8
|
65 |
+
langdetect==1.0.9
|
66 |
+
markdown2==2.5.3
|
67 |
+
markdownify==0.13.1
|
68 |
+
marker-pdf==1.6.2
|
69 |
+
MarkupSafe==3.0.2
|
70 |
+
matplotlib==3.10.1
|
71 |
+
matplotlib-inline==0.1.7
|
72 |
+
mpmath==1.3.0
|
73 |
+
multidict==6.4.3
|
74 |
+
multiprocess==0.70.16
|
75 |
+
narwhals==1.39.1
|
76 |
+
nest-asyncio==1.6.0
|
77 |
+
networkx==3.4.2
|
78 |
+
nodeenv==1.9.1
|
79 |
+
numpy==2.2.4
|
80 |
+
nvidia-cublas-cu12==12.4.5.8
|
81 |
+
nvidia-cuda-cupti-cu12==12.4.127
|
82 |
+
nvidia-cuda-nvrtc-cu12==12.4.127
|
83 |
+
nvidia-cuda-runtime-cu12==12.4.127
|
84 |
+
nvidia-cudnn-cu12==9.1.0.70
|
85 |
+
nvidia-cufft-cu12==11.2.1.3
|
86 |
+
nvidia-curand-cu12==10.3.5.147
|
87 |
+
nvidia-cusolver-cu12==11.6.1.9
|
88 |
+
nvidia-cusparse-cu12==12.3.1.170
|
89 |
+
nvidia-cusparselt-cu12==0.6.2
|
90 |
+
nvidia-nccl-cu12==2.21.5
|
91 |
+
nvidia-nvjitlink-cu12==12.4.127
|
92 |
+
nvidia-nvtx-cu12==12.4.127
|
93 |
+
openai==1.75.0
|
94 |
+
opencv-python==4.11.0.86
|
95 |
+
opencv-python-headless==4.11.0.86
|
96 |
+
packaging==24.2
|
97 |
+
pandas==2.2.3
|
98 |
+
parso==0.8.4
|
99 |
+
pdf2image==1.17.0
|
100 |
+
pdftext==0.6.2
|
101 |
+
pexpect==4.9.0
|
102 |
+
pillow==10.4.0
|
103 |
+
platformdirs==4.3.7
|
104 |
+
pre_commit==4.2.0
|
105 |
+
prompt_toolkit==3.0.50
|
106 |
+
propcache==0.3.1
|
107 |
+
protobuf==6.31.0
|
108 |
+
psutil==7.0.0
|
109 |
+
ptyprocess==0.7.0
|
110 |
+
pure_eval==0.2.3
|
111 |
+
py-cpuinfo==9.0.0
|
112 |
+
pyarrow==19.0.1
|
113 |
+
pyasn1==0.6.1
|
114 |
+
pyasn1_modules==0.4.2
|
115 |
+
pyclipper==1.3.0.post6
|
116 |
+
pydantic==2.11.3
|
117 |
+
pydantic-settings==2.8.1
|
118 |
+
pydantic_core==2.33.1
|
119 |
+
pydeck==0.9.1
|
120 |
+
Pygments==2.19.1
|
121 |
+
PyMuPDF==1.25.5
|
122 |
+
pyparsing==3.2.3
|
123 |
+
pypdfium2==4.30.0
|
124 |
+
pytesseract==0.3.13
|
125 |
+
python-dateutil==2.9.0.post0
|
126 |
+
python-doctr==0.11.0
|
127 |
+
python-dotenv==1.1.0
|
128 |
+
pytz==2025.2
|
129 |
+
PyYAML==6.0.2
|
130 |
+
pyzmq==26.4.0
|
131 |
+
qwen-vl-utils==0.0.10
|
132 |
+
RapidFuzz==3.13.0
|
133 |
+
referencing==0.36.2
|
134 |
+
regex==2024.11.6
|
135 |
+
requests==2.32.3
|
136 |
+
rpds-py==0.25.0
|
137 |
+
rsa==4.9.1
|
138 |
+
safetensors==0.5.3
|
139 |
+
scikit-learn==1.6.1
|
140 |
+
scipy==1.15.2
|
141 |
+
seaborn==0.13.2
|
142 |
+
sentence-transformers==4.1.0
|
143 |
+
shapely==2.1.0
|
144 |
+
simsimd==6.2.1
|
145 |
+
six==1.17.0
|
146 |
+
smmap==5.0.2
|
147 |
+
sniffio==1.3.1
|
148 |
+
soupsieve==2.7
|
149 |
+
stack-data==0.6.3
|
150 |
+
streamlit==1.45.1
|
151 |
+
stringzilla==3.12.3
|
152 |
+
surya-ocr==0.13.1
|
153 |
+
sympy==1.13.1
|
154 |
+
tenacity==9.1.2
|
155 |
+
thop==0.1.1.post2209072238
|
156 |
+
threadpoolctl==3.6.0
|
157 |
+
tokenizers==0.21.1
|
158 |
+
toml==0.10.2
|
159 |
+
torch==2.6.0
|
160 |
+
torchvision==0.21.0
|
161 |
+
tornado==6.4.2
|
162 |
+
tqdm==4.67.1
|
163 |
+
traitlets==5.14.3
|
164 |
+
transformers==4.51.2
|
165 |
+
triton==3.2.0
|
166 |
+
typing-inspection==0.4.0
|
167 |
+
typing_extensions==4.13.2
|
168 |
+
tzdata==2025.2
|
169 |
+
urllib3==2.4.0
|
170 |
+
virtualenv==20.30.0
|
171 |
+
watchdog==6.0.0
|
172 |
+
wcwidth==0.2.13
|
173 |
+
websockets==15.0.1
|
174 |
+
xxhash==3.5.0
|
175 |
+
yarl==1.20.0
|