badrivishalk commited on
Commit
30857ba
·
1 Parent(s): 23ca6b0

Added source code

Browse files
Files changed (5) hide show
  1. Dockerfile +5 -20
  2. src/app.py +370 -0
  3. src/config.py +2 -0
  4. src/predict_output.py +559 -0
  5. src/requirements.txt +175 -0
Dockerfile CHANGED
@@ -1,21 +1,6 @@
1
- FROM python:3.9-slim
2
-
3
- WORKDIR /app
4
-
5
- RUN apt-get update && apt-get install -y \
6
- build-essential \
7
- curl \
8
- software-properties-common \
9
- git \
10
- && rm -rf /var/lib/apt/lists/*
11
-
12
- COPY requirements.txt ./
13
- COPY src/ ./src/
14
-
15
- RUN pip3 install -r requirements.txt
16
-
17
  EXPOSE 8501
18
-
19
- HEALTHCHECK CMD curl --fail http://localhost:8501/_stcore/health
20
-
21
- ENTRYPOINT ["streamlit", "run", "src/streamlit_app.py", "--server.port=8501", "--server.address=0.0.0.0"]
 
1
+ FROM python:3.10
2
+ WORKDIR /src
3
+ COPY . /src
4
+ RUN pip install -r requirements.txt
 
 
 
 
 
 
 
 
 
 
 
 
5
  EXPOSE 8501
6
+ CMD ["streamlit", "run", "main/demo/app.py", "--server.port=8501", "--server.address=0.0.0.0"]
 
 
 
src/app.py ADDED
@@ -0,0 +1,370 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import time
3
+ import os
4
+ import shutil
5
+ import pymupdf
6
+ import json
7
+
8
+ st.set_page_config(
9
+ page_title="MGVG Grounding Demo",
10
+ layout="wide",
11
+ initial_sidebar_state="expanded",
12
+ page_icon="logo.png"
13
+ )
14
+
15
+ # --- Simple Authentication ---
16
+ import streamlit as st
17
+ import time
18
+
19
+ # Define your valid credentials
20
+ VALID_USERS = {
21
+ "iitb": "iitb123",
22
+ "badri": "badri123"
23
+ }
24
+
25
+ def login():
26
+ # Set a professional background for the whole app
27
+ st.markdown(
28
+ '''
29
+ <style>
30
+ body, .stApp {
31
+ background: linear-gradient(120deg, #e0eafc 0%, #cfdef3 100%) !important;
32
+ }
33
+ .login-box {
34
+ background: #fff;
35
+ padding: 2.5em 2em 2em 2em;
36
+ border-radius: 16px;
37
+ box-shadow: 0 4px 24px rgba(80, 120, 200, 0.12);
38
+ min-width: 320px;
39
+ max-width: 90vw;
40
+ margin: auto;
41
+ }
42
+ </style>
43
+ ''', unsafe_allow_html=True
44
+ )
45
+ # Center the login box using columns
46
+ col1, col2, col3 = st.columns([1,2,1])
47
+ with col2:
48
+ # st.markdown('<div class="login-box">', unsafe_allow_html=True)
49
+ # image at center
50
+ st.image("logo.png", width=800, use_container_width=False)
51
+ st.markdown('<h2 style="text-align:center; color:#2b6cb0; margin-bottom:1.5em;">🔒 Please log in to access the app</h2>', unsafe_allow_html=True)
52
+ username = st.text_input("Username", key="login_username")
53
+ password = st.text_input("Password", type="password", key="login_password")
54
+ login_btn = st.button("Login")
55
+ if login_btn:
56
+ if username in VALID_USERS and VALID_USERS[username] == password:
57
+ st.session_state["authenticated"] = True
58
+ st.success("Login successful!")
59
+ st.session_state["show_continue"] = True
60
+ else:
61
+ st.error("Invalid username or password")
62
+ if st.session_state.get("show_continue", False):
63
+ if st.button("Continue to App"):
64
+ st.session_state["show_continue"] = False
65
+ st.experimental_rerun() if hasattr(st, "experimental_rerun") else None
66
+ st.markdown('</div>', unsafe_allow_html=True)
67
+
68
+ if "authenticated" not in st.session_state:
69
+ st.session_state["authenticated"] = False
70
+
71
+ if not st.session_state["authenticated"]:
72
+ login()
73
+ st.stop()
74
+ # --- End Authentication ---
75
+
76
+ # st.image("logo.png", width=250)
77
+
78
+ from PIL import Image, ImageDraw
79
+ import io
80
+ # from st_audiorec import st_audiorec
81
+
82
+ from surya.layout import LayoutPredictor
83
+ from doctr.models import ocr_predictor
84
+ from transformers import pipeline
85
+
86
+ @st.cache_resource
87
+ def get_layout_predictor():
88
+ return LayoutPredictor()
89
+
90
+ @st.cache_resource
91
+ def get_ocr_model():
92
+ return ocr_predictor(det_arch='db_resnet50', reco_arch='crnn_vgg16_bn', pretrained=True)
93
+
94
+ @st.cache_resource
95
+ def get_llm_model(device):
96
+ return pipeline("text-generation", model="meta-llama/Meta-Llama-3.1-8B-Instruct", device=device)
97
+
98
+ from predict_output import predict_output
99
+
100
+
101
+ layout_predictor = get_layout_predictor()
102
+ model = get_ocr_model()
103
+ pipe = get_llm_model("cuda")
104
+
105
+ print("Models loaded")
106
+
107
+ # --- Placeholder function for demo ---
108
+ def get_corresponding_bboxes(image, question):
109
+ # Returns dummy bounding boxes and answer for demo
110
+ # Each bbox: (x1, y1, x2, y2)
111
+ w, h = image.size
112
+ block_bboxes = [(w//8, h//8, w//2, h//2)]
113
+ line_bboxes = [(w//4, h//4, w//2, h//3)]
114
+ word_bboxes = [(w//3, h//3, w//2, h//2)]
115
+ point_bboxes = [(w//2, h//2, w//2+5, h//2+5)]
116
+ answer = "This is a demo answer."
117
+ return block_bboxes, line_bboxes, word_bboxes, point_bboxes, answer
118
+
119
+ # --- Helper to draw bboxes ---
120
+ def draw_bboxes(image, bboxes, color):
121
+ img = image.copy()
122
+ # width proportional to the image size
123
+ width = int(img.width/100)
124
+ draw = ImageDraw.Draw(img)
125
+ for bbox in bboxes:
126
+ draw.rectangle(bbox, outline=color, width=width)
127
+ return img
128
+
129
+ def draw_points(image, bboxes, color):
130
+ img = image.copy()
131
+ width = int(img.width)
132
+ draw = ImageDraw.Draw(img)
133
+ for bbox in bboxes:
134
+ # x1, y1, x2, y2 = bbox
135
+ cx, cy = bbox[0], bbox[1]
136
+ # r being relative to the image size
137
+ r = int(img.width/100)
138
+ draw.ellipse((cx-r, cy-r, cx+r, cy+r), outline=color, width=width, fill=color)
139
+ return img
140
+
141
+ # model_type = st.sidebar.checkbox("Use LLM Model", value=False)
142
+ # model_type = "llm" if model_type else "inhouse"
143
+
144
+ st.markdown("""
145
+ <style>
146
+ .main {
147
+ background: linear-gradient(135deg, #f8fafc 0%, #e0e7ef 100%);
148
+ }
149
+ .block-container {
150
+ padding-top: 2rem;
151
+ padding-bottom: 2rem;
152
+ }
153
+ .stButton>button {
154
+ background-color: #4F8BF9;
155
+ color: white;
156
+ border-radius: 8px;
157
+ font-size: 1.1rem;
158
+ padding: 0.5em 2em;
159
+ }
160
+ .stTextInput>div>input {
161
+ border-radius: 8px;
162
+ border: 1px solid #4F8BF9;
163
+ }
164
+ .stFileUploader>div>div {
165
+ border-radius: 8px;
166
+ border: 2px dashed #4F8BF9;
167
+ }
168
+ .stAudio>audio {
169
+ width: 100% !important;
170
+ }
171
+ </style>
172
+ """, unsafe_allow_html=True)
173
+
174
+ col_logo, col_title = st.columns([1, 8])
175
+ with col_logo:
176
+ st.image("logo.png", width=180)
177
+ with col_title:
178
+ st.markdown("<h1 style='margin-bottom: 0;'>MGVG - Multi-Granular Visual Grounding</h1>", unsafe_allow_html=True)
179
+
180
+ # List of quotes (HTML formatted)
181
+ QUOTES = [
182
+ '''<div style="color: #2b6cb0; font-size: 1.3em; font-weight: 500; margin-bottom: 1em;">
183
+ "प्रत्यक्षं किं प्रमाणं?" <span style="font-size:0.9em; color:#444;">(<i>What better proof is there than direct perception?)</i></span>
184
+ </div>''',
185
+ '''<div style="color: #2b6cb0; font-size: 1.3em; font-weight: 500; margin-bottom: 1em;">
186
+ <i>"Truth is not told—it is seen."</i>
187
+ </div>'''
188
+ ]
189
+
190
+ # Initialize session state for quote index and last update time
191
+ if "quote_index" not in st.session_state:
192
+ st.session_state.quote_index = 0
193
+ st.session_state.last_quote_time = time.time()
194
+
195
+ # Check if 5 seconds have passed
196
+ if time.time() - st.session_state.last_quote_time > 5:
197
+ st.session_state.quote_index = (st.session_state.quote_index + 1) % len(QUOTES)
198
+ st.session_state.last_quote_time = time.time()
199
+ # Rerun the app to update the quote
200
+ if hasattr(st, "experimental_rerun"):
201
+ st.experimental_rerun()
202
+
203
+ # Display the current quote
204
+ st.markdown(QUOTES[st.session_state.quote_index], unsafe_allow_html=True)
205
+
206
+ col1, col2 = st.columns([1, 2])
207
+
208
+ with col1:
209
+ st.subheader("1. Upload Image or pdf document")
210
+ image = "Not Uploaded"
211
+ uploaded_file = st.file_uploader("Choose an image", type=["png", "jpg", "jpeg", "pdf"])
212
+ if uploaded_file:
213
+ current_dir = os.getcwd()
214
+ temp_output_folder = os.path.join(current_dir, "temp_output_folder/")
215
+ # delete the temp_output_folder
216
+ if os.path.exists(temp_output_folder):
217
+ shutil.rmtree(temp_output_folder)
218
+
219
+ document_type = "image"
220
+ if uploaded_file.type == "application/pdf":
221
+
222
+
223
+ # save the uploaded file to a temp file
224
+ temp_file_path = os.path.join(current_dir, "temp_file.pdf")
225
+
226
+ # delete the temp_file_path
227
+ if os.path.exists(temp_file_path):
228
+ os.remove(temp_file_path)
229
+
230
+ with open(temp_file_path, "wb") as f:
231
+ f.write(uploaded_file.getbuffer())
232
+
233
+ if not os.path.exists(temp_output_folder):
234
+ os.makedirs(temp_output_folder)
235
+ # output_file = simple_counter_generator("page", ".jpg")
236
+ # convert_from_path(document_path, output_folder=temp_output_folder, dpi=300, fmt='jpeg', jpegopt= jpg_options, output_file=output_file)
237
+
238
+ pages = 0
239
+ doc = pymupdf.open(temp_file_path) # open document
240
+ for page in doc: # iterate through the pages
241
+ pages += 1
242
+ pix = page.get_pixmap() # render page to an image
243
+ pix.save(f"{temp_output_folder}/{page.number}.png")
244
+
245
+ if(pages == 1):
246
+ document_type = "image"
247
+ document_path = os.path.join(temp_output_folder, "0.png")
248
+ uploaded_file = os.path.join(temp_output_folder, "0.png")
249
+ image = Image.open(uploaded_file).convert("RGB")
250
+ else:
251
+ document_type = "pdf"
252
+ # image = Image.open(uploaded_file).convert("RGB")
253
+
254
+ if document_type == "image":
255
+ image = Image.open(uploaded_file).convert("RGB")
256
+ st.image(image, caption="Uploaded Image", use_container_width=True)
257
+ # Save uploaded image to a temp file for predict_output
258
+ temp_file_path = "sample.png"
259
+ image.save(temp_file_path)
260
+ else:
261
+ document_type = "pdf"
262
+ document_path = uploaded_file.name
263
+ image = "Uploaded PDF"
264
+ # st.image(uploaded_file, caption="Uploaded PDF", use_container_width=True)
265
+ else:
266
+ image = "Not Uploaded"
267
+ temp_output_folder = None
268
+ st.image("https://placehold.co/400x300?text=Upload+Image", caption="Uploaded Image", use_container_width=True)
269
+
270
+ st.subheader("2. Ask a question")
271
+ question = st.text_input("Type your question here")
272
+
273
+ # Add radio button for model selection
274
+ model_type = st.radio(
275
+ "Select Model Type:",
276
+ options=["MGVG", "IndoDocs"],
277
+ index=1,
278
+ horizontal=True
279
+ )
280
+
281
+ run_demo = st.button("Run Grounding Demo", use_container_width=True)
282
+
283
+ # --- Output placeholders ---
284
+ with col2:
285
+ st.subheader("3. Visual Grounding Outputs")
286
+ if image!="Not Uploaded" and (question):
287
+ print(image)
288
+ print(question)
289
+ if run_demo and image!="Not Uploaded" and (question):
290
+ # Use text input only
291
+ q = question
292
+ answer, block_bboxes, line_bboxes, word_bboxes, point_bboxes, current_page = predict_output(
293
+ temp_file_path, q, pipe, layout_predictor, model, model_type, document_type
294
+ )
295
+
296
+
297
+ # print(block_bboxes)
298
+ # print(line_bboxes)
299
+ # print(word_bboxes)
300
+ # print(point_bboxes)
301
+ print(answer)
302
+
303
+ if(current_page != -1):
304
+ image = Image.open(os.path.join(temp_output_folder, f"{current_page}.png")).convert("RGB")
305
+ print("--------------------------------")
306
+ print(image)
307
+
308
+ block_img = draw_bboxes(image, block_bboxes, color="#4F8BF9")
309
+ line_img = draw_bboxes(image, line_bboxes, color="#F97B4F")
310
+ word_img = draw_bboxes(image, word_bboxes, color="#4FF9B2")
311
+ point_img = draw_points(image, point_bboxes, color="#FFFF00")
312
+ imgs = [block_img, line_img, word_img, point_img]
313
+ labels = ["Block Level", "Line Level", "Word Level", "Point Level"]
314
+ cols = st.columns(4)
315
+ for i, (img, label) in enumerate(zip(imgs, labels)):
316
+ with cols[i]:
317
+ st.image(img, caption=label, use_container_width=True)
318
+ answer_lines = answer.splitlines()
319
+ st.markdown("""
320
+ <div style='background: #f1f5fa; border-radius: 10px; padding: 1em 2em; border: 1.5px solid #4F8BF9;'>
321
+ <h4 style='color: #4F8BF9;'>Predicted Answer:</h4>
322
+ <p style='font-size: 1.2em; color: #222;'>""" + "<br>".join(answer_lines) + """</p>
323
+ </div>
324
+ """, unsafe_allow_html=True)
325
+
326
+ # --- Centered Save Results Button ---
327
+ result_data = {
328
+ "question": q,
329
+ "answer": answer,
330
+ "block_bboxes": block_bboxes,
331
+ "line_bboxes": line_bboxes,
332
+ "word_bboxes": word_bboxes,
333
+ "point_bboxes": point_bboxes,
334
+ "current_page": current_page
335
+ }
336
+ json_str = json.dumps(result_data, indent=2)
337
+ col_left, col_center, col_right = st.columns([2, 3, 2])
338
+ with col_center:
339
+ st.download_button(
340
+ label="Save Results as JSON",
341
+ data=json_str,
342
+ file_name="grounding_results.json",
343
+ mime="application/json"
344
+ )
345
+ else:
346
+ st.markdown("""
347
+ <div style='display: flex; gap: 2em; flex-wrap: wrap;'>
348
+ <div style='flex: 1; min-width: 220px;'>
349
+ <img src='https://placehold.co/220x180?text=Block+Level' style='width:100%; border-radius: 10px; border: 2px solid #4F8BF9;'>
350
+ <p style='text-align:center; font-weight:600;'>Block Level</p>
351
+ </div>
352
+ <div style='flex: 1; min-width: 220px;'>
353
+ <img src='https://placehold.co/220x180?text=Line+Level' style='width:100%; border-radius: 10px; border: 2px solid #4F8BF9;'>
354
+ <p style='text-align:center; font-weight:600;'>Line Level</p>
355
+ </div>
356
+ <div style='flex: 1; min-width: 220px;'>
357
+ <img src='https://placehold.co/220x180?text=Word+Level' style='width:100%; border-radius: 10px; border: 2px solid #4F8BF9;'>
358
+ <p style='text-align:center; font-weight:600;'>Word Level</p>
359
+ </div>
360
+ <div style='flex: 1; min-width: 220px;'>
361
+ <img src='https://placehold.co/220x180?text=Point+Level' style='width:100%; border-radius: 10px; border: 2px solid #4F8BF9;'>
362
+ <p style='text-align:center; font-weight:600;'>Point Level</p>
363
+ </div>
364
+ </div>
365
+ <br>
366
+ <div style='background: #f1f5fa; border-radius: 10px; padding: 1em 2em; border: 1.5px solid #4F8BF9;'>
367
+ <h4 style='color: #4F8BF9;'>Predicted Answer:</h4>
368
+ <p style='font-size: 1.2em; color: #222;'>[Answer will appear here]</p>
369
+ </div>
370
+ """, unsafe_allow_html=True)
src/config.py ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ MODEL1 = "MGVG"
2
+ MODEL2 = "IndoDocs"
src/predict_output.py ADDED
@@ -0,0 +1,559 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import shutil
3
+ from fuzzywuzzy import fuzz
4
+ from tqdm import tqdm
5
+ from PIL import Image
6
+ import requests
7
+ # from surya.layout import LayoutPredictor
8
+
9
+ from doctr.io import DocumentFile
10
+ from pdf2image import convert_from_path
11
+ import pymupdf
12
+ # from doctr.models import ocr_predictor
13
+ import numpy as np
14
+ from time import time
15
+
16
+ pipe = None
17
+ layout_predictor = None
18
+
19
+ MAX_BLOCK_MATCHES = 2
20
+ MAX_LINE_MATCHES = 5
21
+ CUT_OFF_THRESHOLD = 60
22
+ QUESTION_WEIGHT = 0.2
23
+ ANSWER_WEIGHT = 0.8
24
+ LEVEL = "line"
25
+
26
+ jpg_options = {
27
+ "quality" : 100,
28
+ "progressive": True,
29
+ "optimize" : False
30
+ }
31
+
32
+ stop_words = {'what', 'is', 'the', 'this', 'that', 'these', 'those', 'which', 'how', 'why', 'where', 'when', 'who', 'will', 'be', 'and', 'or', 'in', 'at', 'to', 'for', 'of', 'with', 'by'}
33
+
34
+ def longest_consecutive_range(indices):
35
+ if not indices:
36
+ return []
37
+
38
+ indices = sorted(set(indices))
39
+ longest = []
40
+ current = [indices[0]]
41
+
42
+ for i in range(1, len(indices)):
43
+ if indices[i] == indices[i - 1] + 1:
44
+ current.append(indices[i])
45
+ else:
46
+ if len(current) > len(longest):
47
+ longest = current
48
+ current = [indices[i]]
49
+
50
+ if len(current) > len(longest):
51
+ longest = current
52
+
53
+ return longest
54
+
55
+
56
+ def get_word_level_matches(answer_text, top_k_matches):
57
+ bboxes = []
58
+ for match in top_k_matches:
59
+ indices = []
60
+ for index, word in enumerate(match['words']):
61
+ if word['text'].lower() in answer_text.lower():
62
+ # bboxes.append(word['bbox'])
63
+ indices.append(index)
64
+ longest_indices = longest_consecutive_range(indices)
65
+ for index in longest_indices:
66
+ bboxes.append(match['words'][index]['bbox'])
67
+ return bboxes
68
+
69
+
70
+ def get_matched_regions(question_text, target_text, predictions, level):
71
+
72
+ question_terms = [word.lower() for word in question_text.split() if word.lower() not in stop_words]
73
+ matched_regions = []
74
+ for region in predictions:
75
+ region_text = region['text']
76
+ region_copy = region.copy()
77
+
78
+ if target_text.lower() in region_text.lower():
79
+ region_copy['match_score'] = 100
80
+ region_copy['match_details'] = {
81
+ 'exact_match': True,
82
+ 'answer_score': 100,
83
+ 'question_score': 100
84
+ }
85
+ matched_regions.append(region_copy)
86
+ continue
87
+
88
+ partial_score = fuzz.partial_ratio(target_text.lower(), region_text.lower())
89
+ token_score = fuzz.token_set_ratio(target_text.lower(), region_text.lower())
90
+
91
+ # Calculate length factor (preference for longer matches that contain meaningful content)
92
+ target_len = len(target_text)
93
+ region_len = len(region_text)
94
+ length_factor = min(1.0, region_len / min(50, target_len)) # Cap at 1.0, adapt based on target length
95
+
96
+ # Combine scores for answer with weights
97
+ # Higher weight to token matching for longer texts, higher weight to partial matching for shorter texts
98
+ if region_len > 10:
99
+ answer_score = (partial_score * 0.3) + (token_score * 0.5) + (length_factor * 100 * 0.2)
100
+ else:
101
+ # For very short texts, reduce their overall score unless they're exact matches
102
+ answer_score = (partial_score * 0.3) + (token_score * 0.4) + (length_factor * 100 * 0.3)
103
+ if region_len < 5 and partial_score < 100:
104
+ answer_score *= 0.5 # Penalize very short inexact matches
105
+
106
+ # penalize shorter region_texts
107
+ if region_len < 5:
108
+ answer_score *= 0.5
109
+
110
+ # Calculate fuzzy match scores for question terms using both methods
111
+ partial_question_scores = [fuzz.partial_ratio(term, region_text.lower()) for term in question_terms]
112
+ token_question_scores = [fuzz.token_set_ratio(term, region_text.lower()) for term in question_terms]
113
+
114
+ # Get best scores for question terms
115
+ best_partial_question = max(partial_question_scores) if partial_question_scores else 0
116
+ best_token_question = max(token_question_scores) if token_question_scores else 0
117
+
118
+ # Combine question scores
119
+ question_score = (best_partial_question * 0.4) + (best_token_question * 0.6)
120
+
121
+ # Combine scores (giving more weight to answer matches)
122
+ combined_score = (answer_score * ANSWER_WEIGHT) + (question_score * QUESTION_WEIGHT)
123
+
124
+ # print(combined_score)
125
+
126
+ if combined_score >= CUT_OFF_THRESHOLD:
127
+ region_copy['match_score'] = combined_score
128
+ region_copy['match_details'] = {
129
+ 'exact_match': False,
130
+ 'answer_score': answer_score,
131
+ 'question_score': question_score,
132
+ 'answer_weight': ANSWER_WEIGHT,
133
+ 'question_weight': QUESTION_WEIGHT
134
+ }
135
+ matched_regions.append(region_copy)
136
+
137
+
138
+ matched_regions.sort(key=lambda x: x['match_score'], reverse=True)
139
+
140
+ # If no matches, reduce threshold by 20 and get the topmost single output
141
+ if not matched_regions:
142
+ new_threshold = max(CUT_OFF_THRESHOLD - 20, 0) # Prevent negative threshold
143
+ matched_regions = [region for region in matched_regions if region['match_score'] >= new_threshold]
144
+ matched_regions.sort(key=lambda x: x['match_score'], reverse=True)
145
+ if matched_regions:
146
+ matched_regions = [matched_regions[0]] # Only keep the topmost single output
147
+
148
+ if level == "block":
149
+ top_matches = matched_regions[:MAX_BLOCK_MATCHES]
150
+ elif level == "line":
151
+ top_matches = matched_regions[:MAX_LINE_MATCHES]
152
+ return top_matches
153
+
154
+
155
+ def get_processed_text_for_llm(block_predictions, gap):
156
+ final_text = ""
157
+ for block_data in block_predictions:
158
+ final_text += block_data['text'] + gap
159
+ return final_text
160
+
161
+
162
+ def get_page_number(block_bboxes):
163
+ pages = {}
164
+ for block in block_bboxes:
165
+ if block['page'] not in pages:
166
+ pages[block['page']] = 1
167
+ else:
168
+ pages[block['page']] += 1
169
+
170
+ print(pages)
171
+ max_page = max(pages, key=pages.get)
172
+ return max_page
173
+
174
+
175
+ def predict_output(document_path, question, pipe, layout_predictor, model, model_type, document_type="image"):
176
+
177
+ predicted_answer = None
178
+ block_box_predictions = None
179
+ line_box_predictions = None
180
+ word_box_predictions = None
181
+ point_box_predictions = None
182
+
183
+
184
+ curr_time = time()
185
+ line_predictions, pages_count = get_line_predictions(document_path, model, document_type)
186
+ line_time = time()
187
+ print(f"Done with line predictions in {line_time - curr_time} seconds")
188
+ curr_time = time()
189
+ if(document_type == "pdf" and pages_count < 3):
190
+ block_predictions = get_block_predictions(document_path, layout_predictor, model, document_type)
191
+ gap = '\n\n\n'
192
+ else:
193
+ block_predictions = line_predictions
194
+ gap = '\n'
195
+ block_time = time()
196
+ print(f"Done with block predictions in {block_time - line_time} seconds")
197
+ # exit()
198
+
199
+ # print(line_predictions)
200
+ # print(block_predictions)
201
+
202
+
203
+ curr_time = time()
204
+ if model_type == "MGVG" or document_type=="pdf":
205
+ processed_text_for_llm = get_processed_text_for_llm(block_predictions, gap=gap)
206
+ # print("Processed Text for LLM: ", processed_text_for_llm)
207
+ predicted_answer = generate_llm_answer(question, processed_text_for_llm, pipe)
208
+
209
+ elif model_type == "IndoDocs":
210
+ predicted_answer = generate_via_inhouse_model_answer(question, document_path)
211
+ llm_time = time()
212
+ print(f"Done with LLM in {llm_time - curr_time} seconds")
213
+
214
+ print("LLM Answer: ", predicted_answer)
215
+
216
+
217
+ total_algo_time = time()
218
+
219
+ # print(predicted_answer)
220
+ curr_time = time()
221
+
222
+ line_matches = get_matched_regions(question, predicted_answer, line_predictions, "line")
223
+
224
+
225
+ block_bboxes = get_matched_regions(question, predicted_answer, block_predictions, "block")
226
+ match_time = time()
227
+ print(f"Done with match in {match_time - curr_time} seconds")
228
+
229
+
230
+ if document_type == "pdf":
231
+ current_page = get_page_number(block_bboxes)
232
+ else:
233
+ current_page = -1
234
+
235
+ if(current_page != -1):
236
+ predicted_answer = "Answer predicted from page: " + str(current_page+1) + "\n" + predicted_answer
237
+
238
+ block_box_predictions = []
239
+ for match in block_bboxes:
240
+ block_box_predictions.append(match['bbox'])
241
+
242
+ line_box_predictions = []
243
+ for match in line_matches:
244
+ # print(match['page'], match['bbox'])
245
+ if current_page == -1 or match['page'] == current_page:
246
+ line_box_predictions.append(match['bbox'])
247
+
248
+ # for line in line_box_predictions:
249
+ # print(line)
250
+
251
+ curr_time = time()
252
+ word_box_predictions = get_word_level_matches(predicted_answer, top_k_matches=line_matches)
253
+ word_time = time()
254
+ print(f"Done with word in {word_time - curr_time} seconds")
255
+
256
+ curr_time = time()
257
+ point_box_predictions = get_point_level_matches(block_box_predictions, line_box_predictions, word_box_predictions)
258
+ point_time = time()
259
+ print(f"Done with point in {point_time - curr_time} seconds")
260
+
261
+ print(f"Total algo time: {time() - total_algo_time} seconds")
262
+
263
+
264
+ # print(block_box_predictions)
265
+ # print(line_box_predictions)
266
+ # print(word_box_predictions)
267
+ # print(point_box_predictions)
268
+ # print(predicted_answer)
269
+
270
+
271
+ return predicted_answer, block_box_predictions, line_box_predictions, word_box_predictions, point_box_predictions, current_page
272
+
273
+
274
+ def calculate_midpoint_of_bboxes(bboxes):
275
+
276
+ if not bboxes:
277
+ return None
278
+
279
+ # Convert to numpy array for easier manipulation
280
+ bboxes = np.array(bboxes)
281
+
282
+ # Find the extreme points of all bboxes combined
283
+ min_x = np.min(bboxes[:, 0])
284
+ min_y = np.min(bboxes[:, 1])
285
+ max_x = np.max(bboxes[:, 2])
286
+ max_y = np.max(bboxes[:, 3])
287
+
288
+ # Calculate midpoint
289
+ midpoint_x = (min_x + max_x) / 2
290
+ midpoint_y = (min_y + max_y) / 2
291
+
292
+ return round(midpoint_x, 2), round(midpoint_y, 2)
293
+
294
+
295
+ def get_point_level_matches(block_box_predictions, line_box_predictions, word_box_predictions):
296
+
297
+ point_box_predictions = []
298
+
299
+ if len(block_box_predictions) ==1:
300
+ try:
301
+ x, y = calculate_midpoint_of_bboxes(block_box_predictions)
302
+ point_box_predictions = [[x, y]]
303
+ # print(x, y)
304
+ except:
305
+ try:
306
+ x, y = calculate_midpoint_of_bboxes(line_box_predictions)
307
+ point_box_predictions = [[x, y]]
308
+ except:
309
+ point_box_predictions = []
310
+ else:
311
+ points = []
312
+ for block_bbox in block_box_predictions:
313
+ try:
314
+ x, y = calculate_midpoint_of_bboxes(block_bbox)
315
+ points.append([x, y])
316
+ except:
317
+ continue
318
+ point_box_predictions = points
319
+
320
+ return point_box_predictions
321
+
322
+
323
+ def generate_via_inhouse_model_answer(question, image_path, api_key="VISION-TEAM", max_tokens=512, temperature=0.7, endpoint="http://103.207.148.38:9000/api/v1/chat/upload"):
324
+ headers = {
325
+ "x-api-key": api_key # or whatever the Swagger UI says
326
+ }
327
+
328
+ files = {
329
+ "image": open(image_path, "rb")
330
+ }
331
+
332
+ data = {
333
+ "text": question,
334
+ "max_tokens": str(max_tokens),
335
+ "temperature": str(temperature)
336
+ }
337
+
338
+ try:
339
+ response = requests.post(endpoint, headers=headers, files=files, data=data)
340
+ response.raise_for_status()
341
+ result = response.json()
342
+ except requests.exceptions.RequestException as e:
343
+ return {"error": str(e)}
344
+
345
+ return result['response']['choices'][0]['message']['content']
346
+
347
+ def generate_llm_answer(question, context, pipe):
348
+
349
+ prompt = f"""You are given a question and context. Your task is to find and return the best possible answer to the question using only the context as it is.
350
+ Do not generate summaries, paraphrased content, or any additional explanations including any preamble and postamble.
351
+ Return only the exact phrase or sentence fragment from the context that answers the question.
352
+ If the answer is not found in the context, return: Answer not found in context.
353
+
354
+ Question: {question}
355
+ Context: {context}
356
+ Answer:
357
+ """
358
+
359
+ messages = [ {"role": "user", "content": prompt}]
360
+ result = pipe(messages, max_new_tokens=512, do_sample=True, temperature=0.7)
361
+ ans = result[0]["generated_text"][1]['content']
362
+ return ans
363
+
364
+
365
+ def get_line_predictions(document_path, model, document_type):
366
+
367
+ current_dir = os.getcwd()
368
+ if document_type == "pdf":
369
+ output_file = simple_counter_generator("page", ".jpg")
370
+ current_dir = os.getcwd()
371
+ temp_output_folder = os.path.join(current_dir, "temp_output_folder/")
372
+
373
+ # delete the temp_output_folder
374
+ if os.path.exists(temp_output_folder):
375
+ shutil.rmtree(temp_output_folder)
376
+
377
+ if not os.path.exists(temp_output_folder):
378
+ os.makedirs(temp_output_folder)
379
+ # output_file = simple_counter_generator("page", ".jpg")
380
+ # convert_from_path(document_path, output_folder=temp_output_folder, dpi=300, fmt='jpeg', jpegopt= jpg_options, output_file=output_file)
381
+
382
+ doc = pymupdf.open(document_path) # open document
383
+ for page in doc: # iterate through the pages
384
+ pix = page.get_pixmap() # render page to an image
385
+ pix.save(f"{temp_output_folder}/{page.number}.png")
386
+
387
+ images_path = sorted(os.listdir(temp_output_folder))
388
+ else:
389
+
390
+ images_path = [os.path.join(current_dir, document_path)]
391
+ print(images_path)
392
+
393
+ block_predictions = []
394
+ # print(document_path)
395
+ # if document_type == "pdf":
396
+ # doc = DocumentFile.from_pdf(document_path)
397
+ # else:
398
+ # doc = DocumentFile.from_images(document_path)
399
+ # result = model(doc)
400
+
401
+ line_predictions = []
402
+
403
+ pages_count = -1
404
+ for image_path in images_path:
405
+ pages_count += 1
406
+
407
+ if(len(images_path) > 1):
408
+ doc = DocumentFile.from_images(os.path.join(temp_output_folder, image_path))
409
+ else:
410
+ doc = DocumentFile.from_images(image_path)
411
+
412
+
413
+ result = model(doc)
414
+ for page in result.pages:
415
+ dim = tuple(reversed(page.dimensions))
416
+ for block in page.blocks:
417
+ for line in block.lines:
418
+ output = {}
419
+ geo = line.geometry
420
+ a = list(a*b for a,b in zip(geo[0],dim))
421
+ b = list(a*b for a,b in zip(geo[1],dim))
422
+ x1 = round(a[0], 2).astype(float)
423
+ y1 = round(a[1], 2).astype(float)
424
+ x2 = round(b[0], 2).astype(float)
425
+ y2 = round(b[1], 2).astype(float)
426
+ line_bbox = [x1, y1, x2, y2]
427
+
428
+ sent = []
429
+ words_data = []
430
+ for word in line.words:
431
+ word_data = {}
432
+ sent.append(word.value)
433
+ geo = word.geometry
434
+ a = list(a*b for a,b in zip(geo[0],dim))
435
+ b = list(a*b for a,b in zip(geo[1],dim))
436
+ x1 = round(a[0], 2).astype(float)
437
+ y1 = round(a[1], 2).astype(float)
438
+ x2 = round(b[0], 2).astype(float)
439
+ y2 = round(b[1], 2).astype(float)
440
+ bbox = [x1, y1, x2, y2]
441
+
442
+ word_data['bbox'] = bbox
443
+ word_data['text'] = word.value
444
+ words_data.append(word_data)
445
+ output['bbox'] = line_bbox
446
+ output['text'] = " ".join(sent)
447
+ output['words'] = words_data
448
+ output['page'] = pages_count
449
+ line_predictions.append(output)
450
+
451
+ return line_predictions, pages_count
452
+
453
+
454
+ def get_block_predictions(document_path, layout_predictor, model, document_type):
455
+ current_dir = os.getcwd()
456
+ if document_type == "pdf":
457
+ output_file = simple_counter_generator("page", ".jpg")
458
+ current_dir = os.getcwd()
459
+ temp_output_folder = os.path.join(current_dir, "temp_output_folder/")
460
+
461
+ # delete the temp_output_folder
462
+ if os.path.exists(temp_output_folder):
463
+ shutil.rmtree(temp_output_folder)
464
+
465
+ if not os.path.exists(temp_output_folder):
466
+ os.makedirs(temp_output_folder)
467
+ # output_file = simple_counter_generator("page", ".jpg")
468
+ # convert_from_path(document_path, output_folder=temp_output_folder, dpi=300, fmt='jpeg', jpegopt= jpg_options, output_file=output_file)
469
+
470
+ doc = pymupdf.open(document_path) # open document
471
+ for page in doc: # iterate through the pages
472
+ pix = page.get_pixmap() # render page to an image
473
+ pix.save(f"{temp_output_folder}/{page.number}.png")
474
+
475
+ images_path = sorted(os.listdir(temp_output_folder))
476
+ else:
477
+
478
+ images_path = [os.path.join(current_dir, document_path)]
479
+ # print(images_path)
480
+
481
+ block_predictions = []
482
+
483
+
484
+
485
+ page_count = -1
486
+ for image_path in images_path:
487
+ page_count += 1
488
+
489
+ if(len(images_path) > 1):
490
+ image = Image.open(os.path.join(temp_output_folder, image_path))
491
+ else:
492
+ image = Image.open(os.path.join(current_dir, document_path))
493
+
494
+ # print(image_path)
495
+ # print(image)
496
+
497
+ layout_predictions = layout_predictor([image])
498
+
499
+ for block in layout_predictions[0].bboxes:
500
+ output = {}
501
+ bbox = [int(x) for x in block.bbox]
502
+
503
+
504
+ cropped_image = image.crop(bbox)
505
+
506
+ cropped_image.save(f'temp.png')
507
+ doc = DocumentFile.from_images('temp.png')
508
+ result = model(doc)
509
+
510
+ text = []
511
+ for page in result.pages:
512
+ for block in page.blocks:
513
+ for line in block.lines:
514
+ for word in line.words:
515
+ text.append(word.value)
516
+
517
+
518
+ output['bbox'] = bbox
519
+ output['text'] = " ".join(text)
520
+ output['page'] = page_count
521
+ block_predictions.append(output)
522
+
523
+ return block_predictions
524
+
525
+ def simple_counter_generator(prefix="", suffix=""):
526
+ while True:
527
+ yield 'p'
528
+
529
+
530
+
531
+ # from doctr.models import ocr_predictor
532
+ # model = ocr_predictor(det_arch='db_resnet50', reco_arch='crnn_vgg16_bn', pretrained=True)
533
+
534
+
535
+ # # from transformers import pipeline
536
+ # # def load_llm_model(device):
537
+ # # pipe = pipeline("text-generation", model="meta-llama/Meta-Llama-3.1-8B-Instruct", device=device)
538
+ # # return pipe
539
+
540
+ # # pipe = load_llm_model("cuda")
541
+ # pipe = None
542
+
543
+ # # from surya.layout import LayoutPredictor
544
+ # # layout_predictor = LayoutPredictor()
545
+ # layout_predictor = None
546
+
547
+ # document_path = "sample.pdf"
548
+ # question = "What is the subject of the circular?"
549
+
550
+ # answer, block_box_predictions, line_box_predictions, word_box_predictions, point_box_predictions = predict_output(document_path, question, pipe, layout_predictor, model, "Inhouse", document_type="pdf")
551
+
552
+ # print(answer)
553
+ # print(block_box_predictions)
554
+ # print(line_box_predictions)
555
+ # print(word_box_predictions)
556
+ # print(point_box_predictions)
557
+
558
+
559
+
src/requirements.txt ADDED
@@ -0,0 +1,175 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ accelerate==1.6.0
2
+ aiohappyeyeballs==2.6.1
3
+ aiohttp==3.11.18
4
+ aiosignal==1.3.2
5
+ albucore==0.0.23
6
+ albumentations==2.0.5
7
+ altair==5.5.0
8
+ annotated-types==0.7.0
9
+ anthropic==0.46.0
10
+ anyascii==0.3.2
11
+ anyio==4.9.0
12
+ asttokens==3.0.0
13
+ async-timeout==5.0.1
14
+ attrs==25.3.0
15
+ av==14.3.0
16
+ beautifulsoup4==4.13.4
17
+ blinker==1.9.0
18
+ cachetools==5.5.2
19
+ certifi==2025.1.31
20
+ cfgv==3.4.0
21
+ charset-normalizer==3.4.1
22
+ click==8.1.8
23
+ comm==0.2.2
24
+ contourpy==1.3.1
25
+ cycler==0.12.1
26
+ datasets==3.5.0
27
+ debugpy==1.8.14
28
+ decorator==5.2.1
29
+ defusedxml==0.7.1
30
+ dill==0.3.8
31
+ distlib==0.3.9
32
+ distro==1.9.0
33
+ doclayout_yolo==0.0.3
34
+ exceptiongroup==1.2.2
35
+ executing==2.2.0
36
+ filelock==3.18.0
37
+ filetype==1.2.0
38
+ fonttools==4.57.0
39
+ frozenlist==1.6.0
40
+ fsspec==2024.12.0
41
+ ftfy==6.3.1
42
+ fuzzywuzzy==0.18.0
43
+ gitdb==4.0.12
44
+ GitPython==3.1.44
45
+ google-auth==2.39.0
46
+ google-genai==1.11.0
47
+ h11==0.14.0
48
+ h5py==3.13.0
49
+ httpcore==1.0.8
50
+ httpx==0.28.1
51
+ huggingface-hub==0.30.2
52
+ identify==2.6.10
53
+ idna==3.10
54
+ ipykernel==6.29.5
55
+ ipython==8.35.0
56
+ jedi==0.19.2
57
+ Jinja2==3.1.6
58
+ jiter==0.9.0
59
+ joblib==1.4.2
60
+ jsonschema==4.23.0
61
+ jsonschema-specifications==2025.4.1
62
+ jupyter_client==8.6.3
63
+ jupyter_core==5.7.2
64
+ kiwisolver==1.4.8
65
+ langdetect==1.0.9
66
+ markdown2==2.5.3
67
+ markdownify==0.13.1
68
+ marker-pdf==1.6.2
69
+ MarkupSafe==3.0.2
70
+ matplotlib==3.10.1
71
+ matplotlib-inline==0.1.7
72
+ mpmath==1.3.0
73
+ multidict==6.4.3
74
+ multiprocess==0.70.16
75
+ narwhals==1.39.1
76
+ nest-asyncio==1.6.0
77
+ networkx==3.4.2
78
+ nodeenv==1.9.1
79
+ numpy==2.2.4
80
+ nvidia-cublas-cu12==12.4.5.8
81
+ nvidia-cuda-cupti-cu12==12.4.127
82
+ nvidia-cuda-nvrtc-cu12==12.4.127
83
+ nvidia-cuda-runtime-cu12==12.4.127
84
+ nvidia-cudnn-cu12==9.1.0.70
85
+ nvidia-cufft-cu12==11.2.1.3
86
+ nvidia-curand-cu12==10.3.5.147
87
+ nvidia-cusolver-cu12==11.6.1.9
88
+ nvidia-cusparse-cu12==12.3.1.170
89
+ nvidia-cusparselt-cu12==0.6.2
90
+ nvidia-nccl-cu12==2.21.5
91
+ nvidia-nvjitlink-cu12==12.4.127
92
+ nvidia-nvtx-cu12==12.4.127
93
+ openai==1.75.0
94
+ opencv-python==4.11.0.86
95
+ opencv-python-headless==4.11.0.86
96
+ packaging==24.2
97
+ pandas==2.2.3
98
+ parso==0.8.4
99
+ pdf2image==1.17.0
100
+ pdftext==0.6.2
101
+ pexpect==4.9.0
102
+ pillow==10.4.0
103
+ platformdirs==4.3.7
104
+ pre_commit==4.2.0
105
+ prompt_toolkit==3.0.50
106
+ propcache==0.3.1
107
+ protobuf==6.31.0
108
+ psutil==7.0.0
109
+ ptyprocess==0.7.0
110
+ pure_eval==0.2.3
111
+ py-cpuinfo==9.0.0
112
+ pyarrow==19.0.1
113
+ pyasn1==0.6.1
114
+ pyasn1_modules==0.4.2
115
+ pyclipper==1.3.0.post6
116
+ pydantic==2.11.3
117
+ pydantic-settings==2.8.1
118
+ pydantic_core==2.33.1
119
+ pydeck==0.9.1
120
+ Pygments==2.19.1
121
+ PyMuPDF==1.25.5
122
+ pyparsing==3.2.3
123
+ pypdfium2==4.30.0
124
+ pytesseract==0.3.13
125
+ python-dateutil==2.9.0.post0
126
+ python-doctr==0.11.0
127
+ python-dotenv==1.1.0
128
+ pytz==2025.2
129
+ PyYAML==6.0.2
130
+ pyzmq==26.4.0
131
+ qwen-vl-utils==0.0.10
132
+ RapidFuzz==3.13.0
133
+ referencing==0.36.2
134
+ regex==2024.11.6
135
+ requests==2.32.3
136
+ rpds-py==0.25.0
137
+ rsa==4.9.1
138
+ safetensors==0.5.3
139
+ scikit-learn==1.6.1
140
+ scipy==1.15.2
141
+ seaborn==0.13.2
142
+ sentence-transformers==4.1.0
143
+ shapely==2.1.0
144
+ simsimd==6.2.1
145
+ six==1.17.0
146
+ smmap==5.0.2
147
+ sniffio==1.3.1
148
+ soupsieve==2.7
149
+ stack-data==0.6.3
150
+ streamlit==1.45.1
151
+ stringzilla==3.12.3
152
+ surya-ocr==0.13.1
153
+ sympy==1.13.1
154
+ tenacity==9.1.2
155
+ thop==0.1.1.post2209072238
156
+ threadpoolctl==3.6.0
157
+ tokenizers==0.21.1
158
+ toml==0.10.2
159
+ torch==2.6.0
160
+ torchvision==0.21.0
161
+ tornado==6.4.2
162
+ tqdm==4.67.1
163
+ traitlets==5.14.3
164
+ transformers==4.51.2
165
+ triton==3.2.0
166
+ typing-inspection==0.4.0
167
+ typing_extensions==4.13.2
168
+ tzdata==2025.2
169
+ urllib3==2.4.0
170
+ virtualenv==20.30.0
171
+ watchdog==6.0.0
172
+ wcwidth==0.2.13
173
+ websockets==15.0.1
174
+ xxhash==3.5.0
175
+ yarl==1.20.0