prithivMLmods commited on
Commit
ac626f9
·
verified ·
1 Parent(s): 39360bb

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +76 -103
app.py CHANGED
@@ -10,6 +10,9 @@ from io import BytesIO
10
  from PIL import Image
11
  from gradio_pdf import PDF
12
  from loguru import logger
 
 
 
13
  from datetime import datetime
14
  from pathlib import Path
15
  import torch
@@ -24,17 +27,6 @@ import tempfile
24
  pdf_suffixes = [".pdf"]
25
  image_suffixes = [".png", ".jpeg", ".jpg"]
26
 
27
- # LaTeX delimiter configurations
28
- latex_delimiters_type_a = [
29
- {'left': '$$', 'right': '$$', 'display': True},
30
- {'left': '$', 'right': '$', 'display': False},
31
- ]
32
- latex_delimiters_type_b = [
33
- {'left': '\\(', 'right': '\\)', 'display': False},
34
- {'left': '\\[', 'right': '\\]', 'display': True},
35
- ]
36
- latex_delimiters_type_all = latex_delimiters_type_a + latex_delimiters_type_b
37
-
38
  # --- Model and Processor Initialization ---
39
  device = "cuda" if torch.cuda.is_available() else "cpu"
40
  MODEL_ID = "Logics-MLLM/Logics-Parsing"
@@ -131,7 +123,9 @@ def to_pdf(file_path: str) -> str or None:
131
 
132
  pdf_bytes = read_fn(file_path)
133
  unique_filename = f'{safe_stem(file_path)}.pdf'
134
- tmp_file_path = os.path.join(os.path.dirname(file_path), unique_filename)
 
 
135
 
136
  with open(tmp_file_path, 'wb') as tmp_pdf_file:
137
  tmp_pdf_file.write(pdf_bytes)
@@ -139,76 +133,86 @@ def to_pdf(file_path: str) -> str or None:
139
  return tmp_file_path
140
 
141
 
142
- def convert_pdf_to_images_fitz(pdf_path: str, dpi: int = 200) -> list:
143
- """
144
- Converts a PDF file to a list of PIL Images using PyMuPDF (fitz).
145
- This function replaces the need for the external Poppler dependency.
146
- """
147
- images = []
148
- logger.info(f"Converting PDF '{pdf_path}' to images with PyMuPDF.")
149
- try:
150
- pdf_document = fitz.open(pdf_path)
151
- # Calculate zoom factor based on desired DPI
152
- zoom = dpi / 72.0
153
- mat = fitz.Matrix(zoom, zoom)
154
-
155
- for page_num in range(len(pdf_document)):
156
- page = pdf_document.load_page(page_num)
157
- pix = page.get_pixmap(matrix=mat)
158
- img_data = pix.tobytes("png") # Use PNG for better quality
159
- image = Image.open(BytesIO(img_data))
160
- images.append(image)
161
- pdf_document.close()
162
- logger.info(f"Successfully converted {len(images)} pages.")
163
- except Exception as e:
164
- logger.error(f"Failed to convert PDF using PyMuPDF: {e}")
165
- raise
166
- return images
167
-
168
-
169
  async def pdf_parse(file_path: str, request: gr.Request):
170
  """
171
  Main parsing function that orchestrates the PDF processing pipeline.
 
172
  """
173
  if file_path is None:
174
  logger.warning("file_path is None")
175
  return (
176
- "<p>Please upload a PDF file</p>",
177
- "",
178
- "<p>No input file</p>",
179
- None,
180
- None,
181
- "Error: No file provided"
182
  )
183
  logger.info(f'Processing file: {file_path}')
184
 
185
- # Ensure file is in PDF format
186
  tmp_pdf_path = to_pdf(file_path)
187
  if tmp_pdf_path is None:
188
  return (
189
- "<p>Failed to process file</p>",
190
- "",
191
- "<p>Processing error</p>",
192
- None,
193
- None,
194
- "Error: Failed to process file"
195
  )
196
 
197
  start_time = time.time()
198
  try:
199
- # ** FIX: Use PyMuPDF (fitz) instead of pdf2image to avoid Poppler dependency **
200
- pages = convert_pdf_to_images_fitz(tmp_pdf_path, dpi=200)
201
-
202
  html_parts = []
203
- for i, page in enumerate(pages):
204
- logger.info(f"Parsing page {i+1}/{len(pages)}")
205
- html = parse_page(page)
206
- html_parts.append(f'<div class="page-{i+1}">{html}</div>')
207
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
208
  full_html = '\n'.join(html_parts)
209
  parsing_time = time.time() - start_time
210
 
211
- # Convert generated HTML to Markdown
212
  mmd = html2text.html2text(full_html)
213
  mmd_html = markdown.markdown(mmd)
214
  qwen_html = full_html
@@ -218,20 +222,17 @@ async def pdf_parse(file_path: str, request: gr.Request):
218
  f.write(mmd)
219
  md_path = f.name
220
 
221
- input_path = tmp_pdf_path
222
  cost_time = f'Parsing time: {parsing_time:.2f}s, Total time: {parsing_time:.2f}s'
223
 
224
- return mmd_html, mmd, qwen_html, md_path, input_path, cost_time
225
 
226
  except Exception as e:
227
  logger.error(f"Parsing failed: {e}")
 
 
228
  return (
229
- "<p>Parsing failed. Please try again.</p>",
230
- "",
231
- f"<p>Error: {str(e)}</p>",
232
- None,
233
- None,
234
- f"Error: {str(e)}"
235
  )
236
 
237
 
@@ -241,36 +242,13 @@ def main(ctx, **kwargs):
241
  """
242
  Sets up and launches the Gradio user interface.
243
  """
244
- with gr.Blocks(head='''
245
- <meta name="data-spm" content="label" />
246
- <meta name="aplus-core" content="aplus.js" />
247
- <meta name="aplus-ifr-pv" content="1"/>
248
- <meta name="aplus-iframe-ignore-i" content="on" />
249
- <script>
250
- window.APLUS_CONFIG = {
251
- pid: 'aidata',
252
- };
253
- (function (w, d, s, q) {
254
- w[q] = w[q] || [];
255
- var f = d.getElementsByTagName(s)[0],
256
- j = d.createElement(s);
257
- j.async = true;
258
- j.id = 'beacon-aplus';
259
- var userIdParam = '';
260
- j.setAttribute(
261
- 'exparams',
262
- 'userid=' +
263
- userIdParam +
264
- '&aplus&sidx=aplusSidex&ckx=aplusCkx'
265
- );
266
- j.src = '//g.alicdn.com/alilog/mlog/aplus_v2.js';
267
- j.crossorigin = 'anonymous';
268
- f.parentNode.insertBefore(j, f);
269
- })(window, document, 'script', 'aplus_queue');
270
- </script>
271
- ''') as demo:
272
  gr.Markdown("# 📄 Logics-Parsing Document Analysis")
273
- gr.Markdown("Upload a PDF or image file to parse its content into structured Markdown and HTML formats.")
274
  with gr.Row():
275
  with gr.Column(variant='panel', scale=5):
276
  with gr.Row():
@@ -287,10 +265,7 @@ def main(ctx, **kwargs):
287
  example_files = [os.path.join(example_root, f) for f in os.listdir(example_root) if f.endswith(tuple(pdf_suffixes + image_suffixes))]
288
  if example_files:
289
  with gr.Accordion('Examples:', open=True):
290
- gr.Examples(
291
- examples=example_files,
292
- inputs=input_file
293
- )
294
 
295
  with gr.Column(variant='panel', scale=5):
296
  output_file = gr.File(label='Download Markdown Result', interactive=False)
@@ -303,9 +278,7 @@ def main(ctx, **kwargs):
303
  with gr.Tab('Generated HTML'):
304
  raw_html = gr.TextArea(lines=45, show_copy_button=True, label="Generated HTML")
305
 
306
- # Define component list for clearing
307
  components_to_clear = [input_file, pdf_show, mmd, raw_html, output_file, mmd_html, cost_time]
308
-
309
  clear_bu.add(components_to_clear)
310
 
311
  input_file.change(fn=to_pdf, inputs=input_file, outputs=pdf_show, show_progress="full")
 
10
  from PIL import Image
11
  from gradio_pdf import PDF
12
  from loguru import logger
13
+ import sys # Added for logging configuration
14
+ import base64 # Added for image encoding
15
+ from bs4 import BeautifulSoup # Added for HTML manipulation
16
  from datetime import datetime
17
  from pathlib import Path
18
  import torch
 
27
  pdf_suffixes = [".pdf"]
28
  image_suffixes = [".png", ".jpeg", ".jpg"]
29
 
 
 
 
 
 
 
 
 
 
 
 
30
  # --- Model and Processor Initialization ---
31
  device = "cuda" if torch.cuda.is_available() else "cpu"
32
  MODEL_ID = "Logics-MLLM/Logics-Parsing"
 
123
 
124
  pdf_bytes = read_fn(file_path)
125
  unique_filename = f'{safe_stem(file_path)}.pdf'
126
+ # Use Gradio's temp directory for temporary files
127
+ tmp_dir = tempfile.gettempdir()
128
+ tmp_file_path = os.path.join(tmp_dir, unique_filename)
129
 
130
  with open(tmp_file_path, 'wb') as tmp_pdf_file:
131
  tmp_pdf_file.write(pdf_bytes)
 
133
  return tmp_file_path
134
 
135
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
136
  async def pdf_parse(file_path: str, request: gr.Request):
137
  """
138
  Main parsing function that orchestrates the PDF processing pipeline.
139
+ It now extracts images directly and injects them into the final HTML.
140
  """
141
  if file_path is None:
142
  logger.warning("file_path is None")
143
  return (
144
+ "<p>Please upload a PDF file</p>", "", "<p>No input file</p>",
145
+ None, None, "Error: No file provided"
 
 
 
 
146
  )
147
  logger.info(f'Processing file: {file_path}')
148
 
 
149
  tmp_pdf_path = to_pdf(file_path)
150
  if tmp_pdf_path is None:
151
  return (
152
+ "<p>Failed to process file</p>", "", "<p>Processing error</p>",
153
+ None, None, "Error: Failed to process file"
 
 
 
 
154
  )
155
 
156
  start_time = time.time()
157
  try:
158
+ pdf_document = fitz.open(tmp_pdf_path)
 
 
159
  html_parts = []
 
 
 
 
160
 
161
+ # Process each page
162
+ for page_num in range(len(pdf_document)):
163
+ page = pdf_document.load_page(page_num)
164
+ logger.info(f"Processing Page {page_num + 1}/{len(pdf_document)}")
165
+
166
+ # --- 1. Extract images directly from the PDF page using PyMuPDF ---
167
+ page_images_base64 = []
168
+ img_list = page.get_images(full=True)
169
+ for img_index, img in enumerate(img_list):
170
+ xref = img[0]
171
+ base_image = pdf_document.extract_image(xref)
172
+ image_bytes = base_image["image"]
173
+ image_ext = base_image["ext"]
174
+ base64_string = f"data:image/{image_ext};base64,{base64.b64encode(image_bytes).decode()}"
175
+ page_images_base64.append(base64_string)
176
+
177
+ logger.info(f" > Found {len(page_images_base64)} images on page {page_num + 1}.")
178
+
179
+ # --- 2. Render the page to an image for the VL-Model ---
180
+ zoom = 200 / 72.0 # Corresponds to 200 DPI
181
+ mat = fitz.Matrix(zoom, zoom)
182
+ pix = page.get_pixmap(matrix=mat)
183
+ page_image = Image.open(BytesIO(pix.tobytes("png")))
184
+
185
+ # --- 3. Get the structured HTML from the model ---
186
+ logger.info(f" > Parsing page layout with Logics-Parsing model...")
187
+ html_content = parse_page(page_image)
188
+
189
+ # --- 4. Inject extracted images back into the HTML ---
190
+ if page_images_base64:
191
+ logger.info(f" > Injecting {len(page_images_base64)} extracted images into generated HTML...")
192
+ soup = BeautifulSoup(html_content, 'html.parser')
193
+ figures = soup.find_all('figure')
194
+
195
+ # If model identified same number of figures, inject images into them
196
+ if len(figures) == len(page_images_base64):
197
+ for fig, b64_img in zip(figures, page_images_base64):
198
+ img_tag = soup.new_tag('img', src=b64_img, style="max-width:100%; height:auto;")
199
+ fig.append(img_tag)
200
+ else: # Otherwise, append all images at the end of the page content as a fallback
201
+ logger.warning(f" > Mismatch: Model found {len(figures)} figures, but {len(page_images_base64)} images were extracted. Appending images to the end.")
202
+ for b64_img in page_images_base64:
203
+ img_tag = soup.new_tag('img', src=b64_img, style="max-width:100%; height:auto;")
204
+ p_tag = soup.new_tag('p')
205
+ p_tag.append(img_tag)
206
+ soup.append(p_tag)
207
+ html_content = str(soup)
208
+
209
+ html_parts.append(f'<div class="page-{page_num+1}">{html_content}</div>')
210
+
211
+ pdf_document.close()
212
  full_html = '\n'.join(html_parts)
213
  parsing_time = time.time() - start_time
214
 
215
+ # Convert final rich HTML to Markdown
216
  mmd = html2text.html2text(full_html)
217
  mmd_html = markdown.markdown(mmd)
218
  qwen_html = full_html
 
222
  f.write(mmd)
223
  md_path = f.name
224
 
 
225
  cost_time = f'Parsing time: {parsing_time:.2f}s, Total time: {parsing_time:.2f}s'
226
 
227
+ return mmd_html, mmd, qwen_html, md_path, tmp_pdf_path, cost_time
228
 
229
  except Exception as e:
230
  logger.error(f"Parsing failed: {e}")
231
+ import traceback
232
+ traceback.print_exc()
233
  return (
234
+ "<p>Parsing failed. Please try again.</p>", "", f"<p>Error: {str(e)}</p>",
235
+ None, None, f"Error: {str(e)}"
 
 
 
 
236
  )
237
 
238
 
 
242
  """
243
  Sets up and launches the Gradio user interface.
244
  """
245
+ # **FIX: Configure Loguru for better visibility in deployment environments**
246
+ logger.remove() # Remove default handler
247
+ logger.add(sys.stdout, level="INFO")
248
+
249
+ with gr.Blocks(theme=gr.themes.Soft()) as demo:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
250
  gr.Markdown("# 📄 Logics-Parsing Document Analysis")
251
+ gr.Markdown("Upload a PDF or image file to parse its content into structured Markdown and HTML formats, now with improved image extraction.")
252
  with gr.Row():
253
  with gr.Column(variant='panel', scale=5):
254
  with gr.Row():
 
265
  example_files = [os.path.join(example_root, f) for f in os.listdir(example_root) if f.endswith(tuple(pdf_suffixes + image_suffixes))]
266
  if example_files:
267
  with gr.Accordion('Examples:', open=True):
268
+ gr.Examples(examples=example_files, inputs=input_file)
 
 
 
269
 
270
  with gr.Column(variant='panel', scale=5):
271
  output_file = gr.File(label='Download Markdown Result', interactive=False)
 
278
  with gr.Tab('Generated HTML'):
279
  raw_html = gr.TextArea(lines=45, show_copy_button=True, label="Generated HTML")
280
 
 
281
  components_to_clear = [input_file, pdf_show, mmd, raw_html, output_file, mmd_html, cost_time]
 
282
  clear_bu.add(components_to_clear)
283
 
284
  input_file.change(fn=to_pdf, inputs=input_file, outputs=pdf_show, show_progress="full")