prithivMLmods commited on
Commit
6966b5a
·
verified ·
1 Parent(s): a327584

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +48 -57
app.py CHANGED
@@ -9,11 +9,13 @@ import re
9
  import time
10
  from threading import Thread
11
  import uuid
 
12
 
13
  import gradio as gr
14
  import requests
15
  import torch
16
  from PIL import Image
 
17
 
18
  from transformers import (
19
  Qwen2_5_VLForConditionalGeneration,
@@ -28,7 +30,6 @@ from reportlab.lib.units import inch
28
 
29
  # --- Constants and Model Setup ---
30
  MAX_INPUT_TOKEN_LENGTH = 4096
31
- # Note: The following line correctly falls back to CPU if CUDA is not available.
32
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
33
 
34
  print("CUDA_VISIBLE_DEVICES=", os.environ.get("CUDA_VISIBLE_DEVICES"))
@@ -80,43 +81,36 @@ model_i = Qwen2_5_VLForConditionalGeneration.from_pretrained(
80
  ).to(device).eval()
81
 
82
 
83
- # --- PDF Generation Utility Function ---
84
- def generate_pdf(image: Image.Image, text_content: str, font_size: int, line_spacing: float, alignment: str, image_size: str) -> str:
85
  """
86
- Generates a PDF document with the input image and extracted text.
 
87
  """
88
- if image is None or not text_content:
89
  raise gr.Error("Cannot generate PDF. Image or text content is missing.")
90
 
91
- filename = f"/tmp/output_{uuid.uuid4()}.pdf"
 
 
92
  doc = SimpleDocTemplate(
93
- filename,
94
  pagesize=A4,
95
- rightMargin=inch,
96
- leftMargin=inch,
97
- topMargin=inch,
98
- bottomMargin=inch
99
  )
100
  styles = getSampleStyleSheet()
101
  style_normal = styles["Normal"]
102
  style_normal.fontSize = int(font_size)
103
  style_normal.leading = int(font_size) * line_spacing
104
- style_normal.alignment = {
105
- "Left": 0,
106
- "Center": 1,
107
- "Right": 2,
108
- "Justified": 4
109
- }[alignment]
110
 
111
  story = []
112
 
113
- # Handle Image
114
- # Convert PIL image to a format reportlab can use without saving to disk
115
  img_buffer = BytesIO()
116
  image.save(img_buffer, format='PNG')
117
  img_buffer.seek(0)
118
 
119
- # Image size settings
120
  page_width, _ = A4
121
  available_width = page_width - 2 * inch
122
  image_widths = {
@@ -124,13 +118,12 @@ def generate_pdf(image: Image.Image, text_content: str, font_size: int, line_spa
124
  "Medium": available_width * 0.6,
125
  "Large": available_width * 0.9,
126
  }
127
- img = RLImage(img_buffer, width=image_widths[image_size], height=image.height * (image_widths[image_size]/image.width))
 
128
  story.append(img)
129
  story.append(Spacer(1, 12))
130
 
131
- # Handle Text - Replace markdown with spaces for PDF
132
- # A simple replacement for basic markdown, for more complex cases a proper parser would be needed
133
- cleaned_text = text_content.replace("# ", "").replace("## ", "").replace("*", "")
134
  text_paragraphs = cleaned_text.split('\n')
135
 
136
  for para in text_paragraphs:
@@ -138,7 +131,23 @@ def generate_pdf(image: Image.Image, text_content: str, font_size: int, line_spa
138
  story.append(Paragraph(para, style_normal))
139
 
140
  doc.build(story)
141
- return filename
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
142
 
143
 
144
  # --- Core Application Logic ---
@@ -151,10 +160,8 @@ def process_document_stream(model_name: str, image: Image.Image, max_new_tokens:
151
  yield "Please upload an image.", "Please upload an image."
152
  return
153
 
154
- # 1. Set prompt for OCR
155
  text_prompt = ocr_prompt
156
 
157
- # 2. Select model and processor
158
  if model_name == "Camel-Doc-OCR-080125": processor, model = processor_m, model_m
159
  elif model_name == "Megalodon-OCR-Sync-0713": processor, model = processor_t, model_t
160
  elif model_name == "Nanonets-OCR-s": processor, model = processor_c, model_c
@@ -164,7 +171,6 @@ def process_document_stream(model_name: str, image: Image.Image, max_new_tokens:
164
  yield "Invalid model selected.", "Invalid model selected."
165
  return
166
 
167
- # 3. Prepare model inputs and streamer
168
  messages = [{"role": "user", "content": [{"type": "image", "image": image}, {"type": "text", "text": text_prompt}]}]
169
  prompt_full = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
170
  inputs = processor(text=[prompt_full], images=[image], return_tensors="pt", padding=True, truncation=True, max_length=MAX_INPUT_TOKEN_LENGTH).to(device)
@@ -174,7 +180,6 @@ def process_document_stream(model_name: str, image: Image.Image, max_new_tokens:
174
  thread = Thread(target=model.generate, kwargs=generation_kwargs)
175
  thread.start()
176
 
177
- # 4. Stream raw output to the UI in real-time
178
  buffer = ""
179
  for new_text in streamer:
180
  buffer += new_text
@@ -182,7 +187,6 @@ def process_document_stream(model_name: str, image: Image.Image, max_new_tokens:
182
  time.sleep(0.01)
183
  yield buffer , "⏳ Processing..."
184
 
185
- # 5. Yield the final result for both raw and formatted outputs
186
  yield buffer, buffer
187
 
188
 
@@ -193,6 +197,7 @@ def create_gradio_interface():
193
  .main-container { max-width: 1400px; margin: 0 auto; }
194
  .process-button { border: none !important; color: white !important; font-weight: bold !important; background-color: blue !important;}
195
  .process-button:hover { background-color: darkblue !important; transform: translateY(-2px) !important; box-shadow: 0 4px 8px rgba(0,0,0,0.2) !important; }
 
196
  """
197
  with gr.Blocks(theme="bethecloud/storj_theme", css=css) as demo:
198
  gr.HTML("""
@@ -208,14 +213,8 @@ def create_gradio_interface():
208
  # Left Column (Inputs)
209
  with gr.Column(scale=1):
210
  model_choice = gr.Dropdown(
211
- choices=["Camel-Doc-OCR-080125",
212
- "MonkeyOCR-Recognition",
213
- "olmOCR-7B-0725",
214
- "Nanonets-OCR-s",
215
- "Megalodon-OCR-Sync-0713"
216
- ],
217
- label="Select Model",
218
- value="Nanonets-OCR-s"
219
  )
220
  image_input = gr.Image(label="Upload Image", type="pil", sources=['upload'])
221
  with gr.Accordion("Advanced Settings", open=False):
@@ -226,7 +225,6 @@ def create_gradio_interface():
226
  alignment = gr.Dropdown(choices=["Left", "Center", "Right", "Justified"], value="Left", label="Text Alignment")
227
  image_size = gr.Dropdown(choices=["Small", "Medium", "Large"], value="Medium", label="Image Size in PDF")
228
 
229
-
230
  process_btn = gr.Button("🚀 Process Document", variant="primary", elem_classes=["process-button"], size="lg")
231
  clear_btn = gr.Button("🗑️ Clear All", variant="secondary")
232
 
@@ -238,44 +236,37 @@ def create_gradio_interface():
238
  with gr.Row():
239
  examples = gr.Examples(
240
  examples=["examples/1.png", "examples/2.png", "examples/3.png", "examples/4.png", "examples/5.png"],
241
- inputs=image_input,
242
- label="Examples"
243
- )
244
  gr.Markdown("[Report-Bug💻](https://huggingface.co/spaces/prithivMLmods/OCR-Comparator/discussions)")
245
 
246
  with gr.Tab("📰 README.md"):
247
  markdown_output = gr.Markdown(label="Formatted Markdown")
248
 
249
  with gr.Tab("📋 PDF Preview"):
250
- pdf_output_file = gr.File(label="Generated PDF Document", interactive=False)
251
- generate_pdf_btn = gr.Button("📄 Generate PDF", variant="primary")
252
-
253
 
254
  # Event Handlers
255
  def clear_all_outputs():
256
- return None, "Raw output will appear here.", "Formatted results will appear here.", None
257
 
258
  process_btn.click(
259
  fn=process_document_stream,
260
- inputs=[model_choice,
261
- image_input,
262
- max_new_tokens],
263
- outputs=[raw_output_stream,
264
- markdown_output]
265
  )
266
 
267
  generate_pdf_btn.click(
268
- fn=generate_pdf,
269
  inputs=[image_input, markdown_output, font_size, line_spacing, alignment, image_size],
270
- outputs=[pdf_output_file]
271
  )
272
 
273
  clear_btn.click(
274
  clear_all_outputs,
275
- outputs=[image_input,
276
- raw_output_stream,
277
- markdown_output,
278
- pdf_output_file]
279
  )
280
  return demo
281
 
 
9
  import time
10
  from threading import Thread
11
  import uuid
12
+ import tempfile
13
 
14
  import gradio as gr
15
  import requests
16
  import torch
17
  from PIL import Image
18
+ import fitz
19
 
20
  from transformers import (
21
  Qwen2_5_VLForConditionalGeneration,
 
30
 
31
  # --- Constants and Model Setup ---
32
  MAX_INPUT_TOKEN_LENGTH = 4096
 
33
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
34
 
35
  print("CUDA_VISIBLE_DEVICES=", os.environ.get("CUDA_VISIBLE_DEVICES"))
 
81
  ).to(device).eval()
82
 
83
 
84
+ # --- PDF Generation and Preview Utility Function ---
85
+ def generate_and_preview_pdf(image: Image.Image, text_content: str, font_size: int, line_spacing: float, alignment: str, image_size: str):
86
  """
87
+ Generates a PDF, saves it, and then creates image previews of its pages.
88
+ Returns the path to the PDF and a list of paths to the preview images.
89
  """
90
+ if image is None or not text_content or not text_content.strip():
91
  raise gr.Error("Cannot generate PDF. Image or text content is missing.")
92
 
93
+ # --- 1. Generate the PDF ---
94
+ temp_dir = tempfile.gettempdir()
95
+ pdf_filename = os.path.join(temp_dir, f"output_{uuid.uuid4()}.pdf")
96
  doc = SimpleDocTemplate(
97
+ pdf_filename,
98
  pagesize=A4,
99
+ rightMargin=inch, leftMargin=inch,
100
+ topMargin=inch, bottomMargin=inch
 
 
101
  )
102
  styles = getSampleStyleSheet()
103
  style_normal = styles["Normal"]
104
  style_normal.fontSize = int(font_size)
105
  style_normal.leading = int(font_size) * line_spacing
106
+ style_normal.alignment = {"Left": 0, "Center": 1, "Right": 2, "Justified": 4}[alignment]
 
 
 
 
 
107
 
108
  story = []
109
 
 
 
110
  img_buffer = BytesIO()
111
  image.save(img_buffer, format='PNG')
112
  img_buffer.seek(0)
113
 
 
114
  page_width, _ = A4
115
  available_width = page_width - 2 * inch
116
  image_widths = {
 
118
  "Medium": available_width * 0.6,
119
  "Large": available_width * 0.9,
120
  }
121
+ img_width = image_widths[image_size]
122
+ img = RLImage(img_buffer, width=img_width, height=image.height * (img_width / image.width))
123
  story.append(img)
124
  story.append(Spacer(1, 12))
125
 
126
+ cleaned_text = re.sub(r'#+\s*', '', text_content).replace("*", "")
 
 
127
  text_paragraphs = cleaned_text.split('\n')
128
 
129
  for para in text_paragraphs:
 
131
  story.append(Paragraph(para, style_normal))
132
 
133
  doc.build(story)
134
+
135
+ # --- 2. Render PDF pages as images for preview ---
136
+ preview_images = []
137
+ try:
138
+ pdf_doc = fitz.open(pdf_filename)
139
+ for page_num in range(len(pdf_doc)):
140
+ page = pdf_doc.load_page(page_num)
141
+ pix = page.get_pixmap(dpi=150) # Render at 150 DPI for good quality
142
+ preview_img_path = os.path.join(temp_dir, f"preview_{uuid.uuid4()}_p{page_num}.png")
143
+ pix.save(preview_img_path)
144
+ preview_images.append(preview_img_path)
145
+ pdf_doc.close()
146
+ except Exception as e:
147
+ print(f"Error generating PDF preview: {e}")
148
+ # Continue without preview if rendering fails
149
+
150
+ return pdf_filename, preview_images
151
 
152
 
153
  # --- Core Application Logic ---
 
160
  yield "Please upload an image.", "Please upload an image."
161
  return
162
 
 
163
  text_prompt = ocr_prompt
164
 
 
165
  if model_name == "Camel-Doc-OCR-080125": processor, model = processor_m, model_m
166
  elif model_name == "Megalodon-OCR-Sync-0713": processor, model = processor_t, model_t
167
  elif model_name == "Nanonets-OCR-s": processor, model = processor_c, model_c
 
171
  yield "Invalid model selected.", "Invalid model selected."
172
  return
173
 
 
174
  messages = [{"role": "user", "content": [{"type": "image", "image": image}, {"type": "text", "text": text_prompt}]}]
175
  prompt_full = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
176
  inputs = processor(text=[prompt_full], images=[image], return_tensors="pt", padding=True, truncation=True, max_length=MAX_INPUT_TOKEN_LENGTH).to(device)
 
180
  thread = Thread(target=model.generate, kwargs=generation_kwargs)
181
  thread.start()
182
 
 
183
  buffer = ""
184
  for new_text in streamer:
185
  buffer += new_text
 
187
  time.sleep(0.01)
188
  yield buffer , "⏳ Processing..."
189
 
 
190
  yield buffer, buffer
191
 
192
 
 
197
  .main-container { max-width: 1400px; margin: 0 auto; }
198
  .process-button { border: none !important; color: white !important; font-weight: bold !important; background-color: blue !important;}
199
  .process-button:hover { background-color: darkblue !important; transform: translateY(-2px) !important; box-shadow: 0 4px 8px rgba(0,0,0,0.2) !important; }
200
+ #gallery { min-height: 400px; }
201
  """
202
  with gr.Blocks(theme="bethecloud/storj_theme", css=css) as demo:
203
  gr.HTML("""
 
213
  # Left Column (Inputs)
214
  with gr.Column(scale=1):
215
  model_choice = gr.Dropdown(
216
+ choices=["Camel-Doc-OCR-080125", "MonkeyOCR-Recognition", "olmOCR-7B-0725", "Nanonets-OCR-s", "Megalodon-OCR-Sync-0713"],
217
+ label="Select Model", value="Nanonets-OCR-s"
 
 
 
 
 
 
218
  )
219
  image_input = gr.Image(label="Upload Image", type="pil", sources=['upload'])
220
  with gr.Accordion("Advanced Settings", open=False):
 
225
  alignment = gr.Dropdown(choices=["Left", "Center", "Right", "Justified"], value="Left", label="Text Alignment")
226
  image_size = gr.Dropdown(choices=["Small", "Medium", "Large"], value="Medium", label="Image Size in PDF")
227
 
 
228
  process_btn = gr.Button("🚀 Process Document", variant="primary", elem_classes=["process-button"], size="lg")
229
  clear_btn = gr.Button("🗑️ Clear All", variant="secondary")
230
 
 
236
  with gr.Row():
237
  examples = gr.Examples(
238
  examples=["examples/1.png", "examples/2.png", "examples/3.png", "examples/4.png", "examples/5.png"],
239
+ inputs=image_input, label="Examples"
240
+ )
 
241
  gr.Markdown("[Report-Bug💻](https://huggingface.co/spaces/prithivMLmods/OCR-Comparator/discussions)")
242
 
243
  with gr.Tab("📰 README.md"):
244
  markdown_output = gr.Markdown(label="Formatted Markdown")
245
 
246
  with gr.Tab("📋 PDF Preview"):
247
+ generate_pdf_btn = gr.Button("📄 Generate PDF & Render", variant="primary")
248
+ pdf_output_file = gr.File(label="Download Generated PDF", interactive=False)
249
+ pdf_preview_gallery = gr.Gallery(label="PDF Page Preview", show_label=True, elem_id="gallery", columns=2, object_fit="contain", height="auto")
250
 
251
  # Event Handlers
252
  def clear_all_outputs():
253
+ return None, "Raw output will appear here.", "Formatted results will appear here.", None, None
254
 
255
  process_btn.click(
256
  fn=process_document_stream,
257
+ inputs=[model_choice, image_input, max_new_tokens],
258
+ outputs=[raw_output_stream, markdown_output]
 
 
 
259
  )
260
 
261
  generate_pdf_btn.click(
262
+ fn=generate_and_preview_pdf,
263
  inputs=[image_input, markdown_output, font_size, line_spacing, alignment, image_size],
264
+ outputs=[pdf_output_file, pdf_preview_gallery]
265
  )
266
 
267
  clear_btn.click(
268
  clear_all_outputs,
269
+ outputs=[image_input, raw_output_stream, markdown_output, pdf_output_file, pdf_preview_gallery]
 
 
 
270
  )
271
  return demo
272