prithivMLmods commited on
Commit
a327584
·
verified ·
1 Parent(s): b051d42

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +94 -80
app.py CHANGED
@@ -1,5 +1,6 @@
1
  import spaces
2
  import json
 
3
  import os
4
  import traceback
5
  from io import BytesIO
@@ -7,6 +8,7 @@ from typing import Any, Dict, List, Optional, Tuple
7
  import re
8
  import time
9
  from threading import Thread
 
10
 
11
  import gradio as gr
12
  import requests
@@ -18,15 +20,15 @@ from transformers import (
18
  AutoProcessor,
19
  TextIteratorStreamer,
20
  )
 
21
  from reportlab.lib.pagesizes import A4
22
  from reportlab.lib.styles import getSampleStyleSheet
23
- from reportlab.lib import colors
24
  from reportlab.platypus import SimpleDocTemplate, Image as RLImage, Paragraph, Spacer
25
  from reportlab.lib.units import inch
26
- import uuid
27
 
28
  # --- Constants and Model Setup ---
29
  MAX_INPUT_TOKEN_LENGTH = 4096
 
30
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
31
 
32
  print("CUDA_VISIBLE_DEVICES=", os.environ.get("CUDA_VISIBLE_DEVICES"))
@@ -40,6 +42,9 @@ if torch.cuda.is_available():
40
 
41
  print("Using device:", device)
42
 
 
 
 
43
  # --- Model Loading ---
44
  MODEL_ID_M = "prithivMLmods/Camel-Doc-OCR-080125"
45
  processor_m = AutoProcessor.from_pretrained(MODEL_ID_M, trust_remote_code=True)
@@ -74,13 +79,16 @@ model_i = Qwen2_5_VLForConditionalGeneration.from_pretrained(
74
  MODEL_ID_I, trust_remote_code=True, torch_dtype=torch.float16
75
  ).to(device).eval()
76
 
77
- # --- Prompts ---
78
- ocr_prompt = "Perform precise OCR on the image. Extract all text content, maintaining the original structure, paragraphs, and tables as formatted markdown."
79
 
80
- # --- PDF Generation Functions ---
81
- def generate_pdf(media_path, plain_text, font_size, line_spacing, alignment, image_size):
82
- """Generates a PDF document."""
83
- filename = f"output_{uuid.uuid4()}.pdf"
 
 
 
 
 
84
  doc = SimpleDocTemplate(
85
  filename,
86
  pagesize=A4,
@@ -90,9 +98,10 @@ def generate_pdf(media_path, plain_text, font_size, line_spacing, alignment, ima
90
  bottomMargin=inch
91
  )
92
  styles = getSampleStyleSheet()
93
- styles["Normal"].fontSize = int(font_size)
94
- styles["Normal"].leading = int(font_size) * line_spacing
95
- styles["Normal"].alignment = {
 
96
  "Left": 0,
97
  "Center": 1,
98
  "Right": 2,
@@ -101,49 +110,61 @@ def generate_pdf(media_path, plain_text, font_size, line_spacing, alignment, ima
101
 
102
  story = []
103
 
104
- # Add image with size adjustment
105
- image_sizes = {
106
- "Small": (200, 200),
107
- "Medium": (400, 400),
108
- "Large": (600, 600)
 
 
 
 
 
 
 
 
109
  }
110
- img = RLImage(media_path, width=image_sizes[image_size][0], height=image_sizes[image_size][1])
111
  story.append(img)
112
  story.append(Spacer(1, 12))
113
 
114
- # Add plain text output
115
- text = Paragraph(plain_text, styles["Normal"])
116
- story.append(text)
 
 
 
 
 
117
 
118
  doc.build(story)
119
  return filename
120
 
 
121
  # --- Core Application Logic ---
122
  @spaces.GPU
123
- def process_document_stream(model_name: str, image: Image.Image, max_new_tokens: int, font_size: str, line_spacing: float, alignment: str, image_size: str):
124
  """
125
- Main generator function for OCR task, also generating PDF for preview.
126
  """
127
  if image is None:
128
- yield "Please upload an image.", "Please upload an image.", None
129
  return
130
 
131
- # Select model and processor
 
 
 
132
  if model_name == "Camel-Doc-OCR-080125": processor, model = processor_m, model_m
133
  elif model_name == "Megalodon-OCR-Sync-0713": processor, model = processor_t, model_t
134
  elif model_name == "Nanonets-OCR-s": processor, model = processor_c, model_c
135
  elif model_name == "MonkeyOCR-Recognition": processor, model = processor_g, model_g
136
  elif model_name == "olmOCR-7B-0725": processor, model = processor_i, model_i
137
  else:
138
- yield "Invalid model selected.", "Invalid model selected.", None
139
  return
140
 
141
- # Save image temporarily for PDF generation
142
- temp_image_path = f"temp_{uuid.uuid4()}.png"
143
- image.save(temp_image_path)
144
-
145
- # Prepare model inputs and streamer
146
- text_prompt = ocr_prompt
147
  messages = [{"role": "user", "content": [{"type": "image", "image": image}, {"type": "text", "text": text_prompt}]}]
148
  prompt_full = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
149
  inputs = processor(text=[prompt_full], images=[image], return_tensors="pt", padding=True, truncation=True, max_length=MAX_INPUT_TOKEN_LENGTH).to(device)
@@ -153,23 +174,17 @@ def process_document_stream(model_name: str, image: Image.Image, max_new_tokens:
153
  thread = Thread(target=model.generate, kwargs=generation_kwargs)
154
  thread.start()
155
 
156
- # Stream raw output to the UI in real-time
157
  buffer = ""
158
  for new_text in streamer:
159
  buffer += new_text
160
  buffer = buffer.replace("<|im_end|>", "")
161
  time.sleep(0.01)
162
- # Generate PDF with current buffer
163
- pdf_file = generate_pdf(temp_image_path, buffer, font_size, line_spacing, alignment, image_size)
164
- yield buffer, buffer, pdf_file
165
 
166
- # Final PDF with complete output
167
- pdf_file = generate_pdf(temp_image_path, buffer, font_size, line_spacing, alignment, image_size)
168
- yield buffer, buffer, pdf_file
169
 
170
- # Clean up temporary image file
171
- if os.path.exists(temp_image_path):
172
- os.remove(temp_image_path)
173
 
174
  # --- Gradio UI Definition ---
175
  def create_gradio_interface():
@@ -178,15 +193,13 @@ def create_gradio_interface():
178
  .main-container { max-width: 1400px; margin: 0 auto; }
179
  .process-button { border: none !important; color: white !important; font-weight: bold !important; background-color: blue !important;}
180
  .process-button:hover { background-color: darkblue !important; transform: translateY(-2px) !important; box-shadow: 0 4px 8px rgba(0,0,0,0.2) !important; }
181
- .download-btn { background-color: #35a6d6 !important; color: white !important; }
182
- .download-btn:hover { background-color: #22bcff !important; }
183
  """
184
  with gr.Blocks(theme="bethecloud/storj_theme", css=css) as demo:
185
  gr.HTML("""
186
  <div class="title" style="text-align: center">
187
  <h1>Tiny VLMs Lab🧪</h1>
188
  <p style="font-size: 1.1em; color: #6b7280; margin-bottom: 0.6em;">
189
- Advanced Vision-Language Model for Image Content Extraction and PDF Generation
190
  </p>
191
  </div>
192
  """)
@@ -195,39 +208,24 @@ def create_gradio_interface():
195
  # Left Column (Inputs)
196
  with gr.Column(scale=1):
197
  model_choice = gr.Dropdown(
198
- choices=[
199
- "Camel-Doc-OCR-080125",
200
- "MonkeyOCR-Recognition",
201
- "olmOCR-7B-0725",
202
- "Nanonets-OCR-s",
203
- "Megalodon-OCR-Sync-0713"
204
- ],
205
  label="Select Model",
206
  value="Nanonets-OCR-s"
207
  )
208
  image_input = gr.Image(label="Upload Image", type="pil", sources=['upload'])
209
  with gr.Accordion("Advanced Settings", open=False):
210
  max_new_tokens = gr.Slider(minimum=512, maximum=8192, value=4096, step=256, label="Max New Tokens")
211
- font_size = gr.Dropdown(
212
- choices=["8", "10", "12", "14", "16", "18", "20", "22", "24"],
213
- value="16",
214
- label="Font Size"
215
- )
216
- line_spacing = gr.Dropdown(
217
- choices=[0.5, 1.0, 1.15, 1.5, 2.0, 2.5, 3.0],
218
- value=1.5,
219
- label="Line Spacing"
220
- )
221
- alignment = gr.Dropdown(
222
- choices=["Left", "Center", "Right", "Justified"],
223
- value="Justified",
224
- label="Text Alignment"
225
- )
226
- image_size = gr.Dropdown(
227
- choices=["Small", "Medium", "Large"],
228
- value="Medium",
229
- label="Image Size"
230
- )
231
 
232
  process_btn = gr.Button("🚀 Process Document", variant="primary", elem_classes=["process-button"], size="lg")
233
  clear_btn = gr.Button("🗑️ Clear All", variant="secondary")
@@ -236,19 +234,22 @@ def create_gradio_interface():
236
  with gr.Column(scale=2):
237
  with gr.Tabs() as tabs:
238
  with gr.Tab("📝 Extracted Content"):
239
- raw_output_stream = gr.Textbox(label="Raw Model Output Stream", interactive=False, lines=13, show_copy_button=True)
240
  with gr.Row():
241
  examples = gr.Examples(
242
  examples=["examples/1.png", "examples/2.png", "examples/3.png", "examples/4.png", "examples/5.png"],
243
  inputs=image_input,
244
  label="Examples"
245
- )
246
  gr.Markdown("[Report-Bug💻](https://huggingface.co/spaces/prithivMLmods/OCR-Comparator/discussions)")
 
247
  with gr.Tab("📰 README.md"):
248
- with gr.Accordion("(Formatted Result)", open=True):
249
- markdown_output = gr.Markdown(label="Formatted Markdown")
250
  with gr.Tab("📋 PDF Preview"):
251
- pdf_output = gr.File(label="Download PDF", interactive=True)
 
 
252
 
253
  # Event Handlers
254
  def clear_all_outputs():
@@ -256,12 +257,25 @@ def create_gradio_interface():
256
 
257
  process_btn.click(
258
  fn=process_document_stream,
259
- inputs=[model_choice, image_input, max_new_tokens, font_size, line_spacing, alignment, image_size],
260
- outputs=[raw_output_stream, markdown_output, pdf_output]
 
 
 
261
  )
 
 
 
 
 
 
 
262
  clear_btn.click(
263
- fn=clear_all_outputs,
264
- outputs=[image_input, raw_output_stream, markdown_output, pdf_output]
 
 
 
265
  )
266
  return demo
267
 
 
1
  import spaces
2
  import json
3
+ import math
4
  import os
5
  import traceback
6
  from io import BytesIO
 
8
  import re
9
  import time
10
  from threading import Thread
11
+ import uuid
12
 
13
  import gradio as gr
14
  import requests
 
20
  AutoProcessor,
21
  TextIteratorStreamer,
22
  )
23
+
24
  from reportlab.lib.pagesizes import A4
25
  from reportlab.lib.styles import getSampleStyleSheet
 
26
  from reportlab.platypus import SimpleDocTemplate, Image as RLImage, Paragraph, Spacer
27
  from reportlab.lib.units import inch
 
28
 
29
  # --- Constants and Model Setup ---
30
  MAX_INPUT_TOKEN_LENGTH = 4096
31
+ # Note: The following line correctly falls back to CPU if CUDA is not available.
32
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
33
 
34
  print("CUDA_VISIBLE_DEVICES=", os.environ.get("CUDA_VISIBLE_DEVICES"))
 
42
 
43
  print("Using device:", device)
44
 
45
+ # --- Prompts for Different Tasks ---
46
+ ocr_prompt = "Perform precise OCR on the image. Extract all text content, maintaining the original structure, paragraphs, and tables as formatted markdown."
47
+
48
  # --- Model Loading ---
49
  MODEL_ID_M = "prithivMLmods/Camel-Doc-OCR-080125"
50
  processor_m = AutoProcessor.from_pretrained(MODEL_ID_M, trust_remote_code=True)
 
79
  MODEL_ID_I, trust_remote_code=True, torch_dtype=torch.float16
80
  ).to(device).eval()
81
 
 
 
82
 
83
+ # --- PDF Generation Utility Function ---
84
+ def generate_pdf(image: Image.Image, text_content: str, font_size: int, line_spacing: float, alignment: str, image_size: str) -> str:
85
+ """
86
+ Generates a PDF document with the input image and extracted text.
87
+ """
88
+ if image is None or not text_content:
89
+ raise gr.Error("Cannot generate PDF. Image or text content is missing.")
90
+
91
+ filename = f"/tmp/output_{uuid.uuid4()}.pdf"
92
  doc = SimpleDocTemplate(
93
  filename,
94
  pagesize=A4,
 
98
  bottomMargin=inch
99
  )
100
  styles = getSampleStyleSheet()
101
+ style_normal = styles["Normal"]
102
+ style_normal.fontSize = int(font_size)
103
+ style_normal.leading = int(font_size) * line_spacing
104
+ style_normal.alignment = {
105
  "Left": 0,
106
  "Center": 1,
107
  "Right": 2,
 
110
 
111
  story = []
112
 
113
+ # Handle Image
114
+ # Convert PIL image to a format reportlab can use without saving to disk
115
+ img_buffer = BytesIO()
116
+ image.save(img_buffer, format='PNG')
117
+ img_buffer.seek(0)
118
+
119
+ # Image size settings
120
+ page_width, _ = A4
121
+ available_width = page_width - 2 * inch
122
+ image_widths = {
123
+ "Small": available_width * 0.3,
124
+ "Medium": available_width * 0.6,
125
+ "Large": available_width * 0.9,
126
  }
127
+ img = RLImage(img_buffer, width=image_widths[image_size], height=image.height * (image_widths[image_size]/image.width))
128
  story.append(img)
129
  story.append(Spacer(1, 12))
130
 
131
+ # Handle Text - Replace markdown with spaces for PDF
132
+ # A simple replacement for basic markdown, for more complex cases a proper parser would be needed
133
+ cleaned_text = text_content.replace("# ", "").replace("## ", "").replace("*", "")
134
+ text_paragraphs = cleaned_text.split('\n')
135
+
136
+ for para in text_paragraphs:
137
+ if para.strip():
138
+ story.append(Paragraph(para, style_normal))
139
 
140
  doc.build(story)
141
  return filename
142
 
143
+
144
  # --- Core Application Logic ---
145
  @spaces.GPU
146
+ def process_document_stream(model_name: str, image: Image.Image, max_new_tokens: int):
147
  """
148
+ Main generator function that handles OCR tasks.
149
  """
150
  if image is None:
151
+ yield "Please upload an image.", "Please upload an image."
152
  return
153
 
154
+ # 1. Set prompt for OCR
155
+ text_prompt = ocr_prompt
156
+
157
+ # 2. Select model and processor
158
  if model_name == "Camel-Doc-OCR-080125": processor, model = processor_m, model_m
159
  elif model_name == "Megalodon-OCR-Sync-0713": processor, model = processor_t, model_t
160
  elif model_name == "Nanonets-OCR-s": processor, model = processor_c, model_c
161
  elif model_name == "MonkeyOCR-Recognition": processor, model = processor_g, model_g
162
  elif model_name == "olmOCR-7B-0725": processor, model = processor_i, model_i
163
  else:
164
+ yield "Invalid model selected.", "Invalid model selected."
165
  return
166
 
167
+ # 3. Prepare model inputs and streamer
 
 
 
 
 
168
  messages = [{"role": "user", "content": [{"type": "image", "image": image}, {"type": "text", "text": text_prompt}]}]
169
  prompt_full = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
170
  inputs = processor(text=[prompt_full], images=[image], return_tensors="pt", padding=True, truncation=True, max_length=MAX_INPUT_TOKEN_LENGTH).to(device)
 
174
  thread = Thread(target=model.generate, kwargs=generation_kwargs)
175
  thread.start()
176
 
177
+ # 4. Stream raw output to the UI in real-time
178
  buffer = ""
179
  for new_text in streamer:
180
  buffer += new_text
181
  buffer = buffer.replace("<|im_end|>", "")
182
  time.sleep(0.01)
183
+ yield buffer , "⏳ Processing..."
 
 
184
 
185
+ # 5. Yield the final result for both raw and formatted outputs
186
+ yield buffer, buffer
 
187
 
 
 
 
188
 
189
  # --- Gradio UI Definition ---
190
  def create_gradio_interface():
 
193
  .main-container { max-width: 1400px; margin: 0 auto; }
194
  .process-button { border: none !important; color: white !important; font-weight: bold !important; background-color: blue !important;}
195
  .process-button:hover { background-color: darkblue !important; transform: translateY(-2px) !important; box-shadow: 0 4px 8px rgba(0,0,0,0.2) !important; }
 
 
196
  """
197
  with gr.Blocks(theme="bethecloud/storj_theme", css=css) as demo:
198
  gr.HTML("""
199
  <div class="title" style="text-align: center">
200
  <h1>Tiny VLMs Lab🧪</h1>
201
  <p style="font-size: 1.1em; color: #6b7280; margin-bottom: 0.6em;">
202
+ Advanced Vision-Language Model for Image Content and Layout Extraction
203
  </p>
204
  </div>
205
  """)
 
208
  # Left Column (Inputs)
209
  with gr.Column(scale=1):
210
  model_choice = gr.Dropdown(
211
+ choices=["Camel-Doc-OCR-080125",
212
+ "MonkeyOCR-Recognition",
213
+ "olmOCR-7B-0725",
214
+ "Nanonets-OCR-s",
215
+ "Megalodon-OCR-Sync-0713"
216
+ ],
 
217
  label="Select Model",
218
  value="Nanonets-OCR-s"
219
  )
220
  image_input = gr.Image(label="Upload Image", type="pil", sources=['upload'])
221
  with gr.Accordion("Advanced Settings", open=False):
222
  max_new_tokens = gr.Slider(minimum=512, maximum=8192, value=4096, step=256, label="Max New Tokens")
223
+ gr.Markdown("### PDF Export Settings")
224
+ font_size = gr.Dropdown(choices=["8", "10", "12", "14", "16", "18"], value="12", label="Font Size")
225
+ line_spacing = gr.Dropdown(choices=[1.0, 1.15, 1.5, 2.0], value=1.15, label="Line Spacing")
226
+ alignment = gr.Dropdown(choices=["Left", "Center", "Right", "Justified"], value="Left", label="Text Alignment")
227
+ image_size = gr.Dropdown(choices=["Small", "Medium", "Large"], value="Medium", label="Image Size in PDF")
228
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
229
 
230
  process_btn = gr.Button("🚀 Process Document", variant="primary", elem_classes=["process-button"], size="lg")
231
  clear_btn = gr.Button("🗑️ Clear All", variant="secondary")
 
234
  with gr.Column(scale=2):
235
  with gr.Tabs() as tabs:
236
  with gr.Tab("📝 Extracted Content"):
237
+ raw_output_stream = gr.Textbox(label="Raw Model Output Stream", interactive=False, lines=15, show_copy_button=True)
238
  with gr.Row():
239
  examples = gr.Examples(
240
  examples=["examples/1.png", "examples/2.png", "examples/3.png", "examples/4.png", "examples/5.png"],
241
  inputs=image_input,
242
  label="Examples"
243
+ )
244
  gr.Markdown("[Report-Bug💻](https://huggingface.co/spaces/prithivMLmods/OCR-Comparator/discussions)")
245
+
246
  with gr.Tab("📰 README.md"):
247
+ markdown_output = gr.Markdown(label="Formatted Markdown")
248
+
249
  with gr.Tab("📋 PDF Preview"):
250
+ pdf_output_file = gr.File(label="Generated PDF Document", interactive=False)
251
+ generate_pdf_btn = gr.Button("📄 Generate PDF", variant="primary")
252
+
253
 
254
  # Event Handlers
255
  def clear_all_outputs():
 
257
 
258
  process_btn.click(
259
  fn=process_document_stream,
260
+ inputs=[model_choice,
261
+ image_input,
262
+ max_new_tokens],
263
+ outputs=[raw_output_stream,
264
+ markdown_output]
265
  )
266
+
267
+ generate_pdf_btn.click(
268
+ fn=generate_pdf,
269
+ inputs=[image_input, markdown_output, font_size, line_spacing, alignment, image_size],
270
+ outputs=[pdf_output_file]
271
+ )
272
+
273
  clear_btn.click(
274
+ clear_all_outputs,
275
+ outputs=[image_input,
276
+ raw_output_stream,
277
+ markdown_output,
278
+ pdf_output_file]
279
  )
280
  return demo
281