prithivMLmods commited on
Commit
ed20180
·
verified ·
1 Parent(s): a902eab

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +48 -31
app.py CHANGED
@@ -135,46 +135,63 @@ def generate_and_preview_pdf(image: Image.Image, text_content: str, font_size: i
135
  pdf_doc = fitz.open(pdf_filename)
136
  for page_num in range(len(pdf_doc)):
137
  page = pdf_doc.load_page(page_num)
138
- pix = page.get_pixmap(dpi=150) # Render at 150 DPI for good quality
139
  preview_img_path = os.path.join(temp_dir, f"preview_{uuid.uuid4()}_p{page_num}.png")
140
  pix.save(preview_img_path)
141
  preview_images.append(preview_img_path)
142
  pdf_doc.close()
143
  except Exception as e:
144
  print(f"Error generating PDF preview: {e}")
145
- # Continue without preview if rendering fails
146
 
147
  return pdf_filename, preview_images
148
 
 
149
  # --- Core Application Logic ---
150
  @spaces.GPU
151
- def process_document_stream(model_name: str, image: Image.Image, max_new_tokens: int):
 
 
 
 
 
 
 
 
 
152
  """
153
- Main generator function that handles OCR tasks.
154
  """
155
  if image is None:
156
- yield "Please upload an image.", "Please upload an image."
 
 
 
157
  return
158
 
159
- if model_name == "Camel-Doc-OCR-080125":
160
- processor, model = processor_m, model_m
161
- elif model_name == "Megalodon-OCR-Sync-0713":
162
- processor, model = processor_t, model_t
163
- elif model_name == "Nanonets-OCR-s":
164
- processor, model = processor_c, model_c
165
- elif model_name == "MonkeyOCR-Recognition":
166
- processor, model = processor_g, model_g
167
- elif model_name == "olmOCR-7B-0725":
168
- processor, model = processor_i, model_i
169
  else:
170
- yield "Invalid model selected.", "Invalid model selected."
171
  return
172
 
173
  messages = [{"role": "user", "content": [{"type": "image", "image": image}, {"type": "text", "text": prompt_input}]}]
174
  prompt_full = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
175
  inputs = processor(text=[prompt_full], images=[image], return_tensors="pt", padding=True, truncation=True, max_length=MAX_INPUT_TOKEN_LENGTH).to(device)
176
  streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
177
- generation_kwargs = {**inputs, "streamer": streamer, "max_new_tokens": max_new_tokens}
 
 
 
 
 
 
 
 
 
 
178
 
179
  thread = Thread(target=model.generate, kwargs=generation_kwargs)
180
  thread.start()
@@ -184,7 +201,7 @@ def process_document_stream(model_name: str, image: Image.Image, max_new_tokens:
184
  buffer += new_text
185
  buffer = buffer.replace("<|im_end|>", "")
186
  time.sleep(0.01)
187
- yield buffer , "⏳ Processing..."
188
 
189
  yield buffer, buffer
190
 
@@ -215,12 +232,16 @@ def create_gradio_interface():
215
  choices=["Camel-Doc-OCR-080125", "MonkeyOCR-Recognition", "olmOCR-7B-0725", "Nanonets-OCR-s", "Megalodon-OCR-Sync-0713"],
216
  label="Select Model", value="Nanonets-OCR-s"
217
  )
218
-
219
- prompt_input = gr.Textbox(label="Query Input", placeholder="✦︎ Enter your query")
220
-
221
  image_input = gr.Image(label="Upload Image", type="pil", sources=['upload'])
 
222
  with gr.Accordion("Advanced Settings", open=False):
223
  max_new_tokens = gr.Slider(minimum=512, maximum=8192, value=4096, step=256, label="Max New Tokens")
 
 
 
 
 
224
  gr.Markdown("### PDF Export Settings")
225
  font_size = gr.Dropdown(choices=["8", "10", "12", "14", "16", "18"], value="12", label="Font Size")
226
  line_spacing = gr.Dropdown(choices=[1.0, 1.15, 1.5, 2.0], value=1.15, label="Line Spacing")
@@ -237,18 +258,14 @@ def create_gradio_interface():
237
  raw_output_stream = gr.Textbox(label="Raw Model Output Stream", interactive=False, lines=15, show_copy_button=True)
238
  with gr.Row():
239
  examples = gr.Examples(
240
- examples=["examples/1.png",
241
- "examples/2.png",
242
- "examples/3.png",
243
- "examples/4.png",
244
- "examples/5.png"],
245
  inputs=image_input, label="Examples"
246
  )
247
  gr.Markdown("[Report-Bug💻](https://huggingface.co/spaces/prithivMLmods/OCR-Comparator/discussions)")
248
 
249
  with gr.Tab("📰 README.md"):
250
  with gr.Accordion("(Result.md)", open=True):
251
- markdown_output = gr.Markdown(label="Formatted Markdown")
252
 
253
  with gr.Tab("📋 PDF Preview"):
254
  generate_pdf_btn = gr.Button("📄 Generate PDF & Render", variant="primary")
@@ -257,23 +274,23 @@ def create_gradio_interface():
257
 
258
  # Event Handlers
259
  def clear_all_outputs():
260
- return None, "Raw output will appear here.", "Formatted results will appear here.", None, None
261
 
262
  process_btn.click(
263
  fn=process_document_stream,
264
- inputs=[model_choice, image_input, max_new_tokens],
265
  outputs=[raw_output_stream, markdown_output]
266
  )
267
 
268
  generate_pdf_btn.click(
269
  fn=generate_and_preview_pdf,
270
- inputs=[image_input, markdown_output, font_size, line_spacing, alignment, image_size],
271
  outputs=[pdf_output_file, pdf_preview_gallery]
272
  )
273
 
274
  clear_btn.click(
275
  clear_all_outputs,
276
- outputs=[image_input, raw_output_stream, markdown_output, pdf_output_file, pdf_preview_gallery]
277
  )
278
  return demo
279
 
 
135
  pdf_doc = fitz.open(pdf_filename)
136
  for page_num in range(len(pdf_doc)):
137
  page = pdf_doc.load_page(page_num)
138
+ pix = page.get_pixmap(dpi=150)
139
  preview_img_path = os.path.join(temp_dir, f"preview_{uuid.uuid4()}_p{page_num}.png")
140
  pix.save(preview_img_path)
141
  preview_images.append(preview_img_path)
142
  pdf_doc.close()
143
  except Exception as e:
144
  print(f"Error generating PDF preview: {e}")
 
145
 
146
  return pdf_filename, preview_images
147
 
148
+
149
  # --- Core Application Logic ---
150
  @spaces.GPU
151
+ def process_document_stream(
152
+ model_name: str,
153
+ image: Image.Image,
154
+ prompt_input: str,
155
+ max_new_tokens: int,
156
+ temperature: float,
157
+ top_p: float,
158
+ top_k: int,
159
+ repetition_penalty: float
160
+ ):
161
  """
162
+ Main generator function that handles model inference tasks with advanced generation parameters.
163
  """
164
  if image is None:
165
+ yield "Please upload an image.", ""
166
+ return
167
+ if not prompt_input or not prompt_input.strip():
168
+ yield "Please enter a prompt.", ""
169
  return
170
 
171
+ if model_name == "Camel-Doc-OCR-080125": processor, model = processor_m, model_m
172
+ elif model_name == "Megalodon-OCR-Sync-0713": processor, model = processor_t, model_t
173
+ elif model_name == "Nanonets-OCR-s": processor, model = processor_c, model_c
174
+ elif model_name == "MonkeyOCR-Recognition": processor, model = processor_g, model_g
175
+ elif model_name == "olmOCR-7B-0725": processor, model = processor_i, model_i
 
 
 
 
 
176
  else:
177
+ yield "Invalid model selected.", ""
178
  return
179
 
180
  messages = [{"role": "user", "content": [{"type": "image", "image": image}, {"type": "text", "text": prompt_input}]}]
181
  prompt_full = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
182
  inputs = processor(text=[prompt_full], images=[image], return_tensors="pt", padding=True, truncation=True, max_length=MAX_INPUT_TOKEN_LENGTH).to(device)
183
  streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
184
+
185
+ generation_kwargs = {
186
+ **inputs,
187
+ "streamer": streamer,
188
+ "max_new_tokens": max_new_tokens,
189
+ "temperature": temperature,
190
+ "top_p": top_p,
191
+ "top_k": top_k,
192
+ "repetition_penalty": repetition_penalty,
193
+ "do_sample": True
194
+ }
195
 
196
  thread = Thread(target=model.generate, kwargs=generation_kwargs)
197
  thread.start()
 
201
  buffer += new_text
202
  buffer = buffer.replace("<|im_end|>", "")
203
  time.sleep(0.01)
204
+ yield buffer , buffer
205
 
206
  yield buffer, buffer
207
 
 
232
  choices=["Camel-Doc-OCR-080125", "MonkeyOCR-Recognition", "olmOCR-7B-0725", "Nanonets-OCR-s", "Megalodon-OCR-Sync-0713"],
233
  label="Select Model", value="Nanonets-OCR-s"
234
  )
235
+ prompt_input = gr.Textbox(label="Query Input", placeholder="✦︎ Enter your query", lines=3)
 
 
236
  image_input = gr.Image(label="Upload Image", type="pil", sources=['upload'])
237
+
238
  with gr.Accordion("Advanced Settings", open=False):
239
  max_new_tokens = gr.Slider(minimum=512, maximum=8192, value=4096, step=256, label="Max New Tokens")
240
+ temperature = gr.Slider(label="Temperature", minimum=0.1, maximum=4.0, step=0.1, value=0.6)
241
+ top_p = gr.Slider(label="Top-p (nucleus sampling)", minimum=0.05, maximum=1.0, step=0.05, value=0.9)
242
+ top_k = gr.Slider(label="Top-k", minimum=1, maximum=1000, step=1, value=50)
243
+ repetition_penalty = gr.Slider(label="Repetition penalty", minimum=1.0, maximum=2.0, step=0.05, value=1.2)
244
+
245
  gr.Markdown("### PDF Export Settings")
246
  font_size = gr.Dropdown(choices=["8", "10", "12", "14", "16", "18"], value="12", label="Font Size")
247
  line_spacing = gr.Dropdown(choices=[1.0, 1.15, 1.5, 2.0], value=1.15, label="Line Spacing")
 
258
  raw_output_stream = gr.Textbox(label="Raw Model Output Stream", interactive=False, lines=15, show_copy_button=True)
259
  with gr.Row():
260
  examples = gr.Examples(
261
+ examples=["examples/1.png", "examples/2.png", "examples/3.png", "examples/4.png", "examples/5.png"],
 
 
 
 
262
  inputs=image_input, label="Examples"
263
  )
264
  gr.Markdown("[Report-Bug💻](https://huggingface.co/spaces/prithivMLmods/OCR-Comparator/discussions)")
265
 
266
  with gr.Tab("📰 README.md"):
267
  with gr.Accordion("(Result.md)", open=True):
268
+ markdown_output = gr.Markdown()
269
 
270
  with gr.Tab("📋 PDF Preview"):
271
  generate_pdf_btn = gr.Button("📄 Generate PDF & Render", variant="primary")
 
274
 
275
  # Event Handlers
276
  def clear_all_outputs():
277
+ return None, "", "Raw output will appear here.", "", None, None
278
 
279
  process_btn.click(
280
  fn=process_document_stream,
281
+ inputs=[model_choice, image_input, prompt_input, max_new_tokens, temperature, top_p, top_k, repetition_penalty],
282
  outputs=[raw_output_stream, markdown_output]
283
  )
284
 
285
  generate_pdf_btn.click(
286
  fn=generate_and_preview_pdf,
287
+ inputs=[image_input, raw_output_stream, font_size, line_spacing, alignment, image_size],
288
  outputs=[pdf_output_file, pdf_preview_gallery]
289
  )
290
 
291
  clear_btn.click(
292
  clear_all_outputs,
293
+ outputs=[image_input, prompt_input, raw_output_stream, markdown_output, pdf_output_file, pdf_preview_gallery]
294
  )
295
  return demo
296