Spaces:
Running
on
Zero
Running
on
Zero
Update app.py
Browse files
app.py
CHANGED
@@ -135,46 +135,63 @@ def generate_and_preview_pdf(image: Image.Image, text_content: str, font_size: i
|
|
135 |
pdf_doc = fitz.open(pdf_filename)
|
136 |
for page_num in range(len(pdf_doc)):
|
137 |
page = pdf_doc.load_page(page_num)
|
138 |
-
pix = page.get_pixmap(dpi=150)
|
139 |
preview_img_path = os.path.join(temp_dir, f"preview_{uuid.uuid4()}_p{page_num}.png")
|
140 |
pix.save(preview_img_path)
|
141 |
preview_images.append(preview_img_path)
|
142 |
pdf_doc.close()
|
143 |
except Exception as e:
|
144 |
print(f"Error generating PDF preview: {e}")
|
145 |
-
# Continue without preview if rendering fails
|
146 |
|
147 |
return pdf_filename, preview_images
|
148 |
|
|
|
149 |
# --- Core Application Logic ---
|
150 |
@spaces.GPU
|
151 |
-
def process_document_stream(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
152 |
"""
|
153 |
-
Main generator function that handles
|
154 |
"""
|
155 |
if image is None:
|
156 |
-
yield "Please upload an image.", "
|
|
|
|
|
|
|
157 |
return
|
158 |
|
159 |
-
if model_name == "Camel-Doc-OCR-080125":
|
160 |
-
|
161 |
-
elif model_name == "
|
162 |
-
|
163 |
-
elif model_name == "
|
164 |
-
processor, model = processor_c, model_c
|
165 |
-
elif model_name == "MonkeyOCR-Recognition":
|
166 |
-
processor, model = processor_g, model_g
|
167 |
-
elif model_name == "olmOCR-7B-0725":
|
168 |
-
processor, model = processor_i, model_i
|
169 |
else:
|
170 |
-
yield "Invalid model selected.", "
|
171 |
return
|
172 |
|
173 |
messages = [{"role": "user", "content": [{"type": "image", "image": image}, {"type": "text", "text": prompt_input}]}]
|
174 |
prompt_full = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
|
175 |
inputs = processor(text=[prompt_full], images=[image], return_tensors="pt", padding=True, truncation=True, max_length=MAX_INPUT_TOKEN_LENGTH).to(device)
|
176 |
streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
|
177 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
178 |
|
179 |
thread = Thread(target=model.generate, kwargs=generation_kwargs)
|
180 |
thread.start()
|
@@ -184,7 +201,7 @@ def process_document_stream(model_name: str, image: Image.Image, max_new_tokens:
|
|
184 |
buffer += new_text
|
185 |
buffer = buffer.replace("<|im_end|>", "")
|
186 |
time.sleep(0.01)
|
187 |
-
yield buffer ,
|
188 |
|
189 |
yield buffer, buffer
|
190 |
|
@@ -215,12 +232,16 @@ def create_gradio_interface():
|
|
215 |
choices=["Camel-Doc-OCR-080125", "MonkeyOCR-Recognition", "olmOCR-7B-0725", "Nanonets-OCR-s", "Megalodon-OCR-Sync-0713"],
|
216 |
label="Select Model", value="Nanonets-OCR-s"
|
217 |
)
|
218 |
-
|
219 |
-
prompt_input = gr.Textbox(label="Query Input", placeholder="✦︎ Enter your query")
|
220 |
-
|
221 |
image_input = gr.Image(label="Upload Image", type="pil", sources=['upload'])
|
|
|
222 |
with gr.Accordion("Advanced Settings", open=False):
|
223 |
max_new_tokens = gr.Slider(minimum=512, maximum=8192, value=4096, step=256, label="Max New Tokens")
|
|
|
|
|
|
|
|
|
|
|
224 |
gr.Markdown("### PDF Export Settings")
|
225 |
font_size = gr.Dropdown(choices=["8", "10", "12", "14", "16", "18"], value="12", label="Font Size")
|
226 |
line_spacing = gr.Dropdown(choices=[1.0, 1.15, 1.5, 2.0], value=1.15, label="Line Spacing")
|
@@ -237,18 +258,14 @@ def create_gradio_interface():
|
|
237 |
raw_output_stream = gr.Textbox(label="Raw Model Output Stream", interactive=False, lines=15, show_copy_button=True)
|
238 |
with gr.Row():
|
239 |
examples = gr.Examples(
|
240 |
-
examples=["examples/1.png",
|
241 |
-
"examples/2.png",
|
242 |
-
"examples/3.png",
|
243 |
-
"examples/4.png",
|
244 |
-
"examples/5.png"],
|
245 |
inputs=image_input, label="Examples"
|
246 |
)
|
247 |
gr.Markdown("[Report-Bug💻](https://huggingface.co/spaces/prithivMLmods/OCR-Comparator/discussions)")
|
248 |
|
249 |
with gr.Tab("📰 README.md"):
|
250 |
with gr.Accordion("(Result.md)", open=True):
|
251 |
-
markdown_output = gr.Markdown(
|
252 |
|
253 |
with gr.Tab("📋 PDF Preview"):
|
254 |
generate_pdf_btn = gr.Button("📄 Generate PDF & Render", variant="primary")
|
@@ -257,23 +274,23 @@ def create_gradio_interface():
|
|
257 |
|
258 |
# Event Handlers
|
259 |
def clear_all_outputs():
|
260 |
-
return None, "Raw output will appear here.", "
|
261 |
|
262 |
process_btn.click(
|
263 |
fn=process_document_stream,
|
264 |
-
inputs=[model_choice, image_input, max_new_tokens],
|
265 |
outputs=[raw_output_stream, markdown_output]
|
266 |
)
|
267 |
|
268 |
generate_pdf_btn.click(
|
269 |
fn=generate_and_preview_pdf,
|
270 |
-
inputs=[image_input,
|
271 |
outputs=[pdf_output_file, pdf_preview_gallery]
|
272 |
)
|
273 |
|
274 |
clear_btn.click(
|
275 |
clear_all_outputs,
|
276 |
-
outputs=[image_input, raw_output_stream, markdown_output, pdf_output_file, pdf_preview_gallery]
|
277 |
)
|
278 |
return demo
|
279 |
|
|
|
135 |
pdf_doc = fitz.open(pdf_filename)
|
136 |
for page_num in range(len(pdf_doc)):
|
137 |
page = pdf_doc.load_page(page_num)
|
138 |
+
pix = page.get_pixmap(dpi=150)
|
139 |
preview_img_path = os.path.join(temp_dir, f"preview_{uuid.uuid4()}_p{page_num}.png")
|
140 |
pix.save(preview_img_path)
|
141 |
preview_images.append(preview_img_path)
|
142 |
pdf_doc.close()
|
143 |
except Exception as e:
|
144 |
print(f"Error generating PDF preview: {e}")
|
|
|
145 |
|
146 |
return pdf_filename, preview_images
|
147 |
|
148 |
+
|
149 |
# --- Core Application Logic ---
|
150 |
@spaces.GPU
|
151 |
+
def process_document_stream(
|
152 |
+
model_name: str,
|
153 |
+
image: Image.Image,
|
154 |
+
prompt_input: str,
|
155 |
+
max_new_tokens: int,
|
156 |
+
temperature: float,
|
157 |
+
top_p: float,
|
158 |
+
top_k: int,
|
159 |
+
repetition_penalty: float
|
160 |
+
):
|
161 |
"""
|
162 |
+
Main generator function that handles model inference tasks with advanced generation parameters.
|
163 |
"""
|
164 |
if image is None:
|
165 |
+
yield "Please upload an image.", ""
|
166 |
+
return
|
167 |
+
if not prompt_input or not prompt_input.strip():
|
168 |
+
yield "Please enter a prompt.", ""
|
169 |
return
|
170 |
|
171 |
+
if model_name == "Camel-Doc-OCR-080125": processor, model = processor_m, model_m
|
172 |
+
elif model_name == "Megalodon-OCR-Sync-0713": processor, model = processor_t, model_t
|
173 |
+
elif model_name == "Nanonets-OCR-s": processor, model = processor_c, model_c
|
174 |
+
elif model_name == "MonkeyOCR-Recognition": processor, model = processor_g, model_g
|
175 |
+
elif model_name == "olmOCR-7B-0725": processor, model = processor_i, model_i
|
|
|
|
|
|
|
|
|
|
|
176 |
else:
|
177 |
+
yield "Invalid model selected.", ""
|
178 |
return
|
179 |
|
180 |
messages = [{"role": "user", "content": [{"type": "image", "image": image}, {"type": "text", "text": prompt_input}]}]
|
181 |
prompt_full = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
|
182 |
inputs = processor(text=[prompt_full], images=[image], return_tensors="pt", padding=True, truncation=True, max_length=MAX_INPUT_TOKEN_LENGTH).to(device)
|
183 |
streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
|
184 |
+
|
185 |
+
generation_kwargs = {
|
186 |
+
**inputs,
|
187 |
+
"streamer": streamer,
|
188 |
+
"max_new_tokens": max_new_tokens,
|
189 |
+
"temperature": temperature,
|
190 |
+
"top_p": top_p,
|
191 |
+
"top_k": top_k,
|
192 |
+
"repetition_penalty": repetition_penalty,
|
193 |
+
"do_sample": True
|
194 |
+
}
|
195 |
|
196 |
thread = Thread(target=model.generate, kwargs=generation_kwargs)
|
197 |
thread.start()
|
|
|
201 |
buffer += new_text
|
202 |
buffer = buffer.replace("<|im_end|>", "")
|
203 |
time.sleep(0.01)
|
204 |
+
yield buffer , buffer
|
205 |
|
206 |
yield buffer, buffer
|
207 |
|
|
|
232 |
choices=["Camel-Doc-OCR-080125", "MonkeyOCR-Recognition", "olmOCR-7B-0725", "Nanonets-OCR-s", "Megalodon-OCR-Sync-0713"],
|
233 |
label="Select Model", value="Nanonets-OCR-s"
|
234 |
)
|
235 |
+
prompt_input = gr.Textbox(label="Query Input", placeholder="✦︎ Enter your query", lines=3)
|
|
|
|
|
236 |
image_input = gr.Image(label="Upload Image", type="pil", sources=['upload'])
|
237 |
+
|
238 |
with gr.Accordion("Advanced Settings", open=False):
|
239 |
max_new_tokens = gr.Slider(minimum=512, maximum=8192, value=4096, step=256, label="Max New Tokens")
|
240 |
+
temperature = gr.Slider(label="Temperature", minimum=0.1, maximum=4.0, step=0.1, value=0.6)
|
241 |
+
top_p = gr.Slider(label="Top-p (nucleus sampling)", minimum=0.05, maximum=1.0, step=0.05, value=0.9)
|
242 |
+
top_k = gr.Slider(label="Top-k", minimum=1, maximum=1000, step=1, value=50)
|
243 |
+
repetition_penalty = gr.Slider(label="Repetition penalty", minimum=1.0, maximum=2.0, step=0.05, value=1.2)
|
244 |
+
|
245 |
gr.Markdown("### PDF Export Settings")
|
246 |
font_size = gr.Dropdown(choices=["8", "10", "12", "14", "16", "18"], value="12", label="Font Size")
|
247 |
line_spacing = gr.Dropdown(choices=[1.0, 1.15, 1.5, 2.0], value=1.15, label="Line Spacing")
|
|
|
258 |
raw_output_stream = gr.Textbox(label="Raw Model Output Stream", interactive=False, lines=15, show_copy_button=True)
|
259 |
with gr.Row():
|
260 |
examples = gr.Examples(
|
261 |
+
examples=["examples/1.png", "examples/2.png", "examples/3.png", "examples/4.png", "examples/5.png"],
|
|
|
|
|
|
|
|
|
262 |
inputs=image_input, label="Examples"
|
263 |
)
|
264 |
gr.Markdown("[Report-Bug💻](https://huggingface.co/spaces/prithivMLmods/OCR-Comparator/discussions)")
|
265 |
|
266 |
with gr.Tab("📰 README.md"):
|
267 |
with gr.Accordion("(Result.md)", open=True):
|
268 |
+
markdown_output = gr.Markdown()
|
269 |
|
270 |
with gr.Tab("📋 PDF Preview"):
|
271 |
generate_pdf_btn = gr.Button("📄 Generate PDF & Render", variant="primary")
|
|
|
274 |
|
275 |
# Event Handlers
|
276 |
def clear_all_outputs():
|
277 |
+
return None, "", "Raw output will appear here.", "", None, None
|
278 |
|
279 |
process_btn.click(
|
280 |
fn=process_document_stream,
|
281 |
+
inputs=[model_choice, image_input, prompt_input, max_new_tokens, temperature, top_p, top_k, repetition_penalty],
|
282 |
outputs=[raw_output_stream, markdown_output]
|
283 |
)
|
284 |
|
285 |
generate_pdf_btn.click(
|
286 |
fn=generate_and_preview_pdf,
|
287 |
+
inputs=[image_input, raw_output_stream, font_size, line_spacing, alignment, image_size],
|
288 |
outputs=[pdf_output_file, pdf_preview_gallery]
|
289 |
)
|
290 |
|
291 |
clear_btn.click(
|
292 |
clear_all_outputs,
|
293 |
+
outputs=[image_input, prompt_input, raw_output_stream, markdown_output, pdf_output_file, pdf_preview_gallery]
|
294 |
)
|
295 |
return demo
|
296 |
|