Spaces:

prithivMLmods
/

Tiny-VLMs-Lab

Running on Zero

App Files Files Community

prithivMLmods commited on 6 days ago

Commit

ed20180

verified ·

1 Parent(s): a902eab

Update app.py

Browse files

Files changed (1) hide show

app.py +48 -31

app.py CHANGED Viewed

@@ -135,46 +135,63 @@ def generate_and_preview_pdf(image: Image.Image, text_content: str, font_size: i
         pdf_doc = fitz.open(pdf_filename)
         for page_num in range(len(pdf_doc)):
             page = pdf_doc.load_page(page_num)
-            pix = page.get_pixmap(dpi=150) # Render at 150 DPI for good quality
             preview_img_path = os.path.join(temp_dir, f"preview_{uuid.uuid4()}_p{page_num}.png")
             pix.save(preview_img_path)
             preview_images.append(preview_img_path)
         pdf_doc.close()
     except Exception as e:
         print(f"Error generating PDF preview: {e}")
-        # Continue without preview if rendering fails
     return pdf_filename, preview_images
 # --- Core Application Logic ---
 @spaces.GPU
-def process_document_stream(model_name: str, image: Image.Image, max_new_tokens: int):
     """
-    Main generator function that handles OCR tasks.
     """
     if image is None:
-        yield "Please upload an image.", "Please upload an image."
         return
-    if model_name == "Camel-Doc-OCR-080125":
-        processor, model = processor_m, model_m
-    elif model_name == "Megalodon-OCR-Sync-0713":
-        processor, model = processor_t, model_t
-    elif model_name == "Nanonets-OCR-s":
-        processor, model = processor_c, model_c
-    elif model_name == "MonkeyOCR-Recognition":
-        processor, model = processor_g, model_g
-    elif model_name == "olmOCR-7B-0725":
-        processor, model = processor_i, model_i
     else:
-        yield "Invalid model selected.", "Invalid model selected."
         return
     messages = [{"role": "user", "content": [{"type": "image", "image": image}, {"type": "text", "text": prompt_input}]}]
     prompt_full = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
     inputs = processor(text=[prompt_full], images=[image], return_tensors="pt", padding=True, truncation=True, max_length=MAX_INPUT_TOKEN_LENGTH).to(device)
     streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
-    generation_kwargs = {**inputs, "streamer": streamer, "max_new_tokens": max_new_tokens}
     thread = Thread(target=model.generate, kwargs=generation_kwargs)
     thread.start()
@@ -184,7 +201,7 @@ def process_document_stream(model_name: str, image: Image.Image, max_new_tokens:
         buffer += new_text
         buffer = buffer.replace("<|im_end|>", "")
         time.sleep(0.01)
-        yield buffer , "⏳ Processing..."
     yield buffer, buffer
@@ -215,12 +232,16 @@ def create_gradio_interface():
                     choices=["Camel-Doc-OCR-080125", "MonkeyOCR-Recognition", "olmOCR-7B-0725", "Nanonets-OCR-s", "Megalodon-OCR-Sync-0713"],
                     label="Select Model", value="Nanonets-OCR-s"
                 )
-                prompt_input = gr.Textbox(label="Query Input", placeholder="✦︎ Enter your query")
                 image_input = gr.Image(label="Upload Image", type="pil", sources=['upload'])
                 with gr.Accordion("Advanced Settings", open=False):
                     max_new_tokens = gr.Slider(minimum=512, maximum=8192, value=4096, step=256, label="Max New Tokens")
                     gr.Markdown("### PDF Export Settings")
                     font_size = gr.Dropdown(choices=["8", "10", "12", "14", "16", "18"], value="12", label="Font Size")
                     line_spacing = gr.Dropdown(choices=[1.0, 1.15, 1.5, 2.0], value=1.15, label="Line Spacing")
@@ -237,18 +258,14 @@ def create_gradio_interface():
                         raw_output_stream = gr.Textbox(label="Raw Model Output Stream", interactive=False, lines=15, show_copy_button=True)
                         with gr.Row():
                             examples = gr.Examples(
-                                examples=["examples/1.png",
-                                          "examples/2.png",
-                                          "examples/3.png",
-                                          "examples/4.png",
-                                          "examples/5.png"],
                                 inputs=image_input, label="Examples"
                             )
                         gr.Markdown("[Report-Bug💻](https://huggingface.co/spaces/prithivMLmods/OCR-Comparator/discussions)")
                     with gr.Tab("📰 README.md"):
                         with gr.Accordion("(Result.md)", open=True):
-                            markdown_output = gr.Markdown(label="Formatted Markdown")
                     with gr.Tab("📋 PDF Preview"):
                         generate_pdf_btn = gr.Button("📄 Generate PDF & Render", variant="primary")
@@ -257,23 +274,23 @@ def create_gradio_interface():
         # Event Handlers
         def clear_all_outputs():
-            return None, "Raw output will appear here.", "Formatted results will appear here.", None, None
         process_btn.click(
             fn=process_document_stream,
-            inputs=[model_choice, image_input, max_new_tokens],
             outputs=[raw_output_stream, markdown_output]
         )
         generate_pdf_btn.click(
             fn=generate_and_preview_pdf,
-            inputs=[image_input, markdown_output, font_size, line_spacing, alignment, image_size],
             outputs=[pdf_output_file, pdf_preview_gallery]
         )
         clear_btn.click(
             clear_all_outputs,
-            outputs=[image_input, raw_output_stream, markdown_output, pdf_output_file, pdf_preview_gallery]
         )
     return demo

         pdf_doc = fitz.open(pdf_filename)
         for page_num in range(len(pdf_doc)):
             page = pdf_doc.load_page(page_num)
+            pix = page.get_pixmap(dpi=150)
             preview_img_path = os.path.join(temp_dir, f"preview_{uuid.uuid4()}_p{page_num}.png")
             pix.save(preview_img_path)
             preview_images.append(preview_img_path)
         pdf_doc.close()
     except Exception as e:
         print(f"Error generating PDF preview: {e}")
     return pdf_filename, preview_images
 # --- Core Application Logic ---
 @spaces.GPU
+def process_document_stream(
+    model_name: str,
+    image: Image.Image,
+    prompt_input: str,
+    max_new_tokens: int,
+    temperature: float,
+    top_p: float,
+    top_k: int,
+    repetition_penalty: float
+):
     """
+    Main generator function that handles model inference tasks with advanced generation parameters.
     """
     if image is None:
+        yield "Please upload an image.", ""
+        return
+    if not prompt_input or not prompt_input.strip():
+        yield "Please enter a prompt.", ""
         return
+    if model_name == "Camel-Doc-OCR-080125": processor, model = processor_m, model_m
+    elif model_name == "Megalodon-OCR-Sync-0713": processor, model = processor_t, model_t
+    elif model_name == "Nanonets-OCR-s": processor, model = processor_c, model_c
+    elif model_name == "MonkeyOCR-Recognition": processor, model = processor_g, model_g
+    elif model_name == "olmOCR-7B-0725": processor, model = processor_i, model_i
     else:
+        yield "Invalid model selected.", ""
         return
     messages = [{"role": "user", "content": [{"type": "image", "image": image}, {"type": "text", "text": prompt_input}]}]
     prompt_full = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
     inputs = processor(text=[prompt_full], images=[image], return_tensors="pt", padding=True, truncation=True, max_length=MAX_INPUT_TOKEN_LENGTH).to(device)
     streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
+    generation_kwargs = {
+        **inputs,
+        "streamer": streamer,
+        "max_new_tokens": max_new_tokens,
+        "temperature": temperature,
+        "top_p": top_p,
+        "top_k": top_k,
+        "repetition_penalty": repetition_penalty,
+        "do_sample": True
+    }
     thread = Thread(target=model.generate, kwargs=generation_kwargs)
     thread.start()
         buffer += new_text
         buffer = buffer.replace("<|im_end|>", "")
         time.sleep(0.01)
+        yield buffer , buffer
     yield buffer, buffer
                     choices=["Camel-Doc-OCR-080125", "MonkeyOCR-Recognition", "olmOCR-7B-0725", "Nanonets-OCR-s", "Megalodon-OCR-Sync-0713"],
                     label="Select Model", value="Nanonets-OCR-s"
                 )
+                prompt_input = gr.Textbox(label="Query Input", placeholder="✦︎ Enter your query", lines=3)
                 image_input = gr.Image(label="Upload Image", type="pil", sources=['upload'])
                 with gr.Accordion("Advanced Settings", open=False):
                     max_new_tokens = gr.Slider(minimum=512, maximum=8192, value=4096, step=256, label="Max New Tokens")
+                    temperature = gr.Slider(label="Temperature", minimum=0.1, maximum=4.0, step=0.1, value=0.6)
+                    top_p = gr.Slider(label="Top-p (nucleus sampling)", minimum=0.05, maximum=1.0, step=0.05, value=0.9)
+                    top_k = gr.Slider(label="Top-k", minimum=1, maximum=1000, step=1, value=50)
+                    repetition_penalty = gr.Slider(label="Repetition penalty", minimum=1.0, maximum=2.0, step=0.05, value=1.2)
                     gr.Markdown("### PDF Export Settings")
                     font_size = gr.Dropdown(choices=["8", "10", "12", "14", "16", "18"], value="12", label="Font Size")
                     line_spacing = gr.Dropdown(choices=[1.0, 1.15, 1.5, 2.0], value=1.15, label="Line Spacing")
                         raw_output_stream = gr.Textbox(label="Raw Model Output Stream", interactive=False, lines=15, show_copy_button=True)
                         with gr.Row():
                             examples = gr.Examples(
+                                examples=["examples/1.png", "examples/2.png", "examples/3.png", "examples/4.png", "examples/5.png"],
                                 inputs=image_input, label="Examples"
                             )
                         gr.Markdown("[Report-Bug💻](https://huggingface.co/spaces/prithivMLmods/OCR-Comparator/discussions)")
                     with gr.Tab("📰 README.md"):
                         with gr.Accordion("(Result.md)", open=True):
+                            markdown_output = gr.Markdown()
                     with gr.Tab("📋 PDF Preview"):
                         generate_pdf_btn = gr.Button("📄 Generate PDF & Render", variant="primary")
         # Event Handlers
         def clear_all_outputs():
+            return None, "", "Raw output will appear here.", "", None, None
         process_btn.click(
             fn=process_document_stream,
+            inputs=[model_choice, image_input, prompt_input, max_new_tokens, temperature, top_p, top_k, repetition_penalty],
             outputs=[raw_output_stream, markdown_output]
         )
         generate_pdf_btn.click(
             fn=generate_and_preview_pdf,
+            inputs=[image_input, raw_output_stream, font_size, line_spacing, alignment, image_size],
             outputs=[pdf_output_file, pdf_preview_gallery]
         )
         clear_btn.click(
             clear_all_outputs,
+            outputs=[image_input, prompt_input, raw_output_stream, markdown_output, pdf_output_file, pdf_preview_gallery]
         )
     return demo