Spaces:

prithivMLmods
/

Tiny-VLMs-Lab

Running on Zero

App Files Files Community

prithivMLmods commited on 17 days ago

Commit

5ecf0f4

verified ·

1 Parent(s): 1774f71

Update app.py

Browse files

Files changed (1) hide show

app.py +60 -24

app.py CHANGED Viewed

@@ -17,6 +17,8 @@ from transformers import (
     Qwen2_5_VLForConditionalGeneration,
     AutoProcessor,
     TextIteratorStreamer,
 )
 js_func = """
@@ -77,6 +79,17 @@ model_g = Qwen2_5_VLForConditionalGeneration.from_pretrained(
     MODEL_ID_G, trust_remote_code=True, subfolder=SUBFOLDER, torch_dtype=torch.float16
 ).to(device).eval()
 # --- Utility Functions ---
 def layoutjson2md(layout_data: List[Dict]) -> str:
     """Converts the structured JSON from Layout Analysis into formatted Markdown."""
@@ -121,6 +134,28 @@ def process_document_stream(model_name: str, task_choice: str, image: Image.Imag
     # 1. Select prompt based on user's task choice
     text_prompt = ocr_prompt if task_choice == "Content Extraction" else layout_prompt
     # 2. Select model and processor
     if model_name == "Camel-Doc-OCR-062825": processor, model = processor_m, model_m
     elif model_name == "Megalodon-OCR-Sync-0713": processor, model = processor_t, model_t
@@ -136,10 +171,10 @@ def process_document_stream(model_name: str, task_choice: str, image: Image.Imag
     inputs = processor(text=[prompt_full], images=[image], return_tensors="pt", padding=True, truncation=True, max_length=MAX_INPUT_TOKEN_LENGTH).to(device)
     streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
     generation_kwargs = {**inputs, "streamer": streamer, "max_new_tokens": max_new_tokens}
     thread = Thread(target=model.generate, kwargs=generation_kwargs)
     thread.start()
     # 4. Stream raw output to the UI in real-time
     buffer = ""
     for new_text in streamer:
@@ -157,11 +192,11 @@ def process_document_stream(model_name: str, task_choice: str, image: Image.Imag
             json_match = re.search(r'```json\s*([\s\S]+?)\s*```', buffer)
             if not json_match:
                 raise json.JSONDecodeError("JSON object not found in output.", buffer, 0)
             json_str = json_match.group(1)
             layout_data = json.loads(json_str)
             markdown_content = layoutjson2md(layout_data)
             yield buffer, markdown_content, layout_data
         except Exception as e:
             error_md = f"❌ **Error:** Failed to parse Layout JSON.\n\n**Details:**\n`{str(e)}`"
@@ -173,7 +208,7 @@ def create_gradio_interface():
     """Builds and returns the Gradio web interface."""
     css = """
     .main-container { max-width: 1400px; margin: 0 auto; }
-    .process-button { border: none !important; color: white !important; font-weight: bold !important; background-color: blue !important;}
     .process-button:hover { background-color: darkblue !important; transform: translateY(-2px) !important; box-shadow: 0 4px 8px rgba(0,0,0,0.2) !important; }
     """
     with gr.Blocks(theme="bethecloud/storj_theme", css=css, js=js_func) as demo:
@@ -185,15 +220,16 @@ def create_gradio_interface():
             </p>
         </div>
         """)
         with gr.Row():
             # Left Column (Inputs)
             with gr.Column(scale=1):
                 model_choice = gr.Dropdown(
-                    choices=["Camel-Doc-OCR-062825",
-                             "MonkeyOCR-Recognition",
-                             "Nanonets-OCR-s",
-                             "Megalodon-OCR-Sync-0713"],
                     label="Select Model", value="Nanonets-OCR-s"
                 )
                 task_choice = gr.Dropdown(
@@ -203,7 +239,7 @@ def create_gradio_interface():
                 image_input = gr.Image(label="Upload Image", type="pil", sources=['upload'])
                 with gr.Accordion("Advanced Settings", open=False):
                     max_new_tokens = gr.Slider(minimum=512, maximum=8192, value=4096, step=256, label="Max New Tokens")
                 process_btn = gr.Button("🚀 Process Document", variant="primary", elem_classes=["process-button"], size="lg")
                 clear_btn = gr.Button("🗑️ Clear All", variant="secondary")
@@ -217,37 +253,37 @@ def create_gradio_interface():
                                 examples=["examples/example_img2.png", "examples/example_img1.png"],
                                 inputs=image_input,
                                 label="Examples"
-                    )
-                    with gr.Tab("📰 README.md"):
                         with gr.Accordion("(Formatted Result)", open=True):
                             markdown_output = gr.Markdown(label="Formatted Markdown")
                     with gr.Tab("📋 Layout Analysis Results"):
                         json_output = gr.JSON(label="Structured Layout Data (JSON)")
         # Event Handlers
         def clear_all_outputs():
             return None, "Raw output will appear here.", "Formatted results will appear here.", None
         process_btn.click(
             fn=process_document_stream,
-            inputs=[model_choice,
-                    task_choice,
-                    image_input,
                     max_new_tokens],
-            outputs=[raw_output_stream,
-                     markdown_output,
                      json_output]
         )
         clear_btn.click(
             clear_all_outputs,
-            outputs=[image_input,
-                     raw_output_stream,
-                     markdown_output,
                      json_output]
         )
     return demo
 if __name__ == "__main__":
     demo = create_gradio_interface()
-    demo.queue().launch(share=True, mcp_server=True, ssr_mode=False, show_error=True)

     Qwen2_5_VLForConditionalGeneration,
     AutoProcessor,
     TextIteratorStreamer,
+    AutoModel,
+    AutoTokenizer
 )
 js_func = """
     MODEL_ID_G, trust_remote_code=True, subfolder=SUBFOLDER, torch_dtype=torch.float16
 ).to(device).eval()
+# --- New Model ---
+MODEL_ID_V4 = 'openbmb/MiniCPM-V-4'
+model_v4 = AutoModel.from_pretrained(
+    MODEL_ID_V4,
+    trust_remote_code=True,
+    torch_dtype=torch.bfloat16,
+    attn_implementation='sdpa'  # Use 'flash_attention_2' if available and supported
+).eval().to(device)
+tokenizer_v4 = AutoTokenizer.from_pretrained(MODEL_ID_V4, trust_remote_code=True)
 # --- Utility Functions ---
 def layoutjson2md(layout_data: List[Dict]) -> str:
     """Converts the structured JSON from Layout Analysis into formatted Markdown."""
     # 1. Select prompt based on user's task choice
     text_prompt = ocr_prompt if task_choice == "Content Extraction" else layout_prompt
+    # --- New Model Handling ---
+    if model_name == "openbmb/MiniCPM-V-4":
+        if task_choice == "Layout Analysis(.json)":
+            yield "This model is not optimized for Layout Analysis.", "Task not supported for this model.", None
+            return
+        question = "What is in this image?"
+        msgs = [{'role': 'user', 'content': [image, question]}]
+        # Since this model's .chat method isn't a generator, we run it in a thread
+        # and yield the final result. A more advanced implementation could stream it.
+        try:
+            answer = model_v4.chat(
+                image=image.convert('RGB'),
+                msgs=msgs,
+                tokenizer=tokenizer_v4
+            )
+            yield answer, answer, None
+        except Exception as e:
+            yield f"Error: {str(e)}", "An error occurred.", None
+        return
     # 2. Select model and processor
     if model_name == "Camel-Doc-OCR-062825": processor, model = processor_m, model_m
     elif model_name == "Megalodon-OCR-Sync-0713": processor, model = processor_t, model_t
     inputs = processor(text=[prompt_full], images=[image], return_tensors="pt", padding=True, truncation=True, max_length=MAX_INPUT_TOKEN_LENGTH).to(device)
     streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
     generation_kwargs = {**inputs, "streamer": streamer, "max_new_tokens": max_new_tokens}
     thread = Thread(target=model.generate, kwargs=generation_kwargs)
     thread.start()
     # 4. Stream raw output to the UI in real-time
     buffer = ""
     for new_text in streamer:
             json_match = re.search(r'```json\s*([\s\S]+?)\s*```', buffer)
             if not json_match:
                 raise json.JSONDecodeError("JSON object not found in output.", buffer, 0)
             json_str = json_match.group(1)
             layout_data = json.loads(json_str)
             markdown_content = layoutjson2md(layout_data)
             yield buffer, markdown_content, layout_data
         except Exception as e:
             error_md = f"❌ **Error:** Failed to parse Layout JSON.\n\n**Details:**\n`{str(e)}`"
     """Builds and returns the Gradio web interface."""
     css = """
     .main-container { max-width: 1400px; margin: 0 auto; }
+    .process-button { border: none !important; color: white !important; font-weight: bold !important; background-color: blue !important;}
     .process-button:hover { background-color: darkblue !important; transform: translateY(-2px) !important; box-shadow: 0 4px 8px rgba(0,0,0,0.2) !important; }
     """
     with gr.Blocks(theme="bethecloud/storj_theme", css=css, js=js_func) as demo:
             </p>
         </div>
         """)
         with gr.Row():
             # Left Column (Inputs)
             with gr.Column(scale=1):
                 model_choice = gr.Dropdown(
+                    choices=["Camel-Doc-OCR-062825",
+                             "MonkeyOCR-Recognition",
+                             "Nanonets-OCR-s",
+                             "Megalodon-OCR-Sync-0713",
+                             "openbmb/MiniCPM-V-4"],
                     label="Select Model", value="Nanonets-OCR-s"
                 )
                 task_choice = gr.Dropdown(
                 image_input = gr.Image(label="Upload Image", type="pil", sources=['upload'])
                 with gr.Accordion("Advanced Settings", open=False):
                     max_new_tokens = gr.Slider(minimum=512, maximum=8192, value=4096, step=256, label="Max New Tokens")
                 process_btn = gr.Button("🚀 Process Document", variant="primary", elem_classes=["process-button"], size="lg")
                 clear_btn = gr.Button("🗑️ Clear All", variant="secondary")
                                 examples=["examples/example_img2.png", "examples/example_img1.png"],
                                 inputs=image_input,
                                 label="Examples"
+                    )
+                    with gr.Tab("📰 README.md"):
                         with gr.Accordion("(Formatted Result)", open=True):
                             markdown_output = gr.Markdown(label="Formatted Markdown")
                     with gr.Tab("📋 Layout Analysis Results"):
                         json_output = gr.JSON(label="Structured Layout Data (JSON)")
         # Event Handlers
         def clear_all_outputs():
             return None, "Raw output will appear here.", "Formatted results will appear here.", None
         process_btn.click(
             fn=process_document_stream,
+            inputs=[model_choice,
+                    task_choice,
+                    image_input,
                     max_new_tokens],
+            outputs=[raw_output_stream,
+                     markdown_output,
                      json_output]
         )
         clear_btn.click(
             clear_all_outputs,
+            outputs=[image_input,
+                     raw_output_stream,
+                     markdown_output,
                      json_output]
         )
     return demo
 if __name__ == "__main__":
     demo = create_gradio_interface()
+    demo.queue().launch(share=True, ssr_mode=False, show_error=True)