Multimodal-OCR

Runtime error

App Files Files Community

prithivMLmods commited on May 5

Commit

a8067dc

verified ·

1 Parent(s): f22b5b6

Update app.py

Browse files

Files changed (1) hide show

app.py +13 -10

app.py CHANGED Viewed

@@ -15,9 +15,11 @@ from transformers import (
 from transformers import Qwen2_5_VLForConditionalGeneration
 # Helper Functions
 def progress_bar_html(label: str, primary_color: str = "#4B0082", secondary_color: str = "#9370DB") -> str:
     """
     Returns an HTML snippet for a thin animated progress bar with a label.
     """
     return f'''
 <div style="display: flex; align-items: center;">
@@ -34,6 +36,7 @@ def progress_bar_html(label: str, primary_color: str = "#4B0082", secondary_colo
 </style>
     '''
 def downsample_video(video_path):
     """
     Downsamples a video file by extracting 25 evenly spaced frames.
@@ -78,7 +81,7 @@ rolmocr_model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
 # Main Inference Function
 @spaces.GPU
 def model_inference(input_dict, history, use_rolmocr=False):
-    text = input_dict["text"].strip()
     files = input_dict.get("files", [])
     if not text and not files:
@@ -133,25 +136,25 @@ def model_inference(input_dict, history, use_rolmocr=False):
     thread.start()
     buffer = ""
     yield progress_bar_html(f"Processing with {model_name}")
-    # Stream tokens
     for new_text in streamer:
         buffer += new_text
         buffer = buffer.replace("<|im_end|>", "")
         time.sleep(0.01)
         yield buffer
-    # Ensure generation finished
     thread.join()
-    # Write final response to file
     try:
         with open("response.txt", "w", encoding="utf-8") as f:
-            f.write(buffer.strip())
     except Exception as e:
-        # If writing fails, you can log or yield an error message
-        yield f"Warning: could not write response to file: {e}"
 # Gradio Interface
 examples = [
@@ -160,9 +163,10 @@ examples = [
     [{"text": "Extract as JSON table from the table", "files": ["examples/4.jpg"]}],
 ]
 demo = gr.ChatInterface(
     fn=model_inference,
-    description="# **Multimodal OCR `RolmOCR and Default Qwen2VL OCR`**",
     examples=examples,
     textbox=gr.MultimodalTextbox(
         label="Query Input",
@@ -176,5 +180,4 @@ demo = gr.ChatInterface(
     additional_inputs=[gr.Checkbox(label="Use RolmOCR", value=False, info="Check to use RolmOCR, uncheck to use Qwen2VL OCR")],
 )
-if __name__ == "__main__":
-    demo.launch(debug=True)

 from transformers import Qwen2_5_VLForConditionalGeneration
 # Helper Functions
 def progress_bar_html(label: str, primary_color: str = "#4B0082", secondary_color: str = "#9370DB") -> str:
     """
     Returns an HTML snippet for a thin animated progress bar with a label.
+    Colors can be customized; default colors are used for Qwen2VL/Aya‑Vision.
     """
     return f'''
 <div style="display: flex; align-items: center;">
 </style>
     '''
 def downsample_video(video_path):
     """
     Downsamples a video file by extracting 25 evenly spaced frames.
 # Main Inference Function
 @spaces.GPU
 def model_inference(input_dict, history, use_rolmocr=False):
+    text = input_dict.get("text", "").strip()
     files = input_dict.get("files", [])
     if not text and not files:
     thread.start()
     buffer = ""
+    # Send initial progress bar
     yield progress_bar_html(f"Processing with {model_name}")
+    # Stream generation
     for new_text in streamer:
         buffer += new_text
         buffer = buffer.replace("<|im_end|>", "")
         time.sleep(0.01)
         yield buffer
+    # Ensure generation is complete
     thread.join()
+    # Save the full response to response.txt
     try:
         with open("response.txt", "w", encoding="utf-8") as f:
+            f.write(buffer)
     except Exception as e:
+        yield f"Error saving response: {e}"
 # Gradio Interface
 examples = [
     [{"text": "Extract as JSON table from the table", "files": ["examples/4.jpg"]}],
 ]
 demo = gr.ChatInterface(
     fn=model_inference,
+    description="# **Multimodal OCR `@RolmOCR and Default Qwen2VL OCR`**",
     examples=examples,
     textbox=gr.MultimodalTextbox(
         label="Query Input",
     additional_inputs=[gr.Checkbox(label="Use RolmOCR", value=False, info="Check to use RolmOCR, uncheck to use Qwen2VL OCR")],
 )
+demo.launch(debug=True)