Multimodal-OCR

Running on Zero

App Files Files Community

prithivMLmods commited on Feb 4

Commit

ebca0ae

verified ·

1 Parent(s): c8414ba

Update app.py

Browse files

Files changed (1) hide show

app.py +57 -21

app.py CHANGED Viewed

@@ -1,13 +1,16 @@
 import gradio as gr
 from transformers import Qwen2VLForConditionalGeneration, AutoProcessor, TextIteratorStreamer
 from transformers.image_utils import load_image
 from threading import Thread
 import time
 import torch
-import spaces
 # Fine-tuned for OCR-based tasks from Qwen's [ Qwen/Qwen2-VL-2B-Instruct ]
-MODEL_ID = "prithivMLmods/Qwen2-VL-OCR-2B-Instruct"
 processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True)
 model = Qwen2VLForConditionalGeneration.from_pretrained(
     MODEL_ID,
@@ -15,31 +18,65 @@ model = Qwen2VLForConditionalGeneration.from_pretrained(
     torch_dtype=torch.float16
 ).to("cuda").eval()
 @spaces.GPU
 def model_inference(input_dict, history):
     text = input_dict["text"]
     files = input_dict["files"]
-    # Load images if provided
-    if len(files) > 1:
-        images = [load_image(image) for image in files if image.endswith(('png', 'jpg', 'jpeg'))]
-        videos = [video for video in files if video.endswith(('mp4', 'avi', 'mov'))]
-    elif len(files) == 1:
-        if files[0].endswith(('png', 'jpg', 'jpeg')):
-            images = [load_image(files[0])]
-            videos = []
         else:
-            images = []
-            videos = [files[0]]
-    else:
-        images = []
-        videos = []
     # Validate input
-    if text == "" and not images and not videos:
         gr.Error("Please input a query and optionally image(s) or video(s).")
         return
-    if text == "" and (images or videos):
         gr.Error("Please input a text query along with the image(s) or video(s).")
         return
@@ -48,8 +85,7 @@ def model_inference(input_dict, history):
         {
             "role": "user",
             "content": [
-                *[{"type": "image", "image": image} for image in images],
-                *[{"type": "video", "video": video} for video in videos],
                 {"type": "text", "text": text},
             ],
         }
@@ -59,8 +95,8 @@ def model_inference(input_dict, history):
     prompt = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
     inputs = processor(
         text=[prompt],
-        images=images if images else None,
-        videos=videos if videos else None,
         return_tensors="pt",
         padding=True,
     ).to("cuda")

 import gradio as gr
+import spaces
 from transformers import Qwen2VLForConditionalGeneration, AutoProcessor, TextIteratorStreamer
 from transformers.image_utils import load_image
 from threading import Thread
 import time
 import torch
+from PIL import Image
+import uuid
+import io
 # Fine-tuned for OCR-based tasks from Qwen's [ Qwen/Qwen2-VL-2B-Instruct ]
+MODEL_ID = "prithivMLmods/Qwen2-VL-OCR-2B-Instruct"
 processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True)
 model = Qwen2VLForConditionalGeneration.from_pretrained(
     MODEL_ID,
     torch_dtype=torch.float16
 ).to("cuda").eval()
+# Supported media extensions
+image_extensions = Image.registered_extensions()
+video_extensions = ("avi", "mp4", "mov", "mkv", "flv", "wmv", "mjpeg", "wav", "gif", "webm", "m4v", "3gp")
+def identify_and_save_blob(blob_path):
+    """Identifies if the blob is an image or video and saves it accordingly."""
+    try:
+        with open(blob_path, 'rb') as file:
+            blob_content = file.read()
+            # Try to identify if it's an image
+            try:
+                Image.open(io.BytesIO(blob_content)).verify()  # Check if it's a valid image
+                extension = ".png"  # Default to PNG for saving
+                media_type = "image"
+            except (IOError, SyntaxError):
+                # If it's not a valid image, assume it's a video
+                extension = ".mp4"  # Default to MP4 for saving
+                media_type = "video"
+            # Create a unique filename
+            filename = f"temp_{uuid.uuid4()}_media{extension}"
+            with open(filename, "wb") as f:
+                f.write(blob_content)
+            return filename, media_type
+    except FileNotFoundError:
+        raise ValueError(f"The file {blob_path} was not found.")
+    except Exception as e:
+        raise ValueError(f"An error occurred while processing the file: {e}")
 @spaces.GPU
 def model_inference(input_dict, history):
     text = input_dict["text"]
     files = input_dict["files"]
+    # Process media files (images or videos)
+    media_paths = []
+    media_types = []
+    for file in files:
+        if file.endswith(tuple([i for i, f in image_extensions.items()])):
+            media_type = "image"
+        elif file.endswith(video_extensions):
+            media_type = "video"
         else:
+            try:
+                file, media_type = identify_and_save_blob(file)
+            except Exception as e:
+                gr.Error(f"Unsupported media type: {e}")
+                return
+        media_paths.append(file)
+        media_types.append(media_type)
     # Validate input
+    if text == "" and not media_paths:
         gr.Error("Please input a query and optionally image(s) or video(s).")
         return
+    if text == "" and media_paths:
         gr.Error("Please input a text query along with the image(s) or video(s).")
         return
         {
             "role": "user",
             "content": [
+                *[{"type": media_type, media_type: media_path} for media_path, media_type in zip(media_paths, media_types)],
                 {"type": "text", "text": text},
             ],
         }
     prompt = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
     inputs = processor(
         text=[prompt],
+        images=[load_image(path) for path, media_type in zip(media_paths, media_types) if media_type == "image"],
+        videos=[path for path, media_type in zip(media_paths, media_types) if media_type == "video"],
         return_tensors="pt",
         padding=True,
     ).to("cuda")