Multimodal-OCR

Running on Zero

App Files Files Community

prithivMLmods commited on Mar 4

Commit

df7c39c

verified ·

1 Parent(s): 94ce5a9

Update app.py

Browse files

Files changed (1) hide show

app.py +10 -20

app.py CHANGED Viewed

@@ -14,9 +14,6 @@ from PIL import Image
 import requests
 from io import BytesIO
-# -------------------------
-# Qwen2-VL Model for OCR-based tasks
-# -------------------------
 QV_MODEL_ID = "prithivMLmods/Qwen2-VL-OCR-2B-Instruct"
 qwen_processor = AutoProcessor.from_pretrained(QV_MODEL_ID, trust_remote_code=True)
 qwen_model = Qwen2VLForConditionalGeneration.from_pretrained(
@@ -25,9 +22,6 @@ qwen_model = Qwen2VLForConditionalGeneration.from_pretrained(
     torch_dtype=torch.float16
 ).to("cuda").eval()
-# -------------------------
-# Aya-Vision Model for image-text tasks (@aya-vision)
-# -------------------------
 AYA_MODEL_ID = "CohereForAI/aya-vision-8b"
 aya_processor = AutoProcessor.from_pretrained(AYA_MODEL_ID)
 aya_model = AutoModelForImageTextToText.from_pretrained(
@@ -137,26 +131,22 @@ def model_inference(input_dict, history):
         time.sleep(0.01)
         yield buffer
-# -------------------------
-# Example inputs for the combined interface
-# -------------------------
 examples = [
     [{"text": "@aya-vision Extract JSON from the image", "files": ["example_images/document.jpg"]}],
-    [{"text": "Summarize the letter", "files": ["examples/1.png"]}],
-    [{"text": "Describe the photo", "files": ["examples/3.png"]}],
-    [{"text": "Extract as JSON table from the table", "files": ["examples/4.jpg"]}],
-    [{"text": "Summarize the full image in detail", "files": ["examples/2.jpg"]}],
-    [{"text": "Describe this image.", "files": ["example_images/campeones.jpg"]}],
-    [{"text": "What is this UI about?", "files": ["example_images/s2w_example.png"]}],
-    [{"text": "Can you describe this image?", "files": ["example_images/newyork.jpg"]}],
-    [{"text": "Can you describe this image?", "files": ["example_images/dogs.jpg"]}],
-    [{"text": "Where do the severe droughts happen according to this diagram?", "files": ["example_images/examples_weather_events.png"]}],
 ]
-# Build the Gradio ChatInterface.
 demo = gr.ChatInterface(
     fn=model_inference,
-    description="# **Multimodal OCR with @aya-vision Feature**",
     examples=examples,
     textbox=gr.MultimodalTextbox(label="Query Input", file_types=["image"], file_count="multiple"),
     stop_btn="Stop Generation",

 import requests
 from io import BytesIO
 QV_MODEL_ID = "prithivMLmods/Qwen2-VL-OCR-2B-Instruct"
 qwen_processor = AutoProcessor.from_pretrained(QV_MODEL_ID, trust_remote_code=True)
 qwen_model = Qwen2VLForConditionalGeneration.from_pretrained(
     torch_dtype=torch.float16
 ).to("cuda").eval()
 AYA_MODEL_ID = "CohereForAI/aya-vision-8b"
 aya_processor = AutoProcessor.from_pretrained(AYA_MODEL_ID)
 aya_model = AutoModelForImageTextToText.from_pretrained(
         time.sleep(0.01)
         yield buffer
 examples = [
+    [{"text": "@aya-vision Extract as JSON table from the table", "files": ["examples/4.jpg"]}],
     [{"text": "@aya-vision Extract JSON from the image", "files": ["example_images/document.jpg"]}],
+    [{"text": "@aya-vision Summarize the letter", "files": ["examples/1.png"]}],
+    [{"text": "@aya-vision Describe the photo", "files": ["examples/3.png"]}],
+    [{"text": "@aya-vision Summarize the full image in detail", "files": ["examples/2.jpg"]}],
+    [{"text": "@aya-vision Describe this image.", "files": ["example_images/campeones.jpg"]}],
+    [{"text": "@aya-vision What is this UI about?", "files": ["example_images/s2w_example.png"]}],
+    [{"text": "@aya-vision Can you describe this image?", "files": ["example_images/newyork.jpg"]}],
+    [{"text": "@aya-vision Can you describe this image?", "files": ["example_images/dogs.jpg"]}],
+    [{"text": "@aya-vision Where do the severe droughts happen according to this diagram?", "files": ["example_images/examples_weather_events.png"]}],
 ]
 demo = gr.ChatInterface(
     fn=model_inference,
+    description="# **Multimodal OCR with `@aya-vision` Feature**",
     examples=examples,
     textbox=gr.MultimodalTextbox(label="Query Input", file_types=["image"], file_count="multiple"),
     stop_btn="Stop Generation",