Spaces:
				
			
			
	
			
			
		Running
		
			on 
			
			Zero
	
	
	
			
			
	
	
	
	
		
		
		Running
		
			on 
			
			Zero
	Update app.py
Browse files
    	
        app.py
    CHANGED
    
    | 
         @@ -14,9 +14,6 @@ from PIL import Image 
     | 
|
| 14 | 
         
             
            import requests
         
     | 
| 15 | 
         
             
            from io import BytesIO
         
     | 
| 16 | 
         | 
| 17 | 
         
            -
            # -------------------------
         
     | 
| 18 | 
         
            -
            # Qwen2-VL Model for OCR-based tasks
         
     | 
| 19 | 
         
            -
            # -------------------------
         
     | 
| 20 | 
         
             
            QV_MODEL_ID = "prithivMLmods/Qwen2-VL-OCR-2B-Instruct"
         
     | 
| 21 | 
         
             
            qwen_processor = AutoProcessor.from_pretrained(QV_MODEL_ID, trust_remote_code=True)
         
     | 
| 22 | 
         
             
            qwen_model = Qwen2VLForConditionalGeneration.from_pretrained(
         
     | 
| 
         @@ -25,9 +22,6 @@ qwen_model = Qwen2VLForConditionalGeneration.from_pretrained( 
     | 
|
| 25 | 
         
             
                torch_dtype=torch.float16
         
     | 
| 26 | 
         
             
            ).to("cuda").eval()
         
     | 
| 27 | 
         | 
| 28 | 
         
            -
            # -------------------------
         
     | 
| 29 | 
         
            -
            # Aya-Vision Model for image-text tasks (@aya-vision)
         
     | 
| 30 | 
         
            -
            # -------------------------
         
     | 
| 31 | 
         
             
            AYA_MODEL_ID = "CohereForAI/aya-vision-8b"
         
     | 
| 32 | 
         
             
            aya_processor = AutoProcessor.from_pretrained(AYA_MODEL_ID)
         
     | 
| 33 | 
         
             
            aya_model = AutoModelForImageTextToText.from_pretrained(
         
     | 
| 
         @@ -137,26 +131,22 @@ def model_inference(input_dict, history): 
     | 
|
| 137 | 
         
             
                    time.sleep(0.01)
         
     | 
| 138 | 
         
             
                    yield buffer
         
     | 
| 139 | 
         | 
| 140 | 
         
            -
            # -------------------------
         
     | 
| 141 | 
         
            -
            # Example inputs for the combined interface
         
     | 
| 142 | 
         
            -
            # -------------------------
         
     | 
| 143 | 
         
             
            examples = [
         
     | 
| 
         | 
|
| 144 | 
         
             
                [{"text": "@aya-vision Extract JSON from the image", "files": ["example_images/document.jpg"]}],
         
     | 
| 145 | 
         
            -
                [{"text": "Summarize the letter", "files": ["examples/1.png"]}],
         
     | 
| 146 | 
         
            -
                [{"text": "Describe the photo", "files": ["examples/3.png"]}],
         
     | 
| 147 | 
         
            -
                [{"text": " 
     | 
| 148 | 
         
            -
                [{"text": " 
     | 
| 149 | 
         
            -
                [{"text": " 
     | 
| 150 | 
         
            -
                [{"text": " 
     | 
| 151 | 
         
            -
                [{"text": "Can you describe this image?", "files": ["example_images/ 
     | 
| 152 | 
         
            -
                [{"text": " 
     | 
| 153 | 
         
            -
                [{"text": "Where do the severe droughts happen according to this diagram?", "files": ["example_images/examples_weather_events.png"]}],
         
     | 
| 154 | 
         
             
            ]
         
     | 
| 155 | 
         | 
| 156 | 
         
            -
            # Build the Gradio ChatInterface.
         
     | 
| 157 | 
         
             
            demo = gr.ChatInterface(
         
     | 
| 158 | 
         
             
                fn=model_inference,
         
     | 
| 159 | 
         
            -
                description="# **Multimodal OCR with  
     | 
| 160 | 
         
             
                examples=examples,
         
     | 
| 161 | 
         
             
                textbox=gr.MultimodalTextbox(label="Query Input", file_types=["image"], file_count="multiple"),
         
     | 
| 162 | 
         
             
                stop_btn="Stop Generation",
         
     | 
| 
         | 
|
| 14 | 
         
             
            import requests
         
     | 
| 15 | 
         
             
            from io import BytesIO
         
     | 
| 16 | 
         | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 17 | 
         
             
            QV_MODEL_ID = "prithivMLmods/Qwen2-VL-OCR-2B-Instruct"
         
     | 
| 18 | 
         
             
            qwen_processor = AutoProcessor.from_pretrained(QV_MODEL_ID, trust_remote_code=True)
         
     | 
| 19 | 
         
             
            qwen_model = Qwen2VLForConditionalGeneration.from_pretrained(
         
     | 
| 
         | 
|
| 22 | 
         
             
                torch_dtype=torch.float16
         
     | 
| 23 | 
         
             
            ).to("cuda").eval()
         
     | 
| 24 | 
         | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 25 | 
         
             
            AYA_MODEL_ID = "CohereForAI/aya-vision-8b"
         
     | 
| 26 | 
         
             
            aya_processor = AutoProcessor.from_pretrained(AYA_MODEL_ID)
         
     | 
| 27 | 
         
             
            aya_model = AutoModelForImageTextToText.from_pretrained(
         
     | 
| 
         | 
|
| 131 | 
         
             
                    time.sleep(0.01)
         
     | 
| 132 | 
         
             
                    yield buffer
         
     | 
| 133 | 
         | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 134 | 
         
             
            examples = [
         
     | 
| 135 | 
         
            +
                [{"text": "@aya-vision Extract as JSON table from the table", "files": ["examples/4.jpg"]}],
         
     | 
| 136 | 
         
             
                [{"text": "@aya-vision Extract JSON from the image", "files": ["example_images/document.jpg"]}],
         
     | 
| 137 | 
         
            +
                [{"text": "@aya-vision Summarize the letter", "files": ["examples/1.png"]}],
         
     | 
| 138 | 
         
            +
                [{"text": "@aya-vision Describe the photo", "files": ["examples/3.png"]}],
         
     | 
| 139 | 
         
            +
                [{"text": "@aya-vision Summarize the full image in detail", "files": ["examples/2.jpg"]}],
         
     | 
| 140 | 
         
            +
                [{"text": "@aya-vision Describe this image.", "files": ["example_images/campeones.jpg"]}],
         
     | 
| 141 | 
         
            +
                [{"text": "@aya-vision What is this UI about?", "files": ["example_images/s2w_example.png"]}],
         
     | 
| 142 | 
         
            +
                [{"text": "@aya-vision Can you describe this image?", "files": ["example_images/newyork.jpg"]}],
         
     | 
| 143 | 
         
            +
                [{"text": "@aya-vision Can you describe this image?", "files": ["example_images/dogs.jpg"]}],
         
     | 
| 144 | 
         
            +
                [{"text": "@aya-vision Where do the severe droughts happen according to this diagram?", "files": ["example_images/examples_weather_events.png"]}],
         
     | 
| 
         | 
|
| 145 | 
         
             
            ]
         
     | 
| 146 | 
         | 
| 
         | 
|
| 147 | 
         
             
            demo = gr.ChatInterface(
         
     | 
| 148 | 
         
             
                fn=model_inference,
         
     | 
| 149 | 
         
            +
                description="# **Multimodal OCR with `@aya-vision` Feature**",
         
     | 
| 150 | 
         
             
                examples=examples,
         
     | 
| 151 | 
         
             
                textbox=gr.MultimodalTextbox(label="Query Input", file_types=["image"], file_count="multiple"),
         
     | 
| 152 | 
         
             
                stop_btn="Stop Generation",
         
     |