Spaces:

breadlicker45
/

PaliGemma2

Running on Zero

App Files Files Community

breadlicker45 commited on Dec 15, 2024

Commit

b9c7982

verified ·

1 Parent(s): c580f5e

Update app.py

Browse files

Files changed (1) hide show

app.py +26 -14

app.py CHANGED Viewed

@@ -3,10 +3,13 @@ from transformers import (
     PaliGemmaProcessor,
     PaliGemmaForConditionalGeneration,
 )
-from PIL import Image
 import torch
 import os
 import spaces  # Import the spaces module
 def load_model():
@@ -24,39 +27,48 @@ def load_model():
     processor = PaliGemmaProcessor.from_pretrained(model_id, use_auth_token=token)
     device = "cuda" if torch.cuda.is_available() else "cpu"
     model = PaliGemmaForConditionalGeneration.from_pretrained(
-        model_id, use_auth_token=token, torch_dtype=torch.bfloat16
-    ).to(device)
     return processor, model
 @spaces.GPU  # Decorate the function that uses the GPU
-def process_image_and_text(image, text_input):
     """Extract text from image using PaliGemma2."""
     processor, model = load_model()
     device = "cuda" if torch.cuda.is_available() else "cpu"
-    # Preprocess the image and text
-    inputs = processor(text=text_input, images=image, return_tensors="pt").to(
         device, dtype=torch.bfloat16
     )
-    # Generate predictions
-    with torch.no_grad():
-        generated_ids = model.generate(**inputs, max_new_tokens=100)
-        text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
-    return text
 if __name__ == "__main__":
     iface = gr.Interface(
         fn=process_image_and_text,
         inputs=[
-            gr.Image(type="pil", label="Upload an image containing text"),
             gr.Textbox(label="Enter Text Prompt"),
         ],
-        outputs=gr.Textbox(label="Extracted/Generated Text"),
-        title="Text Reading/Generation with PaliGemma2",
         description="Upload an image and enter a text prompt. The model will generate text based on both.",
     )
     iface.launch()

     PaliGemmaProcessor,
     PaliGemmaForConditionalGeneration,
 )
+from transformers.image_utils import load_image
 import torch
 import os
 import spaces  # Import the spaces module
+import requests
+from io import BytesIO
+from PIL import Image
 def load_model():
     processor = PaliGemmaProcessor.from_pretrained(model_id, use_auth_token=token)
     device = "cuda" if torch.cuda.is_available() else "cpu"
     model = PaliGemmaForConditionalGeneration.from_pretrained(
+        model_id, torch_dtype=torch.bfloat16, use_auth_token=token
+    ).to(device).eval()
     return processor, model
 @spaces.GPU  # Decorate the function that uses the GPU
+def process_image_and_text(image_pil, text_input):
     """Extract text from image using PaliGemma2."""
     processor, model = load_model()
     device = "cuda" if torch.cuda.is_available() else "cpu"
+    # Load the image using load_image
+    # Convert PIL image to bytes
+    buffered = BytesIO()
+    image_pil.save(buffered, format="JPEG")
+    image_bytes = buffered.getvalue()
+    image = load_image(image_bytes)
+    # Use the provided text input
+    model_inputs = processor(text=text_input, images=image, return_tensors="pt").to(
         device, dtype=torch.bfloat16
     )
+    input_len = model_inputs["input_ids"].shape[-1]
+    with torch.inference_mode():
+        generation = model.generate(**model_inputs, max_new_tokens=100, do_sample=False)
+        generation = generation[0][input_len:]
+        decoded = processor.decode(generation, skip_special_tokens=True)
+    return decoded
 if __name__ == "__main__":
     iface = gr.Interface(
         fn=process_image_and_text,
         inputs=[
+            gr.Image(type="pil", label="Upload an image"),
             gr.Textbox(label="Enter Text Prompt"),
         ],
+        outputs=gr.Textbox(label="Generated Text"),
+        title="PaliGemma2 Image and Text to Text",
         description="Upload an image and enter a text prompt. The model will generate text based on both.",
     )
     iface.launch()