Spaces:

breadlicker45
/

PaliGemma2

Sleeping

breadlicker45 commited on Dec 15, 2024

Commit

33262af

verified ·

1 Parent(s): 8bc78e2

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -21,9 +21,9 @@ def load_model():
         "google/paligemma2-28b-pt-896", use_auth_token=token
     )
     model = AutoModelForImageTextToText.from_pretrained(
-        "google/paligemma2-28b-pt-896", use_auth_token=token, torch_dtype=torch.float16
     )
     # Move model to GPU if available
     if torch.cuda.is_available():
         model = model.to("cuda")
@@ -32,16 +32,18 @@ def load_model():
 @spaces.GPU  # Decorate the function that uses the GPU
-def process_image(image):
     """Extract text from image using PaliGemma2."""
     processor, model = load_model()
-    # Preprocess the image
-    inputs = processor(images=image, return_tensors="pt").to("cuda" if torch.cuda.is_available() else "cpu", dtype=torch.float16)
     # Generate predictions
     with torch.no_grad():
-        generated_ids = model.generate(**inputs)
         text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
     return text
@@ -49,10 +51,13 @@ def process_image(image):
 if __name__ == "__main__":
     iface = gr.Interface(
-        fn=process_image,
-        inputs=gr.Image(type="pil", label="Upload an image containing text"),
-        outputs=gr.Textbox(label="Extracted Text"),
-        title="Text Reading from Images using PaliGemma2",
-        description="Upload an image containing text and the model will extract the text.",
     )
     iface.launch()

         "google/paligemma2-28b-pt-896", use_auth_token=token
     )
     model = AutoModelForImageTextToText.from_pretrained(
+        "google/paligemma2-28b-pt-896", use_auth_token=token, torch_dtype=torch.bfloat16
     )
     # Move model to GPU if available
     if torch.cuda.is_available():
         model = model.to("cuda")
 @spaces.GPU  # Decorate the function that uses the GPU
+def process_image_and_text(image, text_input):
     """Extract text from image using PaliGemma2."""
     processor, model = load_model()
+    # Preprocess the image and text
+    inputs = processor(text=text_input, images=image, return_tensors="pt").to(
+        "cuda" if torch.cuda.is_available() else "cpu", dtype=torch.bfloat16
+    )
     # Generate predictions
     with torch.no_grad():
+        generated_ids = model.generate(**inputs, max_new_tokens=100)
         text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
     return text
 if __name__ == "__main__":
     iface = gr.Interface(
+        fn=process_image_and_text,
+        inputs=[
+            gr.Image(type="pil", label="Upload an image containing text"),
+            gr.Textbox(label="Enter Text Prompt"),
+        ],
+        outputs=gr.Textbox(label="Extracted/Generated Text"),
+        title="Text Reading/Generation with PaliGemma2",
+        description="Upload an image and enter a text prompt. The model will generate text based on both.",
     )
     iface.launch()