Spaces:

AI-ANK
/

PalmKosmosVision

Sleeping

App Files Files Community

AI-ANK commited on Nov 4, 2023

Commit

9149fd8

1 Parent(s): f21d103

Update app.py

Browse files

Files changed (1) hide show

app.py +7 -15

app.py CHANGED Viewed

@@ -10,6 +10,8 @@ from llama_index import ServiceContext, VectorStoreIndex, Document
 from llama_index.memory import ChatMemoryBuffer
 import os
 import datetime
 # Set up the title of the application
 st.title("Image Captioning and Chat")
@@ -28,13 +30,14 @@ model, processor = get_vision_model()
 # Function to get image caption via Kosmos2.
 @st.cache_data
 def get_image_caption(image_data):
     model, processor = get_vision_model()
-    #model = AutoModelForVision2Seq.from_pretrained("ydshieh/kosmos-2-patch14-224", trust_remote_code=True)
-    #processor = AutoProcessor.from_pretrained("ydshieh/kosmos-2-patch14-224", trust_remote_code=True)
     prompt = "<grounding>An image of"
-    inputs = processor(text=prompt, images=image_data, return_tensors="pt")
     generated_ids = model.generate(
         pixel_values=inputs["pixel_values"],
@@ -48,18 +51,7 @@ def get_image_caption(image_data):
     generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
     text_description, entities = processor.post_process_generation(generated_text)
-#Using replicate API
-#    input_data = {
-#        "image": image_data,
-#        "description_type": "Brief"
-#    }
-#    output = replicate.run(
-#        "lucataco/kosmos-2:3e7b211c29c092f4bcc8853922cc986baa52efe255876b80cac2c2fbb4aff805",
-#        input=input_data
-#    )
-#    # Split the output string on the newline character and take the first item
-#    text_description = output.split('\n\n')[0]
     return text_description
 # Function to create the chat engine.

 from llama_index.memory import ChatMemoryBuffer
 import os
 import datetime
+from PIL import Image
+import io
 # Set up the title of the application
 st.title("Image Captioning and Chat")
 # Function to get image caption via Kosmos2.
 @st.cache_data
 def get_image_caption(image_data):
+    # Convert BytesIO to PIL Image
+    image = Image.open(io.BytesIO(image_data))
     model, processor = get_vision_model()
     prompt = "<grounding>An image of"
+    # Pass the PIL image to the processor
+    inputs = processor(text=prompt, images=image, return_tensors="pt")
     generated_ids = model.generate(
         pixel_values=inputs["pixel_values"],
     generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
     text_description, entities = processor.post_process_generation(generated_text)
     return text_description
 # Function to create the chat engine.