Csplk's picture
Update app.py
d8d5348 verified
raw
history blame
1.65 kB
import spaces
import torch
import re
import gradio as gr
from PIL import Image
import io
from transformers import AutoTokenizer, AutoModelForCausalLM
if torch.cuda.is_available():
device, dtype = "cuda", torch.float16
else:
device, dtype = "cpu", torch.float32
model_id = "vikhyatk/moondream2"
revision = "2024-04-02"
tokenizer = AutoTokenizer.from_pretrained(model_id, revision=revision)
moondream = AutoModelForCausalLM.from_pretrained(
model_id, trust_remote_code=True, revision=revision
).to(device=device, dtype=dtype)
moondream.eval()
@spaces.GPU(duration=10)
def answer_questions(images, prompt_text):
prompts = [p.strip() for p in prompt_text.split(',')] # Splitting and cleaning prompts
# Extracting images from tuples and converting to PIL images
image_objects = [Image.open(io.BytesIO(img[0])) for img in images]
image_embeds = [moondream.encode_image(img) for img in image_objects]
answers = moondream.batch_answer(
images=image_embeds,
prompts=prompts,
tokenizer=tokenizer,
)
return ["\n".join(ans) for ans in answers]
with gr.Blocks() as demo:
gr.Markdown("# πŸŒ” moondream2\nA tiny vision language model. [GitHub](https://github.com/vikhyatk/moondream)")
with gr.Row():
img = gr.Gallery(label="Upload Images", type="pil")
prompt = gr.Textbox(label="Input Prompts", placeholder="Enter prompts separated by commas. Ex: Describe this image, What is in this image?", lines=2)
submit = gr.Button("Submit")
output = gr.TextArea(label="Responses", lines=4)
submit.click(answer_questions, [img, prompt], output)
demo.queue().launch()