|
from transformers import AutoTokenizer, AutoModelForCausalLM |
|
import gradio as gr |
|
import spaces |
|
from PIL import Image |
|
import hashlib |
|
import base64 |
|
|
|
def load_md2(): |
|
model = AutoModelForCausalLM.from_pretrained("vikhyatk/moondream2", device_map="cpu", trust_remote_code=True,revision="2025-01-09") |
|
return model |
|
|
|
global md2 |
|
|
|
md2 = load_md2() |
|
|
|
@spaces.GPU() |
|
def moondream2(question, image, history=None): |
|
global md2 |
|
model = md2 |
|
model.cuda() |
|
hsh = hashlib.sha256(bts := image.resize((224,224), Image.NEAREST).tobytes()).hexdigest() |
|
b64 = base64.b64encode(bts).decode('utf-8') |
|
res = model.query(image, question) if question is not None and question != "" else model.caption(image) |
|
model.cpu() |
|
ress = [] |
|
if history is not None: |
|
for itm in history: |
|
ress.append(itm) |
|
ress.append({ |
|
"answer": res if question["answer"] is not None and question != "" else None, |
|
"caption": res if question["caption"] is None or question == "" else None, |
|
"sha256": hsh, |
|
"image_b64": b64 |
|
}) |
|
return ress, ress |
|
|
|
def gui(): |
|
with gr.Blocks() as blk: |
|
with gr.Row(): |
|
imgs = gr.Image(label="input", type="pil", elem_id="imgs") |
|
with gr.Row(): |
|
txt = gr.Textbox(label="prompt") |
|
with gr.Row(): |
|
btn = gr.Button("Run") |
|
with gr.Row(): |
|
res = gr.JSON(label="output") |
|
with gr.Row(visible=False): |
|
history = gr.JSON(label="history") |
|
btn.click(moondream2, inputs=[txt, imgs, history], outputs=[res, history]) |
|
blk.launch(share=False) |
|
|
|
if __name__ == "__main__": |
|
gui() |