|
import gradio as gr |
|
from transformers import AutoTokenizer, AutoModelForCausalLM |
|
import json |
|
import torch |
|
import requests |
|
import time |
|
import random |
|
from PIL import Image |
|
from typing import Union |
|
|
|
device = "cuda" if torch.cuda.is_available() else "cpu" |
|
print(f"Using {device}" if device != "cpu" else "Using CPU") |
|
|
|
def _load_model(): |
|
tokenizer = AutoTokenizer.from_pretrained("vikhyatk/moondream2", trust_remote_code=True, revision="2024-05-08") |
|
model = AutoModelForCausalLM.from_pretrained("vikhyatk/moondream2", device_map=device, trust_remote_code=True, revision="2024-05-08") |
|
return (model, tokenizer) |
|
|
|
class MoonDream(): |
|
def __init__(self, model=None, tokenizer=None): |
|
self.model, self.tokenizer = (model, tokenizer) |
|
if not model or not tokenizer: |
|
self.model, self.tokenizer = _load_model() |
|
self.device = device |
|
self.model.to(self.device) |
|
def __call__(self, question, imgs): |
|
imn = 0 |
|
for img in imgs: |
|
img = self.model.encode_image(img) |
|
res = self.model.answer_question(question=question, image_embeds=img, tokenizer=self.tokenizer) |
|
yield res |
|
return |
|
|
|
def _respond_one(question, img): |
|
txt = "" |
|
yield (txt := txt + MoonDream()(question, [img])) |
|
return txt |
|
|
|
def respond_batch(question, **imgs): |
|
md = MoonDream() |
|
for img in imgs.values(): |
|
res = md(question, img) |
|
for r in res: |
|
yield r |
|
yield "\n\n\n\n\n\n" |
|
return |
|
|
|
red = Image.new("RGB", (192,192), (255,0,0)) |
|
green = Image.new("RGB", (192,192), (0,255,0)) |
|
blue = Image.new("RGB", (192,192), (0,0,255)) |
|
res = respond_batch("What color is this? Elaborate upon what emotion registers most strongly with you upon viewing. ", imgs=[red, green, blue]) |
|
for r in res: |
|
print(r) |
|
if "\n\n\n\n\n\n" in r: |
|
break |
|
|
|
def dual_images(img1: Image): |
|
|
|
md = MoonDream() |
|
res = md("Describe the image in plain english ", [img1]) |
|
txt = "" |
|
for r in res: |
|
yield (txt := txt + r) |
|
return |
|
|
|
import os |
|
|
|
with open("together_key.txt", "r") as f: |
|
os.environ["TOGETHER_KEY"] = f.read().strip() |
|
print("Set together key") |
|
|
|
def merge_descriptions_to_prompt(mi, d1, d2): |
|
from together import Together |
|
tog = Together(api_key=os.getenv("TOGETHER_KEY")) |
|
res = tog.completions.create(prompt=f"""Describe what would result if the following two descriptions were describing one thing. |
|
### Description 1: |
|
```text |
|
{d1} |
|
``` |
|
### Description 2: |
|
```text |
|
{d2} |
|
``` |
|
Merge-Specific Instructions: |
|
```text |
|
{mi} |
|
``` |
|
Ensure you end your output with ```\\n |
|
--- |
|
Complete Description: |
|
```text""", model="meta-llama/Meta-Llama-3-70B", stop=["```"], max_tokens=1024) |
|
return res.choices[0].text.split("```")[0] |
|
|
|
def xform_image_description(img, inst): |
|
from together import Together |
|
desc = dual_images(img) |
|
tog = Together(api_key=os.getenv("TOGETHER_KEY")) |
|
prompt=f"""Describe the image in aggressively verbose detail. I must know every freckle upon a man's brow and each blade of the grass intimately.\nDescription: ```text\n{desc}\n```\nInstructions:\n```text\n{inst}\n```\n\n\n---\nDetailed Description:\n```text""" |
|
res = tog.completions.create(prompt=prompt, model="meta-llama/Meta-Llama-3-70B", stop=["```"], max_tokens=1024) |
|
return res.choices[0].text[len(prompt):].split("```")[0] |
|
|
|
with gr.Blocks() as demo: |
|
with gr.Row(visible=True): |
|
with gr.Column(): |
|
with gr.Row(): |
|
img = gr.Image(label="images", type='pil') |
|
with gr.Row(): |
|
btn = gr.Button("submit") |
|
with gr.Row(): |
|
otpt = gr.Textbox(label="output", lines=3, interactive=True) |
|
with gr.Row(): |
|
with gr.Column(): |
|
im1 = gr.Image(label="image 1", type='pil') |
|
with gr.Column(): |
|
im2 = gr.Image(label="image 2", type='pil') |
|
with gr.Row(): |
|
btn2 = gr.Button("submit batch") |
|
with gr.Row(): |
|
with gr.Column(): |
|
otp2 = gr.Textbox(label="individual batch output (left)", interactive=True) |
|
with gr.Column(): |
|
otp3 = gr.Textbox(label="individual batch output (right)", interactive=True) |
|
with gr.Row(): |
|
minst = gr.Textbox(label="Merge Instructions") |
|
with gr.Row(): |
|
btn_scd = gr.Button("Merge Descriptions to Single Combined Description") |
|
with gr.Row(): |
|
otp4 = gr.Textbox(label="batch output ( combined )", interactive=True, lines=4) |
|
btn2.click(dual_images, inputs=[im1], outputs=[otp2]) |
|
btn2.click(dual_images, inputs=[im2], outputs=[otp3]) |
|
btn.click(dual_images, inputs=[img], outputs=[otpt]) |
|
btn_scd.click(merge_descriptions_to_prompt, inputs=[minst, otp2, otp3], outputs=[otp4]) |
|
|
|
demo.launch(debug=True, share=True) |