FoodIdentifier / app.py
rdezwart's picture
Try changing interactive state another different way
1d43352
raw
history blame
4.34 kB
from threading import Thread
import gradio as gr
import torch
from PIL import Image
from transformers import PreTrainedModel # for type hint
from transformers import TextIteratorStreamer, AutoModelForCausalLM, AutoTokenizer # Moondream
from transformers import YolosImageProcessor, YolosForObjectDetection # YOLOS-small-300
# --- Moondream --- #
# Moondream does not support the HuggingFace pipeline system, so we have to do it manually
moondream_id = "vikhyatk/moondream2"
moondream_revision = "2024-04-02"
moondream_tokenizer = AutoTokenizer.from_pretrained(moondream_id, revision=moondream_revision)
moondream_model: PreTrainedModel = AutoModelForCausalLM.from_pretrained(
moondream_id, trust_remote_code=True, revision=moondream_revision
)
moondream_model.eval()
# --- YOLOS --- #
yolos_id = "hustvl/yolos-small-300"
yolos_processor: YolosImageProcessor = YolosImageProcessor.from_pretrained(yolos_id)
yolos_model: YolosForObjectDetection = YolosForObjectDetection.from_pretrained(yolos_id)
def answer_question(img, prompt):
"""
Submits an image and prompt to the Moondream model.
:param img:
:param prompt:
:return: yields the output buffer string
"""
image_embeds = moondream_model.encode_image(img)
streamer = TextIteratorStreamer(moondream_tokenizer, skip_special_tokens=True)
thread = Thread(
target=moondream_model.answer_question,
kwargs={
"image_embeds": image_embeds,
"question": prompt,
"tokenizer": moondream_tokenizer,
"streamer": streamer,
},
)
thread.start()
buffer = ""
for new_text in streamer:
buffer += new_text
yield buffer.strip()
def detect_objects(img: Image.Image):
inputs = yolos_processor(img, return_tensors="pt")
outputs = yolos_model(**inputs)
target_sizes = torch.tensor([tuple(reversed(img.size))])
results = yolos_processor.post_process_object_detection(outputs, threshold=0.7, target_sizes=target_sizes)[0]
box_images = []
for score, label, box in zip(results["scores"], results["labels"], results["boxes"]):
box = [round(i, 2) for i in box.tolist()]
print(
f"Detected {yolos_model.config.id2label[label.item()]} with confidence "
f"{round(score.item(), 3)} at location {box}"
)
box_images.append((
img.crop((box[0], box[1], box[2], box[3])),
f"{yolos_model.config.id2label[label.item()]} ({round(score.item(), 3)})")
)
return box_images
def gallery_selected(evt: gr.SelectData):
"""
Listener for the gallery selection event.
:return: index of the currently selected image
"""
print(f"Index: {evt.index}, Value: {evt.value}, Selected: {evt.selected}")
return evt.index, gr.Button(interactive=evt.selected)
if __name__ == "__main__":
with gr.Blocks() as app:
gr.Markdown(
"""
# Food Identifier
Final project for IAT 481 at Simon Fraser University, Spring 2024.
"""
)
with gr.Tab("Object Detection"):
with gr.Row():
with gr.Column():
yolos_input = gr.Image(type="pil")
yolos_submit = gr.Button("Submit")
yolos_output = gr.Gallery(label="Detected Objects", object_fit="scale-down", columns=3, scale=1,
show_share_button=False, selected_index=None)
with gr.Row():
yolos_selected = gr.TextArea(label="Selected Image Index")
proceed_button = gr.Button("To Moondream", interactive=False)
with gr.Tab("Inference"):
with gr.Row():
moon_prompt = gr.Textbox(label="Input", value="Describe this image.")
moon_submit = gr.Button("Submit")
with gr.Row():
moon_img = gr.Image(label="Image", type="pil")
moon_output = gr.TextArea(label="Output")
# --- YOLOS --- #
yolos_submit.click(detect_objects, [yolos_input], yolos_output)
yolos_output.change(gallery_selected, None, [yolos_selected, proceed_button])
# --- Moondream --- #
moon_submit.click(answer_question, [moon_img, moon_prompt], moon_output)
app.queue().launch()