Spaces:

akshit-g
/

SeeForMe-Live

Running

App Files Files Community

SeeForMe-Live / webcam_gradio_demo.py

akshit-g

Update webcam_gradio_demo.py

242b35d verified 7 months ago

raw

history blame

2.75 kB

	import argparse
	import time
	from threading import Thread
	import gradio as gr
	import torch
	from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
	import spaces

	from moondream.hf import LATEST_REVISION, detect_device

	parser = argparse.ArgumentParser()
	parser.add_argument("--cpu", action="store_true")
	args = parser.parse_args()

	if args.cpu:
	device = torch.device("cpu")
	dtype = torch.float32
	else:
	device, dtype = detect_device()
	if device != torch.device("cpu"):
	print("Using device:", device)
	print("Using dtype:", dtype)
	print("If you run into issues, pass the `--cpu` flag to this script.")
	print()

	model_id = "vikhyatk/moondream2"
	tokenizer = AutoTokenizer.from_pretrained(model_id, revision=LATEST_REVISION)
	moondream = AutoModelForCausalLM.from_pretrained(
	model_id, trust_remote_code=True, revision=LATEST_REVISION
	).to(device=device, dtype=dtype)
	moondream.eval()

	@spaces.GPU(duration=10)
	def answer_question(img, prompt):
	image_embeds = moondream.encode_image(img)
	streamer = TextIteratorStreamer(tokenizer, skip_special_tokens=True)
	thread = Thread(
	target=moondream.answer_question,
	kwargs={
	"image_embeds": image_embeds,
	"question": prompt,
	"tokenizer": tokenizer,
	"streamer": streamer,
	},
	)
	thread.start()

	buffer = ""
	for new_text in streamer:
	buffer += new_text
	yield buffer


	with gr.Blocks() as demo:
	gr.Markdown("# See For Me")

	gr.HTML(
	"""
	<style type="text/css">
	.md_output p {
	padding-top: 1rem;
	font-size: 1.2rem !important;
	}
	</style>
	"""
	)

	with gr.Row():
	prompt = gr.Textbox(
	label="Prompt",
	value="What's going on? Respond with a single sentence.",
	interactive=True,
	)
	with gr.Row():
	img = gr.Image(type="pil", label="Upload an Image", streaming=True)
	output = gr.Markdown(elem_classes=["md_output"])

	latest_img = None
	latest_prompt = prompt.value

	@img.change(inputs=[img])
	def img_change(img):
	global latest_img
	latest_img = img

	@prompt.change(inputs=[prompt])
	def prompt_change(prompt):
	global latest_prompt
	latest_prompt = prompt

	@demo.load(outputs=[output])
	def live_video():
	while True:
	if latest_img is None:
	time.sleep(0.1)
	else:
	for text in answer_question(latest_img, latest_prompt):
	if len(text) > 0:
	yield text


	demo.queue().launch(debug=True, share=True)