import gradio as gr from transformers import AutoModelForCausalLM, AutoTokenizer import torch from PIL import Image import requests from io import BytesIO # Load the Qwen-VL model and tokenizer tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen-VL", trust_remote_code=True) model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen-VL", device_map="cuda", trust_remote_code=True).eval() def generate_predictions(image_input, text_input): # Save the image locally to match the original example user_image_path = "/tmp/user_input_test_image.jpg" image_input.save(user_image_path) image_input = Image.open(user_image_path) # Prepare the inputs query = tokenizer.from_list_format([ {'image': user_image_path}, {'text': text_input}, ]) inputs = tokenizer(query, return_tensors='pt') inputs = inputs.to(model.device) # Generate the caption pred = model.generate(**inputs) response = tokenizer.decode(pred.cpu()[0], skip_special_tokens=False) # Draw bounding boxes if any image_with_boxes = tokenizer.draw_bbox_on_latest_picture(response) return image_with_boxes, response # Create Gradio Interface iface = gr.Interface( fn=generate_predictions, inputs=["image", "text"], outputs=["image", "text"] ) iface.launch()