import gradio as gr
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
from PIL import Image
import requests
from io import BytesIO

# Load the Qwen-VL model and tokenizer
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen-VL", trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen-VL", device_map="cuda", trust_remote_code=True).eval()

def generate_predictions(image_input, text_input):
    # Save the image locally to match the original example
    user_image_path = "/tmp/user_input_test_image.jpg"
    image_input.save(user_image_path)
    image_input = Image.open(user_image_path)

    # Prepare the inputs
    query = tokenizer.from_list_format([
        {'image': user_image_path},
        {'text': text_input},
    ])
    inputs = tokenizer(query, return_tensors='pt')
    inputs = inputs.to(model.device)
    
    # Generate the caption
    pred = model.generate(**inputs)
    response = tokenizer.decode(pred.cpu()[0], skip_special_tokens=False)
    
    # Draw bounding boxes if any
    image_with_boxes = tokenizer.draw_bbox_on_latest_picture(response)
    
    return image_with_boxes, response

# Create Gradio Interface
iface = gr.Interface(
    fn=generate_predictions, 
    inputs=["image", "text"], 
    outputs=["image", "text"]
)

iface.launch()