import gradio as gr
from sentence_transformers import SentenceTransformer
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from PIL import Image
import torch
from torchvision import io
from typing import Dict
from datetime import datetime
import numpy as np
import base64
import os, io

# Load the model in half-precision on the available device(s)
model = Qwen2VLForConditionalGeneration.from_pretrained(
    "./Qwen2-VL-7B-Instruct", torch_dtype="auto", device_map="auto"
)
processor = AutoProcessor.from_pretrained("./Qwen2-VL-7B-Instruct")

def array_to_image_path(image_array):
    if image_array is None:
        raise ValueError("No image provided. Please upload an image before submitting.")
    # Convert numpy array to PIL Image
    img = Image.fromarray(np.uint8(image_array))
    
    # Generate a unique filename using timestamp
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    filename = f"image_{timestamp}.png"
    
    # Save the image
    img.save(filename)
    
    # Get the full path of the saved image
    full_path = os.path.abspath(filename)
    
    return full_path

def generate_embeddings(text):
    model = SentenceTransformer('./all-MiniLM-L6-v2')
    embeddings = model.encode(sentences)
    return embeddings

def describe_image(image):
    # Convert the image to the format expected by the model
    image_path = array_to_image_path(image)
    with open(image_path, "rb") as f:
        image = base64.b64encode(f.read()).decode("utf-8")
    
    messages = [
        {
            "role": "user",
            "content": [
                {"type": "image", "image": f"data:image/png;base64,{image}"},
                {"type": "text", "text": "Make a very detailed description of the image."},
            ],
        }
    ]

    text_prompt = processor.apply_chat_template(conversation, add_generation_prompt=True)
    # Excepted output: '<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<|vision_start|><|image_pad|><|vision_end|>Describe this image.<|im_end|>\n<|im_start|>assistant\n'

    inputs = processor(
        text=[text_prompt], images=[image], padding=True, return_tensors="pt"
    )
    inputs = inputs.to("cuda")

    # Inference: Generation of the output
    output_ids = model.generate(**inputs, max_new_tokens=128)
    generated_ids = [
        output_ids[len(input_ids) :]
        for input_ids, output_ids in zip(inputs.input_ids, output_ids)
    ]
    output_text = processor.batch_decode(
        generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=True
    )   
    # remove image
    os.remove(image_path)
    # Extract the detailed description from the response
    return output_text, generate_embeddings(output_text)

# Create a Gradio interface
iface = gr.Interface(
    fn=describe_image,
    inputs=gr.Image(),
    outputs=[gr.Textbox(label="Description"), gr.JSON(label="Embeddings")],
    title="Image Description with Qwen Model",
    description="Upload an image to get a detailed description using the Qwen2-VL-7B-Instruct model."
)

# Launch the app
#iface.launch(share=True)
iface.launch()