## Due to a small bug when installing exllamav2 from dev branch directly we require CUDA paths
import cuda_bug
cuda_bug.install_cuda_toolkit_requirements()
##

import gradio as gr
from gradio.data_classes import FileData
from huggingface_hub import snapshot_download
from pathlib import Path
import base64
import spaces
import os

import sys, os

import torch

from exllamav2 import (
    ExLlamaV2,
    ExLlamaV2Config,
    ExLlamaV2Cache,
    ExLlamaV2Tokenizer,
    ExLlamaV2VisionTower,
)

from exllamav2.generator import (
    ExLlamaV2DynamicGenerator,
    ExLlamaV2Sampler,
)

from PIL import Image
import requests

from huggingface_hub import snapshot_download

default_bpw = "4.0bpw"
available_models = [
    "2.5bpw",
    "3.0bpw",
    "3.5bpw",
    "4.0bpw",
    "4.5bpw",
    "5.0bpw",
    "6.0bpw",
    "8.0bpw"
]
dirs = {}
for model in available_models:
    dirs.update({model: snapshot_download(repo_id="turboderp/pixtral-12b-exl2", revision=model)})

@spaces.GPU(duration=45)
def run_inference(message, history, model_picked):
    local_dir = dirs[model_picked]
    print(message)
    print(history)
    # Loading only once GPU available
    config = ExLlamaV2Config(local_dir)
    config.max_seq_len = 16384
    
    vision_model = ExLlamaV2VisionTower(config)
    vision_model.load(progress = True)

    model = ExLlamaV2(config)
    cache = ExLlamaV2Cache(model, lazy = True, max_seq_len = 16384)
    model.load_autosplit(cache, progress = True)
    tokenizer = ExLlamaV2Tokenizer(config)
    
    generator = ExLlamaV2DynamicGenerator(
        model = model,
        cache = cache,
        tokenizer = tokenizer
    )

    # Making Prompt Template
    prompt = ""
    image_prompt = ""
    images_embeddings = []
    for couple in history:
        if type(couple[0]) is tuple:
            images_embeddings += [
                vision_model.get_image_embeddings(
                    model = model,
                    tokenizer = tokenizer,
                    image = img,
                    text_alias = alias,
                )
                for (alias, img) in [("{{IMAGE_" + str(len(images_embeddings)+i+1) + "}}", Image.open(path)) for i, path in enumerate(couple[0])]
            ]
            image_prompt = ""
            for i in range(len(couple[0])):
                image_prompt += "{{IMAGE_" + str(len(images_embeddings)-len(couple[0])+i+1) + "}}"
        elif couple[0][1]:
            prompt += "[INST]" + image_prompt + couple[0][1] + "[/INST]"
            prompt += couple[1] + "</s>"

    if type(message) is dict:
        images_embeddings += [
            vision_model.get_image_embeddings(
                model = model,
                tokenizer = tokenizer,
                image = img,
                text_alias = alias,
            )
            for (alias, img) in [("{{IMAGE_" + str(len(images_embeddings)+i+1) + "}}", Image.open(path['path'] if type(path) is dict else path)) for i, path in enumerate(message['files'])]
        ]
        image_prompt = ""
        for i in range(len(message['files'])):
            image_prompt += "{{IMAGE_" + str(len(images_embeddings)-len(message['files'])+i+1) + "}}"
        prompt += "[INST]" + image_prompt + message["text"] + "[/INST]"
    else:
        prompt += "[INST]" + image_prompt + message + "[/INST]"
    
    print(prompt)

    # Gnerating Response
    for out in generator.generate(
        prompt = prompt,
        max_new_tokens = 1024,
        temperature = 0.15,
        add_bos = True,
        encode_special_tokens = True,
        decode_special_tokens = True,
        stop_conditions = [tokenizer.eos_token_id],
        gen_settings = ExLlamaV2Sampler.Settings.greedy(),
        embeddings = images_embeddings,
        stream = True
    ):
        if "[/INST]" in out:
            result = out.split("[/INST]")[-1]
        else:
            result = out
        print(result)
        yield result

description="""
A demo chat interface with Pixtral 12B EXL2 Quants, deployed using **ExllamaV2**!

The model will be loaded once the GPU is available. This space specifically will load by default Pixtral at 4bpw from the following repository: [turboderp/pixtral-12b-exl2](https://huggingface.co/turboderp/pixtral-12b-exl2). Other quantization options are available.

The current version of ExllamaV2 running is the dev branch, not the master branch: [ExllamaV2](https://github.com/turboderp/exllamav2/tree/dev).

The model at **4bpw and 16k context size fits in less than 12GB of VRAM**!

The current settings are:  
- Context Size: 16k tokens
- Max Output: 1024 tokens
- Temperature: 0.15

You can select other quants and experiment!

Thanks, turboderp!
"""
examples = [
    [
        {"text": "What are the similarities and differences between these two experiments?", "files":["test_image_1.jpg", "test_image_2.jpg"]},
    ]
]

drop = gr.Dropdown(available_models, label="EXL2 Quant", value=default_bpw)
demo = gr.ChatInterface(fn=run_inference, examples = examples, title="Pixtral 12B EXL2", multimodal=True, description=description, additional_inputs = drop)
demo.queue().launch()