import spaces
import json
import subprocess
import gradio as gr
from huggingface_hub import hf_hub_download

subprocess.run('pip install llama-cpp-python==0.2.75 --extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cu124', shell=True)
subprocess.run('pip install llama-cpp-agent==0.2.8', shell=True)

hf_hub_download(repo_id="TheBloke/Mistral-7B-Instruct-v0.2-GGUF", filename="mistral-7b-instruct-v0.2.Q6_K.gguf",  local_dir = "./models")

@spaces.GPU(duration=120)
def respond(
    message,
    history: list[tuple[str, str]],
    system_message,
    max_tokens,
    temperature,
    top_p,
):
    from llama_cpp import Llama
    from llama_cpp_agent import LlamaCppAgent
    from llama_cpp_agent import MessagesFormatterType
    from llama_cpp_agent.providers import LlamaCppPythonProvider
    from llama_cpp_agent.chat_history import BasicChatHistory
    
    llm = Llama(
        model_path="models/mistral-7b-instruct-v0.2.Q6_K.gguf",
        n_gpu_layers=33,
    )
    provider = LlamaCppPythonProvider(llm)

    agent = LlamaCppAgent(
        provider,
        system_prompt="You are a helpful assistant.",
        predefined_messages_formatter_type=MessagesFormatterType.MISTRAL,
        debug_output=True
    )
    
    settings = provider.get_provider_default_settings()
    settings.max_tokens = 2000
    settings.stream = True

    messages = BasicChatHistory()
    print(history)

    for msn in history:
        dic = {
            'role': msn[0]
            'content': msn[1]
        }
        messages.add_message(dic)
    
    stream = agent.get_chat_response(message, llm_sampling_settings=settings, chat_history=messages, returns_streaming_generator=True)
    
    outputs = ""
    for output in stream:
        print(output)
        # if "content" in output["choices"][0]["delta"]:  
        outputs += output
        yield outputs
    # from llama_cpp import Llama
    # from llama_cpp_agent import LlamaCppAgent
    # from llama_cpp_agent import MessagesFormatterType
    # from llama_cpp_agent.providers import LlamaCppPythonProvider
    
    # llama_model = Llama(r"models/mistral-7b-instruct-v0.2.Q6_K.gguf", n_batch=1024, n_threads=0, n_gpu_layers=33, n_ctx=8192, verbose=False)

    # provider = LlamaCppPythonProvider(llama_model)

    # agent = LlamaCppAgent(
    #   provider,
    #   system_prompt=f"{system_message}",
    #   predefined_messages_formatter_type=MessagesFormatterType.MISTRAL,
    #   debug_output=True
    # )

    # settings = provider.get_provider_default_settings()
    # settings.stream = True
    # settings.max_tokens = max_tokens
    # settings.temperature = temperature
    # settings.top_p = top_p
    # partial_message = ""
    # for new_token in agent.get_chat_response(message, llm_sampling_settings=settings, returns_streaming_generator=True):
    #     partial_message += new_token
    #     if '<|im_end|>' in partial_message:
    #         break
    #     yield partial_message
    
    # stop_tokens = ["</s>", "[INST]", "[INST] ", "<s>", "[/INST]", "[/INST] "]
    # chat_template = '<s>[INST] ' + system_message
    # # for human, assistant in history:
    # #     chat_template += human + ' [/INST] ' + assistant + '</s>[INST]'
    # chat_template += ' ' + message + ' [/INST]'

    # print(chat_template)
    
    # llm = LlamaCPP(
    #     model_path="models/mistral-7b-instruct-v0.2.Q6_K.gguf",
    #     temperature=temperature,
    #     max_new_tokens=max_tokens,
    #     context_window=2048,
    #     generate_kwargs={
    #         "top_k": 50,
    #         "top_p": top_p,
    #         "repeat_penalty": 1.3
    #     },
    #     model_kwargs={
    #         "n_threads": 0,
    #         "n_gpu_layers": 33
    #     },
    #     messages_to_prompt=messages_to_prompt,
    #     completion_to_prompt=completion_to_prompt,
    #     verbose=True,
    # )
    # # response = ""
    # # for chunk in llm.stream_complete(message):
    # #     print(chunk.delta, end="", flush=True)
    # #     response += str(chunk.delta)
    # #     yield response
    # outputs = []
    # for chunk in llm.stream_complete(message):
    #     outputs.append(chunk.delta)
    #     if chunk.delta in stop_tokens:
    #         break
    #     yield "".join(outputs)

demo = gr.ChatInterface(
    respond,
    additional_inputs=[
        gr.Textbox(value="You are a helpful assistant.", label="System message"),
        gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),
        gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
        gr.Slider(
            minimum=0.1,
            maximum=1.0,
            value=0.95,
            step=0.05,
            label="Top-p (nucleus sampling)",
        ),
    ],
)

if __name__ == "__main__":
    demo.launch()