import gradio as gr
from llama_cpp import Llama
from huggingface_hub import hf_hub_download
import os

import torch
print(f"Is CUDA available: {torch.cuda.is_available()}")
# True
print(f"CUDA device: {torch.cuda.get_device_name(torch.cuda.current_device())}")
# print(f"CMAKE_ARGS={os.environ['CMAKE_ARGS']}")
# print(f"FORCE_CMAKE={os.environ['FORCE_CMAKE']}")
print(f'Llama={Llama.__name__}')


models_path = 'models/'

if not os.path.exists(models_path):
    os.makedirs(models_path)
    
downloaded_model_path = hf_hub_download(repo_id="miqudev/miqu-1-70b",
                        filename="miqu-1-70b.q4_k_m.gguf",local_dir = models_path)

print(f'Downloaded path: {downloaded_model_path}')

print('Initializing model...')
llm = Llama(
  model_path=downloaded_model_path, 
   n_ctx=4096,
   n_threads=10,
   n_gpu_layers=100,
   temp=0.7,
   n_batch = 512, 
   n_predict = -1, 
  n_keep = 0
)
print('Model loaded.')

def mix_query(query, history):
    output = llm(
    f"[INST] {query} [/INST]",
    max_tokens=1024,
    stop=["</s>"],
    echo=False
    )

    print(output['choices'][0]['text'])
    return output['choices'][0]['text']

demo = gr.ChatInterface(fn=mix_query, 

                        examples=["Explain the Fermi paradox"], title="TARS",
                        theme="soft")
demo.launch(share=True,server_name='0.0.0.0')