import gradio as gr from llama_cpp import Llama from huggingface_hub import hf_hub_download import os import torch print(f"Is CUDA available: {torch.cuda.is_available()}") # True print(f"CUDA device: {torch.cuda.get_device_name(torch.cuda.current_device())}") # print(f"CMAKE_ARGS={os.environ['CMAKE_ARGS']}") # print(f"FORCE_CMAKE={os.environ['FORCE_CMAKE']}") print(f'Llama={Llama.__name__}') models_path = 'models/' if not os.path.exists(models_path): os.makedirs(models_path) downloaded_model_path = hf_hub_download(repo_id="miqudev/miqu-1-70b", filename="miqu-1-70b.q4_k_m.gguf",local_dir = models_path) print(f'Downloaded path: {downloaded_model_path}') print('Initializing model...') llm = Llama( model_path=downloaded_model_path, n_ctx=4096, n_threads=10, n_gpu_layers=100, temp=0.7, n_batch = 512, n_predict = -1, n_keep = 0 ) print('Model loaded.') def mix_query(query, history): output = llm( f"[INST] {query} [/INST]", max_tokens=1024, stop=["</s>"], echo=False ) print(output['choices'][0]['text']) return output['choices'][0]['text'] demo = gr.ChatInterface(fn=mix_query, examples=["Explain the Fermi paradox"], title="TARS", theme="soft") demo.launch(share=True,server_name='0.0.0.0')