import gradio as gr from llama_cpp import Llama llm = Llama.from_pretrained( repo_id="bartowski/Phi-3.5-mini-instruct-GGUF", filename="Phi-3.5-mini-instruct-Q4_K_M.gguf", numa=True, # flash_attn=True, # n_gpu_layers=-1, n_batch=1024, n_ctx=4095, ) def respond(prompt: str): stream = llm.create_chat_completion(stream=True, messages=[{"role": "user", "content": prompt}]) response = "" for chunk in stream: if "content" in chunk["choices"][0]["delta"]: response += chunk["choices"][0]["delta"]["content"] yield response demo = gr.Interface(fn=respond, inputs=[gr.TextArea("What is the capital of France?")], outputs=[gr.TextArea()]) demo.launch(server_name="0.0.0.0", server_port=7860)