Spaces:
Sleeping
Sleeping
import gradio as gr | |
from llama_cpp import Llama | |
llm = Llama.from_pretrained( | |
repo_id="bartowski/Phi-3.5-mini-instruct-GGUF", | |
filename="Phi-3.5-mini-instruct-Q4_K_M.gguf", | |
numa=True, | |
use_mmap=False, | |
use_mlock=True, | |
seed=-1, | |
# flash_attn=True, | |
# n_gpu_layers=-1, | |
n_batch=1024, | |
n_ctx=4095, | |
) | |
def respond(prompt: str): | |
stream = llm.create_chat_completion(stream=True, messages=[{"role": "user", "content": prompt}]) | |
response = "" | |
for chunk in stream: | |
if "content" in chunk["choices"][0]["delta"]: | |
response += chunk["choices"][0]["delta"]["content"] | |
yield response | |
demo = gr.Interface(fn=respond, inputs=[gr.TextArea("What is the capital of France?")], outputs=[gr.TextArea()]) | |
demo.launch(server_name="0.0.0.0", server_port=7860) | |