File size: 816 Bytes
01a9f90
 
 
a9e9978
01a9f90
a9e9978
 
01a9f90
8434ae1
 
 
a9e9978
 
 
 
01a9f90
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
import gradio as gr
from llama_cpp import Llama


llm = Llama.from_pretrained(
    repo_id="bartowski/Phi-3.5-mini-instruct-GGUF",
    filename="Phi-3.5-mini-instruct-Q4_K_M.gguf",
    numa=True,
    use_mmap=False,
    use_mlock=True,
    seed=-1,
    # flash_attn=True,
    # n_gpu_layers=-1,
    n_batch=1024,
    n_ctx=4095,
)

def respond(prompt: str):
    stream = llm.create_chat_completion(stream=True, messages=[{"role": "user", "content": prompt}])

    response = ""
    for chunk in stream:
        if "content" in chunk["choices"][0]["delta"]:
            response += chunk["choices"][0]["delta"]["content"]
            yield response


demo = gr.Interface(fn=respond, inputs=[gr.TextArea("What is the capital of France?")], outputs=[gr.TextArea()])
demo.launch(server_name="0.0.0.0", server_port=7860)