Spaces:

jerukperas
/

test

Sleeping

test / app.py

Update app.py

8434ae1 verified 10 months ago

816 Bytes

	import gradio as gr
	from llama_cpp import Llama


	llm = Llama.from_pretrained(
	repo_id="bartowski/Phi-3.5-mini-instruct-GGUF",
	filename="Phi-3.5-mini-instruct-Q4_K_M.gguf",
	numa=True,
	use_mmap=False,
	use_mlock=True,
	seed=-1,
	# flash_attn=True,
	# n_gpu_layers=-1,
	n_batch=1024,
	n_ctx=4095,
	)

	def respond(prompt: str):
	stream = llm.create_chat_completion(stream=True, messages=[{"role": "user", "content": prompt}])

	response = ""
	for chunk in stream:
	if "content" in chunk["choices"][0]["delta"]:
	response += chunk["choices"][0]["delta"]["content"]
	yield response


	demo = gr.Interface(fn=respond, inputs=[gr.TextArea("What is the capital of France?")], outputs=[gr.TextArea()])
	demo.launch(server_name="0.0.0.0", server_port=7860)