Spaces:
Sleeping
Sleeping
File size: 1,383 Bytes
4ab8c75 dfddde4 de1b844 68d1aeb de1b844 c6925d2 ae3d933 619bc26 de1b844 619bc26 d39d390 44b3e69 619bc26 602e24b de1b844 28944a9 c6925d2 2619751 de1b844 28944a9 d7c4ce9 28944a9 d7c4ce9 dfddde4 de1b844 94f6279 de1b844 619bc26 de1b844 c6925d2 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 |
# Inference
import gradio as gr
from huggingface_hub import InferenceClient
model = "meta-llama/Llama-3.2-3B-Instruct"
client = InferenceClient(model)
def fn(
prompt,
history,
system_prompt,
max_tokens,
temperature,
top_p,
):
messages = [{"role": "system", "content": system_prompt}]
messages.append({"role": "user", "content": prompt})
history = []
history.append(messages)
#messages = [{"role": "user", "content": prompt}]
history.append(messages[0])
stream = client.chat.completions.create(
model = model,
messages = history,
max_tokens = max_tokens,
temperature = temperature,
top_p = top_p,
stream = True
)
chunks = []
for chunk in stream:
chunks.append(chunk.choices[0].delta.content or "")
yield "".join(chunks)
app = gr.ChatInterface(
fn = fn,
type = "messages",
additional_inputs = [
gr.Textbox(value="You are a helpful assistant.", label="System Prompt"),
gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max Tokens"),
gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
gr.Slider(minimum=0.1, maximum=1.0, value=0.95, step=0.05, label="Top-P"),
],
title = "Meta Llama",
description = model,
)
if __name__ == "__main__":
app.launch() |