Spaces:
Sleeping
Sleeping
File size: 1,337 Bytes
4ab8c75 dfddde4 de1b844 68d1aeb de1b844 c6925d2 ae3d933 619bc26 de1b844 35bd033 44b3e69 619bc26 dde0107 de1b844 28944a9 c6925d2 2619751 de1b844 28944a9 d7c4ce9 28944a9 d7c4ce9 dfddde4 de1b844 94f6279 de1b844 619bc26 de1b844 c6925d2 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 |
# Inference
import gradio as gr
from huggingface_hub import InferenceClient
model = "meta-llama/Llama-3.2-3B-Instruct"
client = InferenceClient(model)
def fn(
prompt,
history,
system_prompt,
max_tokens,
temperature,
top_p,
):
history = [{"role": "system", "content": system_prompt}]
history.append({"role": "user", "content": prompt})
#messages = [{"role": "user", "content": prompt}]
#history.append(messages[0])
stream = client.chat.completions.create(
model = model,
messages = history,
max_tokens = max_tokens,
temperature = temperature,
top_p = top_p,
stream = True
)
chunks = []
for chunk in stream:
chunks.append(chunk.choices[0].delta.content or "")
yield "".join(chunks)
app = gr.ChatInterface(
fn = fn,
type = "messages",
additional_inputs = [
gr.Textbox(value="You are a helpful assistant.", label="System Prompt"),
gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max Tokens"),
gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
gr.Slider(minimum=0.1, maximum=1.0, value=0.95, step=0.05, label="Top-P"),
],
title = "Meta Llama",
description = model,
)
if __name__ == "__main__":
app.launch() |