Spaces:
Sleeping
Sleeping
File size: 1,797 Bytes
4ab8c75 dfddde4 de1b844 68d1aeb de1b844 c6925d2 e6d6ab6 e015f05 c6925d2 de1b844 c6925d2 44b3e69 de1b844 0ec049b 743689c c6925d2 de1b844 28944a9 c6925d2 2619751 de1b844 28944a9 d7c4ce9 28944a9 d7c4ce9 dfddde4 de1b844 94f6279 de1b844 bff3f06 de1b844 c6925d2 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 |
# Inference
import gradio as gr
from huggingface_hub import InferenceClient
model = "meta-llama/Llama-3.2-3B-Instruct"
client = InferenceClient(model)
def fn(
prompt,
#history: list[tuple[str, str]],
history: list,
#system_prompt,
max_tokens,
temperature,
top_p,
):
#messages = [{"role": "system", "content": system_prompt}]
#history.append({"role": "user", "content": prompt})
messages = [{"role": "user", "content": prompt}]
history.append(messages)
#for val in history:
# if val[0]:
# messages.append({"role": "user", "content": val[0]})
# if val[1]:
# messages.append({"role": "assistant", "content": val[1]})
#messages.append({"role": "user", "content": prompt})
stream = client.chat.completions.create(
model = model,
#messages = messages,
messages = history,
max_tokens = max_tokens,
temperature = temperature,
top_p = top_p,
stream = True
)
#response = ""
#for chunk in stream:
# response += chunk.choices[0].delta.content
#return response
chunks = []
for chunk in stream:
chunks.append(chunk.choices[0].delta.content or "")
yield "".join(chunks)
app = gr.ChatInterface(
fn = fn,
type = "messages",
additional_inputs = [
#gr.Textbox(value="You are a helpful assistant.", label="System Prompt"),
gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max Tokens"),
gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
gr.Slider(minimum=0.1, maximum=1.0, value=0.95, step=0.05, label="Top-P"),
],
title = "Meta Llama",
description = model,
)
if __name__ == "__main__":
app.launch() |