Spaces:

richardskimco
/

meta-llama

Sleeping

File size: 1,383 Bytes

4ab8c75
 
dfddde4
de1b844
 
68d1aeb
de1b844
 
 
c6925d2
ae3d933
619bc26
de1b844
 
 
 
619bc26
 
d39d390
 
 
44b3e69
619bc26
602e24b
de1b844
28944a9
c6925d2
2619751
de1b844
 
 
28944a9
 
d7c4ce9
 
28944a9
d7c4ce9
 
dfddde4
de1b844
 
94f6279
de1b844
619bc26
de1b844
 
 
 
 
 
 
 
 
c6925d2

# Inference

import gradio as gr
from huggingface_hub import InferenceClient

model = "meta-llama/Llama-3.2-3B-Instruct"
client = InferenceClient(model)

def fn(
    prompt,
    history,
    system_prompt,
    max_tokens,
    temperature,
    top_p,
):
    messages = [{"role": "system", "content": system_prompt}]
    messages.append({"role": "user", "content": prompt})

    history = []
    history.append(messages)
    
    #messages = [{"role": "user", "content": prompt}]
    history.append(messages[0])

    stream = client.chat.completions.create(
        model = model,
        messages = history,
        max_tokens = max_tokens,
        temperature = temperature,
        top_p = top_p,
        stream = True
    )

    chunks = []
    for chunk in stream:
        chunks.append(chunk.choices[0].delta.content or "")
        yield "".join(chunks)

app = gr.ChatInterface(
    fn = fn,
    type = "messages",
    additional_inputs = [
        gr.Textbox(value="You are a helpful assistant.", label="System Prompt"),
        gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max Tokens"),
        gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
        gr.Slider(minimum=0.1, maximum=1.0, value=0.95, step=0.05, label="Top-P"),
    ],
    title = "Meta Llama",
    description = model,
)

if __name__ == "__main__":
    app.launch()