Spaces:

richardskimco
/

meta-llama

Sleeping

File size: 1,337 Bytes

4ab8c75
 
dfddde4
de1b844
 
68d1aeb
de1b844
 
 
c6925d2
ae3d933
619bc26
de1b844
 
 
35bd033
 
 
44b3e69
619bc26
dde0107
de1b844
28944a9
c6925d2
2619751
de1b844
 
 
28944a9
 
d7c4ce9
 
28944a9
d7c4ce9
 
dfddde4
de1b844
 
94f6279
de1b844
619bc26
de1b844
 
 
 
 
 
 
 
 
c6925d2

# Inference

import gradio as gr
from huggingface_hub import InferenceClient

model = "meta-llama/Llama-3.2-3B-Instruct"
client = InferenceClient(model)

def fn(
    prompt,
    history,
    system_prompt,
    max_tokens,
    temperature,
    top_p,
):  
    history = [{"role": "system", "content": system_prompt}]
    history.append({"role": "user", "content": prompt})
    
    #messages = [{"role": "user", "content": prompt}]
    #history.append(messages[0])

    stream = client.chat.completions.create(
        model = model,
        messages = history,
        max_tokens = max_tokens,
        temperature = temperature,
        top_p = top_p,
        stream = True
    )

    chunks = []
    for chunk in stream:
        chunks.append(chunk.choices[0].delta.content or "")
        yield "".join(chunks)

app = gr.ChatInterface(
    fn = fn,
    type = "messages",
    additional_inputs = [
        gr.Textbox(value="You are a helpful assistant.", label="System Prompt"),
        gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max Tokens"),
        gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
        gr.Slider(minimum=0.1, maximum=1.0, value=0.95, step=0.05, label="Top-P"),
    ],
    title = "Meta Llama",
    description = model,
)

if __name__ == "__main__":
    app.launch()