File size: 2,667 Bytes
2ac4330
 
 
6a1d4c8
 
 
 
5bacd80
 
 
 
 
6a1d4c8
5bacd80
 
 
 
 
 
 
 
 
 
 
 
 
6a1d4c8
5bacd80
2ac4330
5bacd80
 
 
 
 
2ac4330
 
5bacd80
2ac4330
6a1d4c8
5bacd80
6a1d4c8
 
 
 
 
 
5bacd80
 
6a1d4c8
 
5bacd80
6a1d4c8
5bacd80
 
 
 
 
 
 
 
 
 
 
 
 
 
 
87569bb
5bacd80
2ac4330
5bacd80
 
 
2ac4330
87569bb
 
5bacd80
 
 
 
2ac4330
 
 
 
6a1d4c8
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
import gradio as gr
from huggingface_hub import InferenceClient

def respond(message, history, token, model, system_message, max_tokens, temperature, top_p):
    """
    Handle chat responses using the Hugging Face Inference API.
    """
    # Handle token and model defaults
    token = token.strip()
    model = model.strip()
    
    # Default model selection logic
    if not token:
        model = "gpt2"  # Default public model that doesn't require token
        try:
            client = InferenceClient(model=model)
        except Exception as e:
            yield f"Error initializing client: {str(e)}"
            return
    else:
        model = model or "meta-llama/Llama-3.1-8B-Instruct"  # Default private model
        try:
            client = InferenceClient(model=model, token=token)
        except Exception as e:
            yield f"Error initializing client: {str(e)}"
            return

    # Build message history
    messages = [{"role": "system", "content": system_message}]
    for user_msg, assistant_msg in history:
        if user_msg:
            messages.append({"role": "user", "content": user_msg})
        if assistant_msg:
            messages.append({"role": "assistant", "content": assistant_msg})
    messages.append({"role": "user", "content": message})

    # Generate response
    response = ""
    try:
        for chunk in client.chat_completion(
            messages,
            max_tokens=max_tokens,
            stream=True,
            temperature=temperature,
            top_p=top_p,
        ):
            if chunk.choices and chunk.choices[0].delta.content:
                response += chunk.choices[0].delta.content
                yield response
    except Exception as e:
        yield f"API Error: {str(e)}"

# Input components
token_input = gr.Textbox(
    type="password", 
    label="HF API Token (leave empty for public models)",
    placeholder="hf_XXXXXXXXXXXX"
)
model_input = gr.Dropdown(
    label="Model Name",
    choices=[
        "gpt2",
        "HuggingFaceH4/zephyr-7b-beta",
        "meta-llama/Llama-3.1-8B-Instruct"
    ],
    value="gpt2"
)

# Chat interface
demo = gr.ChatInterface(
    fn=respond,
    title="HF Model Chat Interface",
    description="Enter token for private models or use public models without token",
    additional_inputs=[
        token_input,
        model_input,
        gr.Textbox(value="You are helpful AI.", label="System Message"),
        gr.Slider(1, 2048, value=512, label="Max Tokens"),
        gr.Slider(0.1, 4.0, value=0.7, label="Temperature"),
        gr.Slider(0.1, 1.0, value=0.95, label="Top-p"),
    ],
)

if __name__ == "__main__":
    demo.launch()