Spaces:
Running
Running
File size: 9,411 Bytes
038f313 4c18bfc fde397b 038f313 880ced6 038f313 4c18bfc 038f313 69b4a5f 038f313 3a64d68 d735dab fde397b f7c4208 7d3730f 038f313 5b1509d f7c4208 fde397b a430d0d d735dab f7c4208 fde397b f7c4208 5b1509d 038f313 f7c4208 880ced6 f7c4208 038f313 f7c4208 038f313 fde397b f7c4208 038f313 fde397b f7c4208 5b1509d fde397b 038f313 f7c4208 038f313 21137c4 038f313 f7c4208 5b1509d f7c4208 038f313 fde397b f7c4208 fde397b 542c2ac fde397b f7c4208 fde397b 21137c4 fde397b f7c4208 d735dab f7c4208 fde397b f7c4208 fde397b f7c4208 fde397b f7c4208 fde397b f7c4208 7d3730f fde397b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 |
import gradio as gr
from openai import OpenAI
import os
import time
# Retrieve the access token from the environment variable
ACCESS_TOKEN = os.getenv("HF_TOKEN")
print("Access token loaded.")
# Initialize the OpenAI client with the Hugging Face Inference API endpoint
client = OpenAI(
base_url="https://api-inference.huggingface.co/v1/",
api_key=ACCESS_TOKEN,
)
print("OpenAI client initialized.")
def respond(
message,
history: list[tuple[str, str]],
system_message,
max_tokens,
temperature,
top_p,
frequency_penalty,
seed,
model_filter,
model,
custom_model
):
"""
This function handles the chatbot response. It takes in:
- message: the user's new message
- history: the list of previous messages, each as a tuple (user_msg, assistant_msg)
- system_message: the system prompt
- max_tokens: the maximum number of tokens to generate in the response
- temperature: sampling temperature
- top_p: top-p (nucleus) sampling
- frequency_penalty: penalize repeated tokens in the output
- seed: a fixed seed for reproducibility; -1 will mean 'random'
- model_filter: search term to filter available models
- model: the selected model from the radio choices
- custom_model: manually entered HF model path
"""
print(f"Received message: {message}")
print(f"History: {history}")
print(f"System message: {system_message}")
print(f"Max tokens: {max_tokens}, Temperature: {temperature}, Top-P: {top_p}")
print(f"Frequency Penalty: {frequency_penalty}, Seed: {seed}")
print(f"Model Filter: {model_filter}, Selected Model: {model}, Custom Model: {custom_model}")
# Convert seed to None if -1 (meaning random)
if seed == -1:
seed = None
# Construct the messages array required by the API
messages = [{"role": "system", "content": system_message}]
# Add conversation history to the context
for val in history:
user_part = val[0]
assistant_part = val[1]
if user_part:
messages.append({"role": "user", "content": user_part})
print(f"Added user message to context: {user_part}")
if assistant_part:
messages.append({"role": "assistant", "content": assistant_part})
print(f"Added assistant message to context: {assistant_part}")
# Append the latest user message
messages.append({"role": "user", "content": message})
# Determine the model to use
# Set the API URL based on the selected model or custom model
if custom_model.strip() != "":
api_model = custom_model.strip()
else:
if model == "Llama-3-70B-Instruct":
api_model = "meta-llama/Llama-3.3-70B-Instruct"
elif model == "Mistral-7B-Instruct-v0.2":
api_model = "mistralai/Mistral-7B-Instruct-v0.2"
elif model == "OpenHermes-2.5-Mistral-7B":
api_model = "teknium/OpenHermes-2.5-Mistral-7B"
elif model == "Phi-2":
api_model = "microsoft/Phi-2"
else:
api_model = "meta-llama/Llama-3.3-70B-Instruct"
print(f"Using model: {api_model}")
# Start with an empty string to build the response as tokens stream in
response = ""
print(f"Sending request to OpenAI API, using model {api_model}.")
# Make the streaming request to the HF Inference API via openai-like client
for message_chunk in client.chat.completions.create(
model=api_model,
max_tokens=max_tokens,
stream=True, # Stream the response
temperature=temperature,
top_p=top_p,
frequency_penalty=frequency_penalty,
seed=seed,
messages=messages,
):
# Extract the token text from the response chunk
token_text = message_chunk.choices[0].delta.content
print(f"Received token: {token_text}")
# Check if token_text is None before appending
if token_text is not None:
response += token_text
yield response
print("Completed response generation.")
# Placeholder list of models for the accordion
models_list = [
"Llama-3-70B-Instruct",
"Mistral-7B-Instruct-v0.2",
"OpenHermes-2.5-Mistral-7B",
"Phi-2",
]
# Create a Chatbot component with a specified height
chatbot = gr.Chatbot(height=600)
print("Chatbot interface created.")
# Create the Gradio ChatInterface
demo = gr.ChatInterface(
respond,
additional_inputs=[
gr.Textbox(value="", label="System message"),
gr.Slider(minimum=1, maximum=4096, value=512, step=1, label="Max new tokens"),
gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
gr.Slider(minimum=0.1, maximum=1.0, value=0.95, step=0.05, label="Top-P"),
gr.Slider(
minimum=-2.0,
maximum=2.0,
value=0.0,
step=0.1,
label="Frequency Penalty"
),
gr.Slider(
minimum=-1,
maximum=65535,
value=-1,
step=1,
label="Seed (-1 for random)"
),
gr.Textbox(label="Filter Featured Models", placeholder="Search...", lines=1),
gr.Radio(label="Select a Featured Model", choices=models_list, value="Llama-3-70B-Instruct"),
gr.Textbox(label="Custom Model", placeholder="Enter Hugging Face model path", lines=1),
],
additional_inputs_accordion=gr.Accordion("Advanced Parameters", open=False),
fill_height=True,
chatbot=chatbot,
theme="Nymbo/Nymbo_Theme",
)
# Add the "Information" tab to the demo
with gr.Tab("Information", parent=demo):
with gr.Accordion("Featured Models", open=True):
gr.HTML(
"""
<table style="width:100%; text-align:center; margin:auto;">
<tr>
<th>Model Name</th>
<th>Provider</th>
<th>Notes</th>
</tr>
<tr>
<td>Llama-3-70B-Instruct</td>
<td>Meta</td>
<td>Powerful large language model.</td>
</tr>
<tr>
<td>Mistral-7B-Instruct-v0.2</td>
<td>Mistral AI</td>
<td>Efficient and versatile model.</td>
</tr>
<tr>
<td>OpenHermes-2.5-Mistral-7B</td>
<td>Teknium</td>
<td>Community-driven, fine-tuned model.</td>
</tr>
<tr>
<td>Phi-2</td>
<td>Microsoft</td>
<td>Compact yet powerful model.</td>
</tr>
</table>
"""
)
with gr.Accordion("Parameters Overview", open=False):
gr.Markdown(
"""
## System Message
###### The system message sets the behavior and persona of the chatbot. It's a way to provide context and instructions to the AI. For example, you can tell it to act as a helpful assistant, a storyteller, or any other role.
## Max New Tokens
###### This setting limits the length of the response generated by the AI. A higher number allows for longer, more detailed responses, while a lower number keeps the responses concise.
## Temperature
###### Temperature controls the randomness of the AI's output. A higher temperature makes the responses more creative and varied, while a lower temperature makes them more predictable and focused.
## Top-P (Nucleus Sampling)
###### Top-P sampling is a way to control the diversity of the AI's responses. It sets a threshold for the cumulative probability of the most likely next words. The AI then randomly selects from the words whose probabilities add up to this threshold. A lower Top-P value means less diversity.
## Frequency Penalty
###### Frequency penalty discourages the AI from repeating the same words or phrases too often in its responses. A higher penalty means the AI is less likely to repeat itself.
## Seed
###### The seed is a starting point for the random number generator that influences the AI's responses. If you set a specific seed, you'll get the same response every time you use that seed with the same prompt and settings. If you set it to -1, the AI will generate a new seed each time, leading to different responses.
## Featured Models
###### This section lists pre-selected models that are known to perform well. You can filter the list by typing in the search box.
## Custom Model
###### If you want to use a model that's not in the featured list, you can enter its Hugging Face model path here.
### Feel free to experiment with these settings to see how they affect the AI's responses. Happy chatting!
"""
)
# Filter models function
def filter_models(search_term, model_radio):
filtered_models = [m for m in models_list if search_term.lower() in m.lower()]
if not filtered_models:
filtered_models = ["No matching models"] # Provide feedback
return gr.Radio.update(choices=filtered_models)
# Update model list when search box is used
demo.additional_inputs[6].change(filter_models, inputs=[demo.additional_inputs[6], demo.additional_inputs[7]], outputs=demo.additional_inputs[7])
print("Gradio interface initialized.")
if __name__ == "__main__":
print("Launching the demo application.")
demo.queue().launch() |