File size: 2,974 Bytes
be82a8a 4971496 be82a8a 4971496 be82a8a cdfe590 4971496 cdfe590 4971496 cdfe590 4971496 cdfe590 4971496 cdfe590 be82a8a 4971496 cdfe590 4971496 cdfe590 d0018d0 4971496 cdfe590 4971496 d0018d0 cdfe590 4971496 be82a8a cdfe590 d0018d0 cdfe590 d0018d0 cdfe590 d0018d0 cdfe590 be82a8a cdfe590 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 |
import gradio as gr
from huggingface_hub import InferenceClient
client = InferenceClient("Pinkstack/Superthoughts-lite-v1")
def respond(
message,
history: list[tuple[str, str]],
system_message,
max_tokens,
temperature,
top_p,
):
messages = [{"role": "system", "content": system_message}]
for val in history:
if val[0]:
messages.append({"role": "user", "content": val[0]})
if val[1]:
messages.append({"role": "assistant", "content": val[1]})
messages.append({"role": "user", "content": message})
response = ""
for message in client.chat_completion(
messages,
max_tokens=max_tokens,
stream=True,
temperature=temperature,
top_p=top_p,
):
token = message.choices[0].delta.content
response += token
yield response
def format_response(response):
# Replace <think>...</think> with a collapsible section
response = response.replace("<think>", '<details><summary>Show thoughts</summary><div class="thoughts">')
response = response.replace("</think>", "</div></details>")
return response
css = """
.thoughts {
border: 1px solid #ccc;
padding: 10px;
background-color: #000000;
border-radius: 5px;
}
details summary {
cursor: pointer;
padding: 5px;
background-color: #000000;
border-radius: 5px;
font-weight: bold;
}
details summary::-webkit-details-marker {
display: none;
}
details summary:after {
content: " ▶";
}
details[open] summary:after {
content: " ▼";
}
"""
with gr.Blocks(css=css) as demo:
gr.Markdown("## Chat with Superthoughts lite! (1.7B)")
gr.Markdown("**Warning:** The first output from the AI may take a few moments. After the first message, it should work at a decent speed.")
chatbot = gr.Chatbot()
msg = gr.Textbox()
system_message = gr.Textbox(value="You must always include <think> ... </think> <output> </output> tokens.", label="System message")
max_tokens = gr.Slider(minimum=1, maximum=4096, value=512, step=1, label="Max new tokens")
temperature = gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature")
top_p = gr.Slider(minimum=0.1, maximum=1.0, value=0.95, step=0.05, label="Top-p (nucleus sampling)")
def user(user_message, history):
return "", history + [[user_message, None]]
def bot(history, system_message, max_tokens, temperature, top_p):
user_message, _ = history[-1]
response = ""
for partial_response in respond(user_message, history[:-1], system_message, max_tokens, temperature, top_p):
response = partial_response
formatted_response = format_response(response)
history[-1][1] = formatted_response
return history
msg.submit(user, [msg, chatbot], [msg, chatbot], queue=True).then(
bot, [chatbot, system_message, max_tokens, temperature, top_p], chatbot
)
demo.launch()
|