File size: 4,439 Bytes
be82a8a
4971496
e0d2fc3
be82a8a
4971496
be82a8a
cdfe590
e0d2fc3
cdfe590
e0d2fc3
 
 
 
 
4971496
e0d2fc3
 
 
 
 
 
 
 
 
4971496
e0d2fc3
 
4971496
e0d2fc3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
cdfe590
e0d2fc3
 
 
cdfe590
 
be82a8a
e0d2fc3
4971496
cdfe590
4971496
cdfe590
e0d2fc3
4971496
e0d2fc3
4971496
cdfe590
4971496
 
e0d2fc3
cdfe590
 
e0d2fc3
cdfe590
 
 
 
 
 
 
 
 
4971496
 
be82a8a
e0d2fc3
cdfe590
e0d2fc3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
cdfe590
e0d2fc3
 
cdfe590
 
e0d2fc3
 
cdfe590
e0d2fc3
 
cdfe590
e0d2fc3
 
cdfe590
e0d2fc3
 
 
 
 
 
 
 
 
 
cdfe590
be82a8a
e0d2fc3
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
import gradio as gr
from huggingface_hub import InferenceClient
from typing import Iterator

client = InferenceClient("Pinkstack/Superthoughts-lite-v1")

def respond(
    message: str,
    history: list[tuple[str, str]],
    system_message: str,
    max_tokens: int,
    temperature: float,
    top_p: float,
) -> Iterator[str]:
    messages = [{"role": "system", "content": system_message}]
    
    # Add history to messages
    for user_msg, assistant_msg in history:
        if user_msg:
            messages.append({"role": "user", "content": user_msg})
        if assistant_msg:
            messages.append({"role": "assistant", "content": assistant_msg})
    
    # Add current message
    messages.append({"role": "user", "content": message})
    
    # Initialize response
    response = ""
    
    # Stream the response
    try:
        for chunk in client.chat_completion(
            messages,
            max_tokens=max_tokens,
            stream=True,
            temperature=temperature,
            top_p=top_p,
        ):
            if chunk.choices[0].delta.content is not None:
                token = chunk.choices[0].delta.content
                response += token
                yield format_response(response)
    except Exception as e:
        yield f"Error: {str(e)}"

def format_response(response: str) -> str:
    """Format the response with collapsible thinking sections"""
    response = response.replace("<think>", '<details><summary>Show thinking 🧠</summary><div class="thoughts">')
    response = response.replace("</think>", "</div></details>")
    return response

# Custom CSS for styling
css = """
.thoughts {
    border: 1px solid #ccc;
    padding: 10px;
    background-color: #f8f9fa;
    border-radius: 5px;
    margin: 5px 0;
}
details summary {
    cursor: pointer;
    padding: 5px;
    background-color: #eee;
    border-radius: 5px;
    font-weight: bold;
    margin: 5px 0;
}
details summary::-webkit-details-marker {
    display: none;
}
details summary:after {
    content: " ▶";
}
details[open] summary:after {
    content: " ▼";
}
"""

# Create Gradio interface
with gr.Blocks(css=css) as demo:
    gr.Markdown("# Chat with Superthoughts lite! (1.7B)")
    gr.Markdown("**Warning:** The first output from the AI may take a few moments. After the first message, it should work at a decent speed, keep in mind that this chat is only meant for testing and experimenting.")
    
    chatbot = gr.Chatbot(height=600)
    msg = gr.Textbox(label="Your message", placeholder="Type your message here...")
    
    with gr.Accordion("Advanced Settings", open=False):
        system_message = gr.Textbox(
            value="You must act in a conversational matter and always include <think> ... </think> <output> </output> tokens.",
            label="System message"
        )
        max_tokens = gr.Slider(
            minimum=1,
            maximum=4096,
            value=512,
            step=1,
            label="Max new tokens"
        )
        temperature = gr.Slider(
            minimum=0.1,
            maximum=4.0,
            value=0.7,
            step=0.1,
            label="Temperature"
        )
        top_p = gr.Slider(
            minimum=0.1,
            maximum=1.0,
            value=0.95,
            step=0.05,
            label="Top-p (nucleus sampling)"
        )

    def user(user_message: str, history: list) -> tuple[str, list]:
        """Add user message to history"""
        return "", history + [[user_message, None]]

    def bot(history: list, system_message: str, max_tokens: int, temperature: float, top_p: float) -> Iterator[list]:
        """Generate and stream bot responses"""
        user_message, _ = history[-1]
        history[-1][1] = ""  # Initialize bot's response
        
        for partial_response in respond(user_message, history[:-1], system_message, max_tokens, temperature, top_p):
            history[-1][1] = partial_response
            yield history

    # Set up chat message handling
    msg.submit(
        user,
        [msg, chatbot],
        [msg, chatbot],
        queue=False
    ).then(
        bot,
        [chatbot, system_message, max_tokens, temperature, top_p],
        chatbot
    )

    # Add a clear button
    clear = gr.Button("Clear Conversation")
    clear.click(lambda: None, None, chatbot, queue=False)

# Launch the interface
if __name__ == "__main__":
    demo.queue()
    demo.launch(share=True)