File size: 4,149 Bytes
a474012
665b7ce
4f7e40d
a474012
665b7ce
a474012
9faf370
a474012
9917b41
a474012
9917b41
 
 
 
a474012
 
665b7ce
a474012
665b7ce
 
a474012
 
 
665b7ce
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9917b41
665b7ce
 
 
 
 
a474012
 
 
665b7ce
 
9917b41
a474012
 
 
 
 
 
9917b41
a474012
 
 
9917b41
a474012
 
9917b41
a474012
 
 
 
 
 
 
 
 
9917b41
a474012
 
 
9917b41
a474012
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9917b41
a474012
 
9917b41
 
 
 
a474012
 
 
 
9917b41
a474012
 
 
 
 
 
 
 
 
 
 
 
9917b41
a474012
 
 
665b7ce
9917b41
 
665b7ce
a474012
665b7ce
e5039e0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
import os
import time
import threading
import gradio as gr
from transformers import AutoTokenizer, AutoModelForCausalLM, TextIteratorStreamer
import torch

# Carregar modelo local
model_id = "lambdaindie/lambda-1v-1B"
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32
)
model.to("cuda" if torch.cuda.is_available() else "cpu")
model.eval()

# Estilo
css = """
@import url('https://fonts.googleapis.com/css2?family=JetBrains+Mono&display=swap');
* {
    font-family: 'JetBrains Mono', monospace !important;
}
html, body, .gradio-container {
    background-color: #111 !important;
    color: #e0e0e0 !important;
}
textarea, input, button, select {
    background-color: transparent !important;
    color: #e0e0e0 !important;
    border: 1px solid #444 !important;
}
.markdown-think {
    background-color: #1e1e1e;
    border-left: 4px solid #555;
    padding: 10px;
    margin-bottom: 8px;
    font-style: italic;
    white-space: pre-wrap;
    animation: pulse 1.5s infinite ease-in-out;
}
@keyframes pulse {
    0% { opacity: 0.6; }
    50% { opacity: 1.0; }
    100% { opacity: 0.6; }
}
"""

theme = gr.themes.Base(
    primary_hue="gray",
    font=[gr.themes.GoogleFont("JetBrains Mono"), "monospace"]
).set(
    body_background_fill="#111",
    body_text_color="#e0e0e0",
    button_primary_background_fill="#333",
    button_primary_text_color="#e0e0e0",
    input_background_fill="#222",
    input_border_color="#444",
    block_title_text_color="#fff"
)

# Flag de parada
stop_signal = False

def stop_stream():
    global stop_signal
    stop_signal = True

def respond(history, system_message, max_tokens, temperature, top_p):
    global stop_signal
    stop_signal = False

    # Construir prompt
    prompt = ""
    if system_message:
        prompt += system_message + "\n\n"

    for msg in history:
        role = msg["role"]
        content = msg["content"]
        if role == "user":
            prompt += f"User: {content}\n"
        elif role == "assistant":
            prompt += f"Assistant: {content}\n"

    prompt += "Assistant:"

    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)

    generation_kwargs = dict(
        **inputs,
        streamer=streamer,
        max_new_tokens=max_tokens,
        temperature=temperature,
        top_p=top_p,
        do_sample=True,
    )

    thread = threading.Thread(target=model.generate, kwargs=generation_kwargs)
    thread.start()

    output = ""
    start = time.time()

    for token in streamer:
        if stop_signal:
            break
        output += token
        yield history + [{"role": "assistant", "content": output}]

    end = time.time()
    yield history + [
        {"role": "assistant", "content": output},
        {"role": "system", "content": f"Pensou por {end - start:.1f} segundos"}
    ]

# Interface
with gr.Blocks(css=css, theme=theme) as app:
    chatbot = gr.Chatbot(label="λ", type="messages")
    state = gr.State([])

    with gr.Row():
        msg = gr.Textbox(label="Mensagem")
        send_btn = gr.Button("Enviar")
        stop_btn = gr.Button("Parar")

    with gr.Accordion("Configurações Avançadas", open=False):
        system_message = gr.Textbox(label="System Message", value="")
        max_tokens = gr.Slider(64, 2048, value=256, step=1, label="Max Tokens")
        temperature = gr.Slider(0.1, 2.0, value=0.7, step=0.1, label="Temperature")
        top_p = gr.Slider(0.1, 1.0, value=0.95, step=0.05, label="Top-p")

    def handle_user_msg(user_msg, chat_history):
        if user_msg:
            chat_history = chat_history + [{"role": "user", "content": user_msg}]
        return "", chat_history

    send_btn.click(fn=handle_user_msg, inputs=[msg, state], outputs=[msg, state])\
        .then(fn=respond, inputs=[state, system_message, max_tokens, temperature, top_p], outputs=[chatbot, state])

    stop_btn.click(fn=stop_stream, inputs=[], outputs=[])

app.launch(share=True)