File size: 2,701 Bytes
39fb316
9faf370
 
a174543
9faf370
a174543
 
 
 
 
 
9faf370
 
 
 
a174543
9faf370
 
 
 
a174543
9faf370
 
 
a174543
9faf370
 
 
 
 
 
 
 
 
 
 
 
a3382ae
9faf370
 
 
 
 
 
 
a174543
9faf370
a174543
9faf370
 
 
a174543
9faf370
a174543
 
 
 
 
 
 
 
 
 
 
 
9faf370
 
a174543
9faf370
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
import gradio as gr
from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
import threading
import torch

# Detectar dispositivo automaticamente (GPU ou CPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Carregar modelo e tokenizer
model_name = "lambdaindie/lambda-1v-1B"
model = AutoModelForCausalLM.from_pretrained(model_name).to(device)
tokenizer = AutoTokenizer.from_pretrained(model_name)

stop_flag = {"stop": False}

# Função de resposta
def respond(prompt, history):
    stop_flag["stop"] = False

    full_prompt = f"\nThink a bit step-by-step before answering.  \nQuestion: {prompt} \nAnswer:"
    inputs = tokenizer(full_prompt, return_tensors="pt").to(device)

    streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)

    # Iniciar thread de geração
    generation_thread = threading.Thread(
        target=model.generate,
        kwargs={
            "input_ids": inputs["input_ids"],
            "attention_mask": inputs["attention_mask"],
            "max_new_tokens": 512,
            "do_sample": True,
            "temperature": 0.7,
            "top_p": 0.9,
            "pad_token_id": tokenizer.eos_token_id,
            "streamer": streamer,
        }
    )
    generation_thread.start()

    reasoning = ""
    for new_text in streamer:
        if stop_flag["stop"]:
            return "", history
        reasoning += new_text
        yield "", (history or []) + [(prompt, f"<div class='final-answer'>{reasoning}</div>")]

# Função para parar a geração
def stop_generation():
    stop_flag["stop"] = True

# Interface Gradio
with gr.Blocks(css="""
    #chatbot, .gr-markdown, .gr-button, .gr-textbox {
        font-family: 'JetBrains Mono', monospace !important;
        font-size: 11px !important;
    }
    .final-answer {
        background-color: #1e1e1e;
        color: #ffffff;
        padding: 10px;
        border-left: 4px solid #4caf50;
        font-family: 'JetBrains Mono', monospace !important;
        white-space: pre-wrap;
        font-size: 11px !important;
    }
""") as demo:
    gr.Markdown('<link href="https://fonts.googleapis.com/css2?family=JetBrains+Mono&display=swap" rel="stylesheet">')
    gr.Markdown("## λambdAI — Reasoning Chat")

    chatbot = gr.Chatbot(elem_id="chatbot")
    with gr.Row():
        txt = gr.Textbox(placeholder="Digite sua pergunta...", show_label=False)
        send_btn = gr.Button("Enviar")
        stop_btn = gr.Button("Parar")

    send_btn.click(respond, [txt, chatbot], [txt, chatbot])
    txt.submit(respond, [txt, chatbot], [txt, chatbot])
    stop_btn.click(stop_generation, None, None)

    demo.launch(share=True)