File size: 7,747 Bytes
3738ef6
13880c3
3738ef6
51a7d9e
13880c3
51a7d9e
edb9e8a
13880c3
 
 
 
c8e2710
 
 
13880c3
51a7d9e
13880c3
c8e2710
e339ee0
c8e2710
 
 
 
 
 
 
e2a3fe7
c8e2710
071e665
13880c3
c8e2710
 
 
 
51a7d9e
c8e2710
 
 
 
 
 
 
51a7d9e
1e18916
c8e2710
 
 
 
 
13880c3
c8e2710
 
 
 
e339ee0
ebc31d1
 
 
 
e339ee0
 
c8e2710
 
d8a8bf1
e339ee0
13880c3
c8e2710
 
 
 
13880c3
3738ef6
13880c3
659ca36
c8e2710
 
 
 
 
1e18916
 
5328f67
0b72fd3
13880c3
3738ef6
 
8b5b0c4
c8e2710
 
 
 
3738ef6
c8e2710
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3738ef6
 
c8e2710
13880c3
c8e2710
 
3738ef6
c8e2710
 
 
 
 
 
 
 
bc05e4d
c8e2710
 
 
 
 
 
 
 
 
 
 
 
c44cbfe
13880c3
c8e2710
13880c3
 
c8e2710
 
13880c3
 
0b72fd3
c8e2710
 
 
 
 
 
0b72fd3
c8e2710
 
 
 
0b72fd3
 
c8e2710
 
 
 
 
 
 
 
 
 
0b72fd3
c8e2710
0b72fd3
 
c8e2710
 
0b72fd3
c8e2710
 
 
 
 
 
 
0b72fd3
c8e2710
 
 
 
 
 
0b72fd3
c8e2710
 
 
 
 
 
 
13880c3
3738ef6
51a7d9e
13880c3
c8e2710
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
import os
import re
import time
import torch
import spaces
import gradio as gr
from threading import Thread
from transformers import (
    AutoModelForCausalLM, 
    AutoTokenizer, 
    BitsAndBytesConfig, 
    TextIteratorStreamer,
    StoppingCriteria,
    StoppingCriteriaList
)

# Configuration Constants
MODEL_ID = "deepseek-ai/DeepSeek-R1-Distill-Qwen-14B"

# Enhanced System Prompt
DEFAULT_SYSTEM_PROMPT = """You are an Expert Reasoning Assistant. Follow these steps:
[Understand]: Analyze key elements and clarify objectives
[Plan]: Outline step-by-step methodology
[Reason]: Execute plan with detailed analysis
[Verify]: Check logic and evidence
[Conclude]: Present structured conclusion

Use these section headers and maintain technical accuracy with clear explanations."""

# UI Configuration
TITLE = """
<h1 align="center" style="color: #2d3436; margin-bottom: 0">🧠 AI Reasoning Assistant</h1>
<p align="center" style="color: #636e72; margin-top: 0">DeepSeek-R1-Distill-Qwen-14B</p>
"""
CSS = """
.gr-chatbot { min-height: 500px !important; border-radius: 15px !important; }
.message-wrap pre { background: #f8f9fa !important; padding: 15px !important; }
.thinking-tag { color: #2ecc71; font-weight: 600; }
.plan-tag { color: #e67e22; font-weight: 600; }
.conclude-tag { color: #3498db; font-weight: 600; }
.control-panel { background: #f8f9fa !important; padding: 20px !important; }
footer { visibility: hidden !important; }
"""

class StopOnTokens(StoppingCriteria):
    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
        stop_ids = [0]  # Add custom stop tokens here
        return input_ids[0][-1] in stop_ids

def initialize_model():
    """Initialize model with safety checks"""
    if not torch.cuda.is_available():
        raise RuntimeError("CUDA is required for this application")

    quantization_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_compute_dtype=torch.bfloat16,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_use_double_quant=True,
    )

    tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
    tokenizer.pad_token = tokenizer.eos_token

    model = AutoModelForCausalLM.from_pretrained(
        MODEL_ID,
        device_map="auto",
        quantization_config=quantization_config,
        torch_dtype=torch.bfloat16,
        trust_remote_code=True
    )

    return model, tokenizer

def format_response(text):
    """Enhanced formatting with syntax highlighting for reasoning steps"""
    formatted = text.replace("[Understand]", '\n<strong class="thinking-tag">[Understand]</strong>\n')
    formatted = formatted.replace("[Plan]", '\n<strong class="plan-tag">[Plan]</strong>\n')
    formatted = formatted.replace("[Conclude]", '\n<strong class="conclude-tag">[Conclude]</strong>\n')
    return formatted

@spaces.GPU(duration=120)
def chat_response(
    message: str,
    history: list,
    system_prompt: str,
    temperature: float = 0.3,
    max_new_tokens: int = 2048,
    top_p: float = 0.9,
    top_k: int = 50,
    penalty: float = 1.2,
):
    """Improved streaming generator with error handling"""
    try:
        conversation = [{"role": "system", "content": system_prompt}]
        for user, assistant in history:
            conversation.extend([
                {"role": "user", "content": user},
                {"role": "assistant", "content": assistant}
            ])
        conversation.append({"role": "user", "content": message})

        input_ids = tokenizer.apply_chat_template(
            conversation,
            add_generation_prompt=True,
            return_tensors="pt"
        ).to(model.device)

        streamer = TextIteratorStreamer(
            tokenizer,
            timeout=30,
            skip_prompt=True,
            skip_special_tokens=True
        )

        generate_kwargs = dict(
            input_ids=input_ids,
            max_new_tokens=max_new_tokens,
            temperature=temperature,
            top_p=top_p,
            top_k=top_k,
            repetition_penalty=penalty,
            streamer=streamer,
            stopping_criteria=StoppingCriteriaList([StopOnTokens()])
        )

        buffer = []
        thread = Thread(target=model.generate, kwargs=generate_kwargs)
        thread.start()

        for new_text in streamer:
            buffer.append(new_text)
            partial_result = "".join(buffer)
            
            # Check for complete sections
            if any(tag in partial_result for tag in ["[Understand]", "[Plan]", "[Conclude]"]):
                yield format_response(partial_result)
            else:
                yield format_response(partial_result + " ▌")

        # Final formatting pass
        yield format_response("".join(buffer))

    except Exception as e:
        yield f"⚠️ Error generating response: {str(e)}"

def create_examples():
    """Enhanced examples with diverse use cases"""
    return [
        ["Explain quantum entanglement in simple terms"],
        ["Design a study plan for learning machine learning"],
        ["Compare blockchain and traditional databases"],
        ["How would you optimize AWS costs for a startup?"],
        ["Explain the ethical implications of CRISPR technology"]
    ]

def main():
    """Improved UI layout and interactions"""
    global model, tokenizer
    model, tokenizer = initialize_model()

    with gr.Blocks(css=CSS, theme=gr.themes.Soft()) as demo:
        gr.HTML(TITLE)
        
        with gr.Row():
            with gr.Column(scale=3):
                chatbot = gr.Chatbot(
                    elem_id="chatbot",
                    bubble_full_width=False,
                    show_copy_button=True,
                    render=False
                )
                msg = gr.Textbox(
                    placeholder="Enter your question...",
                    label="Ask the Expert",
                    container=False
                )
                with gr.Row():
                    submit_btn = gr.Button("Send", variant="primary")
                    clear_btn = gr.Button("Clear", variant="secondary")

            with gr.Column(scale=1, elem_classes="control-panel"):
                gr.Examples(
                    examples=create_examples(),
                    inputs=msg,
                    label="Example Queries",
                    examples_per_page=5
                )
                
                with gr.Accordion("⚙️ Generation Parameters", open=False):
                    system_prompt = gr.TextArea(
                        value=DEFAULT_SYSTEM_PROMPT,
                        label="System Instructions",
                        lines=5
                    )
                    temperature = gr.Slider(0, 2, value=0.7, label="Creativity")
                    max_tokens = gr.Slider(128, 4096, value=2048, step=128, label="Max Tokens")
                    top_p = gr.Slider(0, 1, value=0.9, step=0.05, label="Focus (Top-p)")
                    penalty = gr.Slider(1, 2, value=1.2, step=0.1, label="Repetition Control")

        # Event handling
        msg.submit(
            chat_response,
            [msg, chatbot, system_prompt, temperature, max_tokens, top_p, penalty],
            [msg, chatbot],
            show_progress="hidden"
        ).then(lambda: "", None, msg)

        submit_btn.click(
            chat_response,
            [msg, chatbot, system_prompt, temperature, max_tokens, top_p, penalty],
            [msg, chatbot],
            show_progress="hidden"
        ).then(lambda: "", None, msg)

        clear_btn.click(lambda: None, None, chatbot, queue=False)

    return demo

if __name__ == "__main__":
    demo = main()
    demo.queue(max_size=20).launch()