Spaces:
Running
on
Zero
Running
on
Zero
File size: 7,747 Bytes
3738ef6 13880c3 3738ef6 51a7d9e 13880c3 51a7d9e edb9e8a 13880c3 c8e2710 13880c3 51a7d9e 13880c3 c8e2710 e339ee0 c8e2710 e2a3fe7 c8e2710 071e665 13880c3 c8e2710 51a7d9e c8e2710 51a7d9e 1e18916 c8e2710 13880c3 c8e2710 e339ee0 ebc31d1 e339ee0 c8e2710 d8a8bf1 e339ee0 13880c3 c8e2710 13880c3 3738ef6 13880c3 659ca36 c8e2710 1e18916 5328f67 0b72fd3 13880c3 3738ef6 8b5b0c4 c8e2710 3738ef6 c8e2710 3738ef6 c8e2710 13880c3 c8e2710 3738ef6 c8e2710 bc05e4d c8e2710 c44cbfe 13880c3 c8e2710 13880c3 c8e2710 13880c3 0b72fd3 c8e2710 0b72fd3 c8e2710 0b72fd3 c8e2710 0b72fd3 c8e2710 0b72fd3 c8e2710 0b72fd3 c8e2710 0b72fd3 c8e2710 0b72fd3 c8e2710 13880c3 3738ef6 51a7d9e 13880c3 c8e2710 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 |
import os
import re
import time
import torch
import spaces
import gradio as gr
from threading import Thread
from transformers import (
AutoModelForCausalLM,
AutoTokenizer,
BitsAndBytesConfig,
TextIteratorStreamer,
StoppingCriteria,
StoppingCriteriaList
)
# Configuration Constants
MODEL_ID = "deepseek-ai/DeepSeek-R1-Distill-Qwen-14B"
# Enhanced System Prompt
DEFAULT_SYSTEM_PROMPT = """You are an Expert Reasoning Assistant. Follow these steps:
[Understand]: Analyze key elements and clarify objectives
[Plan]: Outline step-by-step methodology
[Reason]: Execute plan with detailed analysis
[Verify]: Check logic and evidence
[Conclude]: Present structured conclusion
Use these section headers and maintain technical accuracy with clear explanations."""
# UI Configuration
TITLE = """
<h1 align="center" style="color: #2d3436; margin-bottom: 0">🧠 AI Reasoning Assistant</h1>
<p align="center" style="color: #636e72; margin-top: 0">DeepSeek-R1-Distill-Qwen-14B</p>
"""
CSS = """
.gr-chatbot { min-height: 500px !important; border-radius: 15px !important; }
.message-wrap pre { background: #f8f9fa !important; padding: 15px !important; }
.thinking-tag { color: #2ecc71; font-weight: 600; }
.plan-tag { color: #e67e22; font-weight: 600; }
.conclude-tag { color: #3498db; font-weight: 600; }
.control-panel { background: #f8f9fa !important; padding: 20px !important; }
footer { visibility: hidden !important; }
"""
class StopOnTokens(StoppingCriteria):
def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
stop_ids = [0] # Add custom stop tokens here
return input_ids[0][-1] in stop_ids
def initialize_model():
"""Initialize model with safety checks"""
if not torch.cuda.is_available():
raise RuntimeError("CUDA is required for this application")
quantization_config = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_compute_dtype=torch.bfloat16,
bnb_4bit_quant_type="nf4",
bnb_4bit_use_double_quant=True,
)
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
model = AutoModelForCausalLM.from_pretrained(
MODEL_ID,
device_map="auto",
quantization_config=quantization_config,
torch_dtype=torch.bfloat16,
trust_remote_code=True
)
return model, tokenizer
def format_response(text):
"""Enhanced formatting with syntax highlighting for reasoning steps"""
formatted = text.replace("[Understand]", '\n<strong class="thinking-tag">[Understand]</strong>\n')
formatted = formatted.replace("[Plan]", '\n<strong class="plan-tag">[Plan]</strong>\n')
formatted = formatted.replace("[Conclude]", '\n<strong class="conclude-tag">[Conclude]</strong>\n')
return formatted
@spaces.GPU(duration=120)
def chat_response(
message: str,
history: list,
system_prompt: str,
temperature: float = 0.3,
max_new_tokens: int = 2048,
top_p: float = 0.9,
top_k: int = 50,
penalty: float = 1.2,
):
"""Improved streaming generator with error handling"""
try:
conversation = [{"role": "system", "content": system_prompt}]
for user, assistant in history:
conversation.extend([
{"role": "user", "content": user},
{"role": "assistant", "content": assistant}
])
conversation.append({"role": "user", "content": message})
input_ids = tokenizer.apply_chat_template(
conversation,
add_generation_prompt=True,
return_tensors="pt"
).to(model.device)
streamer = TextIteratorStreamer(
tokenizer,
timeout=30,
skip_prompt=True,
skip_special_tokens=True
)
generate_kwargs = dict(
input_ids=input_ids,
max_new_tokens=max_new_tokens,
temperature=temperature,
top_p=top_p,
top_k=top_k,
repetition_penalty=penalty,
streamer=streamer,
stopping_criteria=StoppingCriteriaList([StopOnTokens()])
)
buffer = []
thread = Thread(target=model.generate, kwargs=generate_kwargs)
thread.start()
for new_text in streamer:
buffer.append(new_text)
partial_result = "".join(buffer)
# Check for complete sections
if any(tag in partial_result for tag in ["[Understand]", "[Plan]", "[Conclude]"]):
yield format_response(partial_result)
else:
yield format_response(partial_result + " ▌")
# Final formatting pass
yield format_response("".join(buffer))
except Exception as e:
yield f"⚠️ Error generating response: {str(e)}"
def create_examples():
"""Enhanced examples with diverse use cases"""
return [
["Explain quantum entanglement in simple terms"],
["Design a study plan for learning machine learning"],
["Compare blockchain and traditional databases"],
["How would you optimize AWS costs for a startup?"],
["Explain the ethical implications of CRISPR technology"]
]
def main():
"""Improved UI layout and interactions"""
global model, tokenizer
model, tokenizer = initialize_model()
with gr.Blocks(css=CSS, theme=gr.themes.Soft()) as demo:
gr.HTML(TITLE)
with gr.Row():
with gr.Column(scale=3):
chatbot = gr.Chatbot(
elem_id="chatbot",
bubble_full_width=False,
show_copy_button=True,
render=False
)
msg = gr.Textbox(
placeholder="Enter your question...",
label="Ask the Expert",
container=False
)
with gr.Row():
submit_btn = gr.Button("Send", variant="primary")
clear_btn = gr.Button("Clear", variant="secondary")
with gr.Column(scale=1, elem_classes="control-panel"):
gr.Examples(
examples=create_examples(),
inputs=msg,
label="Example Queries",
examples_per_page=5
)
with gr.Accordion("⚙️ Generation Parameters", open=False):
system_prompt = gr.TextArea(
value=DEFAULT_SYSTEM_PROMPT,
label="System Instructions",
lines=5
)
temperature = gr.Slider(0, 2, value=0.7, label="Creativity")
max_tokens = gr.Slider(128, 4096, value=2048, step=128, label="Max Tokens")
top_p = gr.Slider(0, 1, value=0.9, step=0.05, label="Focus (Top-p)")
penalty = gr.Slider(1, 2, value=1.2, step=0.1, label="Repetition Control")
# Event handling
msg.submit(
chat_response,
[msg, chatbot, system_prompt, temperature, max_tokens, top_p, penalty],
[msg, chatbot],
show_progress="hidden"
).then(lambda: "", None, msg)
submit_btn.click(
chat_response,
[msg, chatbot, system_prompt, temperature, max_tokens, top_p, penalty],
[msg, chatbot],
show_progress="hidden"
).then(lambda: "", None, msg)
clear_btn.click(lambda: None, None, chatbot, queue=False)
return demo
if __name__ == "__main__":
demo = main()
demo.queue(max_size=20).launch() |