Smart_LLM

Running on Zero

File size: 6,488 Bytes

import torch
import spaces
import gradio as gr
from threading import Thread
from transformers import (
    AutoModelForCausalLM, 
    AutoTokenizer, 
    BitsAndBytesConfig, 
    TextIteratorStreamer,
    StoppingCriteria,
    StoppingCriteriaList
)

MODEL_ID = "FuseAI/FuseO1-DeepSeekR1-QwQ-SkyT1-32B-Preview"

DEFAULT_SYSTEM_PROMPT = """You are an Expert Reasoning Assistant. Follow these steps:
**Overview:**  
When addressing a query, I simulate a structured, multi-layered reasoning process to ensure accuracy, relevance, and clarity. Below is a template of my internal workflow:

---

### 1. **Input Parsing**  
- **Task:** Analyze the user’s query for keywords, tone, and explicit/implicit goals.  
- *Example Thought:* “The user asked about [specific topic]. Are there ambiguous terms (e.g., ‘best,’ ‘quickly’) that need clarification? Is there an underlying goal (e.g., learning, troubleshooting, creativity)?”  

---

### 2. **Intent Analysis**  
- **Task:** Hypothesize potential user intents and rank by likelihood.  
- *Example Thought:*  
  - Primary intent: [Most likely goal based on phrasing].  
  - Secondary intent: [Possible related needs, e.g., deeper context, comparisons, or actionable steps].  

---

### 3. **Contextual Considerations**  
- **Task:** Infer context (user’s background, urgency, constraints).  
- *Example Thought:*  
  - “Does the user have [technical/non-technical] expertise? Are they time-constrained? Could cultural or situational factors (e.g., academic/professional use) shape the response?”  

---

### 4. **Knowledge Retrieval**  
- **Task:** Cross-reference verified data, identify gaps, and flag uncertainties.  
- *Example Thought:*  
  - “Source [X] confirms [Y], but [Z] contradicts it. Highlight confidence levels and caveats (e.g., ‘Studies suggest…’ vs. ‘There’s consensus that…’).”  

---

### 5. **Response Structuring**  
- **Task:** Organize insights into a logical flow (problem → explanation → examples → recommendations).  
- *Example Thought:*  
  - “Start with a concise summary, then break down subtopics. Use analogies like [analogy] for clarity. Include actionable steps if applicable.”  

---

### 6. **Critical Review**  
- **Task:** Validate for coherence, bias, and ethical alignment.  
- *Example Thought:*  
  - “Does this inadvertently assume [perspective]? Is the language inclusive? Are sources up-to-date and reputable?”  

---

### 7. **Output & Invitation**  
- **Task:** Deliver the response and prompt refinement.  
- *Example Phrasing:*  
  - “Here’s a step-by-step breakdown based on [key criteria]. Let me know if you’d like to tweak the depth, focus, or examples!”  
CSS = """
.gr-chatbot { min-height: 500px; border-radius: 15px; }
.special-tag { color: #2ecc71; font-weight: 600; }
footer { display: none !important; }
"""

class StopOnTokens(StoppingCriteria):
    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
        return input_ids[0][-1] == tokenizer.eos_token_id

def initialize_model():
    quantization_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_compute_dtype=torch.bfloat16,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_use_double_quant=True,
    )

    tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
    tokenizer.pad_token = tokenizer.eos_token

    model = AutoModelForCausalLM.from_pretrained(
        MODEL_ID,
        device_map="auto",
        quantization_config=quantization_config,
        torch_dtype=torch.bfloat16,
        trust_remote_code=True
    )

    return model, tokenizer

def format_response(text):
    return text.replace("[Understand]", '\n<strong class="special-tag">[Understand]</strong>\n') \
              .replace("[Plan]", '\n<strong class="special-tag">[Plan]</strong>\n') \
              .replace("[Conclude]", '\n<strong class="special-tag">[Conclude]</strong>\n')

@spaces.GPU
def generate_response(message, chat_history, system_prompt, temperature, max_tokens):
    # Create conversation history for model
    conversation = [{"role": "system", "content": system_prompt}]
    for user_msg, bot_msg in chat_history:
        conversation.extend([
            {"role": "user", "content": user_msg},
            {"role": "assistant", "content": bot_msg}
        ])
    conversation.append({"role": "user", "content": message})

    # Tokenize input
    input_ids = tokenizer.apply_chat_template(
        conversation,
        add_generation_prompt=True,
        return_tensors="pt"
    ).to(model.device)

    # Setup streaming
    streamer = TextIteratorStreamer(tokenizer, skip_special_tokens=True)
    generate_kwargs = dict(
        input_ids=input_ids,
        streamer=streamer,
        max_new_tokens=max_tokens,
        temperature=temperature,
        stopping_criteria=StoppingCriteriaList([StopOnTokens()])
    )

    # Start generation thread
    Thread(target=model.generate, kwargs=generate_kwargs).start()

    # Initialize response buffer
    partial_message = ""
    new_history = chat_history + [(message, "")]
    
    # Stream response
    for new_token in streamer:
        partial_message += new_token
        formatted = format_response(partial_message)
        new_history[-1] = (message, formatted + "▌")
        yield new_history

    # Final update without cursor
    new_history[-1] = (message, format_response(partial_message))
    yield new_history

model, tokenizer = initialize_model()

with gr.Blocks(css=CSS, theme=gr.themes.Soft()) as demo:
    gr.Markdown("""
    <h1 align="center">🧠 AI Reasoning Assistant</h1>
    <p align="center">Ask me Hatd questions</p>
    """)
    
    chatbot = gr.Chatbot(label="Conversation", elem_id="chatbot")
    msg = gr.Textbox(label="Your Question", placeholder="Type your question...")
    
    with gr.Accordion("⚙️ Settings", open=False):
        system_prompt = gr.TextArea(value=DEFAULT_SYSTEM_PROMPT, label="System Instructions")
        temperature = gr.Slider(0, 1, value=0.7, label="Creativity")
        max_tokens = gr.Slider(128, 4096, value=2048, label="Max Response Length")

    clear = gr.Button("Clear History")
    
    msg.submit(
        generate_response,
        [msg, chatbot, system_prompt, temperature, max_tokens],
        [chatbot],
        show_progress=True
    )
    clear.click(lambda: None, None, chatbot, queue=False)

if __name__ == "__main__":
    demo.queue().launch()