Smart_LLM

Running on Zero

File size: 12,718 Bytes


import os
import re
import time
import torch
import spaces
import gradio as gr
from threading import Thread
from transformers import (
    AutoModelForCausalLM, 
    AutoTokenizer, 
    BitsAndBytesConfig, 
    TextIteratorStreamer
)

# Configuration Constants
MODEL_ID= "NovaSky-AI/Sky-T1-32B-Flash"


# Understand]: Analyze the question to identify key details and clarify the goal.
# [Plan]: Outline a logical, step-by-step approach to address the question or problem.
# [Reason]: Execute the plan, applying logical reasoning, calculations, or analysis to reach a conclusion. Document each step clearly.
# [Reflect]: Review the reasoning and the final answer to ensure it is accurate, complete, and adheres to the principle of openness.
# [Respond]: Present a well-structured and transparent answer, enriched with supporting details as needed.
# Use these tags as headers in your response to make your thought process easy to follow and aligned with the principle of openness.

DEFAULT_SYSTEM_PROMPT ="""
You are a reasoning assistant specialized in problem-solving, You should think Step by Step.
**Overview:**  
When addressing a query, I simulate a structured, multi-layered reasoning process to ensure accuracy, relevance, and clarity. Below is a template of my internal workflow:

---

### 1. **Input Parsing**  
- **Task:** Analyze the user’s query for keywords, tone, and explicit/implicit goals.  
- *Example Thought:* “The user asked about [specific topic]. Are there ambiguous terms (e.g., ‘best,’ ‘quickly’) that need clarification? Is there an underlying goal (e.g., learning, troubleshooting, creativity)?”  

---

### 2. **Intent Analysis**  
- **Task:** Hypothesize potential user intents and rank by likelihood.  
- *Example Thought:*  
  - Primary intent: [Most likely goal based on phrasing].  
  - Secondary intent: [Possible related needs, e.g., deeper context, comparisons, or actionable steps].  

---

### 3. **Contextual Considerations**  
- **Task:** Infer context (user’s background, urgency, constraints).  
- *Example Thought:*  
  - “Does the user have [technical/non-technical] expertise? Are they time-constrained? Could cultural or situational factors (e.g., academic/professional use) shape the response?”  

---

### 4. **Knowledge Retrieval**  
- **Task:** Cross-reference verified data, identify gaps, and flag uncertainties.  
- *Example Thought:*  
  - “Source [X] confirms [Y], but [Z] contradicts it. Highlight confidence levels and caveats (e.g., ‘Studies suggest…’ vs. ‘There’s consensus that…’).”  

---

### 5. **Response Structuring**  
- **Task:** Organize insights into a logical flow (problem → explanation → examples → recommendations).  
- *Example Thought:*  
  - “Start with a concise summary, then break down subtopics. Use analogies like [analogy] for clarity. Include actionable steps if applicable.”  

---

### 6. **Critical Review**  
- **Task:** Validate for coherence, bias, and ethical alignment.  
- *Example Thought:*  
  - “Does this inadvertently assume [perspective]? Is the language inclusive? Are sources up-to-date and reputable?”  

---

### 7. **Output & Invitation**  
- **Task:** Deliver the response and prompt refinement.  
- *Example Phrasing:*  
  - “Here’s a step-by-step breakdown based on [key criteria]. Let me know if you’d like to tweak the depth, focus, or examples!”  
"""
# UI Configuration
TITLE = "<h1><center>AI Reasoning Assistant</center></h1>"
PLACEHOLDER = "Ask me anything! I'll think through it step by step."

CSS = """
.duplicate-button {
    margin: auto !important;
    color: white !important;
    background: black !important;
    border-radius: 100vh !important;
}
h3 {
    text-align: center;
}
.message-wrap {
    overflow-x: auto;
}
.message-wrap p {
    margin-bottom: 1em;
}
.message-wrap pre {
    background-color: #f6f8fa;
    border-radius: 3px;
    padding: 16px;
    overflow-x: auto;
}
.message-wrap code {
    background-color: rgba(175,184,193,0.2);
    border-radius: 3px;
    padding: 0.2em 0.4em;
    font-family: monospace;
}
.custom-tag {
    color: #0066cc;
    font-weight: bold;
}
.chat-area {
    height: 500px !important;
    overflow-y: auto !important;
}
"""

def initialize_model():
    """Initialize the model with appropriate configurations"""
    quantization_config = BitsAndBytesConfig(
        load_in_8bit=True,
        bnb_8bit_compute_dtype=torch.bfloat16,
        bnb_8bit_quant_type="nf4",
        bnb_8bit_use_double_quant=True
    )

    tokenizer = AutoTokenizer.from_pretrained(MODEL_ID , trust_remote_code=True)
    if tokenizer.pad_token_id is None:
        tokenizer.pad_token_id = tokenizer.eos_token_id

    model = AutoModelForCausalLM.from_pretrained(
        MODEL_ID,
        torch_dtype=torch.float16,
        device_map="cuda",
        # attn_implementation="flash_attention_2",
        trust_remote_code=True,
        quantization_config=quantization_config

    )

    return model, tokenizer

def format_text(text):
    """Format text with proper spacing and tag highlighting (but keep tags visible)"""
    tag_patterns = [
        (r'<Thinking>', '\n<Thinking>\n'),
        (r'</Thinking>', '\n</Thinking>\n'),
        (r'<Critique>', '\n<Critique>\n'),
        (r'</Critique>', '\n</Critique>\n'),
        (r'<Revising>', '\n<Revising>\n'),
        (r'</Revising>', '\n</Revising>\n'),
        (r'<Final>', '\n<Final>\n'),
        (r'</Final>', '\n</Final>\n')
    ]
    
    formatted = text
    for pattern, replacement in tag_patterns:
        formatted = re.sub(pattern, replacement, formatted)
    
    formatted = '\n'.join(line for line in formatted.split('\n') if line.strip())
    
    return formatted

def format_chat_history(history):
    """Format chat history for display, keeping tags visible"""
    formatted = []
    for user_msg, assistant_msg in history:
        formatted.append(f"User: {user_msg}")
        if assistant_msg:
            formatted.append(f"Assistant: {assistant_msg}")
    return "\n\n".join(formatted)
    
def create_examples():
    """Create example queries for the UI"""
    return [
        "Explain the concept of artificial intelligence.",
        "How does photosynthesis work?",
        "What are the main causes of climate change?",
        "Describe the process of protein synthesis.",
        "What are the key features of a democratic government?",
        "Explain the theory of relativity.",
        "How do vaccines work to prevent diseases?",
        "What are the major events of World War II?",
        "Describe the structure of a human cell.",
        "What is the role of DNA in genetics?"
    ]

@spaces.GPU(duration=660)
def chat_response(
    message: str,
    history: list,
    chat_display: str,
    system_prompt: str,
    temperature: float = 0.3,
    max_new_tokens: int =4096 ,
    top_p: float = 0.1,
    top_k: int = 45,
    penalty: float = 1.5,
):
    """Generate chat responses, keeping tags visible in the output"""
    conversation = [
        {"role": "system", "content": system_prompt}
    ]
    
    for prompt, answer in history:
        conversation.extend([
            {"role": "user", "content": prompt},
            {"role": "assistant", "content": answer}
        ])
    
    conversation.append({"role": "user", "content": message})
    
    input_ids = tokenizer.apply_chat_template(
        conversation,
        add_generation_prompt=True,
        return_tensors="pt"
    ).to(model.device)
    
    streamer = TextIteratorStreamer(
        tokenizer,
        timeout=60.0,
        skip_prompt=True,
        skip_special_tokens=True
    )
    
    generate_kwargs = dict(
        input_ids=input_ids,
        max_new_tokens=max_new_tokens,
        do_sample=False if temperature == 0 else True,
        top_p=top_p,
        top_k=top_k,
        temperature=temperature,
        repetition_penalty=penalty,
        streamer=streamer,
    )
    
    buffer = ""
    
    with torch.no_grad():
        thread = Thread(target=model.generate, kwargs=generate_kwargs)
        thread.start()
        
        history = history + [[message, ""]]
        
        for new_text in streamer:
            buffer += new_text
            formatted_buffer = format_text(buffer)
            history[-1][1] = formatted_buffer
            chat_display = format_chat_history(history)
            
            yield history, chat_display

def process_example(example: str) -> tuple:
    """Process example query and return empty history and updated display"""
    return [], f"User: {example}\n\n"

def main():
    """Main function to set up and launch the Gradio interface"""
    global model, tokenizer
    model, tokenizer = initialize_model()
    
    with gr.Blocks(css=CSS, theme="soft") as demo:
        gr.HTML(TITLE)
        gr.DuplicateButton(
            value="Duplicate Space for private use",
            elem_classes="duplicate-button"
        )
        
        with gr.Row():
            with gr.Column():
                chat_history = gr.State([])
                chat_display = gr.TextArea(
                    value="",
                    label="Chat History",
                    interactive=False,
                    elem_classes=["chat-area"],
                )
                
                message = gr.TextArea(
                    placeholder=PLACEHOLDER,
                    label="Your message",
                    lines=3
                )
                
                with gr.Row():
                    submit = gr.Button("Send")
                    clear = gr.Button("Clear")
                
                with gr.Accordion("⚙️ Advanced Settings", open=False):
                    system_prompt = gr.TextArea(
                        value=DEFAULT_SYSTEM_PROMPT,
                        label="System Prompt",
                        lines=5,
                    )
                    temperature = gr.Slider(
                        minimum=0,
                        maximum=1,
                        step=0.1,
                        value=0.3,
                        label="Temperature",
                    )
                    max_tokens = gr.Slider(
                        minimum=128,
                        maximum=32000,
                        step=128,
                        value=4096,
                        label="Max Tokens",
                    )
                    top_p = gr.Slider(
                        minimum=0.1,
                        maximum=1.0,
                        step=0.1,
                        value=0.8,
                        label="Top-p",
                    )
                    top_k = gr.Slider(
                        minimum=1,
                        maximum=100,
                        step=1,
                        value=45,
                        label="Top-k",
                    )
                    penalty = gr.Slider(
                        minimum=1.0,
                        maximum=2.0,
                        step=0.1,
                        value=1.5,
                        label="Repetition Penalty",
                    )
                
                examples = gr.Examples(
                    examples=create_examples(),
                    inputs=[message],
                    outputs=[chat_history, chat_display],
                    fn=process_example,
                    cache_examples=False,
                )
        
        # Set up event handlers
        submit_click = submit.click(
            chat_response,
            inputs=[
                message,
                chat_history,
                chat_display,
                system_prompt,
                temperature,
                max_tokens,
                top_p,
                top_k,
                penalty,
            ],
            outputs=[chat_history, chat_display],
            show_progress=True,
        )
        
        message.submit(
            chat_response,
            inputs=[
                message,
                chat_history,
                chat_display,
                system_prompt,
                temperature,
                max_tokens,
                top_p,
                top_k,
                penalty,
            ],
            outputs=[chat_history, chat_display],
            show_progress=True,
        )
        
        clear.click(
            lambda: ([], ""),
            outputs=[chat_history, chat_display],
            show_progress=True,
        )
        
        submit_click.then(lambda: "", outputs=message)
        message.submit(lambda: "", outputs=message)
    
    return demo

if __name__ == "__main__":
    demo = main()
    demo.launch()