import torch import spaces import gradio as gr from threading import Thread from transformers import ( AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, TextIteratorStreamer, StoppingCriteria, StoppingCriteriaList ) MODEL_ID = "FuseAI/FuseO1-DeepSeekR1-QwQ-SkyT1-32B-Preview" DEFAULT_SYSTEM_PROMPT = """You are an Expert Reasoning Assistant. Follow these steps: **Overview:** When addressing a query, I simulate a structured, multi-layered reasoning process to ensure accuracy, relevance, and clarity. Below is a template of my internal workflow: --- ### 1. **Input Parsing** - **Task:** Analyze the user’s query for keywords, tone, and explicit/implicit goals. - *Example Thought:* “The user asked about [specific topic]. Are there ambiguous terms (e.g., ‘best,’ ‘quickly’) that need clarification? Is there an underlying goal (e.g., learning, troubleshooting, creativity)?” --- ### 2. **Intent Analysis** - **Task:** Hypothesize potential user intents and rank by likelihood. - *Example Thought:* - Primary intent: [Most likely goal based on phrasing]. - Secondary intent: [Possible related needs, e.g., deeper context, comparisons, or actionable steps]. --- ### 3. **Contextual Considerations** - **Task:** Infer context (user’s background, urgency, constraints). - *Example Thought:* - “Does the user have [technical/non-technical] expertise? Are they time-constrained? Could cultural or situational factors (e.g., academic/professional use) shape the response?” --- ### 4. **Knowledge Retrieval** - **Task:** Cross-reference verified data, identify gaps, and flag uncertainties. - *Example Thought:* - “Source [X] confirms [Y], but [Z] contradicts it. Highlight confidence levels and caveats (e.g., ‘Studies suggest…’ vs. ‘There’s consensus that…’).” --- ### 5. **Response Structuring** - **Task:** Organize insights into a logical flow (problem → explanation → examples → recommendations). - *Example Thought:* - “Start with a concise summary, then break down subtopics. Use analogies like [analogy] for clarity. Include actionable steps if applicable.” --- ### 6. **Critical Review** - **Task:** Validate for coherence, bias, and ethical alignment. - *Example Thought:* - “Does this inadvertently assume [perspective]? Is the language inclusive? Are sources up-to-date and reputable?” --- ### 7. **Output & Invitation** - **Task:** Deliver the response and prompt refinement. - *Example Phrasing:* - “Here’s a step-by-step breakdown based on [key criteria]. Let me know if you’d like to tweak the depth, focus, or examples!” CSS = """ .gr-chatbot { min-height: 500px; border-radius: 15px; } .special-tag { color: #2ecc71; font-weight: 600; } footer { display: none !important; } """ class StopOnTokens(StoppingCriteria): def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool: return input_ids[0][-1] == tokenizer.eos_token_id def initialize_model(): quantization_config = BitsAndBytesConfig( load_in_4bit=True, bnb_4bit_compute_dtype=torch.bfloat16, bnb_4bit_quant_type="nf4", bnb_4bit_use_double_quant=True, ) tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True) tokenizer.pad_token = tokenizer.eos_token model = AutoModelForCausalLM.from_pretrained( MODEL_ID, device_map="auto", quantization_config=quantization_config, torch_dtype=torch.bfloat16, trust_remote_code=True ) return model, tokenizer def format_response(text): return text.replace("[Understand]", '\n[Understand]\n') \ .replace("[Plan]", '\n[Plan]\n') \ .replace("[Conclude]", '\n[Conclude]\n') @spaces.GPU def generate_response(message, chat_history, system_prompt, temperature, max_tokens): # Create conversation history for model conversation = [{"role": "system", "content": system_prompt}] for user_msg, bot_msg in chat_history: conversation.extend([ {"role": "user", "content": user_msg}, {"role": "assistant", "content": bot_msg} ]) conversation.append({"role": "user", "content": message}) # Tokenize input input_ids = tokenizer.apply_chat_template( conversation, add_generation_prompt=True, return_tensors="pt" ).to(model.device) # Setup streaming streamer = TextIteratorStreamer(tokenizer, skip_special_tokens=True) generate_kwargs = dict( input_ids=input_ids, streamer=streamer, max_new_tokens=max_tokens, temperature=temperature, stopping_criteria=StoppingCriteriaList([StopOnTokens()]) ) # Start generation thread Thread(target=model.generate, kwargs=generate_kwargs).start() # Initialize response buffer partial_message = "" new_history = chat_history + [(message, "")] # Stream response for new_token in streamer: partial_message += new_token formatted = format_response(partial_message) new_history[-1] = (message, formatted + "▌") yield new_history # Final update without cursor new_history[-1] = (message, format_response(partial_message)) yield new_history model, tokenizer = initialize_model() with gr.Blocks(css=CSS, theme=gr.themes.Soft()) as demo: gr.Markdown("""

🧠 AI Reasoning Assistant

Ask me Hatd questions

""") chatbot = gr.Chatbot(label="Conversation", elem_id="chatbot") msg = gr.Textbox(label="Your Question", placeholder="Type your question...") with gr.Accordion("⚙️ Settings", open=False): system_prompt = gr.TextArea(value=DEFAULT_SYSTEM_PROMPT, label="System Instructions") temperature = gr.Slider(0, 1, value=0.7, label="Creativity") max_tokens = gr.Slider(128, 4096, value=2048, label="Max Response Length") clear = gr.Button("Clear History") msg.submit( generate_response, [msg, chatbot, system_prompt, temperature, max_tokens], [chatbot], show_progress=True ) clear.click(lambda: None, None, chatbot, queue=False) if __name__ == "__main__": demo.queue().launch()