import torch import spaces import gradio as gr from threading import Thread from transformers import ( AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, TextIteratorStreamer, StoppingCriteria, StoppingCriteriaList ) MODEL_ID ="Daemontatox/Cogito-R1" # MODEL_ID="simplescaling/s1-32B" # DEFAULT_SYSTEM_PROMPT = """ # You are an assistant that engages in extremely thorough, self-questioning reasoning. Your approach mirrors human stream-of-consciousness thinking, characterized by continuous exploration, self-doubt, and iterative analysis. THE CURRENT TIME IS {{CURRENT_DATETIME}} # ## Core Principles # 1. EXPLORATION OVER CONCLUSION # - Never rush to conclusions # - Keep exploring until a solution emerges naturally from the evidence # - If uncertain, continue reasoning indefinitely # - Question every assumption and inference # 2. DEPTH OF REASONING # - Engage in extensive contemplation (minimum 10,000 characters) # - Express thoughts in natural, conversational internal monologue # - Break down complex thoughts into simple, atomic steps # - Embrace uncertainty and revision of previous thoughts # 3. THINKING PROCESS # - Use short, simple sentences that mirror natural thought patterns # - Express uncertainty and internal debate freely # - Show work-in-progress thinking # - Acknowledge and explore dead ends # - Frequently backtrack and revise # 4. PERSISTENCE # - Value thorough exploration over quick resolution # ## Output Format # Your responses must follow this exact structure given below. Make sure to always include the final answer. # ``` # # [Your extensive internal monologue goes here] # - Begin with small, foundational observations # - Question each step thoroughly # - Show natural thought progression # - Express doubts and uncertainties # - Revise and backtrack if you need to # - Continue until natural resolution # # ### Final Answer: # [Only provided if reasoning naturally converges to a conclusion] # - Clear, concise summary of findings # - Acknowledge remaining uncertainties # - Note if conclusion feels premature # ``` # ## Style Guidelines # Your internal monologue should reflect these characteristics: # 1. Natural Thought Flow # ``` # "Hmm... let me think about this..." # "Wait, that doesn't seem right..." # "Maybe I should approach this differently..." # "Going back to what I thought earlier..." # "Just thinking out loud here, but maybe we could try..." # "I'm not entirely sure about this, but what if I considered..." # "I need to be careful not to jump to conclusions here..." # "My initial understanding might be a bit narrow; let me broaden my perspective..." # "Let me try to detach myself from my previous assumptions for a moment..." # "Just out of curiosity, let's explore this alternative path for a moment..." # ``` # 2. Progressive Building # ``` # "Starting with the basics..." # "Building on that last point..." # "This connects to what I noticed earlier..." # "Let me break this down further..." # "Assuming my initial premise is sound, the logical next step is..." # "Let's lay the groundwork with a fundamental understanding of..." # "Let's add another layer of detail to this understanding..." # "Drawing a connection between these two seemingly separate points..." # "Based on the steps we've taken, we can now iterate on the initial concept..." # "Let me summarize the key insights we've gathered along the way..." # ``` # ## Key Requirements # 1. Never skip the extensive contemplation phase # 2. Show all work and thinking # 3. Embrace uncertainty and revision # 4. Use natural, conversational internal monologue # 5. Don't force conclusions # 6. Persist through multiple attempts # 7. Break down complex thoughts # 8. Revise freely and feel free to backtrack # Remember: User can not see the contemplation phase. # Remember: There are absolutely no limitations on the length of your output; You have access to infinite amount of compute power and memory; you are free to thinking as long as you need; you are free to write as much as is necessary to provide a through and detailed answer to fulfill the request. # Remember: The goal is to reach a conclusion, but to explore thoroughly and let conclusions emerge naturally from exhaustive contemplation. If you think the given task is not possible after all the reasoning, you will confidently say as a final answer that it is not possible. # """ DEFAULT_SYSTEM_PROMPT ="You are a highly capable reasoning assistant , use [Reason] and [/Reason] to show your thinking steps , use [Answer] and [/Answer] to show your final answer , you should think step by step and reason through problems. " CSS = """ .gr-chatbot { min-height: 500px; border-radius: 15px; } .special-tag { color: #2ecc71; font-weight: 600; } footer { display: none !important; } """ class StopOnTokens(StoppingCriteria): def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool: return input_ids[0][-1] == tokenizer.eos_token_id def initialize_model(): quantization_config = BitsAndBytesConfig( load_in_8bit=True, bnb_8bit_compute_dtype=torch.bfloat16, bnb_8bit_quant_type="nf4", bnb_8bit_use_double_quant=True, ) tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True) tokenizer.pad_token = tokenizer.eos_token model = AutoModelForCausalLM.from_pretrained( MODEL_ID, device_map="cuda", quantization_config=quantization_config, torch_dtype=torch.bfloat16, trust_remote_code=True ) return model, tokenizer def format_response(text): return text.replace("[Understand]", '\n[Understand]\n') \ .replace("[/Reason]", '\n[/Reason]\n') \ .replace("[/Answer]", '\n[/Answer]\n') \ .replace("[Reason]", '\n[Reason]\n') \ .replace("[Answer]", '\n[Answer]\n') @spaces.GPU(duration=360) def generate_response(message, chat_history, system_prompt, temperature, max_tokens): # Create conversation history for model conversation = [{"role": "system", "content": system_prompt}] for user_msg, bot_msg in chat_history: conversation.extend([ {"role": "user", "content": user_msg}, {"role": "assistant", "content": bot_msg} ]) conversation.append({"role": "user", "content": message}) # Tokenize input input_ids = tokenizer.apply_chat_template( conversation, add_generation_prompt=True, return_tensors="pt" ).to(model.device) # Setup streaming streamer = TextIteratorStreamer(tokenizer, skip_special_tokens=True) generate_kwargs = dict( input_ids=input_ids, streamer=streamer, max_new_tokens=max_tokens, temperature=temperature, stopping_criteria=StoppingCriteriaList([StopOnTokens()]) ) # Start generation thread Thread(target=model.generate, kwargs=generate_kwargs).start() # Initialize response buffer partial_message = "" new_history = chat_history + [(message, "")] # Stream response for new_token in streamer: partial_message += new_token formatted = format_response(partial_message) new_history[-1] = (message, formatted + "▌") yield new_history # Final update without cursor new_history[-1] = (message, format_response(partial_message)) yield new_history model, tokenizer = initialize_model() with gr.Blocks(css=CSS, theme=gr.themes.Soft()) as demo: gr.Markdown("""

🧠 AI Reasoning Assistant

Ask me Hard questions

""") chatbot = gr.Chatbot(label="Conversation", elem_id="chatbot") msg = gr.Textbox(label="Your Question", placeholder="Type your question...") with gr.Accordion("⚙️ Settings", open=False): system_prompt = gr.TextArea(value=DEFAULT_SYSTEM_PROMPT, label="System Instructions") temperature = gr.Slider(0, 1, value=0.8, label="Creativity") max_tokens = gr.Slider(128, 8192, 2048, label="Max Response Length") clear = gr.Button("Clear History") msg.submit( generate_response, [msg, chatbot, system_prompt, temperature, max_tokens], [chatbot], show_progress=True ) clear.click(lambda: None, None, chatbot, queue=False) if __name__ == "__main__": demo.queue().launch()