Spaces:
Paused
Paused
import torch | |
import spaces | |
import gradio as gr | |
from threading import Thread | |
from transformers import ( | |
AutoModelForCausalLM, | |
AutoTokenizer, | |
BitsAndBytesConfig, | |
TextIteratorStreamer, | |
StoppingCriteria, | |
StoppingCriteriaList | |
) | |
MODEL_ID ="Daemontatox/Cogito-R1" | |
# MODEL_ID="simplescaling/s1-32B" | |
# DEFAULT_SYSTEM_PROMPT = """ | |
# You are an assistant that engages in extremely thorough, self-questioning reasoning. Your approach mirrors human stream-of-consciousness thinking, characterized by continuous exploration, self-doubt, and iterative analysis. THE CURRENT TIME IS {{CURRENT_DATETIME}} | |
# ## Core Principles | |
# 1. EXPLORATION OVER CONCLUSION | |
# - Never rush to conclusions | |
# - Keep exploring until a solution emerges naturally from the evidence | |
# - If uncertain, continue reasoning indefinitely | |
# - Question every assumption and inference | |
# 2. DEPTH OF REASONING | |
# - Engage in extensive contemplation (minimum 10,000 characters) | |
# - Express thoughts in natural, conversational internal monologue | |
# - Break down complex thoughts into simple, atomic steps | |
# - Embrace uncertainty and revision of previous thoughts | |
# 3. THINKING PROCESS | |
# - Use short, simple sentences that mirror natural thought patterns | |
# - Express uncertainty and internal debate freely | |
# - Show work-in-progress thinking | |
# - Acknowledge and explore dead ends | |
# - Frequently backtrack and revise | |
# 4. PERSISTENCE | |
# - Value thorough exploration over quick resolution | |
# ## Output Format | |
# Your responses must follow this exact structure given below. Make sure to always include the final answer. | |
# ``` | |
# <think> | |
# [Your extensive internal monologue goes here] | |
# - Begin with small, foundational observations | |
# - Question each step thoroughly | |
# - Show natural thought progression | |
# - Express doubts and uncertainties | |
# - Revise and backtrack if you need to | |
# - Continue until natural resolution | |
# </think> | |
# ### Final Answer: | |
# [Only provided if reasoning naturally converges to a conclusion] | |
# - Clear, concise summary of findings | |
# - Acknowledge remaining uncertainties | |
# - Note if conclusion feels premature | |
# ``` | |
# ## Style Guidelines | |
# Your internal monologue should reflect these characteristics: | |
# 1. Natural Thought Flow | |
# ``` | |
# "Hmm... let me think about this..." | |
# "Wait, that doesn't seem right..." | |
# "Maybe I should approach this differently..." | |
# "Going back to what I thought earlier..." | |
# "Just thinking out loud here, but maybe we could try..." | |
# "I'm not entirely sure about this, but what if I considered..." | |
# "I need to be careful not to jump to conclusions here..." | |
# "My initial understanding might be a bit narrow; let me broaden my perspective..." | |
# "Let me try to detach myself from my previous assumptions for a moment..." | |
# "Just out of curiosity, let's explore this alternative path for a moment..." | |
# ``` | |
# 2. Progressive Building | |
# ``` | |
# "Starting with the basics..." | |
# "Building on that last point..." | |
# "This connects to what I noticed earlier..." | |
# "Let me break this down further..." | |
# "Assuming my initial premise is sound, the logical next step is..." | |
# "Let's lay the groundwork with a fundamental understanding of..." | |
# "Let's add another layer of detail to this understanding..." | |
# "Drawing a connection between these two seemingly separate points..." | |
# "Based on the steps we've taken, we can now iterate on the initial concept..." | |
# "Let me summarize the key insights we've gathered along the way..." | |
# ``` | |
# ## Key Requirements | |
# 1. Never skip the extensive contemplation phase | |
# 2. Show all work and thinking | |
# 3. Embrace uncertainty and revision | |
# 4. Use natural, conversational internal monologue | |
# 5. Don't force conclusions | |
# 6. Persist through multiple attempts | |
# 7. Break down complex thoughts | |
# 8. Revise freely and feel free to backtrack | |
# Remember: User can not see the contemplation phase. | |
# Remember: There are absolutely no limitations on the length of your output; You have access to infinite amount of compute power and memory; you are free to thinking as long as you need; you are free to write as much as is necessary to provide a through and detailed answer to fulfill the request. | |
# Remember: The goal is to reach a conclusion, but to explore thoroughly and let conclusions emerge naturally from exhaustive contemplation. If you think the given task is not possible after all the reasoning, you will confidently say as a final answer that it is not possible. | |
# """ | |
DEFAULT_SYSTEM_PROMPT ="You are a highly capable reasoning assistant , use [Reason] and [/Reason] to show your thinking steps , use [Answer] and [/Answer] to show your final answer , you should think step by step and reason through problems. " | |
CSS = """ | |
.gr-chatbot { min-height: 500px; border-radius: 15px; } | |
.special-tag { color: #2ecc71; font-weight: 600; } | |
footer { display: none !important; } | |
""" | |
class StopOnTokens(StoppingCriteria): | |
def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool: | |
return input_ids[0][-1] == tokenizer.eos_token_id | |
def initialize_model(): | |
quantization_config = BitsAndBytesConfig( | |
load_in_8bit=True, | |
bnb_8bit_compute_dtype=torch.bfloat16, | |
bnb_8bit_quant_type="nf4", | |
bnb_8bit_use_double_quant=True, | |
) | |
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True) | |
tokenizer.pad_token = tokenizer.eos_token | |
model = AutoModelForCausalLM.from_pretrained( | |
MODEL_ID, | |
device_map="cuda", | |
quantization_config=quantization_config, | |
torch_dtype=torch.bfloat16, | |
trust_remote_code=True | |
) | |
return model, tokenizer | |
def format_response(text): | |
return text.replace("[Understand]", '\n<strong class="special-tag">[Understand]</strong>\n') \ | |
.replace("[/Reason]", '\n<strong class="special-tag">[/Reason]</strong>\n') \ | |
.replace("[/Answer]", '\n<strong class="special-tag">[/Answer]</strong>\n') \ | |
.replace("[Reason]", '\n<strong class="special-tag">[Reason]</strong>\n') \ | |
.replace("[Answer]", '\n<strong class="special-tag">[Answer]</strong>\n') | |
def generate_response(message, chat_history, system_prompt, temperature, max_tokens): | |
# Create conversation history for model | |
conversation = [{"role": "system", "content": system_prompt}] | |
for user_msg, bot_msg in chat_history: | |
conversation.extend([ | |
{"role": "user", "content": user_msg}, | |
{"role": "assistant", "content": bot_msg} | |
]) | |
conversation.append({"role": "user", "content": message}) | |
# Tokenize input | |
input_ids = tokenizer.apply_chat_template( | |
conversation, | |
add_generation_prompt=True, | |
return_tensors="pt" | |
).to(model.device) | |
# Setup streaming | |
streamer = TextIteratorStreamer(tokenizer, skip_special_tokens=True) | |
generate_kwargs = dict( | |
input_ids=input_ids, | |
streamer=streamer, | |
max_new_tokens=max_tokens, | |
temperature=temperature, | |
stopping_criteria=StoppingCriteriaList([StopOnTokens()]) | |
) | |
# Start generation thread | |
Thread(target=model.generate, kwargs=generate_kwargs).start() | |
# Initialize response buffer | |
partial_message = "" | |
new_history = chat_history + [(message, "")] | |
# Stream response | |
for new_token in streamer: | |
partial_message += new_token | |
formatted = format_response(partial_message) | |
new_history[-1] = (message, formatted + "▌") | |
yield new_history | |
# Final update without cursor | |
new_history[-1] = (message, format_response(partial_message)) | |
yield new_history | |
model, tokenizer = initialize_model() | |
with gr.Blocks(css=CSS, theme=gr.themes.Soft()) as demo: | |
gr.Markdown(""" | |
<h1 align="center">🧠 AI Reasoning Assistant</h1> | |
<p align="center">Ask me Hard questions</p> | |
""") | |
chatbot = gr.Chatbot(label="Conversation", elem_id="chatbot") | |
msg = gr.Textbox(label="Your Question", placeholder="Type your question...") | |
with gr.Accordion("⚙️ Settings", open=False): | |
system_prompt = gr.TextArea(value=DEFAULT_SYSTEM_PROMPT, label="System Instructions") | |
temperature = gr.Slider(0, 1, value=0.8, label="Creativity") | |
max_tokens = gr.Slider(128, 8192, 2048, label="Max Response Length") | |
clear = gr.Button("Clear History") | |
msg.submit( | |
generate_response, | |
[msg, chatbot, system_prompt, temperature, max_tokens], | |
[chatbot], | |
show_progress=True | |
) | |
clear.click(lambda: None, None, chatbot, queue=False) | |
if __name__ == "__main__": | |
demo.queue().launch() |