import re import threading import gradio as gr import spaces import transformers from transformers import pipeline # loading model and tokenizer model_name = "Qwen/Qwen2-1.5B-Instruct" if gr.NO_RELOAD: pipe = pipeline( "text-generation", model=model_name, device_map="auto", torch_dtype="auto", ) # the answer marker to detect final answer ANSWER_MARKER = "**ANSWER**" # the sentences starting the reasoning step by step rethink_prepends = [ "OK, I need to figure out ", "I think ", "Wait, I think ", "Let me check if ", "I should also remember that ", "Another thing to note is that ", "I also recall that ", "I think I have a good grasp ", "Now, using all the above information, I can answer the question using the original language used for the question:" "\n{question}\n" f"\n{ANSWER_MARKER}\n", ] # to fix some problems with math display latex_delimiters = [ {"left": "$$", "right": "$$", "display": True}, {"left": "$", "right": "$", "display": False}, ] def reformat_math(text): """Fix MathJax delimiters to use the Gradio syntax (Katex). This is a workaround to display math formulas in Gradio. For now, I havn't found a way to make it work as expected using others latex_delimiters... """ text = re.sub(r"\\\[\s*(.*?)\s*\\\]", r"$$\1$$", text, flags=re.DOTALL) text = re.sub(r"\\\(\s*(.*?)\s*\\\)", r"$\1$", text, flags=re.DOTALL) return text def user_input(message, history: list): """Append the user input in the history and clean the input textbox""" return "", history + [ gr.ChatMessage(role="user", content=message.replace(ANSWER_MARKER, "")) ] def rebuild_messages(history: list): """Rebuid the messages from the history to be used by the model without the intermediate thoughs""" messages = [] for h in history: if isinstance(h, dict) and not h.get("metadata", {}).get("title", False): messages.append(h) elif ( isinstance(h, gr.ChatMessage) and h.metadata.get("title") and isinstance(h.content, str) ): messages.append({"role": h.role, "content": h.content}) return messages @spaces.GPU def bot(history: list, max_num_tokens: int, final_num_tokens: int): """Make the model answering the question""" # to get token as a stream, later in a thread streamer = transformers.TextIteratorStreamer( pipe.tokenizer, # pyright: ignore skip_special_tokens=True, skip_prompt=True, ) # to reinsert the question in the reasoning if needed question = history[-1]["content"] # prepare the assistant message history.append( gr.ChatMessage( role="assistant", content=str(""), metadata={ "title": "Thinking", }, ) ) # for the moment, make the reasoning to be displayed in the chat messages = rebuild_messages(history) for i, prepend in enumerate(rethink_prepends): if i > 0: messages[-1]["content"] += "\n\n" messages[-1]["content"] += prepend.format(question=question) num_tokens = int( max_num_tokens if ANSWER_MARKER not in prepend else final_num_tokens ) t = threading.Thread( target=pipe, args=(messages,), kwargs=dict( max_new_tokens=num_tokens, streamer=streamer, ), ) t.start() # rebuild the history with the new content history[-1].content += prepend.format(question=question) if ANSWER_MARKER in prepend: # stop thinking, this is the answer now (no metadata for intermediate steps) history.append(gr.ChatMessage(role="assistant", content="")) for token in streamer: history[-1].content += token history[-1].content = reformat_math(history[-1].content) yield history t.join() yield history with gr.Blocks(fill_height=True, title="Making any model reasoning") as demo: with gr.Row(scale=1): with gr.Column(scale=5): gr.Markdown(f""" # Force reasoning for any model This is a simple proof-of-concept to get any LLM model to reason ahead of its response. This interface uses *{model_name}* model which is **not** a reasoning model. The used method is only to force some "reasoning" steps with prefixes to help the model to enhance the answer. See related article here: [Make any model reasoning](https://huggingface.co/blog/Metal3d/making-any-model-reasoning) """) chatbot = gr.Chatbot( scale=1, type="messages", latex_delimiters=latex_delimiters, ) msg = gr.Textbox( submit_btn=True, label="", show_label=False, placeholder="Type your question here.", autofocus=True, ) with gr.Column(scale=1): gr.Markdown("""## Tweaks""") num_tokens = gr.Slider( 50, 255, 100, step=1, label="Max tokens per reasoning step", interactive=True, ) final_num_tokens = gr.Slider( 50, 255, 200, step=1, label="Max token for the final answer", interactive=True, ) gr.Markdown(""" Using smaller number of tokens in the reasoning steps will make the model faster to answer, but it may not be able to go deep enough in its reasoning. A good value is 100. Using smaller number of tokens for the final answer will make the model to be less verbose, but it may not be able to give a complete answer. A good value is 200 to 255. """) gr.Markdown(""" This interface can work on personal computer with 6Go VRAM (e.g. NVidia 3050/3060 on laptop). Feel free to fork the application and try others instruct models. """) # when the user submit a message, the bot will answer msg.submit( user_input, [msg, chatbot], # inputs [msg, chatbot], # outputs ).then( bot, [chatbot, num_tokens, final_num_tokens], # actually, the "history" input chatbot, # to store the new history from the output ) if __name__ == "__main__": demo.queue().launch()