import re
import threading

import gradio as gr
import spaces
import transformers
from transformers import pipeline

# loading model and tokenizer
model_name = "Qwen/Qwen2-1.5B-Instruct"
if gr.NO_RELOAD:
    pipe = pipeline(
        "text-generation",
        model=model_name,
        device_map="auto",
        torch_dtype="auto",
    )

# the answer marker to detect final answer
ANSWER_MARKER = "**ANSWER**"

# the sentences starting the reasoning step by step
rethink_prepends = [
    "OK, I need to figure out ",
    "I think ",
    "Wait, I think ",
    "Let me check if ",
    "I should also remember that ",
    "Another thing to note is that ",
    "I also recall that ",
    "I think I have a good grasp ",
    "Now, using all the above information, I can answer the question using the original language used for the question:"
    "\n{question}\n"
    f"\n{ANSWER_MARKER}\n",
]


# to fix some problems with math display
latex_delimiters = [
    {"left": "$$", "right": "$$", "display": True},
    {"left": "$", "right": "$", "display": False},
]


def reformat_math(text):
    """Fix MathJax delimiters to use the Gradio syntax (Katex).

    This is a workaround to display math formulas in Gradio. For now, I havn't found a way to
    make it work as expected using others latex_delimiters...
    """
    text = re.sub(r"\\\[\s*(.*?)\s*\\\]", r"$$\1$$", text, flags=re.DOTALL)
    text = re.sub(r"\\\(\s*(.*?)\s*\\\)", r"$\1$", text, flags=re.DOTALL)
    return text


def user_input(message, history: list):
    """Append the user input in the history and clean the input textbox"""
    return "", history + [
        gr.ChatMessage(role="user", content=message.replace(ANSWER_MARKER, ""))
    ]


def rebuild_messages(history: list):
    """Rebuid the messages from the history to be used by the model without the intermediate thoughs"""
    messages = []
    for h in history:
        if isinstance(h, dict) and not h.get("metadata", {}).get("title", False):
            messages.append(h)
        elif (
            isinstance(h, gr.ChatMessage)
            and h.metadata.get("title")
            and isinstance(h.content, str)
        ):
            messages.append({"role": h.role, "content": h.content})
    return messages


@spaces.GPU
def bot(history: list, max_num_tokens: int, final_num_tokens: int):
    """Make the model answering the question"""

    # to get token as a stream, later in a thread
    streamer = transformers.TextIteratorStreamer(
        pipe.tokenizer,  # pyright: ignore
        skip_special_tokens=True,
        skip_prompt=True,
    )

    # to reinsert the question in the reasoning if needed
    question = history[-1]["content"]

    # prepare the assistant message
    history.append(
        gr.ChatMessage(
            role="assistant",
            content=str(""),
            metadata={
                "title": "Thinking",
            },
        )
    )

    # for the moment, make the reasoning to be displayed in the chat
    messages = rebuild_messages(history)
    for i, prepend in enumerate(rethink_prepends):
        if i > 0:
            messages[-1]["content"] += "\n\n"
        messages[-1]["content"] += prepend.format(question=question)

        num_tokens = int(
            max_num_tokens if ANSWER_MARKER not in prepend else final_num_tokens
        )
        t = threading.Thread(
            target=pipe,
            args=(messages,),
            kwargs=dict(
                max_new_tokens=num_tokens,
                streamer=streamer,
            ),
        )
        t.start()

        # rebuild the history with the new content
        history[-1].content += prepend.format(question=question)
        if ANSWER_MARKER in prepend:
            # stop thinking, this is the answer now (no metadata for intermediate steps)
            history.append(gr.ChatMessage(role="assistant", content=""))
        for token in streamer:
            history[-1].content += token
            history[-1].content = reformat_math(history[-1].content)
            yield history
        t.join()

    yield history


with gr.Blocks(fill_height=True, title="Making any model reasoning") as demo:
    with gr.Row(scale=1):
        with gr.Column(scale=5):
            gr.Markdown(f"""
            # Force reasoning for any model
            
            This is a simple proof-of-concept to get any LLM model to reason ahead of its response.
            This interface uses *{model_name}* model which is **not** a reasoning model. The used method
            is only to force some "reasoning" steps with prefixes to help the model to enhance the answer.

            See related article here: [Make any model reasoning](https://huggingface.co/blog/Metal3d/making-any-model-reasoning)
            """)
            chatbot = gr.Chatbot(
                scale=1,
                type="messages",
                latex_delimiters=latex_delimiters,
            )
            msg = gr.Textbox(
                submit_btn=True,
                label="",
                show_label=False,
                placeholder="Type your question here.",
                autofocus=True,
            )
        with gr.Column(scale=1):
            gr.Markdown("""## Tweaks""")
            num_tokens = gr.Slider(
                50,
                255,
                100,
                step=1,
                label="Max tokens per reasoning step",
                interactive=True,
            )
            final_num_tokens = gr.Slider(
                50,
                255,
                200,
                step=1,
                label="Max token for the final answer",
                interactive=True,
            )
            gr.Markdown("""
            Using smaller number of tokens in the reasoning steps will make the model
            faster to answer, but it may not be able to go deep enough in its reasoning.
            A good value is 100.

            Using smaller number of tokens for the final answer will make the model
            to be less verbose, but it may not be able to give a complete answer.
            A good value is 200 to 255.
            """)
            gr.Markdown("""
            This interface can work on personal computer with 6Go VRAM (e.g. NVidia 3050/3060 on laptop). 
            Feel free to fork the application and try others instruct models.
            """)

    # when the user submit a message, the bot will answer
    msg.submit(
        user_input,
        [msg, chatbot],  # inputs
        [msg, chatbot],  # outputs
    ).then(
        bot,
        [chatbot, num_tokens, final_num_tokens],  # actually, the "history" input
        chatbot,  # to store the new history from the output
    )

if __name__ == "__main__":
    demo.queue().launch()