Nvidia-OpenReasoning

Running

File size: 5,995 Bytes

import gradio as gr
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
from globe import title, description, customtool , presentation1, presentation2, joinus
import spaces

model_path = "nvidia/Mistral-NeMo-Minitron-8B-Instruct"
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForCausalLM.from_pretrained(model_path)

# Extract config info from model's configuration
config_info = model.config

# Create a Markdown string to display the complete model configuration information
model_info_md = "### Model Configuration: Mistral-NeMo-Minitron-8B-Instruct\n\n"
for key, value in config_info.to_dict().items():
    model_info_md += f"- **{key.replace('_', ' ').capitalize()}**: {value}\n"

pipe = pipeline("text-generation", model=model, tokenizer=tokenizer)
# pipe.tokenizer = tokenizer

def create_prompt(system_message, user_message, tool_definition="", context=""):
    if tool_definition:
        return f"""<extra_id_0>System
{system_message}

<tool>
{tool_definition}
</tool>
<context>
{context}
</context>

<extra_id_1>User
{user_message}
<extra_id_1>Assistant
"""
    else:
        return f"<extra_id_0>System\n{system_message}\n\n<extra_id_1>User\n{user_message}\n<extra_id_1>Assistant\n"

@spaces.GPU
def generate_response(message, history, system_message, max_tokens, temperature, top_p, use_pipeline=False, tool_definition="", context=""):
    full_prompt = create_prompt(system_message, message, tool_definition, context)

    if use_pipeline:
        prompt = [{"role": "system", "content": system_message}, {"role": "user", "content": message}]
        response = pipe(prompt, max_new_tokens=max_tokens, temperature=temperature, top_p=top_p, stop_strings=["<extra_id_1>"])[0]['generated_text']
    else:
        tokenized_chat = tokenizer.apply_chat_template(
            [
                {"role": "system", "content": system_message},
                {"role": "user", "content": message},
            ],
            tokenize=True,
            add_generation_prompt=True,
            return_tensors="pt"
        )
        
        with torch.no_grad():
            output_ids = model.generate(
                tokenized_chat['input_ids'],
                max_new_tokens=max_tokens,
                temperature=temperature,
                top_p=top_p,
                do_sample=True
            )
        
        response = tokenizer.decode(output_ids[0], skip_special_tokens=True)

    assistant_response = response.split("<extra_id_1>Assistant\n")[-1].strip()

    if tool_definition and "<toolcall>" in assistant_response:
        tool_call = assistant_response.split("<toolcall>")[1].split("</toolcall>")[0]
        assistant_response += f"\n\nTool Call: {tool_call}\n\nNote: This is a simulated tool call. In a real scenario, the tool would be executed and its output would be used to generate a final response."

    return assistant_response

with gr.Blocks() as demo:
    with gr.Row(): 
        gr.Markdown(title)
    with gr.Row():
        gr.Markdown(description)
    with gr.Row():
        with gr.Column(scale=1):
            with gr.Group():
                gr.Markdown(presentation1)
    with gr.Column(scale=1):
            with gr.Group():
                gr.Markdown(model_info_md)    
    with gr.Row():
        with gr.Column(scale=3):
            chatbot = gr.Chatbot(label="🤖 Mistral-NeMo", height=400)
            msg = gr.Textbox(label="User Input", placeholder="Ask a question or request a task...")
            with gr.Accordion(label="🧪Advanced Settings", open=False):
                system_message = gr.Textbox(
                label="System Message",
                value="You are a helpful AI assistant.",
                lines=2,
                placeholder="Set the AI's behavior and context..."
                )
                context = gr.Textbox(
                    label="Context",
                    lines=2,
                    placeholder="Enter additional context information..."
                )
                max_tokens = gr.Slider(minimum=1, maximum=1024, value=256, step=1, label="Max Tokens")
                temperature = gr.Slider(minimum=0.1, maximum=2.0, value=0.7, step=0.1, label="Temperature")
                top_p = gr.Slider(minimum=0.1, maximum=1.0, value=0.95, step=0.05, label="Top-p")
                use_pipeline = gr.Checkbox(label="Use Pipeline", value=False)
                use_tool = gr.Checkbox(label="Use Function Calling", value=False)
                with gr.Column(visible=False) as tool_options:
                    tool_definition = gr.Code(
                        label="Tool Definition (JSON)",
                        value="{}",
                        lines=15,
                        language="json"
                    )
            with gr.Row():
                clear = gr.Button("Clear")
                send = gr.Button("Send")

    def user(user_message, history):
        return "", history + [[user_message, None]]

    def bot(history, system_message, max_tokens, temperature, top_p, use_pipeline, tool_definition, context):
        user_message = history[-1][0]
        bot_message = generate_response(user_message, history, system_message, max_tokens, temperature, top_p, use_pipeline, tool_definition, context)
        history[-1][1] = bot_message
        return history

    msg.submit(user, [msg, chatbot], [msg, chatbot], queue=False).then(
        bot, [chatbot, system_message, max_tokens, temperature, top_p, use_pipeline, tool_definition, context], chatbot
    )
    send.click(user, [msg, chatbot], [msg, chatbot], queue=False).then(
        bot, [chatbot, system_message, max_tokens, temperature, top_p, use_pipeline, tool_definition, context], chatbot
    )
    clear.click(lambda: None, None, chatbot, queue=False)

    use_tool.change(
        fn=lambda x: gr.update(visible=x),
        inputs=[use_tool],
        outputs=[tool_options]
    )

if __name__ == "__main__":
    demo.queue
    demo.launch()