Spaces:

SimpleBerry
/

LLaMA-O1-Supervised-1129-Demo

Running

File size: 5,223 Bytes

from typing import List, Tuple, Union
import os
import gradio as gr
from llama_cpp import Llama
from huggingface_hub import hf_hub_download

class LlamaAssistant:
    def __init__(self, model_config: dict):
        self.model = Llama(
            model_path=hf_hub_download(
                repo_id=model_config.get("repo_id", "Lyte/LLaMA-O1-Supervised-1129-Q4_K_M-GGUF"),
                filename=model_config.get("model_file", "llama-o1-supervised-1129-q4_k_m.gguf"),
            )
        )
        self.template = "<start_of_father_id>-1<end_of_father_id><start_of_local_id>0<end_of_local_id><start_of_thought><problem>{content}<end_of_thought><start_of_rating><positive_rating><end_of_rating>\n<start_of_father_id>0<end_of_father_id><start_of_local_id>1<end_of_local_id><start_of_thought><expansion>"
        self.generate_cfg = model_config.get("generate_cfg", {
            "max_tokens": 512,
            "temperature": 0.7,
            "top_p": 0.95,
        })

    def _format_prompt(self, message: str) -> str:
        return self.template.format(content=message)

    def generate(self, message: str, history: List[Tuple[str, str]] = None) -> str:
        input_text = self._format_prompt(message)
        inputs = self.model.tokenize(input_text.encode('utf-8'))
        
        response = ""
        for token in self.model.generate(
            inputs,
            top_p=self.generate_cfg["top_p"],
            temp=self.generate_cfg["temperature"]
        ):
            text = self.model.detokenize([token])
            response += text.decode('utf-8')
            yield response

class WebUI:
    def __init__(self, assistant: LlamaAssistant, config: dict = None):
        self.assistant = assistant
        self.config = config or {}
        
    def create_interface(self):
        with gr.Blocks() as demo:
            gr.Markdown(self.config.get("description", """
            # LLaMA-O1-Supervised-1129 Demo
            An experimental research model focused on advancing AI reasoning capabilities.
            
            **To start a new chat**, click "clear" and start a new dialog.
            """))

            chatbot = gr.ChatInterface(
                self.assistant.generate,
                title=self.config.get("title", "LLaMA-O1-Supervised-1129 | Demo"),
                description=self.config.get("description", "Edit Settings below if needed."),
                examples=self.config.get("examples", [
                    ["How many r's are in the word strawberry?"],
                    ['If Diana needs to bike 10 miles to reach home and she can bike at a speed of 3 mph for two hours before getting tired, and then at a speed of 1 mph until she reaches home, how long will it take her to get home?'],
                    ['Find the least odd prime factor of $2019^8+1$.'],
                ]),
                cache_examples=False,
                fill_height=True
            )

            with gr.Accordion("Adjust Parameters", open=False):
                gr.Slider(
                    minimum=128,
                    maximum=8192,
                    value=self.assistant.generate_cfg["max_tokens"],
                    step=1,
                    label="Max Tokens"
                )
                gr.Slider(
                    minimum=0.1,
                    maximum=1.5,
                    value=self.assistant.generate_cfg["temperature"],
                    step=0.1,
                    label="Temperature"
                )
                gr.Slider(
                    minimum=0.05,
                    maximum=1.0,
                    value=self.assistant.generate_cfg["top_p"],
                    step=0.01,
                    label="Top-p (nucleus sampling)"
                )

            gr.Markdown(self.config.get("license", "--- MIT License ---"))

        return demo

    def run(self, **kwargs):
        demo = self.create_interface()
        demo.launch(**kwargs)

def app_gui():
    # Define model configuration
    model_config = {
        "repo_id": os.environ.get("REPO_ID", "Lyte/LLaMA-O1-Supervised-1129-Q4_K_M-GGUF"),
        "model_file": os.environ.get("MODEL_FILE", "llama-o1-supervised-1129-q4_k_m.gguf"),
        "generate_cfg": {
            "max_tokens": 512,
            "temperature": float(os.environ.get("T", 0.7)),
            "top_p": float(os.environ.get("P", 0.95)),
        }
    }

    # UI configuration
    ui_config = {
        "title": "LLaMA-O1-Supervised-1129 | Demo",
        "description": "LLaMA-O1-Supervised-1129 is an experimental research model focused on advancing AI reasoning capabilities.",
        "examples": [
            ["How many r's are in the word strawberry?"],
            ['If Diana needs to bike 10 miles to reach home and she can bike at a speed of 3 mph for two hours before getting tired, and then at a speed of 1 mph until she reaches home, how long will it take her to get home?'],
            ['Find the least odd prime factor of $2019^8+1$.'],
        ],
        "license": "--- MIT License ---"
    }

    # Create and run the web interface
    assistant = LlamaAssistant(model_config)
    WebUI(assistant, ui_config).run(concurrency_limit=80)

if __name__ == '__main__':
    app_gui()