File size: 5,223 Bytes
b377b1e
fc46f2c
 
ee950e1
 
fc46f2c
b377b1e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
fc46f2c
7271565
 
fc46f2c
b377b1e
 
ee950e1
b377b1e
 
 
ee950e1
b377b1e
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
from typing import List, Tuple, Union
import os
import gradio as gr
from llama_cpp import Llama
from huggingface_hub import hf_hub_download

class LlamaAssistant:
    def __init__(self, model_config: dict):
        self.model = Llama(
            model_path=hf_hub_download(
                repo_id=model_config.get("repo_id", "Lyte/LLaMA-O1-Supervised-1129-Q4_K_M-GGUF"),
                filename=model_config.get("model_file", "llama-o1-supervised-1129-q4_k_m.gguf"),
            )
        )
        self.template = "<start_of_father_id>-1<end_of_father_id><start_of_local_id>0<end_of_local_id><start_of_thought><problem>{content}<end_of_thought><start_of_rating><positive_rating><end_of_rating>\n<start_of_father_id>0<end_of_father_id><start_of_local_id>1<end_of_local_id><start_of_thought><expansion>"
        self.generate_cfg = model_config.get("generate_cfg", {
            "max_tokens": 512,
            "temperature": 0.7,
            "top_p": 0.95,
        })

    def _format_prompt(self, message: str) -> str:
        return self.template.format(content=message)

    def generate(self, message: str, history: List[Tuple[str, str]] = None) -> str:
        input_text = self._format_prompt(message)
        inputs = self.model.tokenize(input_text.encode('utf-8'))
        
        response = ""
        for token in self.model.generate(
            inputs,
            top_p=self.generate_cfg["top_p"],
            temp=self.generate_cfg["temperature"]
        ):
            text = self.model.detokenize([token])
            response += text.decode('utf-8')
            yield response

class WebUI:
    def __init__(self, assistant: LlamaAssistant, config: dict = None):
        self.assistant = assistant
        self.config = config or {}
        
    def create_interface(self):
        with gr.Blocks() as demo:
            gr.Markdown(self.config.get("description", """
            # LLaMA-O1-Supervised-1129 Demo
            An experimental research model focused on advancing AI reasoning capabilities.
            
            **To start a new chat**, click "clear" and start a new dialog.
            """))

            chatbot = gr.ChatInterface(
                self.assistant.generate,
                title=self.config.get("title", "LLaMA-O1-Supervised-1129 | Demo"),
                description=self.config.get("description", "Edit Settings below if needed."),
                examples=self.config.get("examples", [
                    ["How many r's are in the word strawberry?"],
                    ['If Diana needs to bike 10 miles to reach home and she can bike at a speed of 3 mph for two hours before getting tired, and then at a speed of 1 mph until she reaches home, how long will it take her to get home?'],
                    ['Find the least odd prime factor of $2019^8+1$.'],
                ]),
                cache_examples=False,
                fill_height=True
            )

            with gr.Accordion("Adjust Parameters", open=False):
                gr.Slider(
                    minimum=128,
                    maximum=8192,
                    value=self.assistant.generate_cfg["max_tokens"],
                    step=1,
                    label="Max Tokens"
                )
                gr.Slider(
                    minimum=0.1,
                    maximum=1.5,
                    value=self.assistant.generate_cfg["temperature"],
                    step=0.1,
                    label="Temperature"
                )
                gr.Slider(
                    minimum=0.05,
                    maximum=1.0,
                    value=self.assistant.generate_cfg["top_p"],
                    step=0.01,
                    label="Top-p (nucleus sampling)"
                )

            gr.Markdown(self.config.get("license", "--- MIT License ---"))

        return demo

    def run(self, **kwargs):
        demo = self.create_interface()
        demo.launch(**kwargs)

def app_gui():
    # Define model configuration
    model_config = {
        "repo_id": os.environ.get("REPO_ID", "Lyte/LLaMA-O1-Supervised-1129-Q4_K_M-GGUF"),
        "model_file": os.environ.get("MODEL_FILE", "llama-o1-supervised-1129-q4_k_m.gguf"),
        "generate_cfg": {
            "max_tokens": 512,
            "temperature": float(os.environ.get("T", 0.7)),
            "top_p": float(os.environ.get("P", 0.95)),
        }
    }

    # UI configuration
    ui_config = {
        "title": "LLaMA-O1-Supervised-1129 | Demo",
        "description": "LLaMA-O1-Supervised-1129 is an experimental research model focused on advancing AI reasoning capabilities.",
        "examples": [
            ["How many r's are in the word strawberry?"],
            ['If Diana needs to bike 10 miles to reach home and she can bike at a speed of 3 mph for two hours before getting tired, and then at a speed of 1 mph until she reaches home, how long will it take her to get home?'],
            ['Find the least odd prime factor of $2019^8+1$.'],
        ],
        "license": "--- MIT License ---"
    }

    # Create and run the web interface
    assistant = LlamaAssistant(model_config)
    WebUI(assistant, ui_config).run(concurrency_limit=80)

if __name__ == '__main__':
    app_gui()