File size: 5,499 Bytes
fc46f2c
4d7e82f
fc46f2c
93fb83c
ee950e1
fc46f2c
4d7e82f
 
 
 
b377b1e
4d7e82f
b377b1e
4d7e82f
 
b377b1e
4d7e82f
 
 
b377b1e
4d7e82f
b377b1e
4d7e82f
93fb83c
43b7c77
4d7e82f
 
037da0c
 
4d7e82f
43b7c77
93fb83c
43b7c77
93fb83c
 
 
 
 
 
 
 
 
 
 
 
43b7c77
 
 
93fb83c
43b7c77
 
 
b377b1e
4d7e82f
 
 
 
93fb83c
43b7c77
4d7e82f
 
f5f11cd
93fb83c
 
 
 
 
 
 
 
 
 
 
f5f11cd
 
 
 
93fb83c
 
 
 
 
 
 
f5f11cd
 
 
 
 
 
 
 
 
43b7c77
 
 
f5f11cd
 
 
b377b1e
4d7e82f
 
 
b377b1e
4d7e82f
 
 
 
 
 
 
 
 
93fb83c
4d7e82f
 
b377b1e
4d7e82f
93fb83c
43b7c77
 
b377b1e
4d7e82f
 
 
ee950e1
4d7e82f
43b7c77
4d7e82f
f5f11cd
43b7c77
93fb83c
f5f11cd
43b7c77
ee950e1
4d7e82f
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
import os
from typing import Generator, Optional
import gradio as gr
from llama_cpp import Llama, LlamaGrammar
from huggingface_hub import hf_hub_download

DESCRIPTION = '''
# SimpleBerry/LLaMA-O1-Supervised-1129 | Duplicate the space and set it to private for faster & personal inference for free.
SimpleBerry/LLaMA-O1-Supervised-1129: an experimental research model developed by the SimpleBerry.  
Focused on advancing AI reasoning capabilities.  

## This Space was designed by Lyte/LLaMA-O1-Supervised-1129-GGUF, Many Thanks!

**To start a new chat**, click "clear" and start a new dialog.
'''

LICENSE = """
--- MIT License ---
"""

template = "<start_of_father_id>-1<end_of_father_id><start_of_local_id>0<end_of_local_id><start_of_thought><problem>{content}<end_of_thought><start_of_rating><positive_rating><end_of_rating>\n<start_of_father_id>0<end_of_father_id><start_of_local_id>1<end_of_local_id><start_of_thought><expansion>"

class OptimizedLLMInterface:
    _model_instance = None  # Singleton pattern

    def __init__(
        self,
        model_repo_id: str = "SimpleBerry/LLaMA-O1-Supervised-1129-Q2_K-GGUF",
        model_filename: str = "LLaMA-O1-Supervised-1129-q2_k.gguf",
    ):
        if OptimizedLLMInterface._model_instance is None:
            model_path = hf_hub_download(repo_id=model_repo_id, filename=model_filename)
            OptimizedLLMInterface._model_instance = Llama(
                model_path=model_path,
                n_ctx=256,            # Minimal context for speed
                n_threads=4,          # Fixed thread count
                n_batch=1,            # Single batch for low latency
                verbose=False,        # Disable logging
                seed=-1,             # Disable random seed
                logits_all=False,    # Disable logits
                embedding=False,     # Disable embeddings
                tensor_split=None,   # No tensor splitting
                rope_freq_base=10000,  # Default RoPE settings
                rope_freq_scale=1.0,
                main_gpu=0,
            )
        self.model = OptimizedLLMInterface._model_instance
        
        # Pre-tokenize template parts
        template_parts = template.split("{content}")
        self._prefix_tokens = self.model.tokenize(template_parts[0].encode())
        self._suffix_tokens = self.model.tokenize(template_parts[1].encode())

    def generate_response(
        self,
        message: str,
        history: Optional[list] = None,
        max_tokens: int = 128,    # Reduced max tokens
        temperature: float = 0.7,
        top_p: float = 0.95,
    ) -> Generator[str, None, None]:
        try:
            # Fast token preparation
            message_tokens = self.model.tokenize(message.encode())
            input_tokens = []
            input_tokens.extend(self._prefix_tokens)
            input_tokens.extend(message_tokens)
            input_tokens.extend(self._suffix_tokens)
            
            output = ""
            batch = []
            batch_size = 4  # Small batch size for faster responses
            
            for token in self.model.generate(
                input_tokens,
                top_p=top_p,
                temp=temperature,
                top_k=1,           # Minimal top_k
                repeat_penalty=1.0, # No repeat penalty
                mirostat_mode=0,   # Disable mirostat
                min_p=0.05,        # Allow more diversity
                typical_p=1.0,     # Disable typical sampling
                presence_penalty=0,
                frequency_penalty=0,
            ):
                batch.append(token)
                if len(batch) >= batch_size:
                    text = self.model.detokenize(batch).decode('utf-8', errors='ignore')
                    output += text
                    yield output
                    batch = []
            
            if batch:
                text = self.model.detokenize(batch).decode('utf-8', errors='ignore')
                output += text
                yield output
                
        except Exception as e:
            yield f"Error: {str(e)}"

def create_demo(llm_interface: OptimizedLLMInterface) -> gr.Blocks:
    with gr.Blocks() as demo:
        gr.Markdown(DESCRIPTION)

        chatbot = gr.ChatInterface(
            llm_interface.generate_response,
            title="SimpleBerry/LLaMA-O1-Supervised-1129 | GGUF Demo",
            description="Edit Settings below if needed.",
            examples=[
                ["How many r's are in the word strawberry?"],
                ['If Diana needs to bike 10 miles to reach home and she can bike at a speed of 3 mph for two hours before getting tired, and then at a speed of 1 mph until she reaches home, how long will it take her to get home?'],
                ['Find the least odd prime factor of $2019^8+1$.'],
            ],
            cache_examples=False,
            fill_height=True
        )

        with gr.Accordion("Adjust Parameters", open=False):
            gr.Slider(minimum=64, maximum=512, value=128, step=64, label="Max Tokens")
            gr.Slider(minimum=0.1, maximum=1.0, value=0.7, step=0.1, label="Temperature")
            gr.Slider(minimum=0.05, maximum=1.0, value=0.95, step=0.05, label="Top-p")

        gr.Markdown(LICENSE)
    
    return demo

def main():
    llm = OptimizedLLMInterface()
    demo = create_demo(llm)
    
    demo.launch(
        share=False,
        quiet=True
    )

if __name__ == "__main__":
    main()