Spaces:

SimpleBerry
/

LLaMA-O1-Supervised-1129-Demo

Running

File size: 5,212 Bytes

fc46f2c
4d7e82f
fc46f2c
ee950e1
 
fc46f2c
4d7e82f
 
 
 
b377b1e
4d7e82f
b377b1e
4d7e82f
 
b377b1e
4d7e82f
 
 
b377b1e
4d7e82f
b377b1e
4d7e82f
43b7c77
 
4d7e82f
 
 
 
 
43b7c77
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b377b1e
4d7e82f
 
 
 
43b7c77
 
4d7e82f
 
43b7c77
 
 
 
 
 
 
 
 
 
 
 
4d7e82f
 
 
 
 
43b7c77
 
4d7e82f
43b7c77
 
 
 
 
 
 
 
 
 
 
 
b377b1e
4d7e82f
 
 
 
b377b1e
4d7e82f
 
 
 
 
 
 
 
 
43b7c77
4d7e82f
 
b377b1e
4d7e82f
43b7c77
 
 
b377b1e
4d7e82f
 
 
ee950e1
4d7e82f
43b7c77
 
4d7e82f
43b7c77
4d7e82f
43b7c77
 
 
 
ee950e1
4d7e82f

import os
from typing import Generator, Optional
import gradio as gr
from llama_cpp import Llama
from huggingface_hub import hf_hub_download

DESCRIPTION = '''
# SimpleBerry/LLaMA-O1-Supervised-1129 | Duplicate the space and set it to private for faster & personal inference for free.
SimpleBerry/LLaMA-O1-Supervised-1129: an experimental research model developed by the SimpleBerry.  
Focused on advancing AI reasoning capabilities.  

## This Space was designed by Lyte/LLaMA-O1-Supervised-1129-GGUF, Many Thanks!

**To start a new chat**, click "clear" and start a new dialog.
'''

LICENSE = """
--- MIT License ---
"""

template = "<start_of_father_id>-1<end_of_father_id><start_of_local_id>0<end_of_local_id><start_of_thought><problem>{content}<end_of_thought><start_of_rating><positive_rating><end_of_rating>\n<start_of_father_id>0<end_of_father_id><start_of_local_id>1<end_of_local_id><start_of_thought><expansion>"

class OptimizedLLMInterface:
    _model_instance = None  # Class-level model instance for singleton pattern

    def __init__(
        self,
        model_repo_id: str = "Lyte/LLaMA-O1-Supervised-1129-Q4_K_M-GGUF",
        model_filename: str = "llama-o1-supervised-1129-q4_k_m.gguf",
    ):
        """Initialize optimized LLM interface with aggressive performance settings"""
        # Only create model instance once
        if OptimizedLLMInterface._model_instance is None:
            OptimizedLLMInterface._model_instance = Llama(
                model_path=hf_hub_download(repo_id=model_repo_id, filename=model_filename),
                n_ctx=512,  # Reduced context size for speed
                n_threads=4,  # Fixed thread count
                n_batch=32,  # Smaller batch size for faster responses
                logits_all=False,
                embedding=False,
                seed=-1,  # Disable seed for performance
                verbose=False,  # Disable logging
                offload_kqv=True,
            )
        self.model = OptimizedLLMInterface._model_instance
        
        # Pre-compute template parts
        template_parts = template.split("{content}")
        self._prefix_tokens = self.model.tokenize(template_parts[0].encode())
        self._suffix_tokens = self.model.tokenize(template_parts[1].encode())

    def generate_response(
        self,
        message: str,
        history: Optional[list] = None,
        max_tokens: int = 256,  # Reduced max tokens
        temperature: float = 0.7,
        top_p: float = 0.95,
    ) -> Generator[str, None, None]:
        """Optimized response generation"""
        # Fast token combination
        message_tokens = self.model.tokenize(message.encode())
        input_tokens = []
        input_tokens.extend(self._prefix_tokens)
        input_tokens.extend(message_tokens)
        input_tokens.extend(self._suffix_tokens)
        
        # Batch output processing
        output = ""
        batch = []
        batch_size = 8  # Process tokens in small batches
        
        for token in self.model.generate(
            input_tokens,
            top_p=top_p,
            temp=temperature,
            top_k=1,  # Minimal sampling for speed
            repeat_penalty=1.0,  # Disable repeat penalty
        ):
            batch.append(token)
            if len(batch) >= batch_size:
                text = self.model.detokenize(batch).decode('utf-8', errors='ignore')
                output += text
                yield output
                batch = []
        
        # Handle remaining tokens
        if batch:
            text = self.model.detokenize(batch).decode('utf-8', errors='ignore')
            output += text
            yield output

def create_demo(llm_interface: OptimizedLLMInterface) -> gr.Blocks:
    """Create the Gradio interface"""
    with gr.Blocks() as demo:
        gr.Markdown(DESCRIPTION)

        chatbot = gr.ChatInterface(
            llm_interface.generate_response,
            title="SimpleBerry/LLaMA-O1-Supervised-1129 | GGUF Demo",
            description="Edit Settings below if needed.",
            examples=[
                ["How many r's are in the word strawberry?"],
                ['If Diana needs to bike 10 miles to reach home and she can bike at a speed of 3 mph for two hours before getting tired, and then at a speed of 1 mph until she reaches home, how long will it take her to get home?'],
                ['Find the least odd prime factor of $2019^8+1$.'],
            ],
            cache_examples=True,  # Enable example caching
            fill_height=True
        )

        with gr.Accordion("Adjust Parameters", open=False):
            gr.Slider(minimum=128, maximum=2048, value=256, step=128, label="Max Tokens")
            gr.Slider(minimum=0.1, maximum=1.0, value=0.7, step=0.1, label="Temperature")
            gr.Slider(minimum=0.05, maximum=1.0, value=0.95, step=0.05, label="Top-p")

        gr.Markdown(LICENSE)
    
    return demo

def main():
    # Initialize with performance settings
    llm = OptimizedLLMInterface()
    
    # Create and launch the demo with minimal overhead
    demo = create_demo(llm)
    demo.queue(max_size=10)
    demo.launch(
        quiet=True,
    )

if __name__ == "__main__":
    main()