Spaces:

SimpleBerry
/

LLaMA-O1-Supervised-1129-Demo

Running

File size: 4,099 Bytes

fc46f2c
4d7e82f
fc46f2c
ee950e1
 
fc46f2c
4d7e82f
 
 
 
 
b377b1e
4d7e82f
b377b1e
4d7e82f
 
b377b1e
4d7e82f
 
 
b377b1e
4d7e82f
b377b1e
4d7e82f
 
 
 
 
 
 
 
 
 
 
 
 
e8e6330
 
 
 
4d7e82f
b377b1e
4d7e82f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b377b1e
4d7e82f
 
 
 
b377b1e
4d7e82f
 
 
 
 
 
 
 
 
 
 
 
b377b1e
4d7e82f
 
 
 
b377b1e
4d7e82f
 
 
ee950e1
4d7e82f
 
 
 
 
 
 
 
 
e8e6330
ee950e1
4d7e82f

import os
from typing import Generator, Optional
import gradio as gr
from llama_cpp import Llama
from huggingface_hub import hf_hub_download

# Keep original template and descriptions
DESCRIPTION = '''
# SimpleBerry/LLaMA-O1-Supervised-1129 | Duplicate the space and set it to private for faster & personal inference for free.
SimpleBerry/LLaMA-O1-Supervised-1129: an experimental research model developed by the SimpleBerry.  
Focused on advancing AI reasoning capabilities.  

## This Space was designed by Lyte/LLaMA-O1-Supervised-1129-GGUF, Many Thanks!

**To start a new chat**, click "clear" and start a new dialog.
'''

LICENSE = """
--- MIT License ---
"""

template = "<start_of_father_id>-1<end_of_father_id><start_of_local_id>0<end_of_local_id><start_of_thought><problem>{content}<end_of_thought><start_of_rating><positive_rating><end_of_rating>\n<start_of_father_id>0<end_of_father_id><start_of_local_id>1<end_of_local_id><start_of_thought><expansion>"

class OptimizedLLMInterface:
    def __init__(
        self,
        model_repo_id: str = "Lyte/LLaMA-O1-Supervised-1129-Q4_K_M-GGUF",
        model_filename: str = "llama-o1-supervised-1129-q4_k_m.gguf",
        context_size: int = 32768,
        num_threads: int = 8,
    ):
        """Initialize optimized LLM interface"""
        self.model = Llama(
            model_path=hf_hub_download(repo_id=model_repo_id, filename=model_filename),
            n_ctx=context_size,
            n_threads=num_threads,
            n_batch=512,  # Increased batch size for better CPU utilization
            logits_all=False,  # Disable unnecessary logit calculations
            embedding=False,  # Disable embedding cache
            offload_kqv=True  # Enable memory optimizations
        )

    def generate_response(
        self,
        message: str,
        history: Optional[list] = None,
        max_tokens: int = 512,
        temperature: float = 0.9,
        top_p: float = 0.95,
    ) -> Generator[str, None, None]:
        """Generate response with optimized streaming"""
        input_text = template.format(content=message)
        input_tokens = self.model.tokenize(input_text.encode('utf-8'))
        
        temp = ""
        for token in self.model.generate(
            input_tokens,
            top_p=top_p,
            temp=temperature,
            repeat_penalty=1.1
        ):
            text = self.model.detokenize([token]).decode('utf-8')
            temp += text
            yield temp

def create_demo(llm_interface: OptimizedLLMInterface) -> gr.Blocks:
    """Create the Gradio interface"""
    with gr.Blocks() as demo:
        gr.Markdown(DESCRIPTION)

        chatbot = gr.ChatInterface(
            llm_interface.generate_response,
            title="SimpleBerry/LLaMA-O1-Supervised-1129 | GGUF Demo",
            description="Edit Settings below if needed.",
            examples=[
                ["How many r's are in the word strawberry?"],
                ['If Diana needs to bike 10 miles to reach home and she can bike at a speed of 3 mph for two hours before getting tired, and then at a speed of 1 mph until she reaches home, how long will it take her to get home?'],
                ['Find the least odd prime factor of $2019^8+1$.'],
            ],
            cache_examples=False,
            fill_height=True
        )

        with gr.Accordion("Adjust Parameters", open=False):
            gr.Slider(minimum=128, maximum=8192, value=512, step=1, label="Max Tokens")
            gr.Slider(minimum=0.1, maximum=1.5, value=0.7, step=0.1, label="Temperature")
            gr.Slider(minimum=0.05, maximum=1.0, value=0.95, step=0.01, label="Top-p (nucleus sampling)")

        gr.Markdown(LICENSE)
    
    return demo

def main():
    # Initialize the optimized LLM interface
    llm = OptimizedLLMInterface(
        num_threads=os.cpu_count() or 8  # Automatically use available CPU cores
    )
    
    # Create and launch the demo
    demo = create_demo(llm)
    demo.queue(max_size=10)  # Limit queue size to prevent overload
    demo.launch(quiet=True)

if __name__ == "__main__":
    main()