import os from typing import Generator, Optional import gradio as gr from llama_cpp import Llama, LlamaGrammar from huggingface_hub import hf_hub_download DESCRIPTION = ''' # SimpleBerry/LLaMA-O1-Supervised-1129 | Duplicate the space and set it to private for faster & personal inference for free. SimpleBerry/LLaMA-O1-Supervised-1129: an experimental research model developed by the SimpleBerry. Focused on advancing AI reasoning capabilities. ## This Space was designed by Lyte/LLaMA-O1-Supervised-1129-GGUF, Many Thanks! **To start a new chat**, click "clear" and start a new dialog. ''' LICENSE = """ --- MIT License --- """ template = "-10{content}\n01" class OptimizedLLMInterface: _model_instance = None # Singleton pattern def __init__( self, model_repo_id: str = "SimpleBerry/LLaMA-O1-Supervised-1129-Q2_K-GGUF", model_filename: str = "LLaMA-O1-Supervised-1129-q2_k.gguf", ): if OptimizedLLMInterface._model_instance is None: model_path = hf_hub_download(repo_id=model_repo_id, filename=model_filename) OptimizedLLMInterface._model_instance = Llama( model_path=model_path, n_ctx=256, # Minimal context for speed n_threads=4, # Fixed thread count n_batch=1, # Single batch for low latency verbose=False, # Disable logging seed=-1, # Disable random seed logits_all=False, # Disable logits embedding=False, # Disable embeddings tensor_split=None, # No tensor splitting rope_freq_base=10000, # Default RoPE settings rope_freq_scale=1.0, main_gpu=0, ) self.model = OptimizedLLMInterface._model_instance # Pre-tokenize template parts template_parts = template.split("{content}") self._prefix_tokens = self.model.tokenize(template_parts[0].encode()) self._suffix_tokens = self.model.tokenize(template_parts[1].encode()) def generate_response( self, message: str, history: Optional[list] = None, max_tokens: int = 128, # Reduced max tokens temperature: float = 0.7, top_p: float = 0.95, ) -> Generator[str, None, None]: try: # Fast token preparation message_tokens = self.model.tokenize(message.encode()) input_tokens = [] input_tokens.extend(self._prefix_tokens) input_tokens.extend(message_tokens) input_tokens.extend(self._suffix_tokens) output = "" batch = [] batch_size = 4 # Small batch size for faster responses for token in self.model.generate( input_tokens, top_p=top_p, temp=temperature, top_k=1, # Minimal top_k repeat_penalty=1.0, # No repeat penalty mirostat_mode=0, # Disable mirostat min_p=0.05, # Allow more diversity typical_p=1.0, # Disable typical sampling presence_penalty=0, frequency_penalty=0, ): batch.append(token) if len(batch) >= batch_size: text = self.model.detokenize(batch).decode('utf-8', errors='ignore') output += text yield output batch = [] if batch: text = self.model.detokenize(batch).decode('utf-8', errors='ignore') output += text yield output except Exception as e: yield f"Error: {str(e)}" def create_demo(llm_interface: OptimizedLLMInterface) -> gr.Blocks: with gr.Blocks() as demo: gr.Markdown(DESCRIPTION) chatbot = gr.ChatInterface( llm_interface.generate_response, title="SimpleBerry/LLaMA-O1-Supervised-1129 | GGUF Demo", description="Edit Settings below if needed.", examples=[ ["How many r's are in the word strawberry?"], ['If Diana needs to bike 10 miles to reach home and she can bike at a speed of 3 mph for two hours before getting tired, and then at a speed of 1 mph until she reaches home, how long will it take her to get home?'], ['Find the least odd prime factor of $2019^8+1$.'], ], cache_examples=False, fill_height=True ) with gr.Accordion("Adjust Parameters", open=False): gr.Slider(minimum=64, maximum=512, value=128, step=64, label="Max Tokens") gr.Slider(minimum=0.1, maximum=1.0, value=0.7, step=0.1, label="Temperature") gr.Slider(minimum=0.05, maximum=1.0, value=0.95, step=0.05, label="Top-p") gr.Markdown(LICENSE) return demo def main(): llm = OptimizedLLMInterface() demo = create_demo(llm) demo.launch( share=False, quiet=True ) if __name__ == "__main__": main()