import os from typing import Generator, Optional import gradio as gr from llama_cpp import Llama from huggingface_hub import hf_hub_download DESCRIPTION = ''' # SimpleBerry/LLaMA-O1-Supervised-1129 | Duplicate the space and set it to private for faster & personal inference for free. SimpleBerry/LLaMA-O1-Supervised-1129: an experimental research model developed by the SimpleBerry. Focused on advancing AI reasoning capabilities. ## This Space was designed by Lyte/LLaMA-O1-Supervised-1129-GGUF, Many Thanks! **To start a new chat**, click "clear" and start a new dialog. ''' LICENSE = """ --- MIT License --- """ template = "-10{content}\n01" class OptimizedLLMInterface: _model_instance = None def __init__( self, model_repo_id: str = "Lyte/LLaMA-O1-Supervised-1129-Q4_K_M-GGUF", model_filename: str = "llama-o1-supervised-1129-q4_k_m.gguf", ): if OptimizedLLMInterface._model_instance is None: OptimizedLLMInterface._model_instance = Llama( model_path=hf_hub_download(repo_id=model_repo_id, filename=model_filename), n_ctx=512, n_threads=4, n_batch=32, logits_all=False, embedding=False, seed=-1, verbose=False, offload_kqv=True, ) self.model = OptimizedLLMInterface._model_instance template_parts = template.split("{content}") self._prefix_tokens = self.model.tokenize(template_parts[0].encode()) self._suffix_tokens = self.model.tokenize(template_parts[1].encode()) def generate_response( self, message: str, history: Optional[list] = None, max_tokens: int = 256, temperature: float = 0.7, top_p: float = 0.95, ) -> Generator[str, None, None]: message_tokens = self.model.tokenize(message.encode()) input_tokens = [] input_tokens.extend(self._prefix_tokens) input_tokens.extend(message_tokens) input_tokens.extend(self._suffix_tokens) output = "" batch = [] batch_size = 8 try: for token in self.model.generate( input_tokens, top_p=top_p, temp=temperature, top_k=1, repeat_penalty=1.0, max_tokens=max_tokens, # Added max_tokens limit ): batch.append(token) if len(batch) >= batch_size: text = self.model.detokenize(batch).decode('utf-8', errors='ignore') output += text yield output batch = [] if batch: text = self.model.detokenize(batch).decode('utf-8', errors='ignore') output += text yield output except Exception as e: yield f"Error: {str(e)}" def create_demo(llm_interface: OptimizedLLMInterface) -> gr.Blocks: with gr.Blocks() as demo: gr.Markdown(DESCRIPTION) chatbot = gr.ChatInterface( llm_interface.generate_response, title="SimpleBerry/LLaMA-O1-Supervised-1129 | GGUF Demo", description="Edit Settings below if needed.", examples=[ ["How many r's are in the word strawberry?"], ['If Diana needs to bike 10 miles to reach home and she can bike at a speed of 3 mph for two hours before getting tired, and then at a speed of 1 mph until she reaches home, how long will it take her to get home?'], ['Find the least odd prime factor of $2019^8+1$.'], ], cache_examples=False, # Disabled example caching to fix the error fill_height=True ) with gr.Accordion("Adjust Parameters", open=False): gr.Slider(minimum=128, maximum=2048, value=256, step=128, label="Max Tokens") gr.Slider(minimum=0.1, maximum=1.0, value=0.7, step=0.1, label="Temperature") gr.Slider(minimum=0.05, maximum=1.0, value=0.95, step=0.05, label="Top-p") gr.Markdown(LICENSE) return demo def main(): llm = OptimizedLLMInterface() demo = create_demo(llm) # Simplified launch configuration demo.launch( quiet=True ) if __name__ == "__main__": main()