File size: 5,212 Bytes
fc46f2c 4d7e82f fc46f2c ee950e1 fc46f2c 4d7e82f b377b1e 4d7e82f b377b1e 4d7e82f b377b1e 4d7e82f b377b1e 4d7e82f b377b1e 4d7e82f 43b7c77 4d7e82f 43b7c77 b377b1e 4d7e82f 43b7c77 4d7e82f 43b7c77 4d7e82f 43b7c77 4d7e82f 43b7c77 b377b1e 4d7e82f b377b1e 4d7e82f 43b7c77 4d7e82f b377b1e 4d7e82f 43b7c77 b377b1e 4d7e82f ee950e1 4d7e82f 43b7c77 4d7e82f 43b7c77 4d7e82f 43b7c77 ee950e1 4d7e82f |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 |
import os
from typing import Generator, Optional
import gradio as gr
from llama_cpp import Llama
from huggingface_hub import hf_hub_download
DESCRIPTION = '''
# SimpleBerry/LLaMA-O1-Supervised-1129 | Duplicate the space and set it to private for faster & personal inference for free.
SimpleBerry/LLaMA-O1-Supervised-1129: an experimental research model developed by the SimpleBerry.
Focused on advancing AI reasoning capabilities.
## This Space was designed by Lyte/LLaMA-O1-Supervised-1129-GGUF, Many Thanks!
**To start a new chat**, click "clear" and start a new dialog.
'''
LICENSE = """
--- MIT License ---
"""
template = "<start_of_father_id>-1<end_of_father_id><start_of_local_id>0<end_of_local_id><start_of_thought><problem>{content}<end_of_thought><start_of_rating><positive_rating><end_of_rating>\n<start_of_father_id>0<end_of_father_id><start_of_local_id>1<end_of_local_id><start_of_thought><expansion>"
class OptimizedLLMInterface:
_model_instance = None # Class-level model instance for singleton pattern
def __init__(
self,
model_repo_id: str = "Lyte/LLaMA-O1-Supervised-1129-Q4_K_M-GGUF",
model_filename: str = "llama-o1-supervised-1129-q4_k_m.gguf",
):
"""Initialize optimized LLM interface with aggressive performance settings"""
# Only create model instance once
if OptimizedLLMInterface._model_instance is None:
OptimizedLLMInterface._model_instance = Llama(
model_path=hf_hub_download(repo_id=model_repo_id, filename=model_filename),
n_ctx=512, # Reduced context size for speed
n_threads=4, # Fixed thread count
n_batch=32, # Smaller batch size for faster responses
logits_all=False,
embedding=False,
seed=-1, # Disable seed for performance
verbose=False, # Disable logging
offload_kqv=True,
)
self.model = OptimizedLLMInterface._model_instance
# Pre-compute template parts
template_parts = template.split("{content}")
self._prefix_tokens = self.model.tokenize(template_parts[0].encode())
self._suffix_tokens = self.model.tokenize(template_parts[1].encode())
def generate_response(
self,
message: str,
history: Optional[list] = None,
max_tokens: int = 256, # Reduced max tokens
temperature: float = 0.7,
top_p: float = 0.95,
) -> Generator[str, None, None]:
"""Optimized response generation"""
# Fast token combination
message_tokens = self.model.tokenize(message.encode())
input_tokens = []
input_tokens.extend(self._prefix_tokens)
input_tokens.extend(message_tokens)
input_tokens.extend(self._suffix_tokens)
# Batch output processing
output = ""
batch = []
batch_size = 8 # Process tokens in small batches
for token in self.model.generate(
input_tokens,
top_p=top_p,
temp=temperature,
top_k=1, # Minimal sampling for speed
repeat_penalty=1.0, # Disable repeat penalty
):
batch.append(token)
if len(batch) >= batch_size:
text = self.model.detokenize(batch).decode('utf-8', errors='ignore')
output += text
yield output
batch = []
# Handle remaining tokens
if batch:
text = self.model.detokenize(batch).decode('utf-8', errors='ignore')
output += text
yield output
def create_demo(llm_interface: OptimizedLLMInterface) -> gr.Blocks:
"""Create the Gradio interface"""
with gr.Blocks() as demo:
gr.Markdown(DESCRIPTION)
chatbot = gr.ChatInterface(
llm_interface.generate_response,
title="SimpleBerry/LLaMA-O1-Supervised-1129 | GGUF Demo",
description="Edit Settings below if needed.",
examples=[
["How many r's are in the word strawberry?"],
['If Diana needs to bike 10 miles to reach home and she can bike at a speed of 3 mph for two hours before getting tired, and then at a speed of 1 mph until she reaches home, how long will it take her to get home?'],
['Find the least odd prime factor of $2019^8+1$.'],
],
cache_examples=True, # Enable example caching
fill_height=True
)
with gr.Accordion("Adjust Parameters", open=False):
gr.Slider(minimum=128, maximum=2048, value=256, step=128, label="Max Tokens")
gr.Slider(minimum=0.1, maximum=1.0, value=0.7, step=0.1, label="Temperature")
gr.Slider(minimum=0.05, maximum=1.0, value=0.95, step=0.05, label="Top-p")
gr.Markdown(LICENSE)
return demo
def main():
# Initialize with performance settings
llm = OptimizedLLMInterface()
# Create and launch the demo with minimal overhead
demo = create_demo(llm)
demo.queue(max_size=10)
demo.launch(
quiet=True,
)
if __name__ == "__main__":
main() |