File size: 5,499 Bytes
fc46f2c 4d7e82f fc46f2c 93fb83c ee950e1 fc46f2c 4d7e82f b377b1e 4d7e82f b377b1e 4d7e82f b377b1e 4d7e82f b377b1e 4d7e82f b377b1e 4d7e82f 93fb83c 43b7c77 4d7e82f 037da0c 4d7e82f 43b7c77 93fb83c 43b7c77 93fb83c 43b7c77 93fb83c 43b7c77 b377b1e 4d7e82f 93fb83c 43b7c77 4d7e82f f5f11cd 93fb83c f5f11cd 93fb83c f5f11cd 43b7c77 f5f11cd b377b1e 4d7e82f b377b1e 4d7e82f 93fb83c 4d7e82f b377b1e 4d7e82f 93fb83c 43b7c77 b377b1e 4d7e82f ee950e1 4d7e82f 43b7c77 4d7e82f f5f11cd 43b7c77 93fb83c f5f11cd 43b7c77 ee950e1 4d7e82f |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 |
import os
from typing import Generator, Optional
import gradio as gr
from llama_cpp import Llama, LlamaGrammar
from huggingface_hub import hf_hub_download
DESCRIPTION = '''
# SimpleBerry/LLaMA-O1-Supervised-1129 | Duplicate the space and set it to private for faster & personal inference for free.
SimpleBerry/LLaMA-O1-Supervised-1129: an experimental research model developed by the SimpleBerry.
Focused on advancing AI reasoning capabilities.
## This Space was designed by Lyte/LLaMA-O1-Supervised-1129-GGUF, Many Thanks!
**To start a new chat**, click "clear" and start a new dialog.
'''
LICENSE = """
--- MIT License ---
"""
template = "<start_of_father_id>-1<end_of_father_id><start_of_local_id>0<end_of_local_id><start_of_thought><problem>{content}<end_of_thought><start_of_rating><positive_rating><end_of_rating>\n<start_of_father_id>0<end_of_father_id><start_of_local_id>1<end_of_local_id><start_of_thought><expansion>"
class OptimizedLLMInterface:
_model_instance = None # Singleton pattern
def __init__(
self,
model_repo_id: str = "SimpleBerry/LLaMA-O1-Supervised-1129-Q2_K-GGUF",
model_filename: str = "LLaMA-O1-Supervised-1129-q2_k.gguf",
):
if OptimizedLLMInterface._model_instance is None:
model_path = hf_hub_download(repo_id=model_repo_id, filename=model_filename)
OptimizedLLMInterface._model_instance = Llama(
model_path=model_path,
n_ctx=256, # Minimal context for speed
n_threads=4, # Fixed thread count
n_batch=1, # Single batch for low latency
verbose=False, # Disable logging
seed=-1, # Disable random seed
logits_all=False, # Disable logits
embedding=False, # Disable embeddings
tensor_split=None, # No tensor splitting
rope_freq_base=10000, # Default RoPE settings
rope_freq_scale=1.0,
main_gpu=0,
)
self.model = OptimizedLLMInterface._model_instance
# Pre-tokenize template parts
template_parts = template.split("{content}")
self._prefix_tokens = self.model.tokenize(template_parts[0].encode())
self._suffix_tokens = self.model.tokenize(template_parts[1].encode())
def generate_response(
self,
message: str,
history: Optional[list] = None,
max_tokens: int = 128, # Reduced max tokens
temperature: float = 0.7,
top_p: float = 0.95,
) -> Generator[str, None, None]:
try:
# Fast token preparation
message_tokens = self.model.tokenize(message.encode())
input_tokens = []
input_tokens.extend(self._prefix_tokens)
input_tokens.extend(message_tokens)
input_tokens.extend(self._suffix_tokens)
output = ""
batch = []
batch_size = 4 # Small batch size for faster responses
for token in self.model.generate(
input_tokens,
top_p=top_p,
temp=temperature,
top_k=1, # Minimal top_k
repeat_penalty=1.0, # No repeat penalty
mirostat_mode=0, # Disable mirostat
min_p=0.05, # Allow more diversity
typical_p=1.0, # Disable typical sampling
presence_penalty=0,
frequency_penalty=0,
):
batch.append(token)
if len(batch) >= batch_size:
text = self.model.detokenize(batch).decode('utf-8', errors='ignore')
output += text
yield output
batch = []
if batch:
text = self.model.detokenize(batch).decode('utf-8', errors='ignore')
output += text
yield output
except Exception as e:
yield f"Error: {str(e)}"
def create_demo(llm_interface: OptimizedLLMInterface) -> gr.Blocks:
with gr.Blocks() as demo:
gr.Markdown(DESCRIPTION)
chatbot = gr.ChatInterface(
llm_interface.generate_response,
title="SimpleBerry/LLaMA-O1-Supervised-1129 | GGUF Demo",
description="Edit Settings below if needed.",
examples=[
["How many r's are in the word strawberry?"],
['If Diana needs to bike 10 miles to reach home and she can bike at a speed of 3 mph for two hours before getting tired, and then at a speed of 1 mph until she reaches home, how long will it take her to get home?'],
['Find the least odd prime factor of $2019^8+1$.'],
],
cache_examples=False,
fill_height=True
)
with gr.Accordion("Adjust Parameters", open=False):
gr.Slider(minimum=64, maximum=512, value=128, step=64, label="Max Tokens")
gr.Slider(minimum=0.1, maximum=1.0, value=0.7, step=0.1, label="Temperature")
gr.Slider(minimum=0.05, maximum=1.0, value=0.95, step=0.05, label="Top-p")
gr.Markdown(LICENSE)
return demo
def main():
llm = OptimizedLLMInterface()
demo = create_demo(llm)
demo.launch(
share=False,
quiet=True
)
if __name__ == "__main__":
main() |