Spaces:

SimpleBerry
/

LLaMA-O1-Supervised-1129-Demo

Running

App Files Files Community

LLaMA-O1-Supervised-1129-Demo / app.py

jwu323

Update app.py

037da0c verified 10 months ago

raw

history blame

5.5 kB

	import os
	from typing import Generator, Optional
	import gradio as gr
	from llama_cpp import Llama, LlamaGrammar
	from huggingface_hub import hf_hub_download

	DESCRIPTION = '''
	# SimpleBerry/LLaMA-O1-Supervised-1129 \| Duplicate the space and set it to private for faster & personal inference for free.
	SimpleBerry/LLaMA-O1-Supervised-1129: an experimental research model developed by the SimpleBerry.
	Focused on advancing AI reasoning capabilities.

	## This Space was designed by Lyte/LLaMA-O1-Supervised-1129-GGUF, Many Thanks!

	To start a new chat, click "clear" and start a new dialog.
	'''

	LICENSE = """
	--- MIT License ---
	"""

	template = "<start_of_father_id>-1<end_of_father_id><start_of_local_id>0<end_of_local_id><start_of_thought><problem>{content}<end_of_thought><start_of_rating><positive_rating><end_of_rating>\n<start_of_father_id>0<end_of_father_id><start_of_local_id>1<end_of_local_id><start_of_thought><expansion>"

	class OptimizedLLMInterface:
	_model_instance = None # Singleton pattern

	def __init__(
	self,
	model_repo_id: str = "SimpleBerry/LLaMA-O1-Supervised-1129-Q2_K-GGUF",
	model_filename: str = "LLaMA-O1-Supervised-1129-q2_k.gguf",
	):
	if OptimizedLLMInterface._model_instance is None:
	model_path = hf_hub_download(repo_id=model_repo_id, filename=model_filename)
	OptimizedLLMInterface._model_instance = Llama(
	model_path=model_path,
	n_ctx=256, # Minimal context for speed
	n_threads=4, # Fixed thread count
	n_batch=1, # Single batch for low latency
	verbose=False, # Disable logging
	seed=-1, # Disable random seed
	logits_all=False, # Disable logits
	embedding=False, # Disable embeddings
	tensor_split=None, # No tensor splitting
	rope_freq_base=10000, # Default RoPE settings
	rope_freq_scale=1.0,
	main_gpu=0,
	)
	self.model = OptimizedLLMInterface._model_instance

	# Pre-tokenize template parts
	template_parts = template.split("{content}")
	self._prefix_tokens = self.model.tokenize(template_parts[0].encode())
	self._suffix_tokens = self.model.tokenize(template_parts[1].encode())

	def generate_response(
	self,
	message: str,
	history: Optional[list] = None,
	max_tokens: int = 128, # Reduced max tokens
	temperature: float = 0.7,
	top_p: float = 0.95,
	) -> Generator[str, None, None]:
	try:
	# Fast token preparation
	message_tokens = self.model.tokenize(message.encode())
	input_tokens = []
	input_tokens.extend(self._prefix_tokens)
	input_tokens.extend(message_tokens)
	input_tokens.extend(self._suffix_tokens)

	output = ""
	batch = []
	batch_size = 4 # Small batch size for faster responses

	for token in self.model.generate(
	input_tokens,
	top_p=top_p,
	temp=temperature,
	top_k=1, # Minimal top_k
	repeat_penalty=1.0, # No repeat penalty
	mirostat_mode=0, # Disable mirostat
	min_p=0.05, # Allow more diversity
	typical_p=1.0, # Disable typical sampling
	presence_penalty=0,
	frequency_penalty=0,
	):
	batch.append(token)
	if len(batch) >= batch_size:
	text = self.model.detokenize(batch).decode('utf-8', errors='ignore')
	output += text
	yield output
	batch = []

	if batch:
	text = self.model.detokenize(batch).decode('utf-8', errors='ignore')
	output += text
	yield output

	except Exception as e:
	yield f"Error: {str(e)}"

	def create_demo(llm_interface: OptimizedLLMInterface) -> gr.Blocks:
	with gr.Blocks() as demo:
	gr.Markdown(DESCRIPTION)

	chatbot = gr.ChatInterface(
	llm_interface.generate_response,
	title="SimpleBerry/LLaMA-O1-Supervised-1129 \| GGUF Demo",
	description="Edit Settings below if needed.",
	examples=[
	["How many r's are in the word strawberry?"],
	['If Diana needs to bike 10 miles to reach home and she can bike at a speed of 3 mph for two hours before getting tired, and then at a speed of 1 mph until she reaches home, how long will it take her to get home?'],
	['Find the least odd prime factor of $2019^8+1$.'],
	],
	cache_examples=False,
	fill_height=True
	)

	with gr.Accordion("Adjust Parameters", open=False):
	gr.Slider(minimum=64, maximum=512, value=128, step=64, label="Max Tokens")
	gr.Slider(minimum=0.1, maximum=1.0, value=0.7, step=0.1, label="Temperature")
	gr.Slider(minimum=0.05, maximum=1.0, value=0.95, step=0.05, label="Top-p")

	gr.Markdown(LICENSE)

	return demo

	def main():
	llm = OptimizedLLMInterface()
	demo = create_demo(llm)

	demo.launch(
	share=False,
	quiet=True
	)

	if __name__ == "__main__":
	main()