File size: 5,223 Bytes
b377b1e fc46f2c ee950e1 fc46f2c b377b1e fc46f2c 7271565 fc46f2c b377b1e ee950e1 b377b1e ee950e1 b377b1e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 |
from typing import List, Tuple, Union
import os
import gradio as gr
from llama_cpp import Llama
from huggingface_hub import hf_hub_download
class LlamaAssistant:
def __init__(self, model_config: dict):
self.model = Llama(
model_path=hf_hub_download(
repo_id=model_config.get("repo_id", "Lyte/LLaMA-O1-Supervised-1129-Q4_K_M-GGUF"),
filename=model_config.get("model_file", "llama-o1-supervised-1129-q4_k_m.gguf"),
)
)
self.template = "<start_of_father_id>-1<end_of_father_id><start_of_local_id>0<end_of_local_id><start_of_thought><problem>{content}<end_of_thought><start_of_rating><positive_rating><end_of_rating>\n<start_of_father_id>0<end_of_father_id><start_of_local_id>1<end_of_local_id><start_of_thought><expansion>"
self.generate_cfg = model_config.get("generate_cfg", {
"max_tokens": 512,
"temperature": 0.7,
"top_p": 0.95,
})
def _format_prompt(self, message: str) -> str:
return self.template.format(content=message)
def generate(self, message: str, history: List[Tuple[str, str]] = None) -> str:
input_text = self._format_prompt(message)
inputs = self.model.tokenize(input_text.encode('utf-8'))
response = ""
for token in self.model.generate(
inputs,
top_p=self.generate_cfg["top_p"],
temp=self.generate_cfg["temperature"]
):
text = self.model.detokenize([token])
response += text.decode('utf-8')
yield response
class WebUI:
def __init__(self, assistant: LlamaAssistant, config: dict = None):
self.assistant = assistant
self.config = config or {}
def create_interface(self):
with gr.Blocks() as demo:
gr.Markdown(self.config.get("description", """
# LLaMA-O1-Supervised-1129 Demo
An experimental research model focused on advancing AI reasoning capabilities.
**To start a new chat**, click "clear" and start a new dialog.
"""))
chatbot = gr.ChatInterface(
self.assistant.generate,
title=self.config.get("title", "LLaMA-O1-Supervised-1129 | Demo"),
description=self.config.get("description", "Edit Settings below if needed."),
examples=self.config.get("examples", [
["How many r's are in the word strawberry?"],
['If Diana needs to bike 10 miles to reach home and she can bike at a speed of 3 mph for two hours before getting tired, and then at a speed of 1 mph until she reaches home, how long will it take her to get home?'],
['Find the least odd prime factor of $2019^8+1$.'],
]),
cache_examples=False,
fill_height=True
)
with gr.Accordion("Adjust Parameters", open=False):
gr.Slider(
minimum=128,
maximum=8192,
value=self.assistant.generate_cfg["max_tokens"],
step=1,
label="Max Tokens"
)
gr.Slider(
minimum=0.1,
maximum=1.5,
value=self.assistant.generate_cfg["temperature"],
step=0.1,
label="Temperature"
)
gr.Slider(
minimum=0.05,
maximum=1.0,
value=self.assistant.generate_cfg["top_p"],
step=0.01,
label="Top-p (nucleus sampling)"
)
gr.Markdown(self.config.get("license", "--- MIT License ---"))
return demo
def run(self, **kwargs):
demo = self.create_interface()
demo.launch(**kwargs)
def app_gui():
# Define model configuration
model_config = {
"repo_id": os.environ.get("REPO_ID", "Lyte/LLaMA-O1-Supervised-1129-Q4_K_M-GGUF"),
"model_file": os.environ.get("MODEL_FILE", "llama-o1-supervised-1129-q4_k_m.gguf"),
"generate_cfg": {
"max_tokens": 512,
"temperature": float(os.environ.get("T", 0.7)),
"top_p": float(os.environ.get("P", 0.95)),
}
}
# UI configuration
ui_config = {
"title": "LLaMA-O1-Supervised-1129 | Demo",
"description": "LLaMA-O1-Supervised-1129 is an experimental research model focused on advancing AI reasoning capabilities.",
"examples": [
["How many r's are in the word strawberry?"],
['If Diana needs to bike 10 miles to reach home and she can bike at a speed of 3 mph for two hours before getting tired, and then at a speed of 1 mph until she reaches home, how long will it take her to get home?'],
['Find the least odd prime factor of $2019^8+1$.'],
],
"license": "--- MIT License ---"
}
# Create and run the web interface
assistant = LlamaAssistant(model_config)
WebUI(assistant, ui_config).run(concurrency_limit=80)
if __name__ == '__main__':
app_gui() |