File size: 3,504 Bytes
bb848de bedd3fe fc46f2c baf6fc1 bedd3fe 22dfef8 bedd3fe fc46f2c bedd3fe 7666411 c21c837 fc46f2c baf6fc1 bedd3fe 7666411 fc46f2c bedd3fe fc46f2c 3c59616 fc46f2c 3c59616 bedd3fe 3c59616 db4b49f bb848de bda8afc bedd3fe db4b49f bedd3fe 3c59616 fc46f2c bedd3fe fc46f2c 7271565 fc46f2c be02eff 2bd1b34 fc46f2c bedd3fe fc46f2c bedd3fe |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 |
# import spaces
import os
import gradio as gr
from transformers import AutoTokenizer, AutoModelForCausalLM
from huggingface_hub import hf_hub_download, snapshot_download
import accelerate
accelerator = accelerate.Accelerator()
# Load the model and tokenizer from Hugging Face
model_path = snapshot_download(
repo_id=os.environ.get("REPO_ID", "SimpleBerry/LLaMA-O1-Supervised-1129")
)
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForCausalLM.from_pretrained(model_path,device_map='auto')
DESCRIPTION = '''
# SimpleBerry/LLaMA-O1-Supervised-1129 | Duplicate the space and set it to private for faster & personal inference for free.
SimpleBerry/LLaMA-O1-Supervised-1129: an experimental research model developed by the SimpleBerry.
Focused on advancing AI reasoning capabilities.
## This Space was designed by Lyte/LLaMA-O1-Supervised-1129-GGUF, Many Thanks!
**To start a new chat**, click "clear" and start a new dialogue.
'''
LICENSE = """
--- MIT License ---
"""
template = "<start_of_father_id>-1<end_of_father_id><start_of_local_id>0<end_of_local_id><start_of_thought><problem>{content}<end_of_thought><start_of_rating><positive_rating><end_of_rating>\n<start_of_father_id>0<end_of_father_id><start_of_local_id>1<end_of_local_id><start_of_thought><expansion>"
def llama_o1_template(data):
#query = data['query']
text = template.format(content=data)
return text
def format_response(response):
response = response.replace('<start_of_father_id>','')
response = response.replace('<end_of_father_id><start_of_local_id>','π')
response = response.replace('<end_of_local_id><start_of_thought>',', ')
response = response.replace('<end_of_thought><start_of_rating>','')
response = response.replace('<end_of_rating>','')
response = response.replace('<positive_rating>','π')
response = response.replace('<negative_rating>','π')
# @spaces.GPU
def generate_text(message, history, max_tokens=512, temperature=0.9, top_p=0.95):
input_text = llama_o1_template(message)
inputs = tokenizer(input_text, return_tensors="pt").to(accelerator.device)
# Generate the text with the model
output = model.generate(
**inputs,
max_length=max_tokens,
temperature=temperature,
top_p=top_p,
do_sample=True,
)
response = tokenizer.decode(output[0], skip_special_tokens=False)
yield response
with gr.Blocks() as demo:
gr.Markdown(DESCRIPTION)
chatbot = gr.ChatInterface(
generate_text,
title="SimpleBerry/LLaMA-O1-Supervised-1129 | GGUF Demo",
description="Edit Settings below if needed.",
examples=[
["How many r's are in the word strawberry?"],
['If Diana needs to bike 10 miles to reach home and she can bike at a speed of 3 mph for two hours before getting tired, and then at a speed of 1 mph until she reaches home, how long will it take her to get home?'],
['Find the least odd prime factor of $2019^8+1$.'],
],
cache_examples=True,
fill_height=True,
)
with gr.Accordion("Adjust Parameters", open=False):
gr.Slider(minimum=1024, maximum=8192, value=2048, step=1, label="Max Tokens")
gr.Slider(minimum=0.1, maximum=1.5, value=0.7, step=0.1, label="Temperature")
gr.Slider(minimum=0.05, maximum=1.0, value=0.95, step=0.01, label="Top-p (nucleus sampling)")
gr.Markdown(LICENSE)
if __name__ == "__main__":
demo.launch()
|