Spaces:

SimpleBerry
/

LLaMA-O1-Supervised-1129-Demo

Running

File size: 4,165 Bytes

7666411
bedd3fe
fc46f2c
 
baf6fc1
bedd3fe
 
22dfef8
bedd3fe
fc46f2c
bedd3fe
7666411
c21c837
fc46f2c
 
baf6fc1
bedd3fe
7666411
fc46f2c
bedd3fe
 
 
 
 
 
 
fc46f2c
 
 
3c59616
fc46f2c
 
3c59616
 
 
bedd3fe
3c59616
 
 
db4b49f
 
 
 
 
 
 
 
 
2aced17
669aad1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
bda8afc
669aad1
 
 
 
 
bda8afc
 
 
bedd3fe
 
 
 
 
 
 
 
 
db4b49f
bedd3fe
3c59616
fc46f2c
 
 
 
 
bedd3fe
 
fc46f2c
 
7271565
 
fc46f2c
be02eff
2bd1b34
fc46f2c
 
 
bedd3fe
 
 
fc46f2c
 
 
 
bedd3fe

import spaces

import os
import gradio as gr
from transformers import AutoTokenizer, AutoModelForCausalLM
from huggingface_hub import hf_hub_download, snapshot_download
import accelerate

accelerator = accelerate.Accelerator()

# Load the model and tokenizer from Hugging Face
model_path = snapshot_download(
    repo_id=os.environ.get("REPO_ID", "SimpleBerry/LLaMA-O1-Supervised-1129")
)

tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForCausalLM.from_pretrained(model_path,device_map='auto')

DESCRIPTION = '''
# SimpleBerry/LLaMA-O1-Supervised-1129 | Duplicate the space and set it to private for faster & personal inference for free.
SimpleBerry/LLaMA-O1-Supervised-1129: an experimental research model developed by the SimpleBerry.  
Focused on advancing AI reasoning capabilities.  

## This Space was designed by Lyte/LLaMA-O1-Supervised-1129-GGUF, Many Thanks!

**To start a new chat**, click "clear" and start a new dialogue.
'''

LICENSE = """
--- MIT License ---
"""

template = "<start_of_father_id>-1<end_of_father_id><start_of_local_id>0<end_of_local_id><start_of_thought><problem>{content}<end_of_thought><start_of_rating><positive_rating><end_of_rating>\n<start_of_father_id>0<end_of_father_id><start_of_local_id>1<end_of_local_id><start_of_thought><expansion>"

def llama_o1_template(data):
    #query = data['query']
    text = template.format(content=data)
    return text

def format_response(response):
    response = response.replace('<start_of_father_id>','')
    response = response.replace('<end_of_father_id><start_of_local_id>','👉')
    response = response.replace('<end_of_local_id><start_of_thought>',', ')
    response = response.replace('<end_of_thought><start_of_rating>','')
    response = response.replace('<end_of_rating>','')
    response = response.replace('<positive_rating>','👍')
    response = response.replace('<negative_rating>','👎')
    
@spaces.GPU
def generate_text_gpu(message, history, max_tokens=512, temperature=0.9, top_p=0.95):
    input_text = llama_o1_template(message)
    inputs = tokenizer(input_text, return_tensors="pt").to(accelerator.device)

    # Generate the text with the model
    output = model.generate(
        **inputs,
        max_length=max_tokens,
        temperature=temperature,
        top_p=top_p,
        do_sample=True,
    )

    response = tokenizer.decode(output[0], skip_special_tokens=False)
    yield response

def generate_text(message, history, max_tokens=512, temperature=0.9, top_p=0.95):
    try:
        yield generate_text_gpu(message, history, max_tokens=512, temperature=0.9, top_p=0.95)
        return
    except Exception as e:
        print(e)
    input_text = llama_o1_template(message)
    inputs = tokenizer(input_text, return_tensors="pt").to(accelerator.device)

    # Generate the text with the model
    output = model.generate(
        **inputs,
        max_length=max_tokens,
        temperature=temperature,
        top_p=top_p,
        do_sample=True,
    )

    response = tokenizer.decode(output[0], skip_special_tokens=False)
    yield response

with gr.Blocks() as demo:
    gr.Markdown(DESCRIPTION)

    chatbot = gr.ChatInterface(
        generate_text,
        title="SimpleBerry/LLaMA-O1-Supervised-1129 | GGUF Demo",
        description="Edit Settings below if needed.",
        examples=[
            ["How many r's are in the word strawberry?"],
            ['If Diana needs to bike 10 miles to reach home and she can bike at a speed of 3 mph for two hours before getting tired, and then at a speed of 1 mph until she reaches home, how long will it take her to get home?'],
            ['Find the least odd prime factor of $2019^8+1$.'],
        ],
        cache_examples=True,
        fill_height=True,
    )

    with gr.Accordion("Adjust Parameters", open=False):
        gr.Slider(minimum=1024, maximum=8192, value=2048, step=1, label="Max Tokens")
        gr.Slider(minimum=0.1, maximum=1.5, value=0.7, step=0.1, label="Temperature")
        gr.Slider(minimum=0.05, maximum=1.0, value=0.95, step=0.01, label="Top-p (nucleus sampling)")

    gr.Markdown(LICENSE)

if __name__ == "__main__":
    demo.launch()