Spaces:

SimpleBerry
/

LLaMA-O1-Supervised-1129-Demo

Running

App Files Files Community

Di Zhang commited on Dec 2, 2024

Commit

bf7cf6b

verified ·

1 Parent(s): 22dfef8

Update app.py

Browse files

Files changed (1) hide show

app.py +35 -30

app.py CHANGED Viewed

@@ -1,30 +1,31 @@
 import spaces
 import os
 import gradio as gr
 from transformers import AutoTokenizer, AutoModelForCausalLM
-from huggingface_hub import hf_hub_download, snapshot_download
-import accelerate
-accelerator = accelerate.Accelerator()
-# Load the model and tokenizer from Hugging Face
 model_path = snapshot_download(
     repo_id=os.environ.get("REPO_ID", "SimpleBerry/LLaMA-O1-Supervised-1129")
 )
 tokenizer = AutoTokenizer.from_pretrained(model_path)
-model = AutoModelForCausalLM.from_pretrained(model_path,device_map='auto')
 DESCRIPTION = '''
-# SimpleBerry/LLaMA-O1-Supervised-1129 | Duplicate the space and set it to private for faster & personal inference for free.
-SimpleBerry/LLaMA-O1-Supervised-1129: an experimental research model developed by the SimpleBerry.
-Focused on advancing AI reasoning capabilities.
-## This Space was designed by Lyte/LLaMA-O1-Supervised-1129-GGUF, Many Thanks!
-**To start a new chat**, click "clear" and start a new dialogue.
 '''
 LICENSE = """
@@ -34,7 +35,6 @@ LICENSE = """
 template = "<start_of_father_id>-1<end_of_father_id><start_of_local_id>0<end_of_local_id><start_of_thought><problem>{content}<end_of_thought><start_of_rating><positive_rating><end_of_rating>\n<start_of_father_id>0<end_of_father_id><start_of_local_id>1<end_of_local_id><start_of_thought><expansion>"
 def llama_o1_template(data):
-    #query = data['query']
     text = template.format(content=data)
     return text
@@ -43,25 +43,30 @@ def generate_text(message, history, max_tokens=512, temperature=0.9, top_p=0.95)
     input_text = llama_o1_template(message)
     inputs = tokenizer(input_text, return_tensors="pt").to(accelerator.device)
-    # Generate the text with the model
-    output = model.generate(
-        **inputs,
-        max_length=max_tokens,
-        temperature=temperature,
-        top_p=top_p,
-        do_sample=True,
-    )
-    response = tokenizer.decode(output[0], skip_special_tokens=True)
-    yield response
 with gr.Blocks() as demo:
     gr.Markdown(DESCRIPTION)
     chatbot = gr.ChatInterface(
         generate_text,
-        title="SimpleBerry/LLaMA-O1-Supervised-1129 | GGUF Demo",
-        description="Edit Settings below if needed.",
         examples=[
             ["How many r's are in the word strawberry?"],
             ['If Diana needs to bike 10 miles to reach home and she can bike at a speed of 3 mph for two hours before getting tired, and then at a speed of 1 mph until she reaches home, how long will it take her to get home?'],
@@ -72,9 +77,9 @@ with gr.Blocks() as demo:
     )
     with gr.Accordion("Adjust Parameters", open=False):
-        gr.Slider(minimum=1024, maximum=8192, value=2048, step=1, label="Max Tokens")
-        gr.Slider(minimum=0.1, maximum=1.5, value=0.7, step=0.1, label="Temperature")
-        gr.Slider(minimum=0.05, maximum=1.0, value=0.95, step=0.01, label="Top-p (nucleus sampling)")
     gr.Markdown(LICENSE)

 import spaces
 import os
 import gradio as gr
 from transformers import AutoTokenizer, AutoModelForCausalLM
+from huggingface_hub import snapshot_download
+import torch
+from accelerate import Accelerator
+# Initialize Accelerator for efficient multi-GPU/Zero optimization
+accelerator = Accelerator()
+# Load the model and tokenizer
 model_path = snapshot_download(
     repo_id=os.environ.get("REPO_ID", "SimpleBerry/LLaMA-O1-Supervised-1129")
 )
 tokenizer = AutoTokenizer.from_pretrained(model_path)
+model = AutoModelForCausalLM.from_pretrained(
+    model_path,
+    torch_dtype=torch.float16,
+    device_map="auto"
+).eval()
 DESCRIPTION = '''
+# SimpleBerry/LLaMA-O1-Supervised-1129 | Optimized for Streaming and Hugging Face Zero Space.
+This model is experimental and focused on advancing AI reasoning capabilities.
+**To start a new chat**, click "clear" and begin a fresh dialogue.
 '''
 LICENSE = """
 template = "<start_of_father_id>-1<end_of_father_id><start_of_local_id>0<end_of_local_id><start_of_thought><problem>{content}<end_of_thought><start_of_rating><positive_rating><end_of_rating>\n<start_of_father_id>0<end_of_father_id><start_of_local_id>1<end_of_local_id><start_of_thought><expansion>"
 def llama_o1_template(data):
     text = template.format(content=data)
     return text
     input_text = llama_o1_template(message)
     inputs = tokenizer(input_text, return_tensors="pt").to(accelerator.device)
+    # Stream generation, token by token
+    with torch.no_grad():
+        for output in model.generate(
+            **inputs,
+            max_length=max_tokens,
+            temperature=temperature,
+            top_p=top_p,
+            do_sample=True,
+            use_cache=True,
+            pad_token_id=tokenizer.eos_token_id,
+            return_dict_in_generate=True,
+            output_scores=False
+        ):
+            # Return text with special tokens included
+            generated_text = tokenizer.decode(output, skip_special_tokens=False)
+            yield generated_text
 with gr.Blocks() as demo:
     gr.Markdown(DESCRIPTION)
     chatbot = gr.ChatInterface(
         generate_text,
+        title="SimpleBerry/LLaMA-O1-Supervised-1129 | Optimized Demo",
+        description="Adjust settings below as needed.",
         examples=[
             ["How many r's are in the word strawberry?"],
             ['If Diana needs to bike 10 miles to reach home and she can bike at a speed of 3 mph for two hours before getting tired, and then at a speed of 1 mph until she reaches home, how long will it take her to get home?'],
     )
     with gr.Accordion("Adjust Parameters", open=False):
+        max_tokens_slider = gr.Slider(minimum=128, maximum=2048, value=512, step=1, label="Max Tokens")
+        temperature_slider = gr.Slider(minimum=0.1, maximum=1.5, value=0.9, step=0.1, label="Temperature")
+        top_p_slider = gr.Slider(minimum=0.05, maximum=1.0, value=0.95, step=0.01, label="Top-p (nucleus sampling)")
     gr.Markdown(LICENSE)