Di Zhang commited on
Commit
bedd3fe
·
verified ·
1 Parent(s): bda8afc

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +31 -36
app.py CHANGED
@@ -1,30 +1,29 @@
1
  import spaces
 
2
  import os
3
  import gradio as gr
4
  from transformers import AutoTokenizer, AutoModelForCausalLM
5
- from huggingface_hub import snapshot_download
6
- import torch
7
- from accelerate import Accelerator
8
 
9
- # Initialize Accelerator for efficient multi-GPU/Zero optimization
10
- accelerator = Accelerator()
11
 
12
- # Load the model and tokenizer
13
  model_path = snapshot_download(
14
  repo_id=os.environ.get("REPO_ID", "SimpleBerry/LLaMA-O1-Supervised-1129")
15
  )
16
 
17
  tokenizer = AutoTokenizer.from_pretrained(model_path)
18
- model = AutoModelForCausalLM.from_pretrained(
19
- model_path,
20
- torch_dtype=torch.float16,
21
- device_map="auto"
22
- ).eval()
23
 
24
  DESCRIPTION = '''
25
- # SimpleBerry/LLaMA-O1-Supervised-1129 | Optimized for Streaming and Hugging Face Zero Space.
26
- This model is experimental and focused on advancing AI reasoning capabilities.
27
- **To start a new chat**, click "clear" and begin a fresh dialogue.
 
 
 
 
28
  '''
29
 
30
  LICENSE = """
@@ -34,6 +33,7 @@ LICENSE = """
34
  template = "<start_of_father_id>-1<end_of_father_id><start_of_local_id>0<end_of_local_id><start_of_thought><problem>{content}<end_of_thought><start_of_rating><positive_rating><end_of_rating>\n<start_of_father_id>0<end_of_father_id><start_of_local_id>1<end_of_local_id><start_of_thought><expansion>"
35
 
36
  def llama_o1_template(data):
 
37
  text = template.format(content=data)
38
  return text
39
 
@@ -42,30 +42,25 @@ def generate_text(message, history, max_tokens=512, temperature=0.9, top_p=0.95)
42
  input_text = llama_o1_template(message)
43
  inputs = tokenizer(input_text, return_tensors="pt").to(accelerator.device)
44
 
45
- # Stream generation, token by token
46
- with torch.no_grad():
47
- for output in model.generate(
48
- **inputs,
49
- max_length=max_tokens,
50
- temperature=temperature,
51
- top_p=top_p,
52
- do_sample=True,
53
- use_cache=True,
54
- pad_token_id=tokenizer.eos_token_id,
55
- return_dict_in_generate=True,
56
- output_scores=False
57
- ):
58
- # Return text with special tokens included
59
- generated_text = tokenizer.decode(output, skip_special_tokens=False)
60
- yield generated_text
61
 
62
  with gr.Blocks() as demo:
63
  gr.Markdown(DESCRIPTION)
64
 
65
  chatbot = gr.ChatInterface(
66
  generate_text,
67
- title="SimpleBerry/LLaMA-O1-Supervised-1129 | Optimized Demo",
68
- description="Adjust settings below as needed.",
69
  examples=[
70
  ["How many r's are in the word strawberry?"],
71
  ['If Diana needs to bike 10 miles to reach home and she can bike at a speed of 3 mph for two hours before getting tired, and then at a speed of 1 mph until she reaches home, how long will it take her to get home?'],
@@ -76,11 +71,11 @@ with gr.Blocks() as demo:
76
  )
77
 
78
  with gr.Accordion("Adjust Parameters", open=False):
79
- max_tokens_slider = gr.Slider(minimum=128, maximum=2048, value=512, step=1, label="Max Tokens")
80
- temperature_slider = gr.Slider(minimum=0.1, maximum=1.5, value=0.9, step=0.1, label="Temperature")
81
- top_p_slider = gr.Slider(minimum=0.05, maximum=1.0, value=0.95, step=0.01, label="Top-p (nucleus sampling)")
82
 
83
  gr.Markdown(LICENSE)
84
 
85
  if __name__ == "__main__":
86
- demo.launch()
 
1
  import spaces
2
+
3
  import os
4
  import gradio as gr
5
  from transformers import AutoTokenizer, AutoModelForCausalLM
6
+ from huggingface_hub import hf_hub_download, snapshot_download
7
+ import accelerate
 
8
 
9
+ accelerator = accelerate.Accelerator()
 
10
 
11
+ # Load the model and tokenizer from Hugging Face
12
  model_path = snapshot_download(
13
  repo_id=os.environ.get("REPO_ID", "SimpleBerry/LLaMA-O1-Supervised-1129")
14
  )
15
 
16
  tokenizer = AutoTokenizer.from_pretrained(model_path)
17
+ model = AutoModelForCausalLM.from_pretrained(model_path,device_map='auto')
 
 
 
 
18
 
19
  DESCRIPTION = '''
20
+ # SimpleBerry/LLaMA-O1-Supervised-1129 | Duplicate the space and set it to private for faster & personal inference for free.
21
+ SimpleBerry/LLaMA-O1-Supervised-1129: an experimental research model developed by the SimpleBerry.
22
+ Focused on advancing AI reasoning capabilities.
23
+
24
+ ## This Space was designed by Lyte/LLaMA-O1-Supervised-1129-GGUF, Many Thanks!
25
+
26
+ **To start a new chat**, click "clear" and start a new dialogue.
27
  '''
28
 
29
  LICENSE = """
 
33
  template = "<start_of_father_id>-1<end_of_father_id><start_of_local_id>0<end_of_local_id><start_of_thought><problem>{content}<end_of_thought><start_of_rating><positive_rating><end_of_rating>\n<start_of_father_id>0<end_of_father_id><start_of_local_id>1<end_of_local_id><start_of_thought><expansion>"
34
 
35
  def llama_o1_template(data):
36
+ #query = data['query']
37
  text = template.format(content=data)
38
  return text
39
 
 
42
  input_text = llama_o1_template(message)
43
  inputs = tokenizer(input_text, return_tensors="pt").to(accelerator.device)
44
 
45
+ # Generate the text with the model
46
+ output = model.generate(
47
+ **inputs,
48
+ max_length=max_tokens,
49
+ temperature=temperature,
50
+ top_p=top_p,
51
+ do_sample=True,
52
+ )
53
+
54
+ response = tokenizer.decode(output[0], skip_special_tokens=True)
55
+ yield response
 
 
 
 
 
56
 
57
  with gr.Blocks() as demo:
58
  gr.Markdown(DESCRIPTION)
59
 
60
  chatbot = gr.ChatInterface(
61
  generate_text,
62
+ title="SimpleBerry/LLaMA-O1-Supervised-1129 | GGUF Demo",
63
+ description="Edit Settings below if needed.",
64
  examples=[
65
  ["How many r's are in the word strawberry?"],
66
  ['If Diana needs to bike 10 miles to reach home and she can bike at a speed of 3 mph for two hours before getting tired, and then at a speed of 1 mph until she reaches home, how long will it take her to get home?'],
 
71
  )
72
 
73
  with gr.Accordion("Adjust Parameters", open=False):
74
+ gr.Slider(minimum=1024, maximum=8192, value=2048, step=1, label="Max Tokens")
75
+ gr.Slider(minimum=0.1, maximum=1.5, value=0.7, step=0.1, label="Temperature")
76
+ gr.Slider(minimum=0.05, maximum=1.0, value=0.95, step=0.01, label="Top-p (nucleus sampling)")
77
 
78
  gr.Markdown(LICENSE)
79
 
80
  if __name__ == "__main__":
81
+ demo.launch()