Locutusque commited on
Commit
6615fb0
·
verified ·
1 Parent(s): bfe628d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +42 -26
app.py CHANGED
@@ -10,53 +10,69 @@ def load_model(model_name):
10
  return pipeline("text-generation", model=model_name, device_map="cuda", torch_dtype=torch.bfloat16, trust_remote_code=True, token=os.environ["token"], use_fast=True)
11
  @spaces.GPU()
12
  def generate(
 
13
  model_name,
14
  system,
15
- user_input,
16
  temperature=0.4,
17
  top_p=0.95,
18
  min_p=0.1,
19
  top_k=50,
20
  max_new_tokens=256,
21
  ):
22
- pipe = load_model(model_name)
23
- tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True, token=os.environ["token"])
24
- print(tokenizer)
25
- pipe.tokenizer = tokenizer
 
26
 
27
- # Set tokenize correctly. Otherwise ticking the box breaks it.
28
- if model_name == "M4-ai/tau-1.8B":
29
- prompt = user_input
30
- else:
31
- prompt = f"<|im_start|>system\n{system}<|im_end|>\n<|im_start|>user\n{user_input}<|im_end|>\n<|im_start|>assistant\n"
32
- streamer = TextIteratorStreamer(pipe.tokenizer, timeout=240.0, skip_prompt=True, skip_special_tokens=True)
33
- generation_kwargs = dict(text_inputs=prompt, streamer=streamer, max_new_tokens=max_new_tokens, do_sample=True, top_p=top_p, min_p=min_p, top_k=top_k,
34
- temperature=temperature, num_beams=1, repetition_penalty=1.1)
35
- t = Thread(target=pipe.__call__, kwargs=generation_kwargs)
36
- t.start()
37
- outputs = []
38
- for chunk in streamer:
39
- outputs.append(chunk)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
40
  yield "".join(outputs)
 
 
 
41
 
42
- model_choices = ["Locutusque/Apollo-2.0-Nemo-2407-12B", "Locutusque/Apollo-2.0-Llama-3.1-8B", "Locutusque/Llama-3-NeuralHermes-Pro-8B", "Locutusque/Hercules-5.0-Qwen2-7B", "Locutusque/Llama-3-NeuralHercules-5.0-8B", "Locutusque/Hercules-5.0-Index-1.9B", "Locutusque/Llama-3-Hercules-5.0-8B"]
43
  # What at the best options?
44
- g = gr.Interface(
45
  fn=generate,
46
- inputs=[
47
  gr.components.Dropdown(choices=model_choices, label="Model", value=model_choices[0], interactive=True),
48
  gr.components.Textbox(lines=2, label="System Prompt", value="You are an AI."),
49
- gr.components.Textbox(lines=2, label="Prompt", value="Write me a Python program that calculates the factorial of a given number."),
50
  gr.components.Slider(minimum=0, maximum=2, value=0.8, label="Temperature"),
51
  gr.components.Slider(minimum=0, maximum=1, value=0.95, label="Top p"),
52
  gr.components.Slider(minimum=0, maximum=1, value=0.1, label="Min P"),
53
  gr.components.Slider(minimum=0, maximum=100, step=1, value=15, label="Top k"),
54
  gr.components.Slider(minimum=1, maximum=2048, step=1, value=1024, label="Max tokens"),
55
  ],
56
- outputs=[gr.Textbox(lines=10, label="Output")],
57
  title="Locutusque's Language Models",
58
  description="Try out Locutusque's language models here! Credit goes to Mediocreatmybest for this space. You may also find some experimental preview models that have not been made public here.",
59
- concurrency_limit=1
60
  )
61
-
62
- g.launch(max_threads=4)
 
10
  return pipeline("text-generation", model=model_name, device_map="cuda", torch_dtype=torch.bfloat16, trust_remote_code=True, token=os.environ["token"], use_fast=True)
11
  @spaces.GPU()
12
  def generate(
13
+ history,
14
  model_name,
15
  system,
 
16
  temperature=0.4,
17
  top_p=0.95,
18
  min_p=0.1,
19
  top_k=50,
20
  max_new_tokens=256,
21
  ):
22
+ try:
23
+ pipe = load_model(model_name)
24
+ tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True, token=os.environ["token"])
25
+ print(tokenizer)
26
+ pipe.tokenizer = tokenizer
27
 
28
+ prompt = f"<|im_start|>system\n{system}<|im_end|>\n"
29
+ for (user_turn, assistant_turn) in history:
30
+ prompt += f"<|im_start|>user\n{user_turn}<|im_end|>\n<|im_start|>assistant\n{assistant_turn}<|im_end|>\n"
31
+ prompt += f"<|im_start|>user\n{history[-1][0]}<|im_end|>\n<|im_start|>assistant\n"
32
+
33
+ streamer = TextIteratorStreamer(pipe.tokenizer, timeout=240.0, skip_prompt=True, skip_special_tokens=True)
34
+ generation_kwargs = dict(
35
+ text_inputs=prompt,
36
+ streamer=streamer,
37
+ max_new_tokens=max_new_tokens,
38
+ do_sample=True,
39
+ top_p=top_p,
40
+ min_p=min_p,
41
+ top_k=top_k,
42
+ temperature=temperature,
43
+ num_beams=1,
44
+ repetition_penalty=1.1
45
+ )
46
+
47
+ t = Thread(target=pipe.__call__, kwargs=generation_kwargs)
48
+ t.start()
49
+
50
+ outputs = []
51
+ for chunk in streamer:
52
+ outputs.append(chunk)
53
+ yield "".join(outputs)
54
+ except StopAsyncIteration:
55
+ print("Stream stopped unexpectedly.")
56
  yield "".join(outputs)
57
+ except Exception as e:
58
+ print(f"An error occurred: {e}")
59
+ yield "An error occurred during generation."
60
 
61
+ model_choices = ["Locutusque/Apollo-2.0-Llama-3.1-8B", "Locutusque/Llama-3-NeuralHermes-Pro-8B", "Locutusque/Hercules-5.0-Qwen2-7B", "Locutusque/Llama-3-NeuralHercules-5.0-8B", "Locutusque/Hercules-5.0-Index-1.9B", "Locutusque/Llama-3-Hercules-5.0-8B"]
62
  # What at the best options?
63
+ g = gr.ChatInterface(
64
  fn=generate,
65
+ additional_inputs=[
66
  gr.components.Dropdown(choices=model_choices, label="Model", value=model_choices[0], interactive=True),
67
  gr.components.Textbox(lines=2, label="System Prompt", value="You are an AI."),
 
68
  gr.components.Slider(minimum=0, maximum=2, value=0.8, label="Temperature"),
69
  gr.components.Slider(minimum=0, maximum=1, value=0.95, label="Top p"),
70
  gr.components.Slider(minimum=0, maximum=1, value=0.1, label="Min P"),
71
  gr.components.Slider(minimum=0, maximum=100, step=1, value=15, label="Top k"),
72
  gr.components.Slider(minimum=1, maximum=2048, step=1, value=1024, label="Max tokens"),
73
  ],
 
74
  title="Locutusque's Language Models",
75
  description="Try out Locutusque's language models here! Credit goes to Mediocreatmybest for this space. You may also find some experimental preview models that have not been made public here.",
 
76
  )
77
+ if __name__ == "__main__":
78
+ g.launch()