KantaHayashiAI commited on
Commit
6c5de50
·
verified ·
1 Parent(s): 85ab2f8

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +33 -46
app.py CHANGED
@@ -1,17 +1,14 @@
1
  import os
2
- from collections.abc import Iterator
3
- from threading import Thread
4
-
5
  import gradio as gr
6
  import spaces
7
  import torch
8
- from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
9
 
10
  DESCRIPTION = """\
11
  # EvaByte-SFT
12
 
13
- EvaByte is a byte-level language model that combines multibyte prediction with the efficient EVA attention mechanism.
14
- This page hosts [`EvaByte/EvaByte-SFT`](https://huggingface.co/EvaByte/EvaByte-SFT), fine-tuned via supervised instruction data to enable chat and general instruction-following capabilities.
15
  For full details on architecture, training recipe, and benchmarks, see their blog post and the project repository:
16
 
17
  - Blog: <https://hkunlp.github.io/blog/2025/evabyte>
@@ -25,71 +22,61 @@ MAX_INPUT_TOKEN_LENGTH = 32000
25
  device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
26
 
27
  tokenizer = AutoTokenizer.from_pretrained("EvaByte/EvaByte", trust_remote_code=True)
28
- model = AutoModelForCausalLM.from_pretrained("evabyte/EvaByte-SFT", torch_dtype=torch.bfloat16, trust_remote_code=True).eval().to("cuda")
 
 
 
 
 
29
 
30
  @spaces.GPU(duration=90)
31
  def generate(
32
  message: str,
33
  chat_history: list[dict],
34
- max_new_tokens: int = 1024,
35
  temperature: float = 0.6,
36
  top_p: float = 0.9,
37
- ) -> Iterator[str]:
 
38
  conversation = [*chat_history, {"role": "user", "content": message}]
 
 
 
 
 
39
 
40
- input_ids = tokenizer.apply_chat_template(conversation, add_generation_prompt=True, return_tensors="pt")
41
  if input_ids.shape[1] > MAX_INPUT_TOKEN_LENGTH:
42
  input_ids = input_ids[:, -MAX_INPUT_TOKEN_LENGTH:]
43
- gr.Warning(f"Trimmed input from conversation as it was longer than {MAX_INPUT_TOKEN_LENGTH} tokens.")
 
 
 
44
  input_ids = input_ids.to(model.device)
45
 
46
- streamer = TextIteratorStreamer(tokenizer, timeout=20.0, skip_prompt=True, skip_special_tokens=True)
47
- generate_kwargs = dict(
48
- {"input_ids": input_ids},
49
- #streamer=streamer,
50
  max_new_tokens=max_new_tokens,
51
  do_sample=True,
52
  top_p=top_p,
53
  temperature=temperature,
54
  )
55
- t = Thread(target=model.multi_byte_generate, kwargs=generate_kwargs)
56
- t.start()
57
 
58
- outputs = []
59
- for text in streamer:
60
- outputs.append(text)
61
- yield "".join(outputs)
62
 
63
 
64
  demo = gr.ChatInterface(
65
  fn=generate,
66
  additional_inputs=[
67
- gr.Slider(
68
- label="Max new tokens",
69
- minimum=1,
70
- maximum=MAX_MAX_NEW_TOKENS,
71
- step=1,
72
- value=DEFAULT_MAX_NEW_TOKENS,
73
- ),
74
- gr.Slider(
75
- label="Temperature",
76
- minimum=0.1,
77
- maximum=4.0,
78
- step=0.1,
79
- value=0.6,
80
- ),
81
- gr.Slider(
82
- label="Top-p (nucleus sampling)",
83
- minimum=0.05,
84
- maximum=1.0,
85
- step=0.05,
86
- value=0.9,
87
- ),
88
- ],
89
- stop_btn=None,
90
- examples=[
91
- ["Write me an English pangram."],
92
  ],
 
 
93
  cache_examples=False,
94
  type="messages",
95
  description=DESCRIPTION,
 
1
  import os
 
 
 
2
  import gradio as gr
3
  import spaces
4
  import torch
5
+ from transformers import AutoModelForCausalLM, AutoTokenizer
6
 
7
  DESCRIPTION = """\
8
  # EvaByte-SFT
9
 
10
+ EvaByte is a byte-level language model that combines multibyte prediction with the efficient EVA attention mechanism.
11
+ This page hosts [EvaByte/EvaByte-SFT](https://huggingface.co/EvaByte/EvaByte-SFT), fine-tuned via supervised instruction data to enable chat and general instruction-following capabilities.
12
  For full details on architecture, training recipe, and benchmarks, see their blog post and the project repository:
13
 
14
  - Blog: <https://hkunlp.github.io/blog/2025/evabyte>
 
22
  device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
23
 
24
  tokenizer = AutoTokenizer.from_pretrained("EvaByte/EvaByte", trust_remote_code=True)
25
+ model = AutoModelForCausalLM.from_pretrained(
26
+ "EvaByte/EvaByte-SFT",
27
+ torch_dtype=torch.bfloat16,
28
+ trust_remote_code=True,
29
+ ).eval().to(device)
30
+
31
 
32
  @spaces.GPU(duration=90)
33
  def generate(
34
  message: str,
35
  chat_history: list[dict],
36
+ max_new_tokens: int = DEFAULT_MAX_NEW_TOKENS,
37
  temperature: float = 0.6,
38
  top_p: float = 0.9,
39
+ ) -> str:
40
+
41
  conversation = [*chat_history, {"role": "user", "content": message}]
42
+ input_ids = tokenizer.apply_chat_template(
43
+ conversation,
44
+ add_generation_prompt=True,
45
+ return_tensors="pt"
46
+ )
47
 
 
48
  if input_ids.shape[1] > MAX_INPUT_TOKEN_LENGTH:
49
  input_ids = input_ids[:, -MAX_INPUT_TOKEN_LENGTH:]
50
+ gr.Warning(
51
+ f"Trimmed input to the last {MAX_INPUT_TOKEN_LENGTH} tokens because it exceeded the limit."
52
+ )
53
+
54
  input_ids = input_ids.to(model.device)
55
 
56
+ output_ids = model.multi_byte_generate(
57
+ input_ids=input_ids,
 
 
58
  max_new_tokens=max_new_tokens,
59
  do_sample=True,
60
  top_p=top_p,
61
  temperature=temperature,
62
  )
 
 
63
 
64
+ generated_segment = output_ids[0][input_ids.shape[1]:]
65
+ return tokenizer.decode(generated_segment, skip_special_tokens=False, clean_up_tokenization_spaces=False)
 
 
66
 
67
 
68
  demo = gr.ChatInterface(
69
  fn=generate,
70
  additional_inputs=[
71
+ gr.Slider("Max new tokens", minimum=1, maximum=MAX_MAX_NEW_TOKENS,
72
+ step=1, value=DEFAULT_MAX_NEW_TOKENS),
73
+ gr.Slider("Temperature", minimum=0.1, maximum=4.0,
74
+ step=0.1, value=0.6),
75
+ gr.Slider("Top-p (nucleus sampling)", minimum=0.05,
76
+ maximum=1.0, step=0.05, value=0.9),
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
77
  ],
78
+ stop_btn=None,
79
+ examples=[["Write me an English pangram."]],
80
  cache_examples=False,
81
  type="messages",
82
  description=DESCRIPTION,