Spaces:

KantaHayashiAI
/

EvaByte-SFT

Running on Zero

File size: 3,095 Bytes

3d72f81
6aedfa8
3d72f81
 
6c5de50
6aedfa8
3d72f81
0b99ec0
3d72f81
0b99ec0
ecb843e
eb895a8
 
 
 
0b99ec0
 
6aedfa8
 
3d72f81
 
 
6aedfa8
3d72f81
6aedfa8
88feb92
6c5de50
 
 
 
 
 
6aedfa8
6308af3
3d72f81
 
 
6c5de50
3d72f81
 
6c5de50
 
3d72f81
6c5de50
 
 
 
 
6aedfa8
3d72f81
 
6c5de50
 
 
 
3d72f81
6aedfa8
6c5de50
 
3d72f81
 
6aedfa8
3d72f81
 
6aedfa8
6c5de50
b028eac
6aedfa8
 
 
3d72f81
6aedfa8
4b29b18
 
 
 
 
 
 
 
 
ff711c8
4b29b18
 
ff711c8
4b29b18
 
 
 
 
 
9039416
4b29b18
3d72f81
6c5de50
 
3d72f81
 
 
 
6aedfa8
 
 
3d72f81

import os
import gradio as gr
import spaces
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer

DESCRIPTION = """\
# EvaByte [Byte-Level LLM]

EvaByte is a efficient　byte-level language model with multibyte prediction and EVA attention, built by the University of Hong Kong and SambaNova Systems.  
This Space is an unofficial demo of the instruction-tuned version [EvaByte/EvaByte-SFT](https://huggingface.co/EvaByte/EvaByte-SFT).  
For full details on architecture, training recipe, and benchmarks, see their blog post and the project repository:

- Blog: <https://hkunlp.github.io/blog/2025/evabyte>  
- GitHub: <https://github.com/OpenEvaByte/evabyte>

If you liked this Space, follow me on Twitter: [@KantaHayashiAI](https://x.com/KantaHayashiAI)
"""

MAX_MAX_NEW_TOKENS = 2048
DEFAULT_MAX_NEW_TOKENS = 1024
MAX_INPUT_TOKEN_LENGTH = 32000

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

tokenizer = AutoTokenizer.from_pretrained("EvaByte/EvaByte-SFT", trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
    "EvaByte/EvaByte-SFT",
    torch_dtype=torch.bfloat16,
    trust_remote_code=True,
).eval().to(device)


@spaces.GPU(duration=60)
def generate(
    message: str,
    chat_history: list[dict],
    max_new_tokens: int = DEFAULT_MAX_NEW_TOKENS,
    temperature: float = 0.6,
    top_p: float = 0.9,
) -> str:                                  

    conversation = [*chat_history, {"role": "user", "content": message}]
    input_ids = tokenizer.apply_chat_template(
        conversation,
        add_generation_prompt=True,
        return_tensors="pt"
    )

    if input_ids.shape[1] > MAX_INPUT_TOKEN_LENGTH:
        input_ids = input_ids[:, -MAX_INPUT_TOKEN_LENGTH:]
        gr.Warning(
            f"Trimmed input to the last {MAX_INPUT_TOKEN_LENGTH} tokens　because it exceeded the limit."
        )

    input_ids = input_ids.to(model.device)

    output_ids = model.multi_byte_generate(  
        input_ids=input_ids,
        max_new_tokens=max_new_tokens,
        do_sample=True,
        top_p=top_p,
        temperature=temperature,
    )

    generated_segment = output_ids[0][input_ids.shape[1]:]
    return tokenizer.decode(generated_segment, skip_special_tokens=True)


demo = gr.ChatInterface(
    fn=generate,
    additional_inputs=[
        gr.Slider(
            label="Max new tokens",
            minimum=1,
            maximum=MAX_MAX_NEW_TOKENS,
            step=1,
            value=DEFAULT_MAX_NEW_TOKENS,
        ),
        gr.Slider(
            label="Temperature",
            minimum=0,
            maximum=4.0,
            step=0.1,
            value=0,
        ),
        gr.Slider(
            label="Top-p (nucleus sampling)",
            minimum=0.05,
            maximum=1.0,
            step=0.05,
            value=1.0,
        ),
    ],
    stop_btn=None,           
    examples=[["Write me an English pangram."]],
    cache_examples=False,
    type="messages",
    description=DESCRIPTION,
    fill_height=True,
)

if __name__ == "__main__":
    demo.queue(max_size=20).launch()