Spaces:

joaogante
/

assisted_generation_demo

Running on Zero

File size: 3,769 Bytes

19ed9d4
 
 
 
a1a543e
 
 
a97bf6b
19ed9d4
a1a543e
8afec35
 
a1a543e
8afec35
 
a1a543e
 
19ed9d4
 
f7f857f
7bb9775
c3cbdc6
 
 
 
a1a543e
ef976dc
a1a543e
 
 
 
 
 
c3cbdc6
a1a543e
 
c3cbdc6
f7f857f
a1a543e
f7f857f
e8e07e2
a1a543e
 
f7f857f
a1a543e
 
f7f857f
 
a1a543e
 
 
8afec35
8a1e417
8afec35
a1a543e
 
 
 
 
 
 
 
588b2d4
8afec35
 
817dcba
a1a543e
 
 
336d41b
 
abbc475
336d41b
 
 
 
 
 
f7f857f
 
a1a543e
f7f857f
a1a543e
 
5afc885
a1a543e
8afec35
8a1e417
a1a543e
f7f857f
8a1e417
f7f857f
 
a1a543e
abbc475

import spaces
import gradio as gr

import time
from threading import Thread

import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, TextIteratorStreamer


model_id = "Qwen/Qwen2.5-32B-Instruct"
assistant_id = "Qwen/Qwen2.5-0.5B-Instruct"

model = AutoModelForCausalLM.from_pretrained(model_id, load_in_4bit=True, device_map="auto")
assistant_model = AutoModelForCausalLM.from_pretrained(assistant_id).to(device=model.device, dtype=torch.float16)
tokenizer = AutoTokenizer.from_pretrained(model_id)


@spaces.GPU
def run_generation(user_text, use_assistant, temperature, max_new_tokens):
    if temperature < 0.1:
        do_sample = False
    else:
        do_sample = True

    # Get the model and tokenizer, and tokenize the user text.
    model_inputs = tokenizer([user_text], return_tensors="pt").to(model.device)

    # Start generation on a separate thread, so that we don't block the UI. The text is pulled from the streamer
    # in the main thread. Adds timeout to the streamer to handle exceptions in the generation thread.
    streamer = TextIteratorStreamer(tokenizer, timeout=10., skip_prompt=True, skip_special_tokens=True)
    generate_kwargs = dict(
        model_inputs,
        assistant_model=assistant_model if use_assistant else None,
        streamer=streamer,
        max_new_tokens=max_new_tokens,
        do_sample=do_sample,
        top_p=0.95,
        temperature=float(temperature),
        top_k=50,
        eos_token_id=-1,  # ensures `max_new_tokens` new tokens are always generated, can't reach EOS
    )
    t = Thread(target=model.generate, kwargs=generate_kwargs)
    start = time.time()
    t.start()

    # Pull the generated text from the streamer, and update the model output. Return the model output and time
    # spent so far.
    model_output = ""
    for new_text in streamer:
        model_output += new_text
        time_so_far = time.time() - start
        tokens_so_far = tokenizer(model_output, return_tensors="pt").input_ids.shape[1]
        yield [model_output, round(tokens_so_far/time_so_far, 2)]


def reset_textbox():
    return gr.update(value='')


with gr.Blocks() as demo:
    gr.Markdown(
        "# 🤗 Assisted Generation Demo\n"
        f"- Model: {model_id} (4-bit quant, ~16GB)\n"
        f"- Assistant Model: {assistant_id} (FP16, ~1GB)\n"
        "- Recipe for speedup: a) >10x model size difference in parameters; b) assistant trained similarly; c) CPU is not a bottleneck"
    )

    with gr.Row():
        with gr.Column(scale=4):
            user_text = gr.Textbox(
                value="A sequence: one, two, three, ",
                label="Prompt"
            )
            model_output = gr.Textbox(label="Model output", lines=10, interactive=False)
            button_submit = gr.Button(value="Submit")

        with gr.Column(scale=1, min_width=200):
            gr.Markdown("### Generation Settings")
            use_assistant = gr.Checkbox(label="Use Assisted Generation", value=True)
            max_new_tokens = gr.Slider(
                minimum=1, maximum=500, value=100, step=1, interactive=True, label="Max New Tokens",
            )
            temperature = gr.Slider(
                minimum=0.0, maximum=2.0, value=0.6, step=0.05, interactive=True, label="Temperature (0.0 = Greedy)",
            )
            gr.Markdown("### Tokens per second")
            tokens_per_second = gr.Textbox(lines=1, interactive=False, show_label=False)

    generate_inputs = [user_text, use_assistant, temperature, max_new_tokens]
    generate_outputs = [model_output, tokens_per_second]
    user_text.submit(run_generation, generate_inputs, generate_outputs)
    button_submit.click(run_generation, generate_inputs, generate_outputs)

    demo.queue(max_size=16).launch()