Spaces:

BlinkDL
/

RWKV-Gradio-2

Running on T4

File size: 5,378 Bytes

f714b01
8a953d3
f714b01
 
 
 
 
8a953d3
 
1d5e556
 
 
 
 
8a953d3
 
1d5e556
dc1d70c
f714b01
8a953d3
 
 
 
f714b01
 
315ea19
8a953d3
f714b01
8a953d3
 
 
f714b01
3bcbfb1
 
 
 
 
8a953d3
3bcbfb1
 
 
 
 
 
 
 
 
 
 
 
 
 
eded8df
315ea19
3bcbfb1
 
 
 
 
 
 
 
 
 
c197986
 
234ee14
 
ab56f98
 
3bcbfb1
 
 
f714b01
c4f1727
3ddf274
8a953d3
 
37d48ee
eea6bc4
37d48ee
c4f1727
 
c197986
 
 
8a953d3
315ea19
37d48ee
c197986
 
3ddf274
8a953d3
315ea19
 
 
 
c197986
 
 
 
8a953d3
6e2a0af
315ea19
c197986
315ea19
c197986

import gradio as gr
import os, gc, copy, torch, re
from datetime import datetime
from huggingface_hub import hf_hub_download
from pynvml import *
nvmlInit()
gpu_h = nvmlDeviceGetHandleByIndex(0)
ctx_limit = 1024
title = "RWKV-x060-eng_single_round_test-1B6-20240427-ctx1024"

os.environ["RWKV_JIT_ON"] = '1'
os.environ["RWKV_CUDA_ON"] = '1' # if '1' then use CUDA kernel for seq mode (much faster)

from rwkv.model import RWKV
model_path = hf_hub_download(repo_id="BlinkDL/temp-latest-training-models", filename=f"{title}.pth")
model = RWKV(model=model_path, strategy='cuda fp16')
from rwkv.utils import PIPELINE, PIPELINE_ARGS
pipeline = PIPELINE(model, "rwkv_vocab_v20230424")

def generate_prompt(instruction):
    instruction = instruction.strip().replace('\r\n','\n')
    instruction = re.sub(r'\n+', '\n', instruction)
    return f"User: {instruction}\n\nAssistant:"""

def evaluate(
    ctx,
    token_count=500,
    temperature=1.0,
    top_p=0.3,
    presencePenalty = 0.3,
    countPenalty = 0.3,
):
    args = PIPELINE_ARGS(temperature = max(0.2, float(temperature)), top_p = float(top_p),
                     alpha_frequency = countPenalty,
                     alpha_presence = presencePenalty,
                     token_ban = [], # ban the generation of some tokens
                     token_stop = [0]) # stop generation whenever you see any token here
    ctx = generate_prompt(ctx)
    all_tokens = []
    out_last = 0
    out_str = ''
    occurrence = {}
    state = None
    for i in range(int(token_count)):
        out, state = model.forward(pipeline.encode(ctx)[-ctx_limit:] if i == 0 else [token], state)
        for n in occurrence:
            out[n] -= (args.alpha_presence + occurrence[n] * args.alpha_frequency)

        token = pipeline.sample_logits(out, temperature=args.temperature, top_p=args.top_p)
        if token in args.token_stop:
            break
        all_tokens += [token]
        for xxx in occurrence:
            occurrence[xxx] *= 0.996        
        if token not in occurrence:
            occurrence[token] = 1
        else:
            occurrence[token] += 1
        
        tmp = pipeline.decode(all_tokens[out_last:])
        if '\ufffd' not in tmp:
            out_str += tmp
            yield out_str.strip()
            out_last = i + 1

    gpu_info = nvmlDeviceGetMemoryInfo(gpu_h)
    timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
    print(f'{timestamp} - vram {gpu_info.total} used {gpu_info.used} free {gpu_info.free}')  
    del out
    del state
    gc.collect()
    torch.cuda.empty_cache()
    yield out_str.strip()

examples = [
    ["Build a rocket to land on moon.", 700, 1, 0.3, 0.3, 0.3],
    ["How can I craft an engaging story featuring vampires on Mars?", 700, 1, 0.3, 0.3, 0.3],
    ["How can I persuade Elon Musk to follow me on Twitter?", 700, 1, 0.3, 0.3, 0.3],
    ["Write a story using the following information: a man named Alex chops a tree down.", 700, 1, 0.3, 0.3, 0.3],
    ["Create a Python program to simulate moon landing.", 700, 1, 0.3, 0.3, 0.3],
    ["Write a simple website in HTML. When a user clicks the button, it shows a random joke from a list of 4 jokes.", 700, 1, 0.3, 0.3, 0.3],
]

##########################################################################

with gr.Blocks(title=title) as demo:
    gr.HTML(f"<div style=\"text-align: center;\">\n<h1>{title}</h1>\n</div>")
    with gr.Tab("Raw Generation"):
        gr.Markdown(f"This is [RWKV-6](https://huggingface.co/BlinkDL/temp-latest-training-models) with 1.6B params [state-tuned](https://twitter.com/BlinkDL_AI/status/1784496793075744966) on single-round English Q & A. RWKV is a 100% attention-free RNN [RWKV-LM](https://github.com/BlinkDL/RWKV-LM), and we have [300+ Github RWKV projects](https://github.com/search?o=desc&p=1&q=rwkv&s=updated&type=Repositories). Demo limited to ctxlen {ctx_limit}.")
        with gr.Row():
            with gr.Column():
                prompt = gr.Textbox(lines=2, label="Prompt", value="Build a rocket to land on moon.")
                token_count = gr.Slider(10, 700, label="Max Tokens", step=10, value=700)
                temperature = gr.Slider(0.2, 2.0, label="Temperature", step=0.1, value=1.0)
                top_p = gr.Slider(0.0, 1.0, label="Top P", step=0.05, value=0.3)
                presence_penalty = gr.Slider(0.0, 1.0, label="Presence Penalty", step=0.1, value=0)
                count_penalty = gr.Slider(0.0, 1.0, label="Count Penalty", step=0.1, value=1)
            with gr.Column():
                with gr.Row():
                    submit = gr.Button("Submit", variant="primary")
                    clear = gr.Button("Clear", variant="secondary")
                output = gr.Textbox(label="Output", lines=50)
        data = gr.Dataset(components=[prompt, token_count, temperature, top_p, presence_penalty, count_penalty], samples=examples, samples_per_page=50, label="Example Instructions", headers=["Prompt", "Max Tokens", "Temperature", "Top P", "Presence Penalty", "Count Penalty"])
        submit.click(evaluate, [prompt, token_count, temperature, top_p, presence_penalty, count_penalty], [output])
        clear.click(lambda: None, [], [output])
        data.click(lambda x: x, [data], [prompt, token_count, temperature, top_p, presence_penalty, count_penalty])

demo.queue(concurrency_count=1, max_size=10)
demo.launch(share=False)