Spaces:

BlinkDL
/

RWKV-Gradio-1

Running on T4

File size: 5,289 Bytes

4a2c956
296b533
7c790c0
 
61b9ff7
 
 
296b533
 
caceb4b
296b533
 
caceb4b
296b533
 
 
a00d592
296b533
a00d592
7c790c0
296b533
 
 
 
e63ee0a
 
2d1a42e
296b533
e63ee0a
296b533
 
 
7c790c0
 
 
 
e63ee0a
 
296b533
7c790c0
 
 
 
 
 
296b533
cade2a3
 
 
 
 
 
 
 
296b533
cade2a3
 
 
 
 
 
 
 
 
 
 
 
 
296b533
cade2a3
 
 
 
 
 
296b533
 
 
 
 
 
cade2a3
 
296b533
cade2a3
e63ee0a
296b533
d93e535
296b533
e63ee0a
 
296b533
 
a00d592
8ecee14
 
 
e63ee0a
 
 
 
296b533
5baab88
296b533
e63ee0a
2d1a42e
7c790c0
296b533

import gradio as gr
import os, gc, copy, torch, re
from datetime import datetime
from huggingface_hub import hf_hub_download
from pynvml import *
nvmlInit()
gpu_h = nvmlDeviceGetHandleByIndex(0)
ctx_limit = 1024
title = "RWKV-x060-eng_single_round_test-1B6-20240427-ctx1024"

os.environ["RWKV_JIT_ON"] = '1'
os.environ["RWKV_CUDA_ON"] = '1' # if '1' then use CUDA kernel for seq mode (much faster)

from rwkv.model import RWKV
model_path = hf_hub_download(repo_id="BlinkDL/temp-latest-training-models", filename=f"{title}.pth")
# model_path = f"E:/{title}"
model = RWKV(model=model_path, strategy='cuda fp16')
from rwkv.utils import PIPELINE, PIPELINE_ARGS
pipeline = PIPELINE(model, "rwkv_vocab_v20230424")

def generate_prompt(instruction):
    instruction = instruction.strip().replace('\r\n','\n')
    instruction = re.sub(r'\n+', '\n', instruction)
    return f"User: {instruction}\n\nAssistant:"""

def evaluate(
    ctx,
    token_count=500,
    temperature=1.0,
    top_p=0.3,
    presencePenalty = 0.3,
    countPenalty = 0.3,
):
    args = PIPELINE_ARGS(temperature = max(0.2, float(temperature)), top_p = float(top_p),
                     alpha_frequency = countPenalty,
                     alpha_presence = presencePenalty,
                     token_ban = [], # ban the generation of some tokens
                     token_stop = [0]) # stop generation whenever you see any token here
    ctx = generate_prompt(ctx)
    all_tokens = []
    out_last = 0
    out_str = ''
    occurrence = {}
    state = None
    for i in range(int(token_count)):
        out, state = model.forward(pipeline.encode(ctx)[-ctx_limit:] if i == 0 else [token], state)
        for n in occurrence:
            out[n] -= (args.alpha_presence + occurrence[n] * args.alpha_frequency)

        token = pipeline.sample_logits(out, temperature=args.temperature, top_p=args.top_p)
        if token in args.token_stop:
            break
        all_tokens += [token]
        for xxx in occurrence:
            occurrence[xxx] *= 0.996        
        if token not in occurrence:
            occurrence[token] = 1
        else:
            occurrence[token] += 1
        
        tmp = pipeline.decode(all_tokens[out_last:])
        if '\ufffd' not in tmp:
            out_str += tmp
            yield out_str.strip()
            out_last = i + 1

    gpu_info = nvmlDeviceGetMemoryInfo(gpu_h)
    timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
    print(f'{timestamp} - vram {gpu_info.total} used {gpu_info.used} free {gpu_info.free}')  
    del out
    del state
    gc.collect()
    torch.cuda.empty_cache()
    yield out_str.strip()

examples = [
    ["How can I craft an engaging story featuring vampires on Mars?", 700, 1, 0.3, 0.3, 0.3],
    ["Write a simple website in HTML. When a user clicks the button, it shows a random joke from a list of 4 jokes.", 700, 1, 0.3, 0.3, 0.3],
    ["Write C++ code to land on moon.", 700, 1, 0.3, 0.3, 0.3],
    ["Write a story using the following information: a man named Alex chops a tree down.", 700, 1, 0.3, 0.3, 0.3],
    ["How can I persuade Elon Musk to follow me on Twitter?", 700, 1, 0.3, 0.3, 0.3],
]

##########################################################################

with gr.Blocks(title=title) as demo:
    gr.HTML(f"<div style=\"text-align: center;\">\n<h1>{title}</h1>\n</div>")
    with gr.Tab("Raw Generation"):
        gr.Markdown(f"This is [RWKV-6](https://huggingface.co/BlinkDL/temp-latest-training-models) with 1.6B params tuned on <b>single-round English</b> Q & A - a 100% attention-free RNN [RWKV-LM](https://github.com/BlinkDL/RWKV-LM). And we have [200+ Github RWKV projects](https://github.com/search?o=desc&p=1&q=rwkv&s=updated&type=Repositories). Demo limited to ctxlen {ctx_limit}.")
        with gr.Row():
            with gr.Column():
                prompt = gr.Textbox(lines=2, label="Prompt", value="How can we craft an engaging story featuring vampires on Mars?")
                token_count = gr.Slider(10, 700, label="Max Tokens", step=10, value=700)
                temperature = gr.Slider(0.2, 2.0, label="Temperature", step=0.1, value=1.0)
                top_p = gr.Slider(0.0, 1.0, label="Top P", step=0.05, value=0.3)
                presence_penalty = gr.Slider(0.0, 1.0, label="Presence Penalty", step=0.1, value=0)
                count_penalty = gr.Slider(0.0, 1.0, label="Count Penalty", step=0.1, value=1)
            with gr.Column():
                with gr.Row():
                    submit = gr.Button("Submit", variant="primary")
                    clear = gr.Button("Clear", variant="secondary")
                output = gr.Textbox(label="Output", lines=50)
        data = gr.Dataset(components=[prompt, token_count, temperature, top_p, presence_penalty, count_penalty], samples=examples, samples_per_page=50, label="Example Instructions", headers=["Prompt", "Max Tokens", "Temperature", "Top P", "Presence Penalty", "Count Penalty"])
        submit.click(evaluate, [prompt, token_count, temperature, top_p, presence_penalty, count_penalty], [output], concurrency_limit=1)
        clear.click(lambda: None, [], [output])
        data.click(lambda x: x, [data], [prompt, token_count, temperature, top_p, presence_penalty, count_penalty])

demo.queue(max_size=10)
demo.launch(share=False)