|
import gradio as gr |
|
from transformers import pipeline |
|
import torch |
|
import subprocess |
|
import spaces |
|
import os |
|
|
|
|
|
|
|
subprocess.run('pip install flash-attn --no-build-isolation', env={'FLASH_ATTENTION_SKIP_CUDA_BUILD': "TRUE"}, shell=True) |
|
|
|
generator = pipeline('text-generation', model='M4-ai/tau-0.5B', torch_dtype=torch.bfloat16) |
|
@spaces.GPU |
|
def generate_text(prompt, temperature, top_p, top_k, repetition_penalty, max_length): |
|
|
|
generator.model.cuda() |
|
generator.device = torch.device("cuda") |
|
outputs = generator( |
|
prompt, |
|
do_sample=True, |
|
max_new_tokens=max_length, |
|
temperature=temperature, |
|
top_p=top_p, |
|
top_k=top_k, |
|
repetition_penalty=repetition_penalty, |
|
return_full_text=False |
|
) |
|
|
|
generated_text = outputs[0]['generated_text'] |
|
generator.model.cpu() |
|
generator.device = torch.device("cpu") |
|
return generated_text |
|
|
|
iface = gr.Interface( |
|
fn=generate_text, |
|
inputs=[ |
|
gr.Textbox(label="Prompt", lines=2, value="Write me a Python program that calculates the factorial of a given number."), |
|
gr.Slider(minimum=0.1, maximum=2.0, step=0.01, value=0.8, label="Temperature"), |
|
gr.Slider(minimum=0.0, maximum=1.0, step=0.01, value=0.95, label="Top p"), |
|
gr.Slider(minimum=0, maximum=100, step=1, value=40, label="Top k"), |
|
gr.Slider(minimum=1.0, maximum=2.0, step=0.01, value=1.10, label="Repetition Penalty"), |
|
gr.Slider(minimum=5, maximum=4096, step=5, value=1024, label="Max Length") |
|
], |
|
outputs=gr.Textbox(label="Generated Text"), |
|
title="M4-ai/tau-0.5B", |
|
description="Try out the M4-ai/tau-0.5B model for free!" |
|
) |
|
|
|
iface.launch() |