|
import gradio as gr |
|
import spaces |
|
import torch |
|
from transformers import ( |
|
AutoConfig, |
|
AutoTokenizer, |
|
AutoModelForCausalLM, |
|
pipeline |
|
) |
|
|
|
|
|
@spaces.GPU(duration=60) |
|
def load_pipeline(): |
|
|
|
config = AutoConfig.from_pretrained("wuhp/myr1", subfolder="myr1", trust_remote_code=True) |
|
tokenizer = AutoTokenizer.from_pretrained("wuhp/myr1", subfolder="myr1", trust_remote_code=True) |
|
model = AutoModelForCausalLM.from_pretrained( |
|
"wuhp/myr1", |
|
subfolder="myr1", |
|
config=config, |
|
torch_dtype=torch.float16, |
|
device_map="auto", |
|
trust_remote_code=True |
|
) |
|
|
|
text_pipeline = pipeline( |
|
"text-generation", |
|
model=model, |
|
tokenizer=tokenizer |
|
) |
|
return text_pipeline |
|
|
|
|
|
text_pipeline = load_pipeline() |
|
|
|
def predict(prompt, max_new_tokens=64): |
|
outputs = text_pipeline( |
|
prompt, max_new_tokens=int(max_new_tokens), do_sample=True, temperature=0.7 |
|
) |
|
return outputs[0]["generated_text"] |
|
|
|
|
|
with gr.Blocks() as demo: |
|
gr.Markdown("## My LLM Inference (ZeroGPU)") |
|
prompt = gr.Textbox(label="Prompt") |
|
max_nt = gr.Slider(1, 200, value=64, step=1, label="Max New Tokens") |
|
output = gr.Textbox(label="Generated Text") |
|
|
|
btn = gr.Button("Generate") |
|
btn.click(fn=predict, inputs=[prompt, max_nt], outputs=output) |
|
|
|
demo.launch() |
|
|