myr1-2

Sleeping

File size: 1,627 Bytes

5755412
 
eccd8f6
 
 
 
 
 
2c4c7b5
eccd8f6
5755412
 
 
 
 
 
eccd8f6
5755412
 
eccd8f6
5755412
eccd8f6
 
 
5755412
eccd8f6
 
 
5755412
eccd8f6
5755412
eccd8f6
5755412
 
eccd8f6
5755412
 
 
eccd8f6
5755412
eccd8f6
5755412
 
 
 
 
 
 
 
 
eccd8f6
5755412

import gradio as gr
import spaces
import torch
from transformers import (
    AutoConfig,
    AutoTokenizer,
    AutoModelForCausalLM,
    pipeline
)

# 1) Decorate your GPU-dependent function(s)
@spaces.GPU(duration=60)  # default is 60s, can increase if needed
def load_pipeline():
    # -- load config & model from wuhp/myr1 --
    config = AutoConfig.from_pretrained("wuhp/myr1", subfolder="myr1", trust_remote_code=True)
    tokenizer = AutoTokenizer.from_pretrained("wuhp/myr1", subfolder="myr1", trust_remote_code=True)
    model = AutoModelForCausalLM.from_pretrained(
        "wuhp/myr1",
        subfolder="myr1",
        config=config,
        torch_dtype=torch.float16,  # half precision
        device_map="auto",
        trust_remote_code=True
    )
    # optional: load generation config if you have generation_config.json
    text_pipeline = pipeline(
        "text-generation",
        model=model,
        tokenizer=tokenizer
    )
    return text_pipeline

# We'll load it once and store globally
text_pipeline = load_pipeline()

def predict(prompt, max_new_tokens=64):
    outputs = text_pipeline(
        prompt, max_new_tokens=int(max_new_tokens), do_sample=True, temperature=0.7
    )
    return outputs[0]["generated_text"]

# 2) Build your Gradio app
with gr.Blocks() as demo:
    gr.Markdown("## My LLM Inference (ZeroGPU)")
    prompt = gr.Textbox(label="Prompt")
    max_nt = gr.Slider(1, 200, value=64, step=1, label="Max New Tokens")
    output = gr.Textbox(label="Generated Text")

    btn = gr.Button("Generate")
    btn.click(fn=predict, inputs=[prompt, max_nt], outputs=output)

demo.launch()