import gradio as gr import spaces import torch from transformers import ( AutoConfig, AutoTokenizer, AutoModelForCausalLM, pipeline ) # 1) Decorate your GPU-dependent function(s) @spaces.GPU(duration=60) # default is 60s, can increase if needed def load_pipeline(): # -- load config & model from wuhp/myr1 -- config = AutoConfig.from_pretrained("wuhp/myr1", subfolder="myr1", trust_remote_code=True) tokenizer = AutoTokenizer.from_pretrained("wuhp/myr1", subfolder="myr1", trust_remote_code=True) model = AutoModelForCausalLM.from_pretrained( "wuhp/myr1", subfolder="myr1", config=config, torch_dtype=torch.float16, # half precision device_map="auto", trust_remote_code=True ) # optional: load generation config if you have generation_config.json text_pipeline = pipeline( "text-generation", model=model, tokenizer=tokenizer ) return text_pipeline # We'll load it once and store globally text_pipeline = load_pipeline() def predict(prompt, max_new_tokens=64): outputs = text_pipeline( prompt, max_new_tokens=int(max_new_tokens), do_sample=True, temperature=0.7 ) return outputs[0]["generated_text"] # 2) Build your Gradio app with gr.Blocks() as demo: gr.Markdown("## My LLM Inference (ZeroGPU)") prompt = gr.Textbox(label="Prompt") max_nt = gr.Slider(1, 200, value=64, step=1, label="Max New Tokens") output = gr.Textbox(label="Generated Text") btn = gr.Button("Generate") btn.click(fn=predict, inputs=[prompt, max_nt], outputs=output) demo.launch()