import gradio as gr from transformers import AutoTokenizer, AutoModelForSeq2SeqLM import torch # Load model and tokenizer once on startup tokenizer = AutoTokenizer.from_pretrained("Salesforce/codet5p-220m") model = AutoModelForSeq2SeqLM.from_pretrained("Salesforce/codet5p-220m") device = torch.device("cuda" if torch.cuda.is_available() else "cpu") model = model.to(device) def generate(prompt): inputs = tokenizer(prompt, return_tensors="pt").to(device) outputs = model.generate( **inputs, max_length=2048, num_beams=3, early_stopping=True, no_repeat_ngram_size=3, eos_token_id=tokenizer.eos_token_id, pad_token_id=tokenizer.pad_token_id, ) output_text = tokenizer.decode(outputs[0], skip_special_tokens=True) return output_text # Create Gradio interface iface = gr.Interface( fn=generate, inputs=gr.Textbox(lines=10, label="Input Prompt"), outputs=gr.Textbox(label="Generated Output"), title="LLaMA 7B Server", description="A web interface for interacting with the LLaMA 7B model.", allow_flagging="never" ) # Launch the interface if __name__ == "__main__": iface.launch(server_name="0.0.0.0", server_port=7860)