import os import torch from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline import gradio as gr import spaces huggingface_token = os.getenv("HUGGINGFACE_TOKEN") if not huggingface_token: pass raise ValueError("HUGGINGFACE_TOKEN environment variable is not set") model_id = "meta-llama/Meta-Llama-3.1-8B-Instruct" model_id = "microsoft/Phi-3-mini-128k-instruct" # device_map style value auto not cuda device = "auto" #torch.device("cuda" if torch.cuda.is_available() else "cpu") dtype = torch.bfloat16 tokenizer = AutoTokenizer.from_pretrained(model_id, token=huggingface_token) print(model_id,device,dtype) @spaces.GPU def generate_text(prompt, system_message="You are a helpful assistant."): model = AutoModelForCausalLM.from_pretrained( model_id, torch_dtype=dtype,device_map=device, token=huggingface_token ) text_generator = pipeline("text-generation", model=model, tokenizer=tokenizer, torch_dtype=dtype, device_map=device) messages = [ {"role": "system", "content": system_message}, {"role": "user", "content": prompt}, ] result = text_generator(messages, max_new_tokens=256, do_sample=True, temperature=0.7) generated_output = result[0]["generated_text"] if isinstance(generated_output, list): for message in reversed(generated_output): if message.get("role") == "assistant": return message.get("content", "No content found.") return "No assistant response found." else: return "Unexpected output format." iface = gr.Interface( fn=generate_text, inputs=[ gr.Textbox(lines=3, label="Input Prompt"), gr.Textbox(lines=2, label="System Message", value="You are a helpful assistant."), ], outputs=gr.Textbox(label="Generated Text"), title="Llama 3.1 8B Instruct Text Generation", description="Enter a prompt and optional system message to generate text using the Llama 3.1 8B Instruct model.", ) if __name__ == "__main__": iface.launch()