import gradio as gr from transformers import AutoTokenizer, AutoModelForCausalLM, AutoConfig from peft import PeftModel, LoraConfig from unsloth.chat_templates import get_chat_template # Define the path where the model and adapters are saved model_path = "yentinglin/Llama-3-Taiwan-8B-Instruct" # Update this to your model path adapter_path = "netmouse/Llama-3-Taiwan-8B-Instruct-finetuning-by-promisedchat" # Assuming adapter is stored in the same path # Load the tokenizer tokenizer = AutoTokenizer.from_pretrained(model_path) # Load the base model config config = AutoConfig.from_pretrained(model_path) # Load the base model without quantization configurations # Ensure that bitsandbytes is not used by removing any reference to 4bit or 8bit base_model = AutoModelForCausalLM.from_pretrained(model_path, config=config, ignore_mismatched_sizes=True) # Load the LoRA adapter model = PeftModel.from_pretrained(base_model, adapter_path) def generate_text(input_text): inputs = tokenizer.apply_chat_template( messages, tokenize = True, add_generation_prompt = True, # Must add for generation return_tensors = "pt", ).to("cuda") #input_ids = tokenizer.encode(input_text, return_tensors='pt') outputs = model.generate(inputs, max_length=50, num_return_sequences=1) generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True) return generated_text iface = gr.Interface(fn=generate_text, inputs="text", outputs="text") iface.launch()