from transformers import AutoModelForCausalLM, AutoTokenizer import torch import gradio as gr # Model and tokenizer paths model_name = "ahmedbasemdev/llama-3.2-3b-ChatBot" # Load the model print("Loading the model...") model = AutoModelForCausalLM.from_pretrained(model_name) # Apply dynamic quantization to reduce model size and improve CPU performance print("Applying quantization...") model = torch.quantization.quantize_dynamic( model, # Model to quantize {torch.nn.Linear}, # Layers to quantize (e.g., Linear layers) dtype=torch.qint8, # Quantized data type ) # Load the tokenizer tokenizer = AutoTokenizer.from_pretrained(model_name) # Define the inference function def single_inference(question): messages = [] messages.append({"role": "user", "content": question}) # Tokenize the input input_ids = tokenizer.apply_chat_template( messages, add_generation_prompt=True, return_tensors="pt" ).to("cpu") # Ensure everything runs on CPU # Generate a response terminators = [ tokenizer.eos_token_id, tokenizer.convert_tokens_to_ids("<|eot_id|>") ] outputs = model.generate( input_ids, max_new_tokens=256, eos_token_id=terminators, do_sample=True, temperature=0.2, ) response = outputs[0][input_ids.shape[-1]:] output = tokenizer.decode(response, skip_special_tokens=True) return output # Gradio interface print("Setting up Gradio app...") interface = gr.Interface( fn=single_inference, inputs="text", outputs="text", title="Chatbot", description="Ask me anything!" ) # Launch the Gradio app interface.launch()