# import os # from transformers import AutoModelForCausalLM, AutoTokenizer # import torch # # Correct model name # MODEL_NAME = "bigcode/starcoder" # # Ensure the token is provided # HF_TOKEN = os.getenv("HUGGINGFACE_TOKEN") # if not HF_TOKEN: # raise ValueError("Missing Hugging Face token. Set HUGGINGFACE_TOKEN as an environment variable.") # # Set device # device = "cuda" if torch.cuda.is_available() else "cpu" # # Load tokenizer with authentication # tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, token=HF_TOKEN) # # Load model with optimizations # model = AutoModelForCausalLM.from_pretrained( # MODEL_NAME, # token=HF_TOKEN, # torch_dtype=torch.float16, # Reduce memory usage # low_cpu_mem_usage=True, # Optimize loading # device_map="auto", # Automatic device placement # offload_folder="offload" # Offload to disk if needed # ).to(device) # def generate_code(prompt: str, max_tokens: int = 256): # """Generates code based on the input prompt.""" # if not prompt.strip(): # return "Error: Empty prompt provided." # inputs = tokenizer(prompt, return_tensors="pt").to(device) # output = model.generate(**inputs, max_new_tokens=max_tokens) # return tokenizer.decode(output[0], skip_special_tokens=True) import os from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig import torch MODEL_NAME = "bigcode/starcoderbase-3b" # Lighter version HF_TOKEN = os.getenv("HUGGINGFACE_TOKEN") quant_config = BitsAndBytesConfig( load_in_4bit=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=torch.float16 ) tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, token=HF_TOKEN) model = AutoModelForCausalLM.from_pretrained( MODEL_NAME, token=HF_TOKEN, quantization_config=quant_config, device_map="auto", trust_remote_code=True ) def generate_code(prompt: str, max_tokens: int = 256): inputs = tokenizer(prompt, return_tensors="pt").to(model.device) output = model.generate(**inputs, max_new_tokens=max_tokens) return tokenizer.decode(output[0], skip_special_tokens=True)