import torch from transformers import BitsAndBytesConfig from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline quantization_config = BitsAndBytesConfig( load_in_4bit=True, bnb_4bit_compute_dtype=torch.float16, bnb_4bit_quant_type="nf4", bnb_4bit_use_double_quant=True, ) # My version with smaller chunks on safetensors for low RAM environments model_id = "vilsonrodrigues/falcon-7b-instruct-sharded" model_4bit = AutoModelForCausalLM.from_pretrained( model_id, device_map="auto", quantization_config=quantization_config, trust_remote_code=True) tokenizer = AutoTokenizer.from_pretrained(model_id) pipeline = pipeline( "text-generation", model=model_4bit, tokenizer=tokenizer, use_cache=True, device_map="auto", max_length=296, do_sample=True, top_k=10, num_return_sequences=1, eos_token_id=tokenizer.eos_token_id, pad_token_id=tokenizer.eos_token_id, ) print(pipeline("Hello"))