not run
torch 2.4
transformers 4.4
import torch
from transformers import AutoTokenizer
from hqq.models.hf.base import AutoHQQHFModel
from hqq.utils.patching import *
from hqq.core.quantize import *
from hqq.utils.generation_hf import HFGenerator
#Load the model
###################################################
model_id = 'mobiuslabsgmbh/Llama-3.1-8b-instruct_4bitgs64_hqq' #no calib version
#model_id = 'mobiuslabsgmbh/Llama-3.1-8b-instruct_4bitgs64_hqq_calib' #calibrated version
compute_dtype = torch.bfloat16 #bfloat16 for torchao, float16 for bitblas
cache_dir = '.'
model = AutoHQQHFModel.from_quantized(model_id, cache_dir=cache_dir, compute_dtype=compute_dtype)
tokenizer = AutoTokenizer.from_pretrained(model_id, cache_dir=cache_dir)
quant_config = BaseQuantizeConfig(nbits=4, group_size=64, quant_scale=False, quant_zero=False, axis=1)
patch_linearlayers(model, patch_add_quant_config, quant_config)
#Use optimized inference kernels
###################################################
HQQLinear.set_backend(HQQBackend.PYTORCH)
#prepare_for_inference(model) #default backend
prepare_for_inference(model, backend="torchao_int4")
#prepare_for_inference(model, backend="bitblas") #takes a while to init...
#Generate
###################################################
#For longer context, make sure to allocate enough cache via the cache_size= parameter
gen = HFGenerator(model, tokenizer, max_new_tokens=1000, do_sample=True, compile="partial").warmup() #Warm-up takes a while
gen.generate("Write an essay about large language models", print_tokens=True)
gen.generate("Tell me a funny joke!", print_tokens=True)
gen.generate("How to make a yummy chocolate cake?", print_tokens=True)
in <cell line: 0>()
13 compute_dtype = torch.bfloat16 #bfloat16 for torchao, float16 for bitblas
14 cache_dir = '.'
---> 15 model = AutoHQQHFModel.from_quantized(model_id, cache_dir=cache_dir, compute_dtype=compute_dtype)
16 tokenizer = AutoTokenizer.from_pretrained(model_id, cache_dir=cache_dir)
17
5 frames
/usr/local/lib/python3.11/dist-packages/transformers/models/llama/configuration_llama.py in _rope_scaling_validation(self)
179
180 if not isinstance(self.rope_scaling, dict) or len(self.rope_scaling) != 2:
--> 181 raise ValueError(
182 "rope_scaling
must be a dictionary with two fields, type
and factor
, " f"got {self.rope_scaling}"
183 )
ValueError: rope_scaling
must be a dictionary with two fields, type
and factor
, got {'factor': 8.0, 'high_freq_factor': 4.0, 'low_freq_factor': 1.0, 'original_max_position_embeddings': 8192, 'rope_type': 'llama3'}
!pip install torch==2.4.0 torchvision==0.19.0 torchaudio==2.4.0 --index-url https://download.pytorch.org/whl/cu121
/usr/local/lib/python3.11/dist-packages/huggingface_hub/utils/_auth.py:94: UserWarning:
The secret HF_TOKEN
does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
warnings.warn(
Fetching 8 files: 100%
8/8 [00:00<00:00, 681.41it/s]
ValueError Traceback (most recent call last)
in <cell line: 0>()
13 compute_dtype = torch.bfloat16 #bfloat16 for torchao, float16 for bitblas
14 cache_dir = '.'
---> 15 model = AutoHQQHFModel.from_quantized(model_id, cache_dir=cache_dir, compute_dtype=compute_dtype)
16 tokenizer = AutoTokenizer.from_pretrained(model_id, cache_dir=cache_dir)
17
5 frames
/usr/local/lib/python3.11/dist-packages/transformers/models/llama/configuration_llama.py in _rope_scaling_validation(self)
179
180 if not isinstance(self.rope_scaling, dict) or len(self.rope_scaling) != 2:
--> 181 raise ValueError(
182 "rope_scaling
must be a dictionary with two fields, type
and factor
, " f"got {self.rope_scaling}"
183 )
ValueError: rope_scaling
must be a dictionary with two fields, type
and factor
, got {'factor': 8.0, 'high_freq_factor': 4.0, 'low_freq_factor': 1.0, 'original_max_position_embeddings': 8192, 'rope_type': 'llama3'}
This is working fine, make sure you have an updated version of transformers