not run

#1
by rakmik - opened

torch 2.4
transformers 4.4

import torch
from transformers import AutoTokenizer
from hqq.models.hf.base import AutoHQQHFModel
from hqq.utils.patching import *
from hqq.core.quantize import *
from hqq.utils.generation_hf import HFGenerator

#Load the model
###################################################
model_id = 'mobiuslabsgmbh/Llama-3.1-8b-instruct_4bitgs64_hqq' #no calib version
#model_id = 'mobiuslabsgmbh/Llama-3.1-8b-instruct_4bitgs64_hqq_calib' #calibrated version

compute_dtype = torch.bfloat16 #bfloat16 for torchao, float16 for bitblas
cache_dir = '.'
model = AutoHQQHFModel.from_quantized(model_id, cache_dir=cache_dir, compute_dtype=compute_dtype)
tokenizer = AutoTokenizer.from_pretrained(model_id, cache_dir=cache_dir)

quant_config = BaseQuantizeConfig(nbits=4, group_size=64, quant_scale=False, quant_zero=False, axis=1)
patch_linearlayers(model, patch_add_quant_config, quant_config)

#Use optimized inference kernels
###################################################
HQQLinear.set_backend(HQQBackend.PYTORCH)
#prepare_for_inference(model) #default backend
prepare_for_inference(model, backend="torchao_int4")
#prepare_for_inference(model, backend="bitblas") #takes a while to init...

#Generate
###################################################
#For longer context, make sure to allocate enough cache via the cache_size= parameter
gen = HFGenerator(model, tokenizer, max_new_tokens=1000, do_sample=True, compile="partial").warmup() #Warm-up takes a while

gen.generate("Write an essay about large language models", print_tokens=True)
gen.generate("Tell me a funny joke!", print_tokens=True)
gen.generate("How to make a yummy chocolate cake?", print_tokens=True)

in <cell line: 0>()
13 compute_dtype = torch.bfloat16 #bfloat16 for torchao, float16 for bitblas
14 cache_dir = '.'
---> 15 model = AutoHQQHFModel.from_quantized(model_id, cache_dir=cache_dir, compute_dtype=compute_dtype)
16 tokenizer = AutoTokenizer.from_pretrained(model_id, cache_dir=cache_dir)
17

5 frames
/usr/local/lib/python3.11/dist-packages/transformers/models/llama/configuration_llama.py in _rope_scaling_validation(self)
179
180 if not isinstance(self.rope_scaling, dict) or len(self.rope_scaling) != 2:
--> 181 raise ValueError(
182 "rope_scaling must be a dictionary with two fields, type and factor, " f"got {self.rope_scaling}"
183 )

ValueError: rope_scaling must be a dictionary with two fields, type and factor, got {'factor': 8.0, 'high_freq_factor': 4.0, 'low_freq_factor': 1.0, 'original_max_position_embeddings': 8192, 'rope_type': 'llama3'}

!pip install torch==2.4.0 torchvision==0.19.0 torchaudio==2.4.0 --index-url https://download.pytorch.org/whl/cu121

/usr/local/lib/python3.11/dist-packages/huggingface_hub/utils/_auth.py:94: UserWarning:
The secret HF_TOKEN does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
warnings.warn(
Fetching 8 files: 100%
 8/8 [00:00<00:00, 681.41it/s]

ValueError Traceback (most recent call last)
in <cell line: 0>()
13 compute_dtype = torch.bfloat16 #bfloat16 for torchao, float16 for bitblas
14 cache_dir = '.'
---> 15 model = AutoHQQHFModel.from_quantized(model_id, cache_dir=cache_dir, compute_dtype=compute_dtype)
16 tokenizer = AutoTokenizer.from_pretrained(model_id, cache_dir=cache_dir)
17

5 frames
/usr/local/lib/python3.11/dist-packages/transformers/models/llama/configuration_llama.py in _rope_scaling_validation(self)
179
180 if not isinstance(self.rope_scaling, dict) or len(self.rope_scaling) != 2:
--> 181 raise ValueError(
182 "rope_scaling must be a dictionary with two fields, type and factor, " f"got {self.rope_scaling}"
183 )

ValueError: rope_scaling must be a dictionary with two fields, type and factor, got {'factor': 8.0, 'high_freq_factor': 4.0, 'low_freq_factor': 1.0, 'original_max_position_embeddings': 8192, 'rope_type': 'llama3'}

Mobius Labs GmbH org

This is working fine, make sure you have an updated version of transformers

mobicham changed discussion status to closed

Sign up or log in to comment