mobiuslabsgmbh/Llama-3.1-8b-instruct_4bitgs64_hqq

11 days ago

torch 2.4
transformers 4.4

import torch
from transformers import AutoTokenizer
from hqq.models.hf.base import AutoHQQHFModel
from hqq.utils.patching import *
from hqq.core.quantize import *
from hqq.utils.generation_hf import HFGenerator

#Load the model
###################################################
model_id = 'mobiuslabsgmbh/Llama-3.1-8b-instruct_4bitgs64_hqq' #no calib version
#model_id = 'mobiuslabsgmbh/Llama-3.1-8b-instruct_4bitgs64_hqq_calib' #calibrated version

compute_dtype = torch.bfloat16 #bfloat16 for torchao, float16 for bitblas
cache_dir = '.'
model = AutoHQQHFModel.from_quantized(model_id, cache_dir=cache_dir, compute_dtype=compute_dtype)
tokenizer = AutoTokenizer.from_pretrained(model_id, cache_dir=cache_dir)

quant_config = BaseQuantizeConfig(nbits=4, group_size=64, quant_scale=False, quant_zero=False, axis=1)
patch_linearlayers(model, patch_add_quant_config, quant_config)

#Use optimized inference kernels
###################################################
HQQLinear.set_backend(HQQBackend.PYTORCH)
#prepare_for_inference(model) #default backend
prepare_for_inference(model, backend="torchao_int4")
#prepare_for_inference(model, backend="bitblas") #takes a while to init...

#Generate
###################################################
#For longer context, make sure to allocate enough cache via the cache_size= parameter
gen = HFGenerator(model, tokenizer, max_new_tokens=1000, do_sample=True, compile="partial").warmup() #Warm-up takes a while

gen.generate("Write an essay about large language models", print_tokens=True)
gen.generate("Tell me a funny joke!", print_tokens=True)
gen.generate("How to make a yummy chocolate cake?", print_tokens=True)

in <cell line: 0>()
13 compute_dtype = torch.bfloat16 #bfloat16 for torchao, float16 for bitblas
14 cache_dir = '.'
---> 15 model = AutoHQQHFModel.from_quantized(model_id, cache_dir=cache_dir, compute_dtype=compute_dtype)
16 tokenizer = AutoTokenizer.from_pretrained(model_id, cache_dir=cache_dir)
17

5 frames
/usr/local/lib/python3.11/dist-packages/transformers/models/llama/configuration_llama.py in _rope_scaling_validation(self)
179
180 if not isinstance(self.rope_scaling, dict) or len(self.rope_scaling) != 2:
--> 181 raise ValueError(
182 "rope_scaling must be a dictionary with two fields, type and factor, " f"got {self.rope_scaling}"
183 )

ValueError: rope_scaling must be a dictionary with two fields, type and factor, got {'factor': 8.0, 'high_freq_factor': 4.0, 'low_freq_factor': 1.0, 'original_max_position_embeddings': 8192, 'rope_type': 'llama3'}

rakmik

11 days ago

!pip install torch==2.4.0 torchvision==0.19.0 torchaudio==2.4.0 --index-url https://download.pytorch.org/whl/cu121

/usr/local/lib/python3.11/dist-packages/huggingface_hub/utils/_auth.py:94: UserWarning:
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
warnings.warn(
Fetching 8 files: 100%
8/8 [00:00<00:00, 681.41it/s]

ValueError Traceback (most recent call last)
in <cell line: 0>()
13 compute_dtype = torch.bfloat16 #bfloat16 for torchao, float16 for bitblas
14 cache_dir = '.'
---> 15 model = AutoHQQHFModel.from_quantized(model_id, cache_dir=cache_dir, compute_dtype=compute_dtype)
16 tokenizer = AutoTokenizer.from_pretrained(model_id, cache_dir=cache_dir)
17

5 frames
/usr/local/lib/python3.11/dist-packages/transformers/models/llama/configuration_llama.py in _rope_scaling_validation(self)
179
180 if not isinstance(self.rope_scaling, dict) or len(self.rope_scaling) != 2:
--> 181 raise ValueError(
182 "rope_scaling must be a dictionary with two fields, type and factor, " f"got {self.rope_scaling}"
183 )

ValueError: rope_scaling must be a dictionary with two fields, type and factor, got {'factor': 8.0, 'high_freq_factor': 4.0, 'low_freq_factor': 1.0, 'original_max_position_embeddings': 8192, 'rope_type': 'llama3'}

mobicham

Mobius Labs GmbH org 10 days ago

This is working fine, make sure you have an updated version of transformers

mobicham changed discussion status to closed 10 days ago

mobiuslabsgmbh
/

Llama-3.1-8b-instruct_4bitgs64_hqq

not run