RedHatAI
/

Meta-Llama-3.1-70B-Instruct-quantized.w8a8

@@ -19,7 +19,7 @@ base_model: meta-llama/Meta-Llama-3.1-70B-Instruct
 # Meta-Llama-3.1-70B-Instruct-quantized.w8a8
 ## Model Overview
-- **Model Architecture:** Meta-Llama-3
   - **Input:** Text
   - **Output:** Text
 - **Model Optimizations:**
@@ -88,12 +88,14 @@ vLLM aslo supports OpenAI-compatible serving. See the [documentation](https://do
 This model was created by using the [llm-compressor](https://github.com/vllm-project/llm-compressor) library as presented in the code snipet below.
 ```python
-from transformers import AutoTokenizer
 from datasets import load_dataset
 from llmcompressor.transformers import SparseAutoModelForCausalLM, oneshot
 from llmcompressor.modifiers.quantization import GPTQModifier
-model_id = "meta-llama/Meta-Llama-3.1-70B-Instruct"
 num_samples = 256
 max_seq_len = 8192
@@ -104,30 +106,42 @@ def preprocess_fn(example):
   return {"text": tokenizer.apply_chat_template(example["messages"], add_generation_prompt=False, tokenize=False)}
 ds = load_dataset("neuralmagic/LLM_compression_calibration", split="train")
-ds = ds.shuffle().select(range(num_samples))
 ds = ds.map(preprocess_fn)
 recipe = GPTQModifier(
-  targets="Linear",
-  scheme="W8A8",
-  ignore=["lm_head"],
-  dampening_frac=0.1,
 )
-model = SparseAutoModelForCausalLM.from_pretrained(
-  model_id,
-  device_map="auto",
 )
 oneshot(
-  model=model,
-  dataset=ds,
-  recipe=recipe,
-  max_seq_length=max_seq_len,
-  num_calibration_samples=num_samples,
 )
-model.save_pretrained("Meta-Llama-3.1-70B-Instruct-quantized.w8a8")
 ```

 # Meta-Llama-3.1-70B-Instruct-quantized.w8a8
 ## Model Overview
+- **Model Architecture:** LlamaForCausalLM
   - **Input:** Text
   - **Output:** Text
 - **Model Optimizations:**
 This model was created by using the [llm-compressor](https://github.com/vllm-project/llm-compressor) library as presented in the code snipet below.
 ```python
+from transformers import AutoTokenizer, AutoModelForCausalLM
 from datasets import load_dataset
 from llmcompressor.transformers import SparseAutoModelForCausalLM, oneshot
 from llmcompressor.modifiers.quantization import GPTQModifier
+from llmcompressor.transformers.compression.helpers import calculate_offload_device_map
+model_stub = "meta-llama/Meta-Llama-3.1-70B-Instruct"
+model_name = model_stub.split("/")[-1]
 num_samples = 256
 max_seq_len = 8192
   return {"text": tokenizer.apply_chat_template(example["messages"], add_generation_prompt=False, tokenize=False)}
 ds = load_dataset("neuralmagic/LLM_compression_calibration", split="train")
 ds = ds.map(preprocess_fn)
 recipe = GPTQModifier(
+    targets="Linear",
+    scheme="W8A8",
+    ignore=["lm_head"],
+    dampening_frac=0.1,
+)
+device_map = calculate_offload_device_map(
+    model_stub,
+    reserve_for_hessians=True,
+    num_gpus=2,
+    torch_dtype="auto",
 )
+model = AutoModelForCausalLM.from_pretrained(
+    model_stub,
+    device_map="auto",
+    dtype="auto",
 )
 oneshot(
+    model=model,
+    dataset=ds,
+    recipe=recipe,
+    max_seq_length=max_seq_len,
+    num_calibration_samples=num_samples,
 )
+save_path = model_name + "-quantized.w8a8
+model.save_pretrained(save_path)
+tokenizer.save_pretrained(save_path)
+print(f"Model and tokenizer saved to: {save_path}")
 ```