Update README.md
Browse files
README.md
CHANGED
@@ -19,7 +19,7 @@ base_model: meta-llama/Meta-Llama-3.1-70B-Instruct
|
|
19 |
# Meta-Llama-3.1-70B-Instruct-quantized.w8a8
|
20 |
|
21 |
## Model Overview
|
22 |
-
- **Model Architecture:**
|
23 |
- **Input:** Text
|
24 |
- **Output:** Text
|
25 |
- **Model Optimizations:**
|
@@ -88,12 +88,14 @@ vLLM aslo supports OpenAI-compatible serving. See the [documentation](https://do
|
|
88 |
This model was created by using the [llm-compressor](https://github.com/vllm-project/llm-compressor) library as presented in the code snipet below.
|
89 |
|
90 |
```python
|
91 |
-
from transformers import AutoTokenizer
|
92 |
from datasets import load_dataset
|
93 |
from llmcompressor.transformers import SparseAutoModelForCausalLM, oneshot
|
94 |
from llmcompressor.modifiers.quantization import GPTQModifier
|
|
|
95 |
|
96 |
-
|
|
|
97 |
|
98 |
num_samples = 256
|
99 |
max_seq_len = 8192
|
@@ -104,30 +106,42 @@ def preprocess_fn(example):
|
|
104 |
return {"text": tokenizer.apply_chat_template(example["messages"], add_generation_prompt=False, tokenize=False)}
|
105 |
|
106 |
ds = load_dataset("neuralmagic/LLM_compression_calibration", split="train")
|
107 |
-
ds = ds.shuffle().select(range(num_samples))
|
108 |
ds = ds.map(preprocess_fn)
|
109 |
|
110 |
recipe = GPTQModifier(
|
111 |
-
|
112 |
-
|
113 |
-
|
114 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
115 |
)
|
116 |
|
117 |
-
model =
|
118 |
-
|
119 |
-
|
|
|
120 |
)
|
121 |
|
122 |
oneshot(
|
123 |
-
|
124 |
-
|
125 |
-
|
126 |
-
|
127 |
-
|
128 |
)
|
129 |
|
130 |
-
|
|
|
|
|
|
|
|
|
|
|
131 |
```
|
132 |
|
133 |
|
|
|
19 |
# Meta-Llama-3.1-70B-Instruct-quantized.w8a8
|
20 |
|
21 |
## Model Overview
|
22 |
+
- **Model Architecture:** LlamaForCausalLM
|
23 |
- **Input:** Text
|
24 |
- **Output:** Text
|
25 |
- **Model Optimizations:**
|
|
|
88 |
This model was created by using the [llm-compressor](https://github.com/vllm-project/llm-compressor) library as presented in the code snipet below.
|
89 |
|
90 |
```python
|
91 |
+
from transformers import AutoTokenizer, AutoModelForCausalLM
|
92 |
from datasets import load_dataset
|
93 |
from llmcompressor.transformers import SparseAutoModelForCausalLM, oneshot
|
94 |
from llmcompressor.modifiers.quantization import GPTQModifier
|
95 |
+
from llmcompressor.transformers.compression.helpers import calculate_offload_device_map
|
96 |
|
97 |
+
model_stub = "meta-llama/Meta-Llama-3.1-70B-Instruct"
|
98 |
+
model_name = model_stub.split("/")[-1]
|
99 |
|
100 |
num_samples = 256
|
101 |
max_seq_len = 8192
|
|
|
106 |
return {"text": tokenizer.apply_chat_template(example["messages"], add_generation_prompt=False, tokenize=False)}
|
107 |
|
108 |
ds = load_dataset("neuralmagic/LLM_compression_calibration", split="train")
|
|
|
109 |
ds = ds.map(preprocess_fn)
|
110 |
|
111 |
recipe = GPTQModifier(
|
112 |
+
targets="Linear",
|
113 |
+
scheme="W8A8",
|
114 |
+
ignore=["lm_head"],
|
115 |
+
dampening_frac=0.1,
|
116 |
+
)
|
117 |
+
|
118 |
+
device_map = calculate_offload_device_map(
|
119 |
+
model_stub,
|
120 |
+
reserve_for_hessians=True,
|
121 |
+
num_gpus=2,
|
122 |
+
torch_dtype="auto",
|
123 |
)
|
124 |
|
125 |
+
model = AutoModelForCausalLM.from_pretrained(
|
126 |
+
model_stub,
|
127 |
+
device_map="auto",
|
128 |
+
dtype="auto",
|
129 |
)
|
130 |
|
131 |
oneshot(
|
132 |
+
model=model,
|
133 |
+
dataset=ds,
|
134 |
+
recipe=recipe,
|
135 |
+
max_seq_length=max_seq_len,
|
136 |
+
num_calibration_samples=num_samples,
|
137 |
)
|
138 |
|
139 |
+
|
140 |
+
save_path = model_name + "-quantized.w8a8
|
141 |
+
model.save_pretrained(save_path)
|
142 |
+
tokenizer.save_pretrained(save_path)
|
143 |
+
print(f"Model and tokenizer saved to: {save_path}")
|
144 |
+
|
145 |
```
|
146 |
|
147 |
|