nm-research commited on
Commit
8d0dcbb
·
verified ·
1 Parent(s): cc881d0

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +31 -17
README.md CHANGED
@@ -19,7 +19,7 @@ base_model: meta-llama/Meta-Llama-3.1-70B-Instruct
19
  # Meta-Llama-3.1-70B-Instruct-quantized.w8a8
20
 
21
  ## Model Overview
22
- - **Model Architecture:** Meta-Llama-3
23
  - **Input:** Text
24
  - **Output:** Text
25
  - **Model Optimizations:**
@@ -88,12 +88,14 @@ vLLM aslo supports OpenAI-compatible serving. See the [documentation](https://do
88
  This model was created by using the [llm-compressor](https://github.com/vllm-project/llm-compressor) library as presented in the code snipet below.
89
 
90
  ```python
91
- from transformers import AutoTokenizer
92
  from datasets import load_dataset
93
  from llmcompressor.transformers import SparseAutoModelForCausalLM, oneshot
94
  from llmcompressor.modifiers.quantization import GPTQModifier
 
95
 
96
- model_id = "meta-llama/Meta-Llama-3.1-70B-Instruct"
 
97
 
98
  num_samples = 256
99
  max_seq_len = 8192
@@ -104,30 +106,42 @@ def preprocess_fn(example):
104
  return {"text": tokenizer.apply_chat_template(example["messages"], add_generation_prompt=False, tokenize=False)}
105
 
106
  ds = load_dataset("neuralmagic/LLM_compression_calibration", split="train")
107
- ds = ds.shuffle().select(range(num_samples))
108
  ds = ds.map(preprocess_fn)
109
 
110
  recipe = GPTQModifier(
111
- targets="Linear",
112
- scheme="W8A8",
113
- ignore=["lm_head"],
114
- dampening_frac=0.1,
 
 
 
 
 
 
 
115
  )
116
 
117
- model = SparseAutoModelForCausalLM.from_pretrained(
118
- model_id,
119
- device_map="auto",
 
120
  )
121
 
122
  oneshot(
123
- model=model,
124
- dataset=ds,
125
- recipe=recipe,
126
- max_seq_length=max_seq_len,
127
- num_calibration_samples=num_samples,
128
  )
129
 
130
- model.save_pretrained("Meta-Llama-3.1-70B-Instruct-quantized.w8a8")
 
 
 
 
 
131
  ```
132
 
133
 
 
19
  # Meta-Llama-3.1-70B-Instruct-quantized.w8a8
20
 
21
  ## Model Overview
22
+ - **Model Architecture:** LlamaForCausalLM
23
  - **Input:** Text
24
  - **Output:** Text
25
  - **Model Optimizations:**
 
88
  This model was created by using the [llm-compressor](https://github.com/vllm-project/llm-compressor) library as presented in the code snipet below.
89
 
90
  ```python
91
+ from transformers import AutoTokenizer, AutoModelForCausalLM
92
  from datasets import load_dataset
93
  from llmcompressor.transformers import SparseAutoModelForCausalLM, oneshot
94
  from llmcompressor.modifiers.quantization import GPTQModifier
95
+ from llmcompressor.transformers.compression.helpers import calculate_offload_device_map
96
 
97
+ model_stub = "meta-llama/Meta-Llama-3.1-70B-Instruct"
98
+ model_name = model_stub.split("/")[-1]
99
 
100
  num_samples = 256
101
  max_seq_len = 8192
 
106
  return {"text": tokenizer.apply_chat_template(example["messages"], add_generation_prompt=False, tokenize=False)}
107
 
108
  ds = load_dataset("neuralmagic/LLM_compression_calibration", split="train")
 
109
  ds = ds.map(preprocess_fn)
110
 
111
  recipe = GPTQModifier(
112
+ targets="Linear",
113
+ scheme="W8A8",
114
+ ignore=["lm_head"],
115
+ dampening_frac=0.1,
116
+ )
117
+
118
+ device_map = calculate_offload_device_map(
119
+ model_stub,
120
+ reserve_for_hessians=True,
121
+ num_gpus=2,
122
+ torch_dtype="auto",
123
  )
124
 
125
+ model = AutoModelForCausalLM.from_pretrained(
126
+ model_stub,
127
+ device_map="auto",
128
+ dtype="auto",
129
  )
130
 
131
  oneshot(
132
+ model=model,
133
+ dataset=ds,
134
+ recipe=recipe,
135
+ max_seq_length=max_seq_len,
136
+ num_calibration_samples=num_samples,
137
  )
138
 
139
+
140
+ save_path = model_name + "-quantized.w8a8
141
+ model.save_pretrained(save_path)
142
+ tokenizer.save_pretrained(save_path)
143
+ print(f"Model and tokenizer saved to: {save_path}")
144
+
145
  ```
146
 
147