File size: 9,823 Bytes
de81c99
 
9f6cdbc
de81c99
 
 
 
 
 
 
9f6cdbc
de81c99
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9f6cdbc
de81c99
 
 
 
 
 
9f6cdbc
de81c99
 
 
 
 
 
 
 
 
 
9f6cdbc
de81c99
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9f6cdbc
de81c99
9f6cdbc
de81c99
9f6cdbc
de81c99
9f6cdbc
de81c99
 
 
 
 
9f6cdbc
de81c99
9f6cdbc
de81c99
 
 
 
 
 
9f6cdbc
de81c99
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9f6cdbc
de81c99
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
import os
from typing import Optional, Tuple, List
import gradio as gr
import torch
import spaces
from dataclasses import dataclass
from huggingface_hub import HfApi, Repository, CommitOperationAdd
from transformers import AutoProcessor
from llmcompressor.modifiers.quantization import QuantizationModifier
from llmcompressor.transformers import oneshot, wrap_hf_model_class

@dataclass
class CommitInfo:
    repo_url: str

HF_TOKEN = os.environ.get("HF_TOKEN")

def get_model_class(class_name: str):
    """Dynamically import and return the specified model class from transformers"""
    try:
        # Default to AutoModelForCausalLM if not specified
        if not class_name:
            from transformers import AutoModelForCausalLM
            return AutoModelForCausalLM
            
        exec(f"from transformers import {class_name}")
        return eval(class_name)
    except Exception as e:
        raise ValueError(f"Failed to import model class {class_name}: {str(e)}")

def parse_ignore_list(ignore_str: str) -> List[str]:
    """Parse comma-separated ignore list string into list"""
    if not ignore_str:
        return ["lm_head"]  # Default ignore list
    return [item.strip() for item in ignore_str.split(',') if item.strip()]

def create_quantized_model(
    model_id: str,
    work_dir: str,
    api: HfApi,
    ignore_list: List[str],
    model_class_name: str
) -> Tuple[str, List[Tuple[str, Exception]]]:
    """Quantize model to FP8 and save to disk"""
    
    errors = []
    try:
        # Get the appropriate model class
        model_class = get_model_class(model_class_name)
        wrapped_model_class = wrap_hf_model_class(model_class)
        
        # Load model with ZeroGPU
        model = wrapped_model_class.from_pretrained(
            model_id,
            device_map="auto",
            torch_dtype="auto",
            trust_remote_code=True,
            _attn_implementation="eager"
        )
        processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=True)

        # Configure quantization
        recipe = QuantizationModifier(
            targets="Linear",
            scheme="FP8_DYNAMIC",
            ignore=ignore_list,
        )

        # Apply quantization
        save_dir = os.path.join(work_dir, f"{model_id.split('/')[-1]}-FP8-dynamic")
        oneshot(model=model, recipe=recipe, output_dir=save_dir)
        processor.save_pretrained(save_dir)
        
        return save_dir, errors
        
    except Exception as e:
        errors.append((model_id, e))
        raise e

def push_to_hub(
    api: HfApi,
    model_id: str,
    quantized_path: str,
    token: str,
    ignore_list: List[str],
    model_class_name: str,
) -> CommitInfo:
    """Create new repository with quantized model"""
    
    # Create new model repo name
    original_owner = model_id.split('/')[0]
    new_model_name = f"{model_id.split('/')[-1]}-fp8"
    
    # Get the token owner's username
    token_owner = api.whoami(token)["name"]
    
    # Create the new repo under the token owner's account
    target_repo = f"{token_owner}/{new_model_name}"
    
    # Create model card content
    model_card = f"""---
language:
- en
license: apache-2.0
tags:
- fp8
- quantized
- llmcompressor
base_model: {model_id}
quantization_config:
  ignored_layers: {ignore_list}
  model_class: {model_class_name}
---

# {new_model_name}

This is an FP8-quantized version of [{model_id}](https://huggingface.co/{model_id}) using [LLM Compressor](https://github.com/georgian-io/LLM-Compressor).

## Quantization Details

- Weights quantized to FP8 with per channel PTQ
- Activations quantized to FP8 with dynamic per token
- Linear layers targeted for quantization
- Ignored layers: {ignore_list}
- Model class: {model_class_name}

## Usage

```python
from transformers import {model_class_name}, AutoProcessor

model = {model_class_name}.from_pretrained("{target_repo}")
processor = AutoProcessor.from_pretrained("{target_repo}")
```
"""

    # Create new repository
    api.create_repo(
        repo_id=target_repo,
        private=False,
        exist_ok=True,
    )

    # Prepare operations for upload
    operations = [
        CommitOperationAdd(path_in_repo="README.md", path_or_content=model_card),
    ]
    
    # Add all files from quantized model
    for root, _, files in os.walk(quantized_path):
        for file in files:
            file_path = os.path.join(root, file)
            relative_path = os.path.relpath(file_path, quantized_path)
            operations.append(
                CommitOperationAdd(
                    path_in_repo=relative_path,
                    path_or_content=file_path
                )
            )

    # Upload files
    api.create_commit(
        repo_id=target_repo,
        operations=operations,
        commit_message=f"Add FP8 quantized version of {model_id}",
    )

    return CommitInfo(repo_url=f"https://huggingface.co/{target_repo}")

@spaces.GPU(duration=300)  # 5 minutes timeout for large models
def run(
    model_id: str,
    is_private: bool,
    token: str,
    ignore_str: str,
    model_class_name: str
) -> str:
    """Main function to handle quantization and model upload"""
    
    if not token or model_id == "":
        return """
        ### Invalid input 🐞
        
        Please provide both a token and model_id.
        """
        
    try:
        # Parse ignore list
        ignore_list = parse_ignore_list(ignore_str)
        
        # Set up API with user's token
        api = HfApi(token=token)
        
        print("Processing model:", model_id)
        print("Ignore list:", ignore_list)
        print("Model class:", model_class_name)
        
        # Create working directory
        work_dir = "quantized_models"
        os.makedirs(work_dir, exist_ok=True)
        
        # Quantize model
        quantized_path, errors = create_quantized_model(
            model_id,
            work_dir,
            api,
            ignore_list,
            model_class_name
        )
        
        # Upload quantized model to new repository
        commit_info = push_to_hub(
            api,
            model_id,
            quantized_path,
            token,
            ignore_list,
            model_class_name
        )

        response = f"""
        ### Success πŸ”₯

        Your model has been successfully quantized to FP8 and uploaded to a new repository:

        [{commit_info.repo_url}]({commit_info.repo_url})
        
        Configuration:
        - Ignored layers: {ignore_list}
        - Model class: {model_class_name}
        
        You can use this model directly with the transformers library!
        """
        
        if errors:
            response += "\nWarnings during quantization:\n"
            response += "\n".join(f"Warning for {filename}: {e}" for filename, e in errors)
            
        return response
        
    except Exception as e:
        return f"""
        ### Error 😒

        An error occurred during processing:
        {str(e)}
        """

# Gradio Interface
DESCRIPTION = """
# Convert any model to FP8 using LLM Compressor

This space will quantize your model to FP8 format using LLM Compressor and create a new model repository under your account.

The steps are:
1. Paste your HuggingFace token (from hf.co/settings/tokens) - needs write access
2. Enter the model ID you want to quantize
3. (Optional) Customize ignored layers and model class
4. Click "Submit"
5. You'll get a link to your new quantized model repository! πŸš€

## Advanced Options:
- **Ignore List**: Comma-separated list of layer patterns to ignore during quantization. Examples:
  - Llama: `lm_head`
  - Phi3v: `re:.*lm_head,re:model.vision_embed_tokens.*`
  - Pixtral: `re:.*lm_head,re:multi_modal_projector.*`
  - Llama Vision: `re:.*lm_head,re:multi_modal_projector.*,re:vision_model.*`
- **Model Class**: Specific model class from transformers (default: AutoModelForCausalLM). Examples:
  - `MllamaForConditionalGeneration`
  - `Qwen2VLForConditionalGeneration`
  - `LlavaForConditionalGeneration`

Note: 
- Processing may take several minutes depending on the model size
- The quantized model will be created as a new public repository under your account
- Your token needs write access to create the new repository
"""

title = "FP8 Quantization with LLM Compressor"

with gr.Blocks(title=title) as demo:
    gr.Markdown(DESCRIPTION)
    
    with gr.Row():
        with gr.Column():
            model_id = gr.Text(
                max_lines=1,
                label="model_id",
                placeholder="huggingface/model-name"
            )
            is_private = gr.Checkbox(
                label="Private model (requires read access to original model)"
            )
            token = gr.Text(
                max_lines=1,
                label="your_hf_token (requires write access)",
                placeholder="hf_..."
            )
            ignore_str = gr.Text(
                max_lines=1,
                label="ignore_list (comma-separated)",
                placeholder="lm_head,re:vision_model.*",
                value="lm_head"
            )
            model_class_name = gr.Text(
                max_lines=1,
                label="model_class_name (optional)",
                placeholder="AutoModelForCausalLM",
                value="AutoModelForCausalLM"
            )
            
            with gr.Row():
                clean = gr.ClearButton()
                submit = gr.Button("Submit", variant="primary")
        
        with gr.Column():
            output = gr.Markdown()
    
    submit.click(
        run,
        inputs=[model_id, is_private, token, ignore_str, model_class_name],
        outputs=output,
        concurrency_limit=1
    )

demo.queue(max_size=10).launch(show_api=True)