Spaces:

mgoin
/

convert-fp8

Sleeping

File size: 9,823 Bytes

import os
from typing import Optional, Tuple, List
import gradio as gr
import torch
import spaces
from dataclasses import dataclass
from huggingface_hub import HfApi, Repository, CommitOperationAdd
from transformers import AutoProcessor
from llmcompressor.modifiers.quantization import QuantizationModifier
from llmcompressor.transformers import oneshot, wrap_hf_model_class

@dataclass
class CommitInfo:
    repo_url: str

HF_TOKEN = os.environ.get("HF_TOKEN")

def get_model_class(class_name: str):
    """Dynamically import and return the specified model class from transformers"""
    try:
        # Default to AutoModelForCausalLM if not specified
        if not class_name:
            from transformers import AutoModelForCausalLM
            return AutoModelForCausalLM
            
        exec(f"from transformers import {class_name}")
        return eval(class_name)
    except Exception as e:
        raise ValueError(f"Failed to import model class {class_name}: {str(e)}")

def parse_ignore_list(ignore_str: str) -> List[str]:
    """Parse comma-separated ignore list string into list"""
    if not ignore_str:
        return ["lm_head"]  # Default ignore list
    return [item.strip() for item in ignore_str.split(',') if item.strip()]

def create_quantized_model(
    model_id: str,
    work_dir: str,
    api: HfApi,
    ignore_list: List[str],
    model_class_name: str
) -> Tuple[str, List[Tuple[str, Exception]]]:
    """Quantize model to FP8 and save to disk"""
    
    errors = []
    try:
        # Get the appropriate model class
        model_class = get_model_class(model_class_name)
        wrapped_model_class = wrap_hf_model_class(model_class)
        
        # Load model with ZeroGPU
        model = wrapped_model_class.from_pretrained(
            model_id,
            device_map="auto",
            torch_dtype="auto",
            trust_remote_code=True,
            _attn_implementation="eager"
        )
        processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=True)

        # Configure quantization
        recipe = QuantizationModifier(
            targets="Linear",
            scheme="FP8_DYNAMIC",
            ignore=ignore_list,
        )

        # Apply quantization
        save_dir = os.path.join(work_dir, f"{model_id.split('/')[-1]}-FP8-dynamic")
        oneshot(model=model, recipe=recipe, output_dir=save_dir)
        processor.save_pretrained(save_dir)
        
        return save_dir, errors
        
    except Exception as e:
        errors.append((model_id, e))
        raise e

def push_to_hub(
    api: HfApi,
    model_id: str,
    quantized_path: str,
    token: str,
    ignore_list: List[str],
    model_class_name: str,
) -> CommitInfo:
    """Create new repository with quantized model"""
    
    # Create new model repo name
    original_owner = model_id.split('/')[0]
    new_model_name = f"{model_id.split('/')[-1]}-fp8"
    
    # Get the token owner's username
    token_owner = api.whoami(token)["name"]
    
    # Create the new repo under the token owner's account
    target_repo = f"{token_owner}/{new_model_name}"
    
    # Create model card content
    model_card = f"""---
language:
- en
license: apache-2.0
tags:
- fp8
- quantized
- llmcompressor
base_model: {model_id}
quantization_config:
  ignored_layers: {ignore_list}
  model_class: {model_class_name}
---

# {new_model_name}

This is an FP8-quantized version of [{model_id}](https://huggingface.co/{model_id}) using [LLM Compressor](https://github.com/georgian-io/LLM-Compressor).

## Quantization Details

- Weights quantized to FP8 with per channel PTQ
- Activations quantized to FP8 with dynamic per token
- Linear layers targeted for quantization
- Ignored layers: {ignore_list}
- Model class: {model_class_name}

## Usage

```python
from transformers import {model_class_name}, AutoProcessor

model = {model_class_name}.from_pretrained("{target_repo}")
processor = AutoProcessor.from_pretrained("{target_repo}")
```
"""

    # Create new repository
    api.create_repo(
        repo_id=target_repo,
        private=False,
        exist_ok=True,
    )

    # Prepare operations for upload
    operations = [
        CommitOperationAdd(path_in_repo="README.md", path_or_content=model_card),
    ]
    
    # Add all files from quantized model
    for root, _, files in os.walk(quantized_path):
        for file in files:
            file_path = os.path.join(root, file)
            relative_path = os.path.relpath(file_path, quantized_path)
            operations.append(
                CommitOperationAdd(
                    path_in_repo=relative_path,
                    path_or_content=file_path
                )
            )

    # Upload files
    api.create_commit(
        repo_id=target_repo,
        operations=operations,
        commit_message=f"Add FP8 quantized version of {model_id}",
    )

    return CommitInfo(repo_url=f"https://huggingface.co/{target_repo}")

@spaces.GPU(duration=300)  # 5 minutes timeout for large models
def run(
    model_id: str,
    is_private: bool,
    token: str,
    ignore_str: str,
    model_class_name: str
) -> str:
    """Main function to handle quantization and model upload"""
    
    if not token or model_id == "":
        return """
        ### Invalid input 🐞
        
        Please provide both a token and model_id.
        """
        
    try:
        # Parse ignore list
        ignore_list = parse_ignore_list(ignore_str)
        
        # Set up API with user's token
        api = HfApi(token=token)
        
        print("Processing model:", model_id)
        print("Ignore list:", ignore_list)
        print("Model class:", model_class_name)
        
        # Create working directory
        work_dir = "quantized_models"
        os.makedirs(work_dir, exist_ok=True)
        
        # Quantize model
        quantized_path, errors = create_quantized_model(
            model_id,
            work_dir,
            api,
            ignore_list,
            model_class_name
        )
        
        # Upload quantized model to new repository
        commit_info = push_to_hub(
            api,
            model_id,
            quantized_path,
            token,
            ignore_list,
            model_class_name
        )

        response = f"""
        ### Success 🔥

        Your model has been successfully quantized to FP8 and uploaded to a new repository:

        [{commit_info.repo_url}]({commit_info.repo_url})
        
        Configuration:
        - Ignored layers: {ignore_list}
        - Model class: {model_class_name}
        
        You can use this model directly with the transformers library!
        """
        
        if errors:
            response += "\nWarnings during quantization:\n"
            response += "\n".join(f"Warning for {filename}: {e}" for filename, e in errors)
            
        return response
        
    except Exception as e:
        return f"""
        ### Error 😢

        An error occurred during processing:
        {str(e)}
        """

# Gradio Interface
DESCRIPTION = """
# Convert any model to FP8 using LLM Compressor

This space will quantize your model to FP8 format using LLM Compressor and create a new model repository under your account.

The steps are:
1. Paste your HuggingFace token (from hf.co/settings/tokens) - needs write access
2. Enter the model ID you want to quantize
3. (Optional) Customize ignored layers and model class
4. Click "Submit"
5. You'll get a link to your new quantized model repository! 🚀

## Advanced Options:
- **Ignore List**: Comma-separated list of layer patterns to ignore during quantization. Examples:
  - Llama: `lm_head`
  - Phi3v: `re:.*lm_head,re:model.vision_embed_tokens.*`
  - Pixtral: `re:.*lm_head,re:multi_modal_projector.*`
  - Llama Vision: `re:.*lm_head,re:multi_modal_projector.*,re:vision_model.*`
- **Model Class**: Specific model class from transformers (default: AutoModelForCausalLM). Examples:
  - `MllamaForConditionalGeneration`
  - `Qwen2VLForConditionalGeneration`
  - `LlavaForConditionalGeneration`

Note: 
- Processing may take several minutes depending on the model size
- The quantized model will be created as a new public repository under your account
- Your token needs write access to create the new repository
"""

title = "FP8 Quantization with LLM Compressor"

with gr.Blocks(title=title) as demo:
    gr.Markdown(DESCRIPTION)
    
    with gr.Row():
        with gr.Column():
            model_id = gr.Text(
                max_lines=1,
                label="model_id",
                placeholder="huggingface/model-name"
            )
            is_private = gr.Checkbox(
                label="Private model (requires read access to original model)"
            )
            token = gr.Text(
                max_lines=1,
                label="your_hf_token (requires write access)",
                placeholder="hf_..."
            )
            ignore_str = gr.Text(
                max_lines=1,
                label="ignore_list (comma-separated)",
                placeholder="lm_head,re:vision_model.*",
                value="lm_head"
            )
            model_class_name = gr.Text(
                max_lines=1,
                label="model_class_name (optional)",
                placeholder="AutoModelForCausalLM",
                value="AutoModelForCausalLM"
            )
            
            with gr.Row():
                clean = gr.ClearButton()
                submit = gr.Button("Submit", variant="primary")
        
        with gr.Column():
            output = gr.Markdown()
    
    submit.click(
        run,
        inputs=[model_id, is_private, token, ignore_str, model_class_name],
        outputs=output,
        concurrency_limit=1
    )

demo.queue(max_size=10).launch(show_api=True)