Spaces:

pytorch
/

torchao-my-repo

Running

App Files Files Community

MekkCyber commited on Jan 8

Commit

63d14c6

1 Parent(s): ce0c4f3

add app logic

Browse files

Files changed (3) hide show

README.md +15 -6
app.py +221 -4
requirement.txt +6 -0

README.md CHANGED Viewed

@@ -1,13 +1,22 @@
 ---
-title: TorchAO
-emoji: 🚀
-colorFrom: gray
-colorTo: blue
 sdk: gradio
-sdk_version: 5.10.0
 app_file: app.py
 pinned: false
-short_description: TorchAO Quantization
 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: QuantizationTorchAODraft
+emoji: 💻
+colorFrom: blue
+colorTo: red
 sdk: gradio
+sdk_version: 4.27.0
 app_file: app.py
 pinned: false
+hf_oauth: true
+# optional, default duration is 8 hours/480 minutes. Max duration is 30 days/43200 minutes.
+hf_oauth_expiration_minutes: 480
+# optional, see "Scopes" below. "openid profile" is always included.
+hf_oauth_scopes:
+ - read-repos
+ - write-repos
+ - manage-repos
+ - inference-api
 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

app.py CHANGED Viewed

@@ -1,7 +1,224 @@
 import gradio as gr
-def greet(name):
-    return "Hello " + name + "!!"
-demo = gr.Interface(fn=greet, inputs="text", outputs="text")
-demo.launch()

 import gradio as gr
+import torch
+from transformers import TorchAoConfig, AutoModelForCausalLM, AutoTokenizer, AutoModel
+import tempfile
+from huggingface_hub import HfApi
+from huggingface_hub import list_models
+from gradio_huggingfacehub_search import HuggingfaceHubSearch
+from packaging import version
+import os
+import spaces
+def hello(profile: gr.OAuthProfile | None, oauth_token: gr.OAuthToken | None) -> str:
+    # ^ expect a gr.OAuthProfile object as input to get the user's profile
+    # if the user is not logged in, profile will be None
+    if profile is None:
+        return "Hello !"
+    return f"Hello {profile.name} !"
+def check_model_exists(oauth_token: gr.OAuthToken | None, username, quantization_type, group_size, model_name, quantized_model_name):
+    """Check if a model exists in the user's Hugging Face repository."""
+    try:
+        models = list_models(author=username, token=oauth_token.token)
+        model_names = [model.id for model in models]
+        if quantized_model_name :
+            repo_name = f"{username}/{quantized_model_name}"
+        else :
+            if quantization_type == "int4_weight_only" :
+                repo_name = f"{username}/{model_name.split('/')[-1]}-torchao-{quantization_type.lower()}-gs_{group_size}"
+            else :
+                repo_name = f"{username}/{model_name.split('/')[-1]}-torchao-{quantization_type.lower()}"
+        if repo_name in model_names:
+            return f"Model '{repo_name}' already exists in your repository."
+        else:
+            return None  # Model does not exist
+    except Exception as e:
+        return f"Error checking model existence: {str(e)}"
+def create_model_card(model_name, quantization_type, group_size):
+    model_card = f"""---
+base_model:
+- {model_name}
+---
+# {model_name} (Quantized)
+## Description
+This model is a quantized version of the original model `{model_name}`. It has been quantized using {quantization_type} quantization with torchao.
+## Quantization Details
+- **Quantization Type**: {quantization_type}
+- **Group Size**: {group_size if quantization_type == "int4_weight_only" else None}
+## Usage
+You can use this model in your applications by loading it directly from the Hugging Face Hub:
+```python
+from transformers import AutoModel
+model = AutoModel.from_pretrained("{model_name}")"""
+    return model_card
+@spaces.GPU
+def load_model_gpu(model_name, quantization_config, auth_token) :
+    return AutoModel.from_pretrained(model_name, torch_dtype=torch.bfloat16, quantization_config=quantization_config, use_auth_token=auth_token.token)
+def load_model_cpu(model_name, quantization_config, auth_token) :
+    return AutoModel.from_pretrained(model_name, torch_dtype=torch.bfloat16, quantization_config=quantization_config, use_auth_token=auth_token.token)
+def quantize_model(model_name, quantization_type, group_size=128, auth_token=None, username=None, device="cuda"):
+    print(f"Quantizing model: {quantization_type}")
+    if quantization_type == "int4_weight_only" :
+        quantization_config = TorchAoConfig(quantization_type, group_size=group_size)
+    else :
+        quantization_config = TorchAoConfig(quantization_type)
+    if device == "cuda" :
+        model = load_model_gpu(model_name, quantization_config=quantization_config, auth_token=auth_token)
+    else :
+        model = load_model_cpu(model_name, quantization_config=quantization_config, auth_token=auth_token)
+    return model
+def save_model(model, model_name, quantization_type, group_size=128, username=None, auth_token=None, quantized_model_name=None):
+    print("Saving quantized model")
+    with tempfile.TemporaryDirectory() as tmpdirname:
+        model.save_pretrained(tmpdirname, safe_serialization=False, use_auth_token=auth_token.token)
+        if quantized_model_name :
+            repo_name = f"{username}/{quantized_model_name}"
+        else :
+            if quantization_type == "int4_weight_only" :
+                repo_name = f"{username}/{model_name.split('/')[-1]}-torchao-{quantization_type.lower()}-gs_{group_size}"
+            else :
+                repo_name = f"{username}/{model_name.split('/')[-1]}-torchao-{quantization_type.lower()}"
+        model_card = create_model_card(repo_name, quantization_type, group_size)
+        with open(os.path.join(tmpdirname, "README.md"), "w") as f:
+            f.write(model_card)
+        # Push to Hub
+        api = HfApi(token=auth_token.token)
+        api.create_repo(repo_name, exist_ok=True)
+        api.upload_folder(
+            folder_path=tmpdirname,
+            repo_id=repo_name,
+            repo_type="model",
+        )
+    return f"https://huggingface.co/{repo_name}"
+def quantize_and_save(profile: gr.OAuthProfile | None, oauth_token: gr.OAuthToken | None, model_name, quantization_type, group_size, quantized_model_name, device):
+    if oauth_token is None :
+        return "Error : Please Sign In to your HuggingFace account to use the quantizer"
+    if not profile:
+        return "Error: Please Sign In to your HuggingFace account to use the quantizer"
+    exists_message = check_model_exists(oauth_token, profile.username, quantization_type, group_size, model_name, quantized_model_name)
+    if exists_message :
+        return exists_message
+    if quantization_type == "int4_weight_only" and device == "cpu" :
+        return "int4_weight_only not supported on cpu"
+    # try :
+    quantized_model = quantize_model(model_name, quantization_type, group_size, oauth_token, profile.username, device)
+    return save_model(quantized_model, model_name, quantization_type, group_size, profile.username, oauth_token, quantized_model_name)
+    # except Exception as e :
+    #     return e
+with gr.Blocks(theme=gr.themes.Soft()) as app:
+    gr.Markdown(
+        """
+        # 🚀 LLM Model Quantization App
+        Quantize your favorite Hugging Face models and save them to your profile!
+        """
+    )
+    gr.LoginButton(elem_id="login-button", elem_classes="center-button")
+    m1 = gr.Markdown()
+    app.load(hello, inputs=None, outputs=m1)
+    with gr.Row():
+        with gr.Column():
+            model_name = HuggingfaceHubSearch(
+                label="Hub Model ID",
+                placeholder="Search for model id on Huggingface",
+                search_type="model",
+            )
+            quantization_type = gr.Dropdown(
+                label="Quantization Type",
+                choices=["int4_weight_only", "int8_weight_only", "int8_dynamic_activation_int8_weight"],
+                value="int8_weight_only"
+            )
+            group_size = gr.Number(
+                label="Group Size (only for int4_weight_only)",
+                value=128,
+                interactive=True
+            )
+            device = gr.Dropdown(
+                label="Device (int4 only works with cuda)",
+                choices=["cuda", "cpu"],
+                value="cuda"
+            )
+            quantized_model_name = gr.Textbox(
+                label="Model Name (optional : to override default)",
+                value="",
+                interactive=True
+            )
+            # with gr.Row():
+            #     username = gr.Textbox(
+            #         label="Hugging Face Username",
+            #         placeholder="Enter your Hugging Face username",
+            #         value="",
+            #         interactive=True,
+            #         elem_id="username-box"
+            #     )
+        with gr.Column():
+            quantize_button = gr.Button("Quantize and Save Model", variant="primary")
+            output_link = gr.Textbox(label="Quantized Model Link")
+    gr.Markdown(
+        """
+        ## Instructions
+        1. Login to your HuggingFace account
+        2. Enter the name of the Hugging Face LLM model you want to quantize (Make sure you have access to it)
+        3. Choose the quantization type.
+        4. Optionally, specify the group size.
+        5. Optionally, choose a custom name for the quantized model
+        6. Click "Quantize and Save Model" to start the process.
+        7. Once complete, you'll receive a link to the quantized model on Hugging Face.
+        Note: This process may take some time depending on the model size and your hardware you can check the container logs to see where are you at in the process!
+        """
+    )
+    # Adding CSS styles for the username box
+    app.css = """
+    #username-box {
+        background-color: #f0f8ff; /* Light color */
+        border-radius: 8px;
+        padding: 10px;
+    }
+    """
+    app.css = """
+    .center-button {
+        display: flex;
+        justify-content: center;
+        align-items: center;
+        margin: 0 auto; /* Center horizontally */
+    }
+    """
+    quantize_button.click(
+        fn=quantize_and_save,
+        inputs=[model_name, quantization_type, group_size, quantized_model_name, device],
+        outputs=[output_link]
+    )
+# Launch the app
+app.launch()

requirement.txt ADDED Viewed

	@@ -0,0 +1,6 @@

+git+https://github.com/huggingface/transformers.git@main#egg=transformers
+accelerate
+torchao
+huggingface-hub
+https://gradio-builds.s3.amazonaws.com/4485dd46a8e4b3f5b35e42d52f291b72fdc1a952/gradio-4.39.0-py3-none-any.whl
+gradio-huggingfacehub-search