Spaces:

pytorch
/

torchao-my-repo

Running

App Files Files Community

MekkCyber commited on Jan 8

Commit

f187eb1

1 Parent(s): 80ac8e4

fix

Browse files

Files changed (1) hide show

app.py +19 -19

app.py CHANGED Viewed

@@ -9,6 +9,10 @@ from packaging import version
 import os
 import spaces
 def hello(profile: gr.OAuthProfile | None, oauth_token: gr.OAuthToken | None) -> str:
     # ^ expect a gr.OAuthProfile object as input to get the user's profile
     # if the user is not logged in, profile will be None
@@ -25,9 +29,9 @@ def check_model_exists(oauth_token: gr.OAuthToken | None, username, quantization
             repo_name = f"{username}/{quantized_model_name}"
         else :
             if quantization_type == "int4_weight_only" :
-                repo_name = f"{username}/{model_name.split('/')[-1]}-torchao-{quantization_type.lower()}-gs_{group_size}"
             else :
-                repo_name = f"{username}/{model_name.split('/')[-1]}-torchao-{quantization_type.lower()}"
         if repo_name in model_names:
             return f"Model '{repo_name}' already exists in your repository."
@@ -61,23 +65,19 @@ model = AutoModel.from_pretrained("{model_name}")"""
     return model_card
-@spaces.GPU
-def load_model_gpu(model_name, quantization_config, auth_token) :
-    return AutoModel.from_pretrained(model_name, torch_dtype=torch.bfloat16, quantization_config=quantization_config, use_auth_token=auth_token.token)
 def load_model_cpu(model_name, quantization_config, auth_token) :
     return AutoModel.from_pretrained(model_name, torch_dtype=torch.bfloat16, quantization_config=quantization_config, use_auth_token=auth_token.token)
-def quantize_model(model_name, quantization_type, group_size=128, auth_token=None, username=None, device="cuda"):
     print(f"Quantizing model: {quantization_type}")
     if quantization_type == "int4_weight_only" :
         quantization_config = TorchAoConfig(quantization_type, group_size=group_size)
     else :
         quantization_config = TorchAoConfig(quantization_type)
-    if device == "cuda" :
-        model = load_model_gpu(model_name, quantization_config=quantization_config, auth_token=auth_token)
-    else :
-        model = load_model_cpu(model_name, quantization_config=quantization_config, auth_token=auth_token)
     return model
@@ -109,7 +109,7 @@ def save_model(model, model_name, quantization_type, group_size=128, username=No
     return f"https://huggingface.co/{repo_name}"
-def quantize_and_save(profile: gr.OAuthProfile | None, oauth_token: gr.OAuthToken | None, model_name, quantization_type, group_size, quantized_model_name, device):
     if oauth_token is None :
         return "Error : Please Sign In to your HuggingFace account to use the quantizer"
     if not profile:
@@ -117,10 +117,10 @@ def quantize_and_save(profile: gr.OAuthProfile | None, oauth_token: gr.OAuthToke
     exists_message = check_model_exists(oauth_token, profile.username, quantization_type, group_size, model_name, quantized_model_name)
     if exists_message :
         return exists_message
-    if quantization_type == "int4_weight_only" and device == "cpu" :
         return "int4_weight_only not supported on cpu"
     # try :
-    quantized_model = quantize_model(model_name, quantization_type, group_size, oauth_token, profile.username, device)
     return save_model(quantized_model, model_name, quantization_type, group_size, profile.username, oauth_token, quantized_model_name)
     # except Exception as e :
     #     return e
@@ -158,11 +158,11 @@ with gr.Blocks(theme=gr.themes.Soft()) as app:
                 value=128,
                 interactive=True
             )
-            device = gr.Dropdown(
-                label="Device (int4 only works with cuda)",
-                choices=["cuda", "cpu"],
-                value="cuda"
-            )
             quantized_model_name = gr.Textbox(
                 label="Model Name (optional : to override default)",
                 value="",
@@ -215,7 +215,7 @@ with gr.Blocks(theme=gr.themes.Soft()) as app:
     quantize_button.click(
         fn=quantize_and_save,
-        inputs=[model_name, quantization_type, group_size, quantized_model_name, device],
         outputs=[output_link]
     )

 import os
 import spaces
+MAP_QUANT_TYPE_TO_NAME = {
+    "int4_weight_only": "int4wo", "int8_weight_only": "int8wo", "int8_dynamic_activation_int8_weight": "int8da8w"
+}
 def hello(profile: gr.OAuthProfile | None, oauth_token: gr.OAuthToken | None) -> str:
     # ^ expect a gr.OAuthProfile object as input to get the user's profile
     # if the user is not logged in, profile will be None
             repo_name = f"{username}/{quantized_model_name}"
         else :
             if quantization_type == "int4_weight_only" :
+                repo_name = f"{username}/{model_name.split('/')[-1]}-torchao-{MAP_QUANT_TYPE_TO_NAME[quantization_type.lower()]}-gs{group_size}"
             else :
+                repo_name = f"{username}/{model_name.split('/')[-1]}-torchao-{MAP_QUANT_TYPE_TO_NAME[quantization_type.lower()]}"
         if repo_name in model_names:
             return f"Model '{repo_name}' already exists in your repository."
     return model_card
+def load_model(model_name, quantization_config, auth_token) :
+    return AutoModel.from_pretrained(model_name, torch_dtype=torch.bfloat16, quantization_config=quantization_config, device_map="auto", use_auth_token=auth_token.token)
 def load_model_cpu(model_name, quantization_config, auth_token) :
     return AutoModel.from_pretrained(model_name, torch_dtype=torch.bfloat16, quantization_config=quantization_config, use_auth_token=auth_token.token)
+def quantize_model(model_name, quantization_type, group_size=128, auth_token=None, username=None):
     print(f"Quantizing model: {quantization_type}")
     if quantization_type == "int4_weight_only" :
         quantization_config = TorchAoConfig(quantization_type, group_size=group_size)
     else :
         quantization_config = TorchAoConfig(quantization_type)
+    model = load_model(model_name, quantization_config=quantization_config, auth_token=auth_token)
     return model
     return f"https://huggingface.co/{repo_name}"
+def quantize_and_save(profile: gr.OAuthProfile | None, oauth_token: gr.OAuthToken | None, model_name, quantization_type, group_size, quantized_model_name):
     if oauth_token is None :
         return "Error : Please Sign In to your HuggingFace account to use the quantizer"
     if not profile:
     exists_message = check_model_exists(oauth_token, profile.username, quantization_type, group_size, model_name, quantized_model_name)
     if exists_message :
         return exists_message
+    if quantization_type == "int4_weight_only" :
         return "int4_weight_only not supported on cpu"
     # try :
+    quantized_model = quantize_model(model_name, quantization_type, group_size, oauth_token, profile.username)
     return save_model(quantized_model, model_name, quantization_type, group_size, profile.username, oauth_token, quantized_model_name)
     # except Exception as e :
     #     return e
                 value=128,
                 interactive=True
             )
+            # device = gr.Dropdown(
+            #     label="Device (int4 only works with cuda)",
+            #     choices=["cuda", "cpu"],
+            #     value="cuda"
+            # )
             quantized_model_name = gr.Textbox(
                 label="Model Name (optional : to override default)",
                 value="",
     quantize_button.click(
         fn=quantize_and_save,
+        inputs=[model_name, quantization_type, group_size, quantized_model_name],
         outputs=[output_link]
     )