Spaces:
Running
Running
MekkCyber
commited on
Commit
·
f187eb1
1
Parent(s):
80ac8e4
fix
Browse files
app.py
CHANGED
|
@@ -9,6 +9,10 @@ from packaging import version
|
|
| 9 |
import os
|
| 10 |
import spaces
|
| 11 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 12 |
def hello(profile: gr.OAuthProfile | None, oauth_token: gr.OAuthToken | None) -> str:
|
| 13 |
# ^ expect a gr.OAuthProfile object as input to get the user's profile
|
| 14 |
# if the user is not logged in, profile will be None
|
|
@@ -25,9 +29,9 @@ def check_model_exists(oauth_token: gr.OAuthToken | None, username, quantization
|
|
| 25 |
repo_name = f"{username}/{quantized_model_name}"
|
| 26 |
else :
|
| 27 |
if quantization_type == "int4_weight_only" :
|
| 28 |
-
repo_name = f"{username}/{model_name.split('/')[-1]}-torchao-{quantization_type.lower()}-
|
| 29 |
else :
|
| 30 |
-
repo_name = f"{username}/{model_name.split('/')[-1]}-torchao-{quantization_type.lower()}"
|
| 31 |
|
| 32 |
if repo_name in model_names:
|
| 33 |
return f"Model '{repo_name}' already exists in your repository."
|
|
@@ -61,23 +65,19 @@ model = AutoModel.from_pretrained("{model_name}")"""
|
|
| 61 |
|
| 62 |
return model_card
|
| 63 |
|
| 64 |
-
|
| 65 |
-
|
| 66 |
-
return AutoModel.from_pretrained(model_name, torch_dtype=torch.bfloat16, quantization_config=quantization_config, use_auth_token=auth_token.token)
|
| 67 |
|
| 68 |
def load_model_cpu(model_name, quantization_config, auth_token) :
|
| 69 |
return AutoModel.from_pretrained(model_name, torch_dtype=torch.bfloat16, quantization_config=quantization_config, use_auth_token=auth_token.token)
|
| 70 |
|
| 71 |
-
def quantize_model(model_name, quantization_type, group_size=128, auth_token=None, username=None
|
| 72 |
print(f"Quantizing model: {quantization_type}")
|
| 73 |
if quantization_type == "int4_weight_only" :
|
| 74 |
quantization_config = TorchAoConfig(quantization_type, group_size=group_size)
|
| 75 |
else :
|
| 76 |
quantization_config = TorchAoConfig(quantization_type)
|
| 77 |
-
|
| 78 |
-
model = load_model_gpu(model_name, quantization_config=quantization_config, auth_token=auth_token)
|
| 79 |
-
else :
|
| 80 |
-
model = load_model_cpu(model_name, quantization_config=quantization_config, auth_token=auth_token)
|
| 81 |
|
| 82 |
return model
|
| 83 |
|
|
@@ -109,7 +109,7 @@ def save_model(model, model_name, quantization_type, group_size=128, username=No
|
|
| 109 |
|
| 110 |
return f"https://huggingface.co/{repo_name}"
|
| 111 |
|
| 112 |
-
def quantize_and_save(profile: gr.OAuthProfile | None, oauth_token: gr.OAuthToken | None, model_name, quantization_type, group_size, quantized_model_name
|
| 113 |
if oauth_token is None :
|
| 114 |
return "Error : Please Sign In to your HuggingFace account to use the quantizer"
|
| 115 |
if not profile:
|
|
@@ -117,10 +117,10 @@ def quantize_and_save(profile: gr.OAuthProfile | None, oauth_token: gr.OAuthToke
|
|
| 117 |
exists_message = check_model_exists(oauth_token, profile.username, quantization_type, group_size, model_name, quantized_model_name)
|
| 118 |
if exists_message :
|
| 119 |
return exists_message
|
| 120 |
-
if quantization_type == "int4_weight_only"
|
| 121 |
return "int4_weight_only not supported on cpu"
|
| 122 |
# try :
|
| 123 |
-
quantized_model = quantize_model(model_name, quantization_type, group_size, oauth_token, profile.username
|
| 124 |
return save_model(quantized_model, model_name, quantization_type, group_size, profile.username, oauth_token, quantized_model_name)
|
| 125 |
# except Exception as e :
|
| 126 |
# return e
|
|
@@ -158,11 +158,11 @@ with gr.Blocks(theme=gr.themes.Soft()) as app:
|
|
| 158 |
value=128,
|
| 159 |
interactive=True
|
| 160 |
)
|
| 161 |
-
device = gr.Dropdown(
|
| 162 |
-
|
| 163 |
-
|
| 164 |
-
|
| 165 |
-
)
|
| 166 |
quantized_model_name = gr.Textbox(
|
| 167 |
label="Model Name (optional : to override default)",
|
| 168 |
value="",
|
|
@@ -215,7 +215,7 @@ with gr.Blocks(theme=gr.themes.Soft()) as app:
|
|
| 215 |
|
| 216 |
quantize_button.click(
|
| 217 |
fn=quantize_and_save,
|
| 218 |
-
inputs=[model_name, quantization_type, group_size, quantized_model_name
|
| 219 |
outputs=[output_link]
|
| 220 |
)
|
| 221 |
|
|
|
|
| 9 |
import os
|
| 10 |
import spaces
|
| 11 |
|
| 12 |
+
MAP_QUANT_TYPE_TO_NAME = {
|
| 13 |
+
"int4_weight_only": "int4wo", "int8_weight_only": "int8wo", "int8_dynamic_activation_int8_weight": "int8da8w"
|
| 14 |
+
}
|
| 15 |
+
|
| 16 |
def hello(profile: gr.OAuthProfile | None, oauth_token: gr.OAuthToken | None) -> str:
|
| 17 |
# ^ expect a gr.OAuthProfile object as input to get the user's profile
|
| 18 |
# if the user is not logged in, profile will be None
|
|
|
|
| 29 |
repo_name = f"{username}/{quantized_model_name}"
|
| 30 |
else :
|
| 31 |
if quantization_type == "int4_weight_only" :
|
| 32 |
+
repo_name = f"{username}/{model_name.split('/')[-1]}-torchao-{MAP_QUANT_TYPE_TO_NAME[quantization_type.lower()]}-gs{group_size}"
|
| 33 |
else :
|
| 34 |
+
repo_name = f"{username}/{model_name.split('/')[-1]}-torchao-{MAP_QUANT_TYPE_TO_NAME[quantization_type.lower()]}"
|
| 35 |
|
| 36 |
if repo_name in model_names:
|
| 37 |
return f"Model '{repo_name}' already exists in your repository."
|
|
|
|
| 65 |
|
| 66 |
return model_card
|
| 67 |
|
| 68 |
+
def load_model(model_name, quantization_config, auth_token) :
|
| 69 |
+
return AutoModel.from_pretrained(model_name, torch_dtype=torch.bfloat16, quantization_config=quantization_config, device_map="auto", use_auth_token=auth_token.token)
|
|
|
|
| 70 |
|
| 71 |
def load_model_cpu(model_name, quantization_config, auth_token) :
|
| 72 |
return AutoModel.from_pretrained(model_name, torch_dtype=torch.bfloat16, quantization_config=quantization_config, use_auth_token=auth_token.token)
|
| 73 |
|
| 74 |
+
def quantize_model(model_name, quantization_type, group_size=128, auth_token=None, username=None):
|
| 75 |
print(f"Quantizing model: {quantization_type}")
|
| 76 |
if quantization_type == "int4_weight_only" :
|
| 77 |
quantization_config = TorchAoConfig(quantization_type, group_size=group_size)
|
| 78 |
else :
|
| 79 |
quantization_config = TorchAoConfig(quantization_type)
|
| 80 |
+
model = load_model(model_name, quantization_config=quantization_config, auth_token=auth_token)
|
|
|
|
|
|
|
|
|
|
| 81 |
|
| 82 |
return model
|
| 83 |
|
|
|
|
| 109 |
|
| 110 |
return f"https://huggingface.co/{repo_name}"
|
| 111 |
|
| 112 |
+
def quantize_and_save(profile: gr.OAuthProfile | None, oauth_token: gr.OAuthToken | None, model_name, quantization_type, group_size, quantized_model_name):
|
| 113 |
if oauth_token is None :
|
| 114 |
return "Error : Please Sign In to your HuggingFace account to use the quantizer"
|
| 115 |
if not profile:
|
|
|
|
| 117 |
exists_message = check_model_exists(oauth_token, profile.username, quantization_type, group_size, model_name, quantized_model_name)
|
| 118 |
if exists_message :
|
| 119 |
return exists_message
|
| 120 |
+
if quantization_type == "int4_weight_only" :
|
| 121 |
return "int4_weight_only not supported on cpu"
|
| 122 |
# try :
|
| 123 |
+
quantized_model = quantize_model(model_name, quantization_type, group_size, oauth_token, profile.username)
|
| 124 |
return save_model(quantized_model, model_name, quantization_type, group_size, profile.username, oauth_token, quantized_model_name)
|
| 125 |
# except Exception as e :
|
| 126 |
# return e
|
|
|
|
| 158 |
value=128,
|
| 159 |
interactive=True
|
| 160 |
)
|
| 161 |
+
# device = gr.Dropdown(
|
| 162 |
+
# label="Device (int4 only works with cuda)",
|
| 163 |
+
# choices=["cuda", "cpu"],
|
| 164 |
+
# value="cuda"
|
| 165 |
+
# )
|
| 166 |
quantized_model_name = gr.Textbox(
|
| 167 |
label="Model Name (optional : to override default)",
|
| 168 |
value="",
|
|
|
|
| 215 |
|
| 216 |
quantize_button.click(
|
| 217 |
fn=quantize_and_save,
|
| 218 |
+
inputs=[model_name, quantization_type, group_size, quantized_model_name],
|
| 219 |
outputs=[output_link]
|
| 220 |
)
|
| 221 |
|