Spaces:
Running
Running
MekkCyber
commited on
Commit
·
f187eb1
1
Parent(s):
80ac8e4
fix
Browse files
app.py
CHANGED
@@ -9,6 +9,10 @@ from packaging import version
|
|
9 |
import os
|
10 |
import spaces
|
11 |
|
|
|
|
|
|
|
|
|
12 |
def hello(profile: gr.OAuthProfile | None, oauth_token: gr.OAuthToken | None) -> str:
|
13 |
# ^ expect a gr.OAuthProfile object as input to get the user's profile
|
14 |
# if the user is not logged in, profile will be None
|
@@ -25,9 +29,9 @@ def check_model_exists(oauth_token: gr.OAuthToken | None, username, quantization
|
|
25 |
repo_name = f"{username}/{quantized_model_name}"
|
26 |
else :
|
27 |
if quantization_type == "int4_weight_only" :
|
28 |
-
repo_name = f"{username}/{model_name.split('/')[-1]}-torchao-{quantization_type.lower()}-
|
29 |
else :
|
30 |
-
repo_name = f"{username}/{model_name.split('/')[-1]}-torchao-{quantization_type.lower()}"
|
31 |
|
32 |
if repo_name in model_names:
|
33 |
return f"Model '{repo_name}' already exists in your repository."
|
@@ -61,23 +65,19 @@ model = AutoModel.from_pretrained("{model_name}")"""
|
|
61 |
|
62 |
return model_card
|
63 |
|
64 |
-
|
65 |
-
|
66 |
-
return AutoModel.from_pretrained(model_name, torch_dtype=torch.bfloat16, quantization_config=quantization_config, use_auth_token=auth_token.token)
|
67 |
|
68 |
def load_model_cpu(model_name, quantization_config, auth_token) :
|
69 |
return AutoModel.from_pretrained(model_name, torch_dtype=torch.bfloat16, quantization_config=quantization_config, use_auth_token=auth_token.token)
|
70 |
|
71 |
-
def quantize_model(model_name, quantization_type, group_size=128, auth_token=None, username=None
|
72 |
print(f"Quantizing model: {quantization_type}")
|
73 |
if quantization_type == "int4_weight_only" :
|
74 |
quantization_config = TorchAoConfig(quantization_type, group_size=group_size)
|
75 |
else :
|
76 |
quantization_config = TorchAoConfig(quantization_type)
|
77 |
-
|
78 |
-
model = load_model_gpu(model_name, quantization_config=quantization_config, auth_token=auth_token)
|
79 |
-
else :
|
80 |
-
model = load_model_cpu(model_name, quantization_config=quantization_config, auth_token=auth_token)
|
81 |
|
82 |
return model
|
83 |
|
@@ -109,7 +109,7 @@ def save_model(model, model_name, quantization_type, group_size=128, username=No
|
|
109 |
|
110 |
return f"https://huggingface.co/{repo_name}"
|
111 |
|
112 |
-
def quantize_and_save(profile: gr.OAuthProfile | None, oauth_token: gr.OAuthToken | None, model_name, quantization_type, group_size, quantized_model_name
|
113 |
if oauth_token is None :
|
114 |
return "Error : Please Sign In to your HuggingFace account to use the quantizer"
|
115 |
if not profile:
|
@@ -117,10 +117,10 @@ def quantize_and_save(profile: gr.OAuthProfile | None, oauth_token: gr.OAuthToke
|
|
117 |
exists_message = check_model_exists(oauth_token, profile.username, quantization_type, group_size, model_name, quantized_model_name)
|
118 |
if exists_message :
|
119 |
return exists_message
|
120 |
-
if quantization_type == "int4_weight_only"
|
121 |
return "int4_weight_only not supported on cpu"
|
122 |
# try :
|
123 |
-
quantized_model = quantize_model(model_name, quantization_type, group_size, oauth_token, profile.username
|
124 |
return save_model(quantized_model, model_name, quantization_type, group_size, profile.username, oauth_token, quantized_model_name)
|
125 |
# except Exception as e :
|
126 |
# return e
|
@@ -158,11 +158,11 @@ with gr.Blocks(theme=gr.themes.Soft()) as app:
|
|
158 |
value=128,
|
159 |
interactive=True
|
160 |
)
|
161 |
-
device = gr.Dropdown(
|
162 |
-
|
163 |
-
|
164 |
-
|
165 |
-
)
|
166 |
quantized_model_name = gr.Textbox(
|
167 |
label="Model Name (optional : to override default)",
|
168 |
value="",
|
@@ -215,7 +215,7 @@ with gr.Blocks(theme=gr.themes.Soft()) as app:
|
|
215 |
|
216 |
quantize_button.click(
|
217 |
fn=quantize_and_save,
|
218 |
-
inputs=[model_name, quantization_type, group_size, quantized_model_name
|
219 |
outputs=[output_link]
|
220 |
)
|
221 |
|
|
|
9 |
import os
|
10 |
import spaces
|
11 |
|
12 |
+
MAP_QUANT_TYPE_TO_NAME = {
|
13 |
+
"int4_weight_only": "int4wo", "int8_weight_only": "int8wo", "int8_dynamic_activation_int8_weight": "int8da8w"
|
14 |
+
}
|
15 |
+
|
16 |
def hello(profile: gr.OAuthProfile | None, oauth_token: gr.OAuthToken | None) -> str:
|
17 |
# ^ expect a gr.OAuthProfile object as input to get the user's profile
|
18 |
# if the user is not logged in, profile will be None
|
|
|
29 |
repo_name = f"{username}/{quantized_model_name}"
|
30 |
else :
|
31 |
if quantization_type == "int4_weight_only" :
|
32 |
+
repo_name = f"{username}/{model_name.split('/')[-1]}-torchao-{MAP_QUANT_TYPE_TO_NAME[quantization_type.lower()]}-gs{group_size}"
|
33 |
else :
|
34 |
+
repo_name = f"{username}/{model_name.split('/')[-1]}-torchao-{MAP_QUANT_TYPE_TO_NAME[quantization_type.lower()]}"
|
35 |
|
36 |
if repo_name in model_names:
|
37 |
return f"Model '{repo_name}' already exists in your repository."
|
|
|
65 |
|
66 |
return model_card
|
67 |
|
68 |
+
def load_model(model_name, quantization_config, auth_token) :
|
69 |
+
return AutoModel.from_pretrained(model_name, torch_dtype=torch.bfloat16, quantization_config=quantization_config, device_map="auto", use_auth_token=auth_token.token)
|
|
|
70 |
|
71 |
def load_model_cpu(model_name, quantization_config, auth_token) :
|
72 |
return AutoModel.from_pretrained(model_name, torch_dtype=torch.bfloat16, quantization_config=quantization_config, use_auth_token=auth_token.token)
|
73 |
|
74 |
+
def quantize_model(model_name, quantization_type, group_size=128, auth_token=None, username=None):
|
75 |
print(f"Quantizing model: {quantization_type}")
|
76 |
if quantization_type == "int4_weight_only" :
|
77 |
quantization_config = TorchAoConfig(quantization_type, group_size=group_size)
|
78 |
else :
|
79 |
quantization_config = TorchAoConfig(quantization_type)
|
80 |
+
model = load_model(model_name, quantization_config=quantization_config, auth_token=auth_token)
|
|
|
|
|
|
|
81 |
|
82 |
return model
|
83 |
|
|
|
109 |
|
110 |
return f"https://huggingface.co/{repo_name}"
|
111 |
|
112 |
+
def quantize_and_save(profile: gr.OAuthProfile | None, oauth_token: gr.OAuthToken | None, model_name, quantization_type, group_size, quantized_model_name):
|
113 |
if oauth_token is None :
|
114 |
return "Error : Please Sign In to your HuggingFace account to use the quantizer"
|
115 |
if not profile:
|
|
|
117 |
exists_message = check_model_exists(oauth_token, profile.username, quantization_type, group_size, model_name, quantized_model_name)
|
118 |
if exists_message :
|
119 |
return exists_message
|
120 |
+
if quantization_type == "int4_weight_only" :
|
121 |
return "int4_weight_only not supported on cpu"
|
122 |
# try :
|
123 |
+
quantized_model = quantize_model(model_name, quantization_type, group_size, oauth_token, profile.username)
|
124 |
return save_model(quantized_model, model_name, quantization_type, group_size, profile.username, oauth_token, quantized_model_name)
|
125 |
# except Exception as e :
|
126 |
# return e
|
|
|
158 |
value=128,
|
159 |
interactive=True
|
160 |
)
|
161 |
+
# device = gr.Dropdown(
|
162 |
+
# label="Device (int4 only works with cuda)",
|
163 |
+
# choices=["cuda", "cpu"],
|
164 |
+
# value="cuda"
|
165 |
+
# )
|
166 |
quantized_model_name = gr.Textbox(
|
167 |
label="Model Name (optional : to override default)",
|
168 |
value="",
|
|
|
215 |
|
216 |
quantize_button.click(
|
217 |
fn=quantize_and_save,
|
218 |
+
inputs=[model_name, quantization_type, group_size, quantized_model_name],
|
219 |
outputs=[output_link]
|
220 |
)
|
221 |
|