MekkCyber commited on
Commit
f187eb1
·
1 Parent(s): 80ac8e4
Files changed (1) hide show
  1. app.py +19 -19
app.py CHANGED
@@ -9,6 +9,10 @@ from packaging import version
9
  import os
10
  import spaces
11
 
 
 
 
 
12
  def hello(profile: gr.OAuthProfile | None, oauth_token: gr.OAuthToken | None) -> str:
13
  # ^ expect a gr.OAuthProfile object as input to get the user's profile
14
  # if the user is not logged in, profile will be None
@@ -25,9 +29,9 @@ def check_model_exists(oauth_token: gr.OAuthToken | None, username, quantization
25
  repo_name = f"{username}/{quantized_model_name}"
26
  else :
27
  if quantization_type == "int4_weight_only" :
28
- repo_name = f"{username}/{model_name.split('/')[-1]}-torchao-{quantization_type.lower()}-gs_{group_size}"
29
  else :
30
- repo_name = f"{username}/{model_name.split('/')[-1]}-torchao-{quantization_type.lower()}"
31
 
32
  if repo_name in model_names:
33
  return f"Model '{repo_name}' already exists in your repository."
@@ -61,23 +65,19 @@ model = AutoModel.from_pretrained("{model_name}")"""
61
 
62
  return model_card
63
 
64
- @spaces.GPU
65
- def load_model_gpu(model_name, quantization_config, auth_token) :
66
- return AutoModel.from_pretrained(model_name, torch_dtype=torch.bfloat16, quantization_config=quantization_config, use_auth_token=auth_token.token)
67
 
68
  def load_model_cpu(model_name, quantization_config, auth_token) :
69
  return AutoModel.from_pretrained(model_name, torch_dtype=torch.bfloat16, quantization_config=quantization_config, use_auth_token=auth_token.token)
70
 
71
- def quantize_model(model_name, quantization_type, group_size=128, auth_token=None, username=None, device="cuda"):
72
  print(f"Quantizing model: {quantization_type}")
73
  if quantization_type == "int4_weight_only" :
74
  quantization_config = TorchAoConfig(quantization_type, group_size=group_size)
75
  else :
76
  quantization_config = TorchAoConfig(quantization_type)
77
- if device == "cuda" :
78
- model = load_model_gpu(model_name, quantization_config=quantization_config, auth_token=auth_token)
79
- else :
80
- model = load_model_cpu(model_name, quantization_config=quantization_config, auth_token=auth_token)
81
 
82
  return model
83
 
@@ -109,7 +109,7 @@ def save_model(model, model_name, quantization_type, group_size=128, username=No
109
 
110
  return f"https://huggingface.co/{repo_name}"
111
 
112
- def quantize_and_save(profile: gr.OAuthProfile | None, oauth_token: gr.OAuthToken | None, model_name, quantization_type, group_size, quantized_model_name, device):
113
  if oauth_token is None :
114
  return "Error : Please Sign In to your HuggingFace account to use the quantizer"
115
  if not profile:
@@ -117,10 +117,10 @@ def quantize_and_save(profile: gr.OAuthProfile | None, oauth_token: gr.OAuthToke
117
  exists_message = check_model_exists(oauth_token, profile.username, quantization_type, group_size, model_name, quantized_model_name)
118
  if exists_message :
119
  return exists_message
120
- if quantization_type == "int4_weight_only" and device == "cpu" :
121
  return "int4_weight_only not supported on cpu"
122
  # try :
123
- quantized_model = quantize_model(model_name, quantization_type, group_size, oauth_token, profile.username, device)
124
  return save_model(quantized_model, model_name, quantization_type, group_size, profile.username, oauth_token, quantized_model_name)
125
  # except Exception as e :
126
  # return e
@@ -158,11 +158,11 @@ with gr.Blocks(theme=gr.themes.Soft()) as app:
158
  value=128,
159
  interactive=True
160
  )
161
- device = gr.Dropdown(
162
- label="Device (int4 only works with cuda)",
163
- choices=["cuda", "cpu"],
164
- value="cuda"
165
- )
166
  quantized_model_name = gr.Textbox(
167
  label="Model Name (optional : to override default)",
168
  value="",
@@ -215,7 +215,7 @@ with gr.Blocks(theme=gr.themes.Soft()) as app:
215
 
216
  quantize_button.click(
217
  fn=quantize_and_save,
218
- inputs=[model_name, quantization_type, group_size, quantized_model_name, device],
219
  outputs=[output_link]
220
  )
221
 
 
9
  import os
10
  import spaces
11
 
12
+ MAP_QUANT_TYPE_TO_NAME = {
13
+ "int4_weight_only": "int4wo", "int8_weight_only": "int8wo", "int8_dynamic_activation_int8_weight": "int8da8w"
14
+ }
15
+
16
  def hello(profile: gr.OAuthProfile | None, oauth_token: gr.OAuthToken | None) -> str:
17
  # ^ expect a gr.OAuthProfile object as input to get the user's profile
18
  # if the user is not logged in, profile will be None
 
29
  repo_name = f"{username}/{quantized_model_name}"
30
  else :
31
  if quantization_type == "int4_weight_only" :
32
+ repo_name = f"{username}/{model_name.split('/')[-1]}-torchao-{MAP_QUANT_TYPE_TO_NAME[quantization_type.lower()]}-gs{group_size}"
33
  else :
34
+ repo_name = f"{username}/{model_name.split('/')[-1]}-torchao-{MAP_QUANT_TYPE_TO_NAME[quantization_type.lower()]}"
35
 
36
  if repo_name in model_names:
37
  return f"Model '{repo_name}' already exists in your repository."
 
65
 
66
  return model_card
67
 
68
+ def load_model(model_name, quantization_config, auth_token) :
69
+ return AutoModel.from_pretrained(model_name, torch_dtype=torch.bfloat16, quantization_config=quantization_config, device_map="auto", use_auth_token=auth_token.token)
 
70
 
71
  def load_model_cpu(model_name, quantization_config, auth_token) :
72
  return AutoModel.from_pretrained(model_name, torch_dtype=torch.bfloat16, quantization_config=quantization_config, use_auth_token=auth_token.token)
73
 
74
+ def quantize_model(model_name, quantization_type, group_size=128, auth_token=None, username=None):
75
  print(f"Quantizing model: {quantization_type}")
76
  if quantization_type == "int4_weight_only" :
77
  quantization_config = TorchAoConfig(quantization_type, group_size=group_size)
78
  else :
79
  quantization_config = TorchAoConfig(quantization_type)
80
+ model = load_model(model_name, quantization_config=quantization_config, auth_token=auth_token)
 
 
 
81
 
82
  return model
83
 
 
109
 
110
  return f"https://huggingface.co/{repo_name}"
111
 
112
+ def quantize_and_save(profile: gr.OAuthProfile | None, oauth_token: gr.OAuthToken | None, model_name, quantization_type, group_size, quantized_model_name):
113
  if oauth_token is None :
114
  return "Error : Please Sign In to your HuggingFace account to use the quantizer"
115
  if not profile:
 
117
  exists_message = check_model_exists(oauth_token, profile.username, quantization_type, group_size, model_name, quantized_model_name)
118
  if exists_message :
119
  return exists_message
120
+ if quantization_type == "int4_weight_only" :
121
  return "int4_weight_only not supported on cpu"
122
  # try :
123
+ quantized_model = quantize_model(model_name, quantization_type, group_size, oauth_token, profile.username)
124
  return save_model(quantized_model, model_name, quantization_type, group_size, profile.username, oauth_token, quantized_model_name)
125
  # except Exception as e :
126
  # return e
 
158
  value=128,
159
  interactive=True
160
  )
161
+ # device = gr.Dropdown(
162
+ # label="Device (int4 only works with cuda)",
163
+ # choices=["cuda", "cpu"],
164
+ # value="cuda"
165
+ # )
166
  quantized_model_name = gr.Textbox(
167
  label="Model Name (optional : to override default)",
168
  value="",
 
215
 
216
  quantize_button.click(
217
  fn=quantize_and_save,
218
+ inputs=[model_name, quantization_type, group_size, quantized_model_name],
219
  outputs=[output_link]
220
  )
221