File size: 8,235 Bytes
20aa964
 
 
8c9d2de
20aa964
 
 
 
8c9d2de
a360f5e
20aa964
 
883b775
 
4149fa9
20aa964
 
 
4b54665
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8c9d2de
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
883b775
 
 
 
 
 
 
4b54665
8c9d2de
 
3e2702a
a360f5e
 
20aa964
a360f5e
20aa964
a360f5e
8c9d2de
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a360f5e
883b775
a360f5e
883b775
bf39fb9
81c930a
883b775
20aa964
8c9d2de
 
 
 
 
 
 
883b775
a360f5e
883b775
bf39fb9
81c930a
883b775
a360f5e
8c9d2de
 
 
 
 
 
 
a360f5e
 
4149fa9
 
20aa964
a360f5e
4149fa9
 
a360f5e
4149fa9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4b54665
883b775
44fe74d
8c9d2de
 
 
 
 
44fe74d
 
883b775
 
 
 
 
 
 
 
 
 
44fe74d
4e3cb72
5cfec42
8c9d2de
5cfec42
44fe74d
8c9d2de
20aa964
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
import os
import shutil
import subprocess
import sys
import signal
os.environ["GRADIO_ANALYTICS_ENABLED"] = "False"
import gradio as gr

import huggingface_hub
from huggingface_hub import HfApi
from huggingface_hub import ModelCard

from gradio_huggingfacehub_search import HuggingfaceHubSearch

from textwrap import dedent

HF_PATH = "https://huggingface.co/"

CONV_TEMPLATES = [
    "llama-3",
    "llama-3_1",
    "chatml",
    "chatml_nosystem",
    "qwen2",
    "open_hermes_mistral",
    "neural_hermes_mistral",
    "llama_default",
    "llama-2",
    "mistral_default",
    "gpt2",
    "codellama_completion",
    "codellama_instruct",
    "vicuna_v1.1",
    "conv_one_shot",
    "redpajama_chat",
    "rwkv_world",
    "rwkv",
    "gorilla",
    "gorilla-openfunctions-v2",
    "guanaco",
    "dolly",
    "oasst",
    "stablelm",
    "stablecode_completion",
    "stablecode_instruct",
    "minigpt",
    "moss",
    "LM",
    "stablelm-3b",
    "gpt_bigcode",
    "wizardlm_7b",
    "wizard_coder_or_math",
    "glm",
    "custom",  # for web-llm only
    "phi-2",
    "phi-3",
    "phi-3-vision",
    "stablelm-2",
    "gemma_instruction",
    "orion",
    "llava",
    "hermes2_pro_llama3",
    "hermes3_llama-3_1",
    "tinyllama_v1_0",
    "aya-23",
]

QUANTIZATIONS = ["q0f16", 
                 "q0f32", 
                 "q3f16_1", 
                 "q4f16_1", 
                 "q4f32_1", 
                 "q4f16_awq"]

SUPPORTED_MODEL_TYPES = ['llama', 
                         'mistral', 
                         'gemma', 
                         'gemma2', 
                         'gpt2', 
                         'mixtral', 
                         'gpt_neox', 
                         'gpt_bigcode', 
                         'phi-msft', 
                         'phi', 
                         'phi3', 
                         'phi3_v', 
                         'qwen', 
                         'qwen2', 
                         'qwen2_moe', 
                         'stablelm', 
                         'baichuan', 
                         'internlm', 
                         'internlm2', 
                         'rwkv5', 
                         'orion', 
                         'llava', 
                         'rwkv6', 
                         'chatglm', 
                         'eagle', 
                         'bert', 
                         'medusa', 
                         'starcoder2', 
                         'cohere', 
                         'minicpm']

global is_cancelled

def button_click(hf_model_id, conv_template, quantization, oauth_token: gr.OAuthToken | None, progress=gr.Progress()):
    global is_cancelled
    is_cancelled = False

    if oauth_token.token is None:
        return "Log in to Huggingface to use this"
    elif not hf_model_id:
        return "Enter a Huggingface model ID"
    elif not conv_template:
        return "Select a conversation template"
    elif not quantization:
        return "Select a quantization method"
    
    progress(0, desc="Verifying inputs...")
    
    api = HfApi(token=oauth_token.token)
    model_dir_name = hf_model_id.split("/")[1]
    mlc_model_name = model_dir_name + "-" + quantization + "-" + "MLC"

    os.system("mkdir -p dist/models")
    os.system("git lfs install")

    model_info = api.repo_info(hf_model_id)
    if type(model_info) != huggingface_hub.hf_api.ModelInfo:
        os.system("rm -rf dist/")
        return "Entered Huggingface model ID is not a model repository"
    if model_info.config['model_type'] not in SUPPORTED_MODEL_TYPES:
        os.system("rm -rf dist/")
        return f"Model type ({model_info.config['model_type']}) currently not supported by MLC-LLM"

    progress(0.1, desc="Downloading weights from Huggingface...")

    try:
        api.snapshot_download(repo_id=hf_model_id, local_dir=f"./dist/models/{model_dir_name}")
    except BaseException as error:
        os.system("rm -rf dist/")
        return error

    if is_cancelled:
        is_cancelled = False
        os.system("rm -rf dist/")
        return "Conversion cancelled"
    
    progress(0.5, desc="Converting weight to MLC")

    convert_weight_result = subprocess.run(["mlc_llm convert_weight ./dist/models/" + model_dir_name + "/" + \
              " --quantization " + quantization + \
              " -o dist/" + mlc_model_name], shell=True, capture_output=True, text=True)
    if convert_weight_result.returncode != 0:
        os.system("rm -rf dist/")
        return convert_weight_result.stderr
    
    if is_cancelled:
        is_cancelled = False
        os.system("rm -rf dist/")
        return "Conversion cancelled"
    
    progress(0.8, desc="Generating config...")
    
    gen_config_result = subprocess.run(["mlc_llm gen_config ./dist/models/" + model_dir_name + "/" + \
              " --quantization " + quantization + " --conv-template " + conv_template + \
              " -o dist/" + mlc_model_name + "/"], shell=True, capture_output=True, text=True)
    if gen_config_result.returncode != 0:
        os.system("rm -rf dist/")
        return gen_config_result.stderr
    
    if is_cancelled:
        is_cancelled = False
        os.system("rm -rf dist/")
        return "Conversion cancelled"
    
    progress(0.9, desc="Creating your Huggingface repo...")

    # push to HF
    user_name = api.whoami()["name"]
    created_repo_url = api.create_repo(repo_id=f"{user_name}/{mlc_model_name}", private=True)
    created_repo_id = created_repo_url.repo_id

    api.upload_large_folder(folder_path=f"./dist/{mlc_model_name}",
                            repo_id=f"{user_name}/{mlc_model_name}",
                            repo_type="model")
    
    # push model card to HF
    card = ModelCard.load(hf_model_id, token=oauth_token.token)
    if not card.data.tags:
        card.data.tags = []
    card.data.tags.append("mlc-ai")
    card.data.tags.append("MLC-Weight-Conversion")
    card.data.base_model = hf_model_id

    card.text = dedent(
        f"""
        # {created_repo_id}
        This model was compiled using MLC-LLM with {quantization} quantization from [{hf_model_id}]({HF_PATH}{hf_model_id}).
        The conversion was done using the [MLC-Weight-Conversion](https://huggingface.co/spaces/mlc-ai/MLC-Weight-Conversion) space.

        To run this model, please first install [MLC-LLM](https://llm.mlc.ai/docs/install/mlc_llm.html#install-mlc-packages).

        To chat with the model on your terminal:
        ```bash
        mlc_llm chat HF://{created_repo_id}
        ```

        For more information on how to use MLC-LLM, please visit the MLC-LLM [documentation](https://llm.mlc.ai/docs/index.html).
        """
    )
    card.save("./dist/README.md")

    api.upload_file(path_or_fileobj="./dist/README.md",
                    path_in_repo="README.md",
                    repo_id=created_repo_id,
                    repo_type="model")

    os.system("rm -rf dist/")
    return "Successful, please find your compiled LLM model on your personal account"

def quit_button_click():
    global is_cancelled
    is_cancelled = True


with gr.Blocks() as demo:
    gr.LoginButton()
    gr.Markdown(
    """
    # Compile your LLM model with MLC-LLM and run it locally!
    ### This space takes in Huggingface model ID, and converts it for you using your selected conversation template and quantization method!
    """)
    model_id = HuggingfaceHubSearch(
        label="HF Model ID",
        placeholder="Search for your model on Huggingface",
        search_type="model",
    )
    conv = gr.Dropdown(CONV_TEMPLATES, label="Conversation Template")
    quant = gr.Dropdown(QUANTIZATIONS, label="Quantization Method", info="The format of the code is qAfB(_id), where A represents the number of bits for storing weights and B represents the number of bits for storing activations. The _id is an integer identifier to distinguish different quantization algorithms (e.g. symmetric, non-symmetric, AWQ, etc).")
    btn = gr.Button("Convert to MLC")
    btn2 = gr.Button("Quit")
    out = gr.Textbox(label="Conversion Result")
    btn.click(fn=button_click , inputs=[model_id, conv, quant], outputs=out)
    btn2.click(fn=quit_button_click)

demo.launch()