Spaces:
				
			
			
	
			
			
		Running
		
			on 
			
			A10G
	
	
	
			
			
	
	
	
	
		
		
		Running
		
			on 
			
			A10G
	Better isolation + various improvements (#133)
Browse files- better isolation, various improvements (69238fc734088ef8151d981d534809fddf742b7a)
- resolve conflict (db4c9bba61725d35b576116877b699b003c661df)
- .dockerignore +2 -1
- .gitignore +1 -0
- app.py +216 -169
    	
        .dockerignore
    CHANGED
    
    | @@ -1,2 +1,3 @@ | |
| 1 | 
             
            /downloads
         | 
| 2 | 
            -
            /llama.cpp
         | 
|  | 
|  | |
| 1 | 
             
            /downloads
         | 
| 2 | 
            +
            /llama.cpp
         | 
| 3 | 
            +
            /outputs
         | 
    	
        .gitignore
    CHANGED
    
    | @@ -164,3 +164,4 @@ cython_debug/ | |
| 164 | 
             
            /downloads
         | 
| 165 | 
             
            !/downloads/.keep
         | 
| 166 | 
             
            /llama.cpp
         | 
|  | 
|  | |
| 164 | 
             
            /downloads
         | 
| 165 | 
             
            !/downloads/.keep
         | 
| 166 | 
             
            /llama.cpp
         | 
| 167 | 
            +
            /outputs
         | 
    	
        app.py
    CHANGED
    
    | @@ -12,21 +12,34 @@ from textwrap import dedent | |
| 12 | 
             
            from apscheduler.schedulers.background import BackgroundScheduler
         | 
| 13 |  | 
| 14 |  | 
|  | |
| 15 | 
             
            HF_TOKEN = os.environ.get("HF_TOKEN")
         | 
| 16 | 
            -
             | 
| 17 | 
            -
             | 
| 18 | 
            -
             | 
| 19 | 
            -
             | 
| 20 | 
            -
                 | 
| 21 | 
            -
             | 
| 22 | 
            -
                 | 
| 23 | 
            -
                 | 
| 24 | 
            -
             | 
| 25 | 
            -
                 | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 26 | 
             
                    raise Exception(f"Model file not found: {model_path}")
         | 
| 27 |  | 
| 28 | 
             
                print("Running imatrix command...")
         | 
| 29 | 
            -
                process = subprocess.Popen(imatrix_command, shell= | 
| 30 |  | 
| 31 | 
             
                try:
         | 
| 32 | 
             
                    process.wait(timeout=60)  # added wait
         | 
| @@ -39,36 +52,54 @@ def generate_importance_matrix(model_path, train_data_path): | |
| 39 | 
             
                        print("Imatrix proc still didn't term. Forecfully terming process...")
         | 
| 40 | 
             
                        process.kill()
         | 
| 41 |  | 
| 42 | 
            -
                os.chdir("..")
         | 
| 43 | 
            -
             | 
| 44 | 
             
                print("Importance matrix generation completed.")
         | 
| 45 |  | 
| 46 | 
            -
            def split_upload_model(model_path, repo_id, oauth_token: gr.OAuthToken | None, split_max_tensors=256, split_max_size=None):
         | 
|  | |
|  | |
|  | |
| 47 | 
             
                if oauth_token.token is None:
         | 
| 48 | 
             
                    raise ValueError("You have to be logged in.")
         | 
| 49 |  | 
| 50 | 
            -
                split_cmd =  | 
|  | |
|  | |
|  | |
| 51 | 
             
                if split_max_size:
         | 
| 52 | 
            -
                    split_cmd | 
| 53 | 
            -
             | 
| 54 | 
            -
                
         | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 55 | 
             
                print(f"Split command: {split_cmd}") 
         | 
| 56 |  | 
| 57 | 
            -
                result = subprocess.run(split_cmd, shell= | 
| 58 | 
             
                print(f"Split command stdout: {result.stdout}") 
         | 
| 59 | 
             
                print(f"Split command stderr: {result.stderr}") 
         | 
| 60 |  | 
| 61 | 
             
                if result.returncode != 0:
         | 
| 62 | 
            -
                     | 
|  | |
| 63 | 
             
                print("Model split successfully!")
         | 
| 64 | 
            -
             | 
| 65 | 
            -
                
         | 
| 66 | 
            -
                 | 
|  | |
|  | |
|  | |
|  | |
|  | |
| 67 | 
             
                if sharded_model_files:
         | 
| 68 | 
             
                    print(f"Sharded model files: {sharded_model_files}")
         | 
| 69 | 
             
                    api = HfApi(token=oauth_token.token)
         | 
| 70 | 
             
                    for file in sharded_model_files:
         | 
| 71 | 
            -
                        file_path = os.path.join( | 
| 72 | 
             
                        print(f"Uploading file: {file_path}")
         | 
| 73 | 
             
                        try:
         | 
| 74 | 
             
                            api.upload_file(
         | 
| @@ -87,7 +118,6 @@ def process_model(model_id, q_method, use_imatrix, imatrix_q_method, private_rep | |
| 87 | 
             
                if oauth_token.token is None:
         | 
| 88 | 
             
                    raise ValueError("You must be logged in to use GGUF-my-repo")
         | 
| 89 | 
             
                model_name = model_id.split('/')[-1]
         | 
| 90 | 
            -
                fp16 = f"{model_name}.fp16.gguf"
         | 
| 91 |  | 
| 92 | 
             
                try:
         | 
| 93 | 
             
                    api = HfApi(token=oauth_token.token)
         | 
| @@ -111,160 +141,177 @@ def process_model(model_id, q_method, use_imatrix, imatrix_q_method, private_rep | |
| 111 | 
             
                    if not os.path.exists("downloads"):
         | 
| 112 | 
             
                        os.makedirs("downloads")
         | 
| 113 |  | 
| 114 | 
            -
                     | 
| 115 | 
            -
                         | 
| 116 | 
            -
             | 
| 117 | 
            -
             | 
| 118 | 
            -
                         | 
| 119 | 
            -
             | 
| 120 | 
            -
                         | 
| 121 | 
            -
             | 
| 122 | 
            -
             | 
| 123 | 
            -
             | 
| 124 | 
            -
             | 
| 125 | 
            -
             | 
| 126 | 
            -
                             | 
| 127 | 
            -
             | 
| 128 | 
            -
             | 
| 129 | 
            -
             | 
| 130 | 
            -
             | 
| 131 | 
            -
             | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 132 | 
             
                        if result.returncode != 0:
         | 
| 133 | 
            -
                             | 
| 134 | 
            -
             | 
| 135 | 
            -
                        print(f" | 
|  | |
| 136 |  | 
| 137 | 
            -
             | 
|  | |
|  | |
|  | |
|  | |
| 138 |  | 
| 139 | 
            -
             | 
| 140 | 
            -
             | 
| 141 | 
            -
             | 
| 142 | 
            -
             | 
| 143 | 
            -
             | 
| 144 | 
            -
             | 
| 145 | 
            -
                         | 
| 146 | 
            -
             | 
| 147 | 
            -
                         | 
| 148 | 
            -
             | 
| 149 | 
            -
             | 
| 150 | 
            -
             | 
| 151 | 
            -
             | 
| 152 | 
            -
             | 
| 153 | 
            -
             | 
| 154 | 
            -
             | 
| 155 | 
            -
             | 
| 156 | 
            -
             | 
| 157 | 
            -
             | 
| 158 | 
            -
             | 
| 159 | 
            -
             | 
| 160 | 
            -
             | 
| 161 | 
            -
             | 
| 162 | 
            -
             | 
| 163 | 
            -
             | 
| 164 | 
            -
             | 
| 165 | 
            -
             | 
| 166 | 
            -
             | 
| 167 | 
            -
             | 
| 168 | 
            -
             | 
| 169 | 
            -
             | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 170 |  | 
| 171 | 
            -
             | 
| 172 | 
            -
             | 
| 173 | 
            -
             | 
| 174 | 
            -
             | 
| 175 | 
            -
             | 
| 176 | 
            -
             | 
| 177 | 
            -
             | 
| 178 | 
            -
             | 
| 179 | 
            -
             | 
| 180 | 
            -
             | 
| 181 | 
            -
             | 
| 182 | 
            -
             | 
| 183 | 
            -
                        This model was converted to GGUF format from [`{model_id}`](https://huggingface.co/{model_id}) using llama.cpp via the ggml.ai's [GGUF-my-repo](https://huggingface.co/spaces/ggml-org/gguf-my-repo) space.
         | 
| 184 | 
            -
                        Refer to the [original model card](https://huggingface.co/{model_id}) for more details on the model.
         | 
| 185 | 
            -
                        
         | 
| 186 | 
            -
                        ## Use with llama.cpp
         | 
| 187 | 
            -
                        Install llama.cpp through brew (works on Mac and Linux)
         | 
| 188 | 
            -
                        
         | 
| 189 | 
            -
                        ```bash
         | 
| 190 | 
            -
                        brew install llama.cpp
         | 
| 191 | 
            -
                        
         | 
| 192 | 
            -
                        ```
         | 
| 193 | 
            -
                        Invoke the llama.cpp server or the CLI.
         | 
| 194 | 
            -
                        
         | 
| 195 | 
            -
                        ### CLI:
         | 
| 196 | 
            -
                        ```bash
         | 
| 197 | 
            -
                        llama-cli --hf-repo {new_repo_id} --hf-file {quantized_gguf_name} -p "The meaning to life and the universe is"
         | 
| 198 | 
            -
                        ```
         | 
| 199 | 
            -
                        
         | 
| 200 | 
            -
                        ### Server:
         | 
| 201 | 
            -
                        ```bash
         | 
| 202 | 
            -
                        llama-server --hf-repo {new_repo_id} --hf-file {quantized_gguf_name} -c 2048
         | 
| 203 | 
            -
                        ```
         | 
| 204 |  | 
| 205 | 
            -
                         | 
| 206 | 
            -
             | 
| 207 | 
            -
             | 
| 208 | 
            -
             | 
| 209 | 
            -
             | 
| 210 | 
            -
             | 
| 211 | 
            -
             | 
| 212 | 
            -
             | 
| 213 | 
            -
             | 
| 214 | 
            -
             | 
| 215 | 
            -
             | 
| 216 | 
            -
             | 
| 217 | 
            -
             | 
| 218 | 
            -
             | 
| 219 | 
            -
             | 
| 220 | 
            -
                         | 
| 221 | 
            -
                         | 
| 222 | 
            -
                        ```
         | 
| 223 | 
            -
                        ./llama-server --hf-repo {new_repo_id} --hf-file {quantized_gguf_name} -c 2048
         | 
| 224 | 
            -
                        ```
         | 
| 225 | 
            -
                        """
         | 
| 226 | 
            -
                    )
         | 
| 227 | 
            -
                    card.save(f"README.md")
         | 
| 228 | 
            -
             | 
| 229 | 
            -
                    if split_model:
         | 
| 230 | 
            -
                        split_upload_model(quantized_gguf_path, new_repo_id, oauth_token, split_max_tensors, split_max_size)
         | 
| 231 | 
            -
                    else:
         | 
| 232 | 
            -
                        try:
         | 
| 233 | 
            -
                            print(f"Uploading quantized model: {quantized_gguf_path}")
         | 
| 234 | 
            -
                            api.upload_file(
         | 
| 235 | 
            -
                                path_or_fileobj=quantized_gguf_path,
         | 
| 236 | 
            -
                                path_in_repo=quantized_gguf_name,
         | 
| 237 | 
            -
                                repo_id=new_repo_id,
         | 
| 238 | 
            -
                            )
         | 
| 239 | 
            -
                        except Exception as e:
         | 
| 240 | 
            -
                            raise Exception(f"Error uploading quantized model: {e}")
         | 
| 241 | 
            -
                    
         | 
| 242 | 
            -
                    
         | 
| 243 | 
            -
                    imatrix_path = "llama.cpp/imatrix.dat"
         | 
| 244 | 
            -
                    if os.path.isfile(imatrix_path):
         | 
| 245 | 
            -
                        try:
         | 
| 246 | 
            -
                            print(f"Uploading imatrix.dat: {imatrix_path}")
         | 
| 247 | 
            -
                            api.upload_file(
         | 
| 248 | 
            -
                                path_or_fileobj=imatrix_path,
         | 
| 249 | 
            -
                                path_in_repo="imatrix.dat",
         | 
| 250 | 
            -
                                repo_id=new_repo_id,
         | 
| 251 | 
            -
                            )
         | 
| 252 | 
            -
                        except Exception as e:
         | 
| 253 | 
            -
                            raise Exception(f"Error uploading imatrix.dat: {e}")
         | 
| 254 |  | 
| 255 | 
            -
                     | 
| 256 | 
            -
                        path_or_fileobj=f"README.md",
         | 
| 257 | 
            -
                        path_in_repo=f"README.md",
         | 
| 258 | 
            -
                        repo_id=new_repo_id,
         | 
| 259 | 
            -
                    )
         | 
| 260 | 
            -
                    print(f"Uploaded successfully with {imatrix_q_method if use_imatrix else q_method} option!")
         | 
| 261 |  | 
| 262 | 
             
                    return (
         | 
| 263 | 
            -
                        f'<h1>✅ DONE</h1><br | 
| 264 | 
             
                        "llama.png",
         | 
| 265 | 
             
                    )
         | 
| 266 | 
             
                except Exception as e:
         | 
| 267 | 
            -
                    return (f | 
| 268 |  | 
| 269 |  | 
| 270 | 
             
            css="""/* Custom CSS to allow scrolling */
         | 
| @@ -332,7 +379,7 @@ with gr.Blocks(css=css) as demo: | |
| 332 |  | 
| 333 | 
             
                split_max_size = gr.Textbox(
         | 
| 334 | 
             
                    label="Max File Size",
         | 
| 335 | 
            -
                    info="Maximum file size when splitting model (--split-max-size). May leave empty to use the default.",
         | 
| 336 | 
             
                    visible=False
         | 
| 337 | 
             
                )
         | 
| 338 |  | 
|  | |
| 12 | 
             
            from apscheduler.schedulers.background import BackgroundScheduler
         | 
| 13 |  | 
| 14 |  | 
| 15 | 
            +
            # used for restarting the space
         | 
| 16 | 
             
            HF_TOKEN = os.environ.get("HF_TOKEN")
         | 
| 17 | 
            +
            CONVERSION_SCRIPT = "./llama.cpp/convert_hf_to_gguf.py"
         | 
| 18 | 
            +
             | 
| 19 | 
            +
            # escape HTML for logging
         | 
| 20 | 
            +
            def escape(s: str) -> str:
         | 
| 21 | 
            +
                s = s.replace("&", "&") # Must be done first!
         | 
| 22 | 
            +
                s = s.replace("<", "<")
         | 
| 23 | 
            +
                s = s.replace(">", ">")
         | 
| 24 | 
            +
                s = s.replace('"', """)
         | 
| 25 | 
            +
                s = s.replace("\n", "<br/>")
         | 
| 26 | 
            +
                return s
         | 
| 27 | 
            +
             | 
| 28 | 
            +
            def generate_importance_matrix(model_path: str, train_data_path: str, output_path: str):
         | 
| 29 | 
            +
                imatrix_command = [
         | 
| 30 | 
            +
                    "./llama.cpp/llama-imatrix",
         | 
| 31 | 
            +
                    "-m", model_path,
         | 
| 32 | 
            +
                    "-f", train_data_path,
         | 
| 33 | 
            +
                    "-ngl", "99",
         | 
| 34 | 
            +
                    "--output-frequency", "10",
         | 
| 35 | 
            +
                    "-o", output_path,
         | 
| 36 | 
            +
                ]
         | 
| 37 | 
            +
             | 
| 38 | 
            +
                if not os.path.isfile(model_path):
         | 
| 39 | 
             
                    raise Exception(f"Model file not found: {model_path}")
         | 
| 40 |  | 
| 41 | 
             
                print("Running imatrix command...")
         | 
| 42 | 
            +
                process = subprocess.Popen(imatrix_command, shell=False)
         | 
| 43 |  | 
| 44 | 
             
                try:
         | 
| 45 | 
             
                    process.wait(timeout=60)  # added wait
         | 
|  | |
| 52 | 
             
                        print("Imatrix proc still didn't term. Forecfully terming process...")
         | 
| 53 | 
             
                        process.kill()
         | 
| 54 |  | 
|  | |
|  | |
| 55 | 
             
                print("Importance matrix generation completed.")
         | 
| 56 |  | 
| 57 | 
            +
            def split_upload_model(model_path: str, outdir: str, repo_id: str, oauth_token: gr.OAuthToken | None, split_max_tensors=256, split_max_size=None):
         | 
| 58 | 
            +
                print(f"Model path: {model_path}")
         | 
| 59 | 
            +
                print(f"Output dir: {outdir}")
         | 
| 60 | 
            +
             | 
| 61 | 
             
                if oauth_token.token is None:
         | 
| 62 | 
             
                    raise ValueError("You have to be logged in.")
         | 
| 63 |  | 
| 64 | 
            +
                split_cmd = [
         | 
| 65 | 
            +
                    "./llama.cpp/llama-gguf-split",
         | 
| 66 | 
            +
                    "--split",
         | 
| 67 | 
            +
                ]
         | 
| 68 | 
             
                if split_max_size:
         | 
| 69 | 
            +
                    split_cmd.append("--split-max-size")
         | 
| 70 | 
            +
                    split_cmd.append(split_max_size)
         | 
| 71 | 
            +
                else:
         | 
| 72 | 
            +
                    split_cmd.append("--split-max-tensors")
         | 
| 73 | 
            +
                    split_cmd.append(str(split_max_tensors))
         | 
| 74 | 
            +
             | 
| 75 | 
            +
                # args for output
         | 
| 76 | 
            +
                model_path_prefix = '.'.join(model_path.split('.')[:-1]) # remove the file extension
         | 
| 77 | 
            +
                split_cmd.append(model_path)
         | 
| 78 | 
            +
                split_cmd.append(model_path_prefix)
         | 
| 79 | 
            +
             | 
| 80 | 
             
                print(f"Split command: {split_cmd}") 
         | 
| 81 |  | 
| 82 | 
            +
                result = subprocess.run(split_cmd, shell=False, capture_output=True, text=True)
         | 
| 83 | 
             
                print(f"Split command stdout: {result.stdout}") 
         | 
| 84 | 
             
                print(f"Split command stderr: {result.stderr}") 
         | 
| 85 |  | 
| 86 | 
             
                if result.returncode != 0:
         | 
| 87 | 
            +
                    stderr_str = result.stderr.decode("utf-8")
         | 
| 88 | 
            +
                    raise Exception(f"Error splitting the model: {stderr_str}")
         | 
| 89 | 
             
                print("Model split successfully!")
         | 
| 90 | 
            +
             | 
| 91 | 
            +
                # remove the original model file if needed
         | 
| 92 | 
            +
                if os.path.exists(model_path):
         | 
| 93 | 
            +
                    os.remove(model_path)
         | 
| 94 | 
            +
             | 
| 95 | 
            +
                model_file_prefix = model_path_prefix.split('/')[-1]
         | 
| 96 | 
            +
                print(f"Model file name prefix: {model_file_prefix}") 
         | 
| 97 | 
            +
                sharded_model_files = [f for f in os.listdir(outdir) if f.startswith(model_file_prefix) and f.endswith(".gguf")]
         | 
| 98 | 
             
                if sharded_model_files:
         | 
| 99 | 
             
                    print(f"Sharded model files: {sharded_model_files}")
         | 
| 100 | 
             
                    api = HfApi(token=oauth_token.token)
         | 
| 101 | 
             
                    for file in sharded_model_files:
         | 
| 102 | 
            +
                        file_path = os.path.join(outdir, file)
         | 
| 103 | 
             
                        print(f"Uploading file: {file_path}")
         | 
| 104 | 
             
                        try:
         | 
| 105 | 
             
                            api.upload_file(
         | 
|  | |
| 118 | 
             
                if oauth_token.token is None:
         | 
| 119 | 
             
                    raise ValueError("You must be logged in to use GGUF-my-repo")
         | 
| 120 | 
             
                model_name = model_id.split('/')[-1]
         | 
|  | |
| 121 |  | 
| 122 | 
             
                try:
         | 
| 123 | 
             
                    api = HfApi(token=oauth_token.token)
         | 
|  | |
| 141 | 
             
                    if not os.path.exists("downloads"):
         | 
| 142 | 
             
                        os.makedirs("downloads")
         | 
| 143 |  | 
| 144 | 
            +
                    if not os.path.exists("outputs"):
         | 
| 145 | 
            +
                        os.makedirs("outputs")
         | 
| 146 | 
            +
             | 
| 147 | 
            +
                    with tempfile.TemporaryDirectory(dir="outputs") as outdir:
         | 
| 148 | 
            +
                        fp16 = str(Path(outdir)/f"{model_name}.fp16.gguf")
         | 
| 149 | 
            +
             | 
| 150 | 
            +
                        with tempfile.TemporaryDirectory(dir="downloads") as tmpdir:
         | 
| 151 | 
            +
                            # Keep the model name as the dirname so the model name metadata is populated correctly
         | 
| 152 | 
            +
                            local_dir = Path(tmpdir)/model_name
         | 
| 153 | 
            +
                            print(local_dir)
         | 
| 154 | 
            +
                            api.snapshot_download(repo_id=model_id, local_dir=local_dir, local_dir_use_symlinks=False, allow_patterns=dl_pattern)
         | 
| 155 | 
            +
                            print("Model downloaded successfully!")
         | 
| 156 | 
            +
                            print(f"Current working directory: {os.getcwd()}")
         | 
| 157 | 
            +
                            print(f"Model directory contents: {os.listdir(local_dir)}")
         | 
| 158 | 
            +
             | 
| 159 | 
            +
                            config_dir = local_dir/"config.json"
         | 
| 160 | 
            +
                            adapter_config_dir = local_dir/"adapter_config.json"
         | 
| 161 | 
            +
                            if os.path.exists(adapter_config_dir) and not os.path.exists(config_dir):
         | 
| 162 | 
            +
                                raise Exception('adapter_config.json is present.<br/><br/>If you are converting a LoRA adapter to GGUF, please use <a href="https://huggingface.co/spaces/ggml-org/gguf-my-lora" target="_blank" style="text-decoration:underline">GGUF-my-lora</a>.')
         | 
| 163 | 
            +
             | 
| 164 | 
            +
                            result = subprocess.run([
         | 
| 165 | 
            +
                                "python", CONVERSION_SCRIPT, local_dir, "--outtype", "f16", "--outfile", fp16
         | 
| 166 | 
            +
                            ], shell=False, capture_output=True)
         | 
| 167 | 
            +
                            print(result)
         | 
| 168 | 
            +
                            if result.returncode != 0:
         | 
| 169 | 
            +
                                stderr_str = result.stderr.decode("utf-8")
         | 
| 170 | 
            +
                                raise Exception(f"Error converting to fp16: {stderr_str}")
         | 
| 171 | 
            +
                            print("Model converted to fp16 successfully!")
         | 
| 172 | 
            +
                            print(f"Converted model path: {fp16}")
         | 
| 173 | 
            +
             | 
| 174 | 
            +
                        imatrix_path = Path(outdir)/"imatrix.dat"
         | 
| 175 | 
            +
             | 
| 176 | 
            +
                        if use_imatrix:
         | 
| 177 | 
            +
                            if train_data_file:
         | 
| 178 | 
            +
                                train_data_path = train_data_file.name
         | 
| 179 | 
            +
                            else:
         | 
| 180 | 
            +
                                train_data_path = "llama.cpp/groups_merged.txt" #fallback calibration dataset
         | 
| 181 | 
            +
             | 
| 182 | 
            +
                            print(f"Training data file path: {train_data_path}")
         | 
| 183 | 
            +
             | 
| 184 | 
            +
                            if not os.path.isfile(train_data_path):
         | 
| 185 | 
            +
                                raise Exception(f"Training data file not found: {train_data_path}")
         | 
| 186 | 
            +
             | 
| 187 | 
            +
                            generate_importance_matrix(fp16, train_data_path, imatrix_path)
         | 
| 188 | 
            +
                        else:
         | 
| 189 | 
            +
                            print("Not using imatrix quantization.")
         | 
| 190 | 
            +
                        
         | 
| 191 | 
            +
                        # Quantize the model
         | 
| 192 | 
            +
                        quantized_gguf_name = f"{model_name.lower()}-{imatrix_q_method.lower()}-imat.gguf" if use_imatrix else f"{model_name.lower()}-{q_method.lower()}.gguf"
         | 
| 193 | 
            +
                        quantized_gguf_path = str(Path(outdir)/quantized_gguf_name)
         | 
| 194 | 
            +
                        if use_imatrix:
         | 
| 195 | 
            +
                            quantise_ggml = [
         | 
| 196 | 
            +
                                "./llama.cpp/llama-quantize",
         | 
| 197 | 
            +
                                "--imatrix", imatrix_path, fp16, quantized_gguf_path, imatrix_q_method
         | 
| 198 | 
            +
                            ]
         | 
| 199 | 
            +
                        else:
         | 
| 200 | 
            +
                            quantise_ggml = [
         | 
| 201 | 
            +
                                "./llama.cpp/llama-quantize",
         | 
| 202 | 
            +
                                fp16, quantized_gguf_path, q_method
         | 
| 203 | 
            +
                            ]
         | 
| 204 | 
            +
                        result = subprocess.run(quantise_ggml, shell=False, capture_output=True)
         | 
| 205 | 
             
                        if result.returncode != 0:
         | 
| 206 | 
            +
                            stderr_str = result.stderr.decode("utf-8")
         | 
| 207 | 
            +
                            raise Exception(f"Error quantizing: {stderr_str}")
         | 
| 208 | 
            +
                        print(f"Quantized successfully with {imatrix_q_method if use_imatrix else q_method} option!")
         | 
| 209 | 
            +
                        print(f"Quantized model path: {quantized_gguf_path}")
         | 
| 210 |  | 
| 211 | 
            +
                        # Create empty repo
         | 
| 212 | 
            +
                        username = whoami(oauth_token.token)["name"]
         | 
| 213 | 
            +
                        new_repo_url = api.create_repo(repo_id=f"{username}/{model_name}-{imatrix_q_method if use_imatrix else q_method}-GGUF", exist_ok=True, private=private_repo)
         | 
| 214 | 
            +
                        new_repo_id = new_repo_url.repo_id
         | 
| 215 | 
            +
                        print("Repo created successfully!", new_repo_url)
         | 
| 216 |  | 
| 217 | 
            +
                        try:
         | 
| 218 | 
            +
                            card = ModelCard.load(model_id, token=oauth_token.token)
         | 
| 219 | 
            +
                        except:
         | 
| 220 | 
            +
                            card = ModelCard("")
         | 
| 221 | 
            +
                        if card.data.tags is None:
         | 
| 222 | 
            +
                            card.data.tags = []
         | 
| 223 | 
            +
                        card.data.tags.append("llama-cpp")
         | 
| 224 | 
            +
                        card.data.tags.append("gguf-my-repo")
         | 
| 225 | 
            +
                        card.data.base_model = model_id
         | 
| 226 | 
            +
                        card.text = dedent(
         | 
| 227 | 
            +
                            f"""
         | 
| 228 | 
            +
                            # {new_repo_id}
         | 
| 229 | 
            +
                            This model was converted to GGUF format from [`{model_id}`](https://huggingface.co/{model_id}) using llama.cpp via the ggml.ai's [GGUF-my-repo](https://huggingface.co/spaces/ggml-org/gguf-my-repo) space.
         | 
| 230 | 
            +
                            Refer to the [original model card](https://huggingface.co/{model_id}) for more details on the model.
         | 
| 231 | 
            +
                            
         | 
| 232 | 
            +
                            ## Use with llama.cpp
         | 
| 233 | 
            +
                            Install llama.cpp through brew (works on Mac and Linux)
         | 
| 234 | 
            +
                            
         | 
| 235 | 
            +
                            ```bash
         | 
| 236 | 
            +
                            brew install llama.cpp
         | 
| 237 | 
            +
                            
         | 
| 238 | 
            +
                            ```
         | 
| 239 | 
            +
                            Invoke the llama.cpp server or the CLI.
         | 
| 240 | 
            +
                            
         | 
| 241 | 
            +
                            ### CLI:
         | 
| 242 | 
            +
                            ```bash
         | 
| 243 | 
            +
                            llama-cli --hf-repo {new_repo_id} --hf-file {quantized_gguf_name} -p "The meaning to life and the universe is"
         | 
| 244 | 
            +
                            ```
         | 
| 245 | 
            +
                            
         | 
| 246 | 
            +
                            ### Server:
         | 
| 247 | 
            +
                            ```bash
         | 
| 248 | 
            +
                            llama-server --hf-repo {new_repo_id} --hf-file {quantized_gguf_name} -c 2048
         | 
| 249 | 
            +
                            ```
         | 
| 250 | 
            +
                            
         | 
| 251 | 
            +
                            Note: You can also use this checkpoint directly through the [usage steps](https://github.com/ggerganov/llama.cpp?tab=readme-ov-file#usage) listed in the Llama.cpp repo as well.
         | 
| 252 | 
            +
             | 
| 253 | 
            +
                            Step 1: Clone llama.cpp from GitHub.
         | 
| 254 | 
            +
                            ```
         | 
| 255 | 
            +
                            git clone https://github.com/ggerganov/llama.cpp
         | 
| 256 | 
            +
                            ```
         | 
| 257 | 
            +
             | 
| 258 | 
            +
                            Step 2: Move into the llama.cpp folder and build it with `LLAMA_CURL=1` flag along with other hardware-specific flags (for ex: LLAMA_CUDA=1 for Nvidia GPUs on Linux).
         | 
| 259 | 
            +
                            ```
         | 
| 260 | 
            +
                            cd llama.cpp && LLAMA_CURL=1 make
         | 
| 261 | 
            +
                            ```
         | 
| 262 | 
            +
             | 
| 263 | 
            +
                            Step 3: Run inference through the main binary.
         | 
| 264 | 
            +
                            ```
         | 
| 265 | 
            +
                            ./llama-cli --hf-repo {new_repo_id} --hf-file {quantized_gguf_name} -p "The meaning to life and the universe is"
         | 
| 266 | 
            +
                            ```
         | 
| 267 | 
            +
                            or 
         | 
| 268 | 
            +
                            ```
         | 
| 269 | 
            +
                            ./llama-server --hf-repo {new_repo_id} --hf-file {quantized_gguf_name} -c 2048
         | 
| 270 | 
            +
                            ```
         | 
| 271 | 
            +
                            """
         | 
| 272 | 
            +
                        )
         | 
| 273 | 
            +
                        readme_path = Path(outdir)/"README.md"
         | 
| 274 | 
            +
                        card.save(readme_path)
         | 
| 275 |  | 
| 276 | 
            +
                        if split_model:
         | 
| 277 | 
            +
                            split_upload_model(str(quantized_gguf_path), outdir, new_repo_id, oauth_token, split_max_tensors, split_max_size)
         | 
| 278 | 
            +
                        else:
         | 
| 279 | 
            +
                            try:
         | 
| 280 | 
            +
                                print(f"Uploading quantized model: {quantized_gguf_path}")
         | 
| 281 | 
            +
                                api.upload_file(
         | 
| 282 | 
            +
                                    path_or_fileobj=quantized_gguf_path,
         | 
| 283 | 
            +
                                    path_in_repo=quantized_gguf_name,
         | 
| 284 | 
            +
                                    repo_id=new_repo_id,
         | 
| 285 | 
            +
                                )
         | 
| 286 | 
            +
                            except Exception as e:
         | 
| 287 | 
            +
                                raise Exception(f"Error uploading quantized model: {e}")
         | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 288 |  | 
| 289 | 
            +
                        if os.path.isfile(imatrix_path):
         | 
| 290 | 
            +
                            try:
         | 
| 291 | 
            +
                                print(f"Uploading imatrix.dat: {imatrix_path}")
         | 
| 292 | 
            +
                                api.upload_file(
         | 
| 293 | 
            +
                                    path_or_fileobj=imatrix_path,
         | 
| 294 | 
            +
                                    path_in_repo="imatrix.dat",
         | 
| 295 | 
            +
                                    repo_id=new_repo_id,
         | 
| 296 | 
            +
                                )
         | 
| 297 | 
            +
                            except Exception as e:
         | 
| 298 | 
            +
                                raise Exception(f"Error uploading imatrix.dat: {e}")
         | 
| 299 | 
            +
             | 
| 300 | 
            +
                        api.upload_file(
         | 
| 301 | 
            +
                            path_or_fileobj=readme_path,
         | 
| 302 | 
            +
                            path_in_repo="README.md",
         | 
| 303 | 
            +
                            repo_id=new_repo_id,
         | 
| 304 | 
            +
                        )
         | 
| 305 | 
            +
                        print(f"Uploaded successfully with {imatrix_q_method if use_imatrix else q_method} option!")
         | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 306 |  | 
| 307 | 
            +
                    # end of the TemporaryDirectory(dir="outputs") block; temporary outputs are deleted here
         | 
|  | |
|  | |
|  | |
|  | |
|  | |
| 308 |  | 
| 309 | 
             
                    return (
         | 
| 310 | 
            +
                        f'<h1>✅ DONE</h1><br/>Find your repo here: <a href="{new_repo_url}" target="_blank" style="text-decoration:underline">{new_repo_id}</a>',
         | 
| 311 | 
             
                        "llama.png",
         | 
| 312 | 
             
                    )
         | 
| 313 | 
             
                except Exception as e:
         | 
| 314 | 
            +
                    return (f'<h1>❌ ERROR</h1><br/><pre style="white-space:pre-wrap;">{escape(str(e))}</pre>', "error.png")
         | 
| 315 |  | 
| 316 |  | 
| 317 | 
             
            css="""/* Custom CSS to allow scrolling */
         | 
|  | |
| 379 |  | 
| 380 | 
             
                split_max_size = gr.Textbox(
         | 
| 381 | 
             
                    label="Max File Size",
         | 
| 382 | 
            +
                    info="Maximum file size when splitting model (--split-max-size). May leave empty to use the default. Accepted suffixes: M, G. Example: 256M, 5G",
         | 
| 383 | 
             
                    visible=False
         | 
| 384 | 
             
                )
         | 
| 385 |  | 

