Spaces:

Steven10429
/

apply_lora_and_quantize

Paused

App Files Files Community

Steven10429 commited on Feb 13

Commit

039130e

1 Parent(s): abba0b6

1

Browse files

Files changed (2) hide show

.gitignore +2 -1
app.py +32 -12

.gitignore CHANGED Viewed

@@ -1,2 +1,3 @@
 *.log
-output

 *.log
+output
+temp

app.py CHANGED Viewed

@@ -53,8 +53,7 @@ def check_system_resources(model_name):
     log.info(f"Total system memory: {MEMORY}GB")
     model_size_gb = get_model_size_in_gb(model_name)
-    required_memory_gb_16bit = model_size_gb * 1.5
-    required_memory_gb = required_memory_gb_16bit
     log.info(f"Estimated required memory for model: {required_memory_gb:.1f}GB")
@@ -124,20 +123,21 @@ def download_and_merge_model(base_model_name, lora_model_name, output_dir, devic
     """
     os.makedirs("temp", exist_ok=True)
     log.info("Loading base model...")
-    model = AutoModelForCausalLM.from_pretrained(base_model_name, low_cpu_mem_usage=True, device_map="auto", force_download=True, trust_remote_code=True, torch_dtype=torch.float16)
     log.info("Loading adapter tokenizer...")
     adapter_tokenizer = AutoTokenizer.from_pretrained(lora_model_name, trust_remote_code=True, device_map="auto", force_download=True)
     log.info("Resizing token embeddings...")
     added_tokens_decoder = adapter_tokenizer.added_tokens_decoder
     model.resize_token_embeddings(adapter_tokenizer.vocab_size + len(added_tokens_decoder))
     log.info("Loading LoRA adapter...")
-    peft_model = PeftModel.from_pretrained(model, lora_model_name, low_cpu_mem_usage=True, device_map="auto", force_download=True, trust_remote_code=True, torch_dtype=torch.float16)
     log.info("Merging and unloading model...")
     model = peft_model.merge_and_unload()
     log.info("Saving model...")
     model.save_pretrained(output_dir)
     adapter_tokenizer.save_pretrained(output_dir)
     del model, peft_model
     return output_dir
 @timeit
@@ -192,22 +192,25 @@ def quantize(model_path, repo_id, quant_method=None):
     os.makedirs(model_output_dir, exist_ok=True)
     # 中间文件保存在 model_output 目录下
-    guff_16 = os.path.join(model_output_dir, f"{repo_id}-f16.gguf")
-    if not os.path.exists(guff_16):
         log.info(f"正在将模型转换为GGML格式")
         convert_script = os.path.join(llamacpp_dir, "convert_hf_to_gguf.py")
-        convert_cmd = f"python {convert_script} {model_path} --outfile {guff_16}"
         print(f"syscall:[{convert_cmd}]")
         os.system(convert_cmd)
     else:
         log.info(f"GGML中间文件已存在，跳过转换")
     # 最终文件保存在 model_output 目录下
     final_path = os.path.join(model_output_dir, f"{repo_id}-{quant_method}.gguf")
     log.info(f"正在进行{quant_method}量化")
     quantize_bin = os.path.join(llamacpp_dir, "build", "bin", "llama-quantize")
-    quant_cmd = f"{quantize_bin} {guff_16} {final_path} {quant_method}"
     print(f"syscall:[{quant_cmd}]")
     if not os.path.exists(final_path):
@@ -294,12 +297,9 @@ def process_model(base_model_name, lora_model_name, repo_name, quant_methods, hf
         model_path = download_and_merge_model(base_model_name, lora_model_name, output_dir, device)
-        # 量化模型
-        for quant_method in quant_methods:
-            quantize(output_dir, repo_name, quant_method=quant_method)
         create_readme(repo_name, base_model_name, lora_model_name, quant_methods)
         # 上传合并后的模型和量化模型
         api.upload_large_folder(
             folder_path=model_path,
@@ -310,6 +310,26 @@ def process_model(base_model_name, lora_model_name, repo_name, quant_methods, hf
         )
         log.info("Upload completed.")
         # rm -rf model_path
         shutil.rmtree(model_path)
         log.info("Removed model from local")

     log.info(f"Total system memory: {MEMORY}GB")
     model_size_gb = get_model_size_in_gb(model_name)
+    required_memory_gb = model_size_gb * 2.5
     log.info(f"Estimated required memory for model: {required_memory_gb:.1f}GB")
     """
     os.makedirs("temp", exist_ok=True)
     log.info("Loading base model...")
+    model = AutoModelForCausalLM.from_pretrained(base_model_name, low_cpu_mem_usage=True, device_map="auto", force_download=True, trust_remote_code=True, torch_dtype=torch.float16, cache_dir="temp")
     log.info("Loading adapter tokenizer...")
     adapter_tokenizer = AutoTokenizer.from_pretrained(lora_model_name, trust_remote_code=True, device_map="auto", force_download=True)
     log.info("Resizing token embeddings...")
     added_tokens_decoder = adapter_tokenizer.added_tokens_decoder
     model.resize_token_embeddings(adapter_tokenizer.vocab_size + len(added_tokens_decoder))
     log.info("Loading LoRA adapter...")
+    peft_model = PeftModel.from_pretrained(model, lora_model_name, low_cpu_mem_usage=True, device_map="auto", force_download=True, trust_remote_code=True, torch_dtype=torch.float16, cache_dir="temp")
     log.info("Merging and unloading model...")
     model = peft_model.merge_and_unload()
     log.info("Saving model...")
     model.save_pretrained(output_dir)
     adapter_tokenizer.save_pretrained(output_dir)
     del model, peft_model
+    shutil.rmtree("temp") # to save space due to huggingface space limit(50GB)
     return output_dir
 @timeit
     os.makedirs(model_output_dir, exist_ok=True)
     # 中间文件保存在 model_output 目录下
+    guff_16_path =f"./{repo_id}-f16.gguf"
+    if not os.path.exists(guff_16_path):
         log.info(f"正在将模型转换为GGML格式")
         convert_script = os.path.join(llamacpp_dir, "convert_hf_to_gguf.py")
+        convert_cmd = f"python {convert_script} {model_path} --outfile {guff_16_path}"
         print(f"syscall:[{convert_cmd}]")
         os.system(convert_cmd)
     else:
         log.info(f"GGML中间文件已存在，跳过转换")
+    if quant_method == "fp16":
+        return guff_16_path  # for upload to hub
     # 最终文件保存在 model_output 目录下
     final_path = os.path.join(model_output_dir, f"{repo_id}-{quant_method}.gguf")
     log.info(f"正在进行{quant_method}量化")
     quantize_bin = os.path.join(llamacpp_dir, "build", "bin", "llama-quantize")
+    quant_cmd = f"{quantize_bin} {guff_16_path} {final_path} {quant_method}"
     print(f"syscall:[{quant_cmd}]")
     if not os.path.exists(final_path):
         model_path = download_and_merge_model(base_model_name, lora_model_name, output_dir, device)
         create_readme(repo_name, base_model_name, lora_model_name, quant_methods)
         # 上传合并后的模型和量化模型
         api.upload_large_folder(
             folder_path=model_path,
         )
         log.info("Upload completed.")
+        # remove model for space limit
+        shutil.rmtree(model_path)
+        os.makedirs(os.path.join(output_dir, "quantized"), exist_ok=True)
+        if len(quant_methods) > 0:
+            quantize(output_dir, repo_name, "fp16") # for
+            # 量化模型
+            for quant_method in quant_methods:
+                quantize(output_dir, repo_name, quant_method=quant_method)
+            os.system(f"mv ./{repo_name}-f16.gguf ./{output_dir}/quantized/")
+        api.upload_folder(
+            folder_path=os.path.join(output_dir, "quantized"),
+            path_in_repo="quantized",
+            repo_id=repo_name,
+            repo_type="model",
+            num_workers=os.cpu_count() if os.cpu_count() > 4 else 4,
+            print_report_every=10,
+        )
         # rm -rf model_path
         shutil.rmtree(model_path)
         log.info("Removed model from local")