Spaces:

zetavg
/

LLaMA-LoRA-Tuner-UI-Demo

Runtime error

App Files Files Community

zetavg commited on Apr 18, 2023

Commit

6148b7c

unverified ·

1 Parent(s): 0054cc5

improve speed of switching models by offloading unused ones to cpu ram instead if unloading

Browse files

Files changed (3) hide show

llama_lora/globals.py +17 -1
llama_lora/ui/main_page.py +0 -1
llama_lora/utils/model_lru_cache.py +68 -0

llama_lora/globals.py CHANGED Viewed

@@ -1,5 +1,7 @@
 import os
 import subprocess
 from typing import Any, Dict, List, Optional, Tuple, Union
@@ -7,6 +9,7 @@ from numba import cuda
 import nvidia_smi
 from .utils.lru_cache import LRUCache
 from .lib.finetune import train
@@ -34,7 +37,7 @@ class Global:
     generation_force_stopped_at = None
     # Model related
-    loaded_models = LRUCache(1)
     loaded_tokenizers = LRUCache(1)
     new_base_model_that_is_ready_to_be_used = None
     name_of_new_base_model_that_is_ready_to_be_used = None
@@ -89,6 +92,7 @@ if commit_hash:
 def load_gpu_info():
     try:
         cc_cores_per_SM_dict = {
             (2, 0): 32,
@@ -135,8 +139,20 @@ def load_gpu_info():
             f"GPU total memory: {total_memory} bytes ({total_memory_mb:.2f} MB) ({total_memory_gb:.2f} GB)")
         Global.gpu_total_memory = total_memory
     except Exception as e:
         print(f"Notice: cannot get GPU info: {e}")
 load_gpu_info()

 import os
 import subprocess
+import psutil
+import math
 from typing import Any, Dict, List, Optional, Tuple, Union
 import nvidia_smi
 from .utils.lru_cache import LRUCache
+from .utils.model_lru_cache import ModelLRUCache
 from .lib.finetune import train
     generation_force_stopped_at = None
     # Model related
+    loaded_models = ModelLRUCache(1)
     loaded_tokenizers = LRUCache(1)
     new_base_model_that_is_ready_to_be_used = None
     name_of_new_base_model_that_is_ready_to_be_used = None
 def load_gpu_info():
+    print("")
     try:
         cc_cores_per_SM_dict = {
             (2, 0): 32,
             f"GPU total memory: {total_memory} bytes ({total_memory_mb:.2f} MB) ({total_memory_gb:.2f} GB)")
         Global.gpu_total_memory = total_memory
+        available_cpu_ram = psutil.virtual_memory().available
+        available_cpu_ram_mb = available_cpu_ram / (1024 ** 2)
+        available_cpu_ram_gb = available_cpu_ram / (1024 ** 3)
+        print(
+            f"CPU available memory: {available_cpu_ram} bytes ({available_cpu_ram_mb:.2f} MB) ({available_cpu_ram_gb:.2f} GB)")
+        preserve_loaded_models_count = math.floor((available_cpu_ram * 0.8) / total_memory) - 1
+        if preserve_loaded_models_count > 1:
+            print(f"Will keep {preserve_loaded_models_count} offloaded models in CPU RAM.")
+            Global.loaded_models = ModelLRUCache(preserve_loaded_models_count)
+            Global.loaded_tokenizers = LRUCache(preserve_loaded_models_count)
     except Exception as e:
         print(f"Notice: cannot get GPU info: {e}")
+    print("")
 load_gpu_info()

llama_lora/ui/main_page.py CHANGED Viewed

@@ -136,7 +136,6 @@ def main_page():
           const tokenizer_name = current_tokenizer_hint_elem && current_tokenizer_hint_elem.innerText;
           if (tokenizer_name && tokenizer_name !== base_model_name) {
-            document.querySelector('#global_tokenizer_select input').value = tokenizer_name;
             const btn = document.getElementById('use_custom_tokenizer_btn');
             if (btn) btn.click();
           }

           const tokenizer_name = current_tokenizer_hint_elem && current_tokenizer_hint_elem.innerText;
           if (tokenizer_name && tokenizer_name !== base_model_name) {
             const btn = document.getElementById('use_custom_tokenizer_btn');
             if (btn) btn.click();
           }

llama_lora/utils/model_lru_cache.py ADDED Viewed

	@@ -0,0 +1,68 @@

+from collections import OrderedDict
+import gc
+import torch
+from ..lib.get_device import get_device
+device_type = get_device()
+class ModelLRUCache:
+    def __init__(self, capacity=5):
+        self.cache = OrderedDict()
+        self.capacity = capacity
+    def get(self, key):
+        if key in self.cache:
+            # Move the accessed item to the end of the OrderedDict
+            self.cache.move_to_end(key)
+            models_did_move = False
+            for k, m in self.cache.items():
+                if key != k and m.device.type != 'cpu':
+                    models_did_move = True
+                    self.cache[k] = m.to('cpu')
+            if models_did_move:
+                gc.collect()
+                # if not shared.args.cpu: # will not be running on CPUs anyway
+                with torch.no_grad():
+                    torch.cuda.empty_cache()
+            model = self.cache[key]
+            if (model.device.type != device_type or
+                    hasattr(model, "model") and
+                    model.model.device.type != device_type):
+                model = model.to(device_type)
+            return model
+        return None
+    def set(self, key, value):
+        if key in self.cache:
+            # If the key already exists, update its value
+            self.cache[key] = value
+        else:
+            # If the cache has reached its capacity, remove the least recently used item
+            if len(self.cache) >= self.capacity:
+                self.cache.popitem(last=False)
+            self.cache[key] = value
+    def clear(self):
+        self.cache.clear()
+    def prepare_to_set(self):
+        if len(self.cache) >= self.capacity:
+            self.cache.popitem(last=False)
+        models_did_move = False
+        for k, m in self.cache.items():
+            if m.device.type != 'cpu':
+                models_did_move = True
+                self.cache[k] = m.to('cpu')
+        if models_did_move:
+            gc.collect()
+            # if not shared.args.cpu: # will not be running on CPUs anyway
+            with torch.no_grad():
+                torch.cuda.empty_cache()