swapnice
/

swapnice-openorcaxopenchat-preview2-13b

@@ -17,14 +17,15 @@ class EndpointHandler:
     def __init__(self, path=""):
         # load the model
         print('starting to load tokenizer')
-        tokenizer = LlamaTokenizer.from_pretrained("/repository/tokenizer", local_files_only=True)
         print('loaded tokenizer')
         gpu_info1 = nvmlDeviceGetMemoryInfo(gpu_h1)
         print(f'vram {gpu_info1.total} used {gpu_info1.used} free {gpu_info1.free}')
         model = LlamaForCausalLM.from_pretrained(
-            "/repository/pytorch_model",
             device_map="auto",
             torch_dtype=dtype,
         )
         gpu_info1 = nvmlDeviceGetMemoryInfo(gpu_h1)
         print(f'vram {gpu_info1.total} used {gpu_info1.used} free {gpu_info1.free}')

     def __init__(self, path=""):
         # load the model
         print('starting to load tokenizer')
+        tokenizer = LlamaTokenizer.from_pretrained(".", local_files_only=True)
         print('loaded tokenizer')
         gpu_info1 = nvmlDeviceGetMemoryInfo(gpu_h1)
         print(f'vram {gpu_info1.total} used {gpu_info1.used} free {gpu_info1.free}')
         model = LlamaForCausalLM.from_pretrained(
+            ".",
             device_map="auto",
             torch_dtype=dtype,
+            offload_folder="offload"
         )
         gpu_info1 = nvmlDeviceGetMemoryInfo(gpu_h1)
         print(f'vram {gpu_info1.total} used {gpu_info1.used} free {gpu_info1.free}')