swapnice
/

swapnice-openorcaxopenchat-preview2-13b

@@ -2,23 +2,42 @@ import torch
 from typing import Dict, List, Any
 from transformers import LlamaForCausalLM, LlamaTokenizer, pipeline
 # get dtype
 dtype = torch.bfloat16 if torch.cuda.get_device_capability()[0] == 8 else torch.float16
 class EndpointHandler:
     def __init__(self, path=""):
         # load the model
         tokenizer = LlamaTokenizer.from_pretrained("/repository/tokenizer", local_files_only=True)
         model = LlamaForCausalLM.from_pretrained(
             "/repository/pytorch_model",
             device_map="auto",
             torch_dtype=dtype,
         )
         # create inference pipeline
         self.pipeline = pipeline("text-generation", model=model, tokenizer=tokenizer)
     def __call__(self, data: Any) -> List[List[Dict[str, float]]]:
         inputs = data.pop("inputs", data)
         parameters = data.pop("parameters", None)
         # pass inputs with all kwargs in data

 from typing import Dict, List, Any
 from transformers import LlamaForCausalLM, LlamaTokenizer, pipeline
+from pynvml import nvmlInit, nvmlDeviceGetHandleByIndex, nvmlDeviceGetMemoryInfo
+nvmlInit()
+gpu_h1 = nvmlDeviceGetHandleByIndex(0)
+print('loaded_imports')
 # get dtype
 dtype = torch.bfloat16 if torch.cuda.get_device_capability()[0] == 8 else torch.float16
+print('chose dtype', dtype)
 class EndpointHandler:
     def __init__(self, path=""):
         # load the model
+        print('starting to load tokenizer')
         tokenizer = LlamaTokenizer.from_pretrained("/repository/tokenizer", local_files_only=True)
+        print('loaded tokenizer')
+        gpu_info1 = nvmlDeviceGetMemoryInfo(gpu_h1)
+        print(f'vram {gpu_info1.total} used {gpu_info1.used} free {gpu_info1.free}')
         model = LlamaForCausalLM.from_pretrained(
             "/repository/pytorch_model",
             device_map="auto",
             torch_dtype=dtype,
         )
+        gpu_info1 = nvmlDeviceGetMemoryInfo(gpu_h1)
+        print(f'vram {gpu_info1.total} used {gpu_info1.used} free {gpu_info1.free}')
+        print('loaded model')
         # create inference pipeline
         self.pipeline = pipeline("text-generation", model=model, tokenizer=tokenizer)
+        print('created pipeline')
     def __call__(self, data: Any) -> List[List[Dict[str, float]]]:
+        print('starting to call')
         inputs = data.pop("inputs", data)
+        print('inputs: ', inputs)
         parameters = data.pop("parameters", None)
         # pass inputs with all kwargs in data