import torch from typing import Dict, List, Any from transformers import LlamaForCausalLM, LlamaTokenizer, pipeline from pynvml import nvmlInit, nvmlDeviceGetHandleByIndex, nvmlDeviceGetMemoryInfo nvmlInit() gpu_h1 = nvmlDeviceGetHandleByIndex(0) print('loaded_imports') # get dtype dtype = torch.bfloat16 if torch.cuda.get_device_capability()[0] == 8 else torch.float16 print('chose dtype', dtype) class EndpointHandler: def __init__(self, path=""): # load the model print('starting to load tokenizer') tokenizer = LlamaTokenizer.from_pretrained("/repository/orca_tokenizer", local_files_only=True) print('loaded tokenizer') gpu_info1 = nvmlDeviceGetMemoryInfo(gpu_h1) print(f'vram {gpu_info1.total} used {gpu_info1.used} free {gpu_info1.free}') model = LlamaForCausalLM.from_pretrained( "/repository/pytorch_model", device_map="auto", torch_dtype=dtype, offload_folder="offload", local_files_only=True ) gpu_info1 = nvmlDeviceGetMemoryInfo(gpu_h1) print(f'vram {gpu_info1.total} used {gpu_info1.used} free {gpu_info1.free}') print('loaded model') # create inference pipeline self.pipeline = pipeline("text-generation", model=model, tokenizer=tokenizer) print('created pipeline') def __call__(self, data: Any) -> List[List[Dict[str, float]]]: print('starting to call') inputs = data.pop("inputs", data) print('inputs: ', inputs) parameters = data.pop("parameters", None) # pass inputs with all kwargs in data if parameters is not None: prediction = self.pipeline(inputs, **parameters) else: prediction = self.pipeline(inputs) # postprocess the prediction return prediction