import torch from typing import Dict, List, Any from transformers import LlamaForCausalLM, LlamaTokenizer, pipeline from pynvml import nvmlInit, nvmlDeviceGetHandleByIndex, nvmlDeviceGetMemoryInfo nvmlInit() gpu_h1 = nvmlDeviceGetHandleByIndex(0) print('loaded_imports') # get dtype dtype = torch.bfloat16 if torch.cuda.get_device_capability()[0] == 8 else torch.float16 print('chose dtype', dtype) class EndpointHandler: def __init__(self, path=""): # load the model print('starting to load tokenizer') self.tokenizer = LlamaTokenizer.from_pretrained("/repository/orca_tokenizer", local_files_only=True) print('loaded tokenizer') gpu_info1 = nvmlDeviceGetMemoryInfo(gpu_h1) print(f'vram {gpu_info1.total} used {gpu_info1.used} free {gpu_info1.free}') self.model = LlamaForCausalLM.from_pretrained( "/repository", device_map="auto", torch_dtype=dtype, local_files_only=True ) gpu_info1 = nvmlDeviceGetMemoryInfo(gpu_h1) print(f'vram {gpu_info1.total} used {gpu_info1.used} free {gpu_info1.free}') print('loaded model') # create inference pipeline self.pipeline = pipeline("text-generation", model=self.model, tokenizer=self.tokenizer) print('created pipeline') def __call__(self, data: Dict[str, Any]) -> List[Dict[str, Any]]: print('starting to call') inputs = data.pop("inputs", data) print('inputs: ', inputs) parameters = data.pop("parameters", None) # pass inputs with all kwargs in data if parameters is not None: prediction = self.pipeline(inputs, **parameters) else: prediction = self.pipeline( inputs, do_sample=True, top_k=10, num_return_sequences=1, eos_token_id=self.tokenizer.eos_token_id, max_length=256 ) # postprocess the prediction return prediction