Text Generation
Transformers
PyTorch
English
llama
text-generation-inference
Inference Endpoints
Conrad Lippert-Zajaczkowski commited on
Commit
48f8e5d
·
1 Parent(s): e31fd99

test specific

Browse files
Files changed (1) hide show
  1. handler.py +3 -2
handler.py CHANGED
@@ -17,14 +17,15 @@ class EndpointHandler:
17
  def __init__(self, path=""):
18
  # load the model
19
  print('starting to load tokenizer')
20
- tokenizer = LlamaTokenizer.from_pretrained("/repository/tokenizer", local_files_only=True)
21
  print('loaded tokenizer')
22
  gpu_info1 = nvmlDeviceGetMemoryInfo(gpu_h1)
23
  print(f'vram {gpu_info1.total} used {gpu_info1.used} free {gpu_info1.free}')
24
  model = LlamaForCausalLM.from_pretrained(
25
- "/repository/pytorch_model",
26
  device_map="auto",
27
  torch_dtype=dtype,
 
28
  )
29
  gpu_info1 = nvmlDeviceGetMemoryInfo(gpu_h1)
30
  print(f'vram {gpu_info1.total} used {gpu_info1.used} free {gpu_info1.free}')
 
17
  def __init__(self, path=""):
18
  # load the model
19
  print('starting to load tokenizer')
20
+ tokenizer = LlamaTokenizer.from_pretrained(".", local_files_only=True)
21
  print('loaded tokenizer')
22
  gpu_info1 = nvmlDeviceGetMemoryInfo(gpu_h1)
23
  print(f'vram {gpu_info1.total} used {gpu_info1.used} free {gpu_info1.free}')
24
  model = LlamaForCausalLM.from_pretrained(
25
+ ".",
26
  device_map="auto",
27
  torch_dtype=dtype,
28
+ offload_folder="offload"
29
  )
30
  gpu_info1 = nvmlDeviceGetMemoryInfo(gpu_h1)
31
  print(f'vram {gpu_info1.total} used {gpu_info1.used} free {gpu_info1.free}')