Text Generation
Transformers
PyTorch
English
llama
text-generation-inference
Inference Endpoints
Conrad Lippert-Zajaczkowski commited on
Commit
229a1e9
·
1 Parent(s): 96da633

gpu tracking

Browse files
Files changed (1) hide show
  1. handler.py +19 -0
handler.py CHANGED
@@ -2,23 +2,42 @@ import torch
2
  from typing import Dict, List, Any
3
  from transformers import LlamaForCausalLM, LlamaTokenizer, pipeline
4
 
 
 
 
 
 
 
5
  # get dtype
6
  dtype = torch.bfloat16 if torch.cuda.get_device_capability()[0] == 8 else torch.float16
 
 
7
 
8
  class EndpointHandler:
9
  def __init__(self, path=""):
10
  # load the model
 
11
  tokenizer = LlamaTokenizer.from_pretrained("/repository/tokenizer", local_files_only=True)
 
 
 
12
  model = LlamaForCausalLM.from_pretrained(
13
  "/repository/pytorch_model",
14
  device_map="auto",
15
  torch_dtype=dtype,
16
  )
 
 
 
 
17
  # create inference pipeline
18
  self.pipeline = pipeline("text-generation", model=model, tokenizer=tokenizer)
 
19
 
20
  def __call__(self, data: Any) -> List[List[Dict[str, float]]]:
 
21
  inputs = data.pop("inputs", data)
 
22
  parameters = data.pop("parameters", None)
23
 
24
  # pass inputs with all kwargs in data
 
2
  from typing import Dict, List, Any
3
  from transformers import LlamaForCausalLM, LlamaTokenizer, pipeline
4
 
5
+ from pynvml import nvmlInit, nvmlDeviceGetHandleByIndex, nvmlDeviceGetMemoryInfo
6
+
7
+ nvmlInit()
8
+ gpu_h1 = nvmlDeviceGetHandleByIndex(0)
9
+
10
+ print('loaded_imports')
11
  # get dtype
12
  dtype = torch.bfloat16 if torch.cuda.get_device_capability()[0] == 8 else torch.float16
13
+ print('chose dtype', dtype)
14
+
15
 
16
  class EndpointHandler:
17
  def __init__(self, path=""):
18
  # load the model
19
+ print('starting to load tokenizer')
20
  tokenizer = LlamaTokenizer.from_pretrained("/repository/tokenizer", local_files_only=True)
21
+ print('loaded tokenizer')
22
+ gpu_info1 = nvmlDeviceGetMemoryInfo(gpu_h1)
23
+ print(f'vram {gpu_info1.total} used {gpu_info1.used} free {gpu_info1.free}')
24
  model = LlamaForCausalLM.from_pretrained(
25
  "/repository/pytorch_model",
26
  device_map="auto",
27
  torch_dtype=dtype,
28
  )
29
+ gpu_info1 = nvmlDeviceGetMemoryInfo(gpu_h1)
30
+ print(f'vram {gpu_info1.total} used {gpu_info1.used} free {gpu_info1.free}')
31
+
32
+ print('loaded model')
33
  # create inference pipeline
34
  self.pipeline = pipeline("text-generation", model=model, tokenizer=tokenizer)
35
+ print('created pipeline')
36
 
37
  def __call__(self, data: Any) -> List[List[Dict[str, float]]]:
38
+ print('starting to call')
39
  inputs = data.pop("inputs", data)
40
+ print('inputs: ', inputs)
41
  parameters = data.pop("parameters", None)
42
 
43
  # pass inputs with all kwargs in data