swapnice
/

swapnice-openorcaxopenchat-preview2-13b

Conrad Lippert-Zajaczkowski commited on Aug 5, 2023

Commit

ba9cbaf

1 Parent(s): 26b366e

refined input

Files changed (1) hide show

handler.py CHANGED Viewed

@@ -17,11 +17,11 @@ class EndpointHandler:
     def __init__(self, path=""):
         # load the model
         print('starting to load tokenizer')
-        tokenizer = LlamaTokenizer.from_pretrained("/repository/orca_tokenizer", local_files_only=True)
         print('loaded tokenizer')
         gpu_info1 = nvmlDeviceGetMemoryInfo(gpu_h1)
         print(f'vram {gpu_info1.total} used {gpu_info1.used} free {gpu_info1.free}')
-        model = LlamaForCausalLM.from_pretrained(
             "/repository",
             device_map="auto",
             torch_dtype=dtype,
@@ -33,10 +33,10 @@ class EndpointHandler:
         print('loaded model')
         # create inference pipeline
-        self.pipeline = pipeline("text-generation", model=model, tokenizer=tokenizer)
         print('created pipeline')
-    def __call__(self, data: Any) -> List[List[Dict[str, float]]]:
         print('starting to call')
         inputs = data.pop("inputs", data)
         print('inputs: ', inputs)
@@ -46,6 +46,13 @@ class EndpointHandler:
         if parameters is not None:
             prediction = self.pipeline(inputs, **parameters)
         else:
-            prediction = self.pipeline(inputs)
         # postprocess the prediction
         return prediction

     def __init__(self, path=""):
         # load the model
         print('starting to load tokenizer')
+        self.tokenizer = LlamaTokenizer.from_pretrained("/repository/orca_tokenizer", local_files_only=True)
         print('loaded tokenizer')
         gpu_info1 = nvmlDeviceGetMemoryInfo(gpu_h1)
         print(f'vram {gpu_info1.total} used {gpu_info1.used} free {gpu_info1.free}')
+        self.model = LlamaForCausalLM.from_pretrained(
             "/repository",
             device_map="auto",
             torch_dtype=dtype,
         print('loaded model')
         # create inference pipeline
+        self.pipeline = pipeline("text-generation", model=self.model, tokenizer=self.tokenizer)
         print('created pipeline')
+    def __call__(self, data: Dict[str, Any]) -> List[Dict[str, Any]]:
         print('starting to call')
         inputs = data.pop("inputs", data)
         print('inputs: ', inputs)
         if parameters is not None:
             prediction = self.pipeline(inputs, **parameters)
         else:
+            prediction = self.pipeline(
+                inputs,
+                do_sample=True,
+                top_k=10,
+                num_return_sequences=1,
+                eos_token_id=self.tokenizer.eos_token_id,
+                max_length=200,
+            )
         # postprocess the prediction
         return prediction