scepter commited on
Commit
4235293
·
1 Parent(s): 832013f

Create handler.py

Browse files
Files changed (1) hide show
  1. handler.py +68 -0
handler.py ADDED
@@ -0,0 +1,68 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Dict, List, Any
2
+ from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
3
+ import torch
4
+
5
+ class EndpointHandler():
6
+ def __init__(self, path=""):
7
+ self.model = AutoModelForCausalLM.from_pretrained("chavinlo/gpt4-x-alpaca")
8
+ self.tokenizer = AutoTokenizer.from_pretrained("chavinlo/gpt4-x-alpaca")
9
+ self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
10
+ self.model.to(self.device)
11
+ #quantization_config = BitsAndBytesConfig(llm_int8_enable_fp32_cpu_offload=True)
12
+
13
+ # device_map = {
14
+ # "transformer.word_embeddings": 0,
15
+ # "transformer.word_embeddings_layernorm": 0,
16
+ # "lm_head": "cpu",
17
+ # "transformer.h": 0,
18
+ # "transformer.ln_f": 0,
19
+ # }
20
+ #path = "anon8231489123/gpt4-x-alpaca-13b-native-4bit-128g"
21
+
22
+ # self.model = AutoModelForCausalLM.from_pretrained(
23
+ # path,
24
+ # device_map="auto",
25
+ # load_in_8bit=True,
26
+ # #kwargs="--wbits 4 --groupsize 128",
27
+ # #device_map=device_map,
28
+ # #quantization_config=quantization_config
29
+ # )
30
+ # self.tokenizer = AutoTokenizer.from_pretrained(path)
31
+ # self.pipeline = pipeline("conversational", model = self.model, tokenizer=self.tokenizer, trust_remote_code=True, device_map="auto", torch_dtype=torch.bfloat16)
32
+
33
+ #rep= "anon8231489123/gpt4-x-alpaca-13b-native-4bit-128g"
34
+ # tokenizer = AutoTokenizer.from_pretrained(rep)
35
+ #model = AutoModelForCausalLM.from_pretrained(rep)
36
+
37
+ # inputs = tokenizer(["Today is"], return_tensors="pt")
38
+
39
+ # reply_ids = model.generate(**inputs, max_new_tokens=590) # return_dict_in_generate=True, output_scores=True
40
+ # outputs = tokenizer.batch_decode(reply_ids, skip_special_tokens=True)[0]
41
+ # print(outputs)
42
+
43
+ #modelPath = "/"
44
+
45
+ #self.pipeline = pipeline("conversational", model=modelPath)
46
+
47
+ # Preload all the elements you are going to need at inference.
48
+ # pseudo:
49
+ # self.model= load_model(path)
50
+ print("end")
51
+
52
+ def __call__(self, data: Dict[str, Any]) -> List[Dict[str, Any]]:
53
+ inputs = data.pop("inputs", data)
54
+ parameters = data.pop("parameters", None)
55
+
56
+ # preprocess
57
+ input_ids = self.tokenizer(inputs, return_tensors="pt").input_ids
58
+
59
+ # pass inputs with all kwargs in data
60
+ if parameters is not None:
61
+ outputs = self.model.generate(input_ids, **parameters)
62
+ else:
63
+ outputs = self.model.generate(input_ids)
64
+
65
+ # postprocess the prediction
66
+ prediction = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
67
+
68
+ return [{"generated_text": prediction}]