from typing import Dict, List, Any | |
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline | |
import torch | |
class EndpointHandler(): | |
def __init__(self, path=""): | |
self.model = AutoModelForCausalLM.from_pretrained("chavinlo/gpt4-x-alpaca") | |
self.tokenizer = AutoTokenizer.from_pretrained("chavinlo/gpt4-x-alpaca") | |
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") | |
self.model.to(self.device) | |
#quantization_config = BitsAndBytesConfig(llm_int8_enable_fp32_cpu_offload=True) | |
# device_map = { | |
# "transformer.word_embeddings": 0, | |
# "transformer.word_embeddings_layernorm": 0, | |
# "lm_head": "cpu", | |
# "transformer.h": 0, | |
# "transformer.ln_f": 0, | |
# } | |
#path = "anon8231489123/gpt4-x-alpaca-13b-native-4bit-128g" | |
# self.model = AutoModelForCausalLM.from_pretrained( | |
# path, | |
# device_map="auto", | |
# load_in_8bit=True, | |
# #kwargs="--wbits 4 --groupsize 128", | |
# #device_map=device_map, | |
# #quantization_config=quantization_config | |
# ) | |
# self.tokenizer = AutoTokenizer.from_pretrained(path) | |
# self.pipeline = pipeline("conversational", model = self.model, tokenizer=self.tokenizer, trust_remote_code=True, device_map="auto", torch_dtype=torch.bfloat16) | |
#rep= "anon8231489123/gpt4-x-alpaca-13b-native-4bit-128g" | |
# tokenizer = AutoTokenizer.from_pretrained(rep) | |
#model = AutoModelForCausalLM.from_pretrained(rep) | |
# inputs = tokenizer(["Today is"], return_tensors="pt") | |
# reply_ids = model.generate(**inputs, max_new_tokens=590) # return_dict_in_generate=True, output_scores=True | |
# outputs = tokenizer.batch_decode(reply_ids, skip_special_tokens=True)[0] | |
# print(outputs) | |
#modelPath = "/" | |
#self.pipeline = pipeline("conversational", model=modelPath) | |
# Preload all the elements you are going to need at inference. | |
# pseudo: | |
# self.model= load_model(path) | |
print("end") | |
def __call__(self, data: Dict[str, Any]) -> List[Dict[str, Any]]: | |
inputs = data.pop("inputs", data) | |
parameters = data.pop("parameters", None) | |
# preprocess | |
input_ids = self.tokenizer(inputs, return_tensors="pt").input_ids | |
# pass inputs with all kwargs in data | |
if parameters is not None: | |
outputs = self.model.generate(input_ids, **parameters) | |
else: | |
outputs = self.model.generate(input_ids) | |
# postprocess the prediction | |
prediction = self.tokenizer.decode(outputs[0], skip_special_tokens=True) | |
return [{"generated_text": prediction}] |