dordonezc
/

Phi-3-mini-128k-instruct-4-endpoints

Text Generation

text-generation-inference

Inference Endpoints

Model card Files Files and versions Community

dordonezc commited on Jun 21, 2024

Commit

ce570f0

·

verified ·

1 Parent(s): a53dddc

Quantization

Files changed (1) hide show

handler.py +2 -1

handler.py CHANGED Viewed

@@ -4,7 +4,8 @@ from typing import Dict, List, Any
 class EndpointHandler():
     def __init__(self, path=""):
-      self.model = AutoModelForCausalLM.from_pretrained(path, device_map="cuda", torch_dtype="auto", trust_remote_code=True)
       self.tokenizer = AutoTokenizer.from_pretrained(path)
       self.pipeline = pipeline("text-generation", model=self.model, tokenizer=self.tokenizer)

 class EndpointHandler():
     def __init__(self, path=""):
+      self.quant = quantization_config = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_compute_dtype=torch.float16)
+      self.model = AutoModelForCausalLM.from_pretrained(path, device_map="auto", quantization_config=self.quant, trust_remote_code=True)
       self.tokenizer = AutoTokenizer.from_pretrained(path)
       self.pipeline = pipeline("text-generation", model=self.model, tokenizer=self.tokenizer)