yuzhe commited on
Commit
a55dc79
·
verified ·
1 Parent(s): 77c95fb

Update handler.py

Browse files
Files changed (1) hide show
  1. handler.py +57 -22
handler.py CHANGED
@@ -1,29 +1,64 @@
1
- from accelerate import init_empty_weights, load_checkpoint_and_dispatch
 
 
2
  from transformers import AutoTokenizer, AutoModelForCausalLM
 
 
 
3
  class EndpointHandler:
4
- def __init__(self, model_dir: str, **kw):
5
- self.tokenizer = AutoTokenizer.from_pretrained(model_dir, trust_remote_code=True)
 
 
 
 
 
 
 
 
 
 
 
6
  with init_empty_weights():
7
- model = AutoModelForCausalLM.from_pretrained(
8
- model_dir, torch_dtype="auto", trust_remote_code=True
 
 
9
  )
 
 
10
  self.model = load_checkpoint_and_dispatch(
11
- model, checkpoint=model_dir, device_map="auto"
12
- ) # 自动跨 GPU 切层
13
- def __call__(self, data):
14
- prompt = data["inputs"]
15
-
16
- inputs = self.tokenizer(
17
- prompt, return_tensors="pt"
18
- ).to("cuda:0") # 👈 把 input_ids/attention_mask 都放到 0 号卡
19
-
20
- out_ids = self.model.generate(
21
- **inputs,
22
- max_new_tokens=256,
23
- )
24
- return {
25
- "generated_text": self.tokenizer.decode(
26
- out_ids[0], skip_special_tokens=True
27
  )
28
- }
29
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # handler.py —— 放在模型仓库根目录
2
+ from typing import Dict, Any
3
+ import torch
4
  from transformers import AutoTokenizer, AutoModelForCausalLM
5
+ from accelerate import init_empty_weights, load_checkpoint_and_dispatch
6
+
7
+
8
  class EndpointHandler:
9
+ """
10
+ Hugging Face Inference Endpoints 约定的自定义入口:
11
+ • __init__(model_dir, **kwargs) —— 加载模型
12
+ • __call__(inputs: Dict) -> Dict —— 处理一次请求
13
+ """
14
+
15
+ def __init__(self, model_dir: str, **kwargs):
16
+ # 1️⃣ Tokenizer
17
+ self.tokenizer = AutoTokenizer.from_pretrained(
18
+ model_dir, trust_remote_code=True
19
+ )
20
+
21
+ # 2️⃣ 构建“空壳”模型(不占显存)
22
  with init_empty_weights():
23
+ base_model = AutoModelForCausalLM.from_pretrained(
24
+ model_dir,
25
+ torch_dtype=torch.float16,
26
+ trust_remote_code=True,
27
  )
28
+
29
+ # 3️⃣ 把权重切片加载到两张 GPU
30
  self.model = load_checkpoint_and_dispatch(
31
+ base_model,
32
+ checkpoint=model_dir,
33
+ device_map="auto", # 自动分层到 cuda:0 / cuda:1
34
+ dtype=torch.float16,
35
+ )
36
+
37
+ # 4️⃣ 生成时常用的生成参数
38
+ self.generation_kwargs = dict(
39
+ max_new_tokens=2048,
40
+ do_sample=True,
41
+ temperature=0.7,
42
+ top_p=0.9,
 
 
 
 
43
  )
 
44
 
45
+ def __call__(self, data: Dict[str, Any]) -> Dict[str, str]:
46
+ """
47
+ data 格式:
48
+ {
49
+ "inputs": "your prompt here"
50
+ }
51
+ """
52
+ prompt = data["inputs"]
53
+
54
+ # ➡️ 只把输入张量放到 cuda:0(与模型第一层同卡)
55
+ inputs = self.tokenizer(prompt, return_tensors="pt").to("cuda:0")
56
+
57
+ # 生成
58
+ with torch.inference_mode():
59
+ output_ids = self.model.generate(**inputs, **self.generation_kwargs)
60
+
61
+ generated_text = self.tokenizer.decode(
62
+ output_ids[0], skip_special_tokens=True
63
+ )
64
+ return {"generated_text": generated_text}