from transformers import ( AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, pipeline ) import torch, os MODEL_ID = "Qwen/Qwen3-32B" # 换成自己的模型 def get_model(): # ① 先试 bfloat16,A100/H100 都原生支持 return AutoModelForCausalLM.from_pretrained( MODEL_ID, torch_dtype=torch.bfloat16, device_map="auto", # TGI 同款逻辑,自动分片 low_cpu_mem_usage=True, # 先在 CPU 建图,再流式拷到 GPU trust_remote_code=True ) # ---- 如果 bfloat16 仍 OOM,可改成 4-bit 量化 ---- # bnb_cfg = BitsAndBytesConfig( # load_in_4bit=True, # bnb_4bit_quant_type="nf4", # bnb_4bit_use_double_quant=True, # ) # def get_model(): # return AutoModelForCausalLM.from_pretrained( # MODEL_ID, # device_map="auto", # quantization_config=bnb_cfg, # trust_remote_code=True # ) tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True) model = get_model() generator = pipeline( "text-generation", model=model, tokenizer=tokenizer, device_map="auto", torch_dtype=getattr(model, "dtype", torch.bfloat16), ) def __init__(self, *args, **kwargs): pass def __call__(self, data): prompt = data.get("inputs") if isinstance(data, dict) else data outputs = generator(prompt, max_new_tokens=256) return outputs