|
from transformers import ( |
|
AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, pipeline |
|
) |
|
import torch, os |
|
|
|
MODEL_ID = "Qwen/Qwen3-32B" |
|
|
|
def get_model(): |
|
|
|
return AutoModelForCausalLM.from_pretrained( |
|
MODEL_ID, |
|
torch_dtype=torch.bfloat16, |
|
device_map="auto", |
|
low_cpu_mem_usage=True, |
|
trust_remote_code=True |
|
) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True) |
|
model = get_model() |
|
generator = pipeline( |
|
"text-generation", |
|
model=model, |
|
tokenizer=tokenizer, |
|
device_map="auto", |
|
torch_dtype=getattr(model, "dtype", torch.bfloat16), |
|
) |
|
|
|
def __init__(self, *args, **kwargs): |
|
pass |
|
|
|
def __call__(self, data): |
|
prompt = data.get("inputs") if isinstance(data, dict) else data |
|
outputs = generator(prompt, max_new_tokens=256) |
|
return outputs |
|
|