from fastapi import FastAPI from onnxruntime import InferenceSession import numpy as np app = FastAPI() # Load ONNX model only # session = InferenceSession("model.onnx") @app.post("/predict") async def predict(inputs: dict): # Expect pre-tokenized input from client ##input_ids = np.array(inputs["input_ids"], dtype=np.int64) #attention_mask = np.array(inputs["attention_mask"], dtype=np.int64) # Run model #outputs = session.run(None, { # "input_ids": input_ids, # "attention_mask": attention_mask #}) return "Status ok" #return {"embedding": outputs[0].tolist()}