from transformers import AutoTokenizer from onnxruntime import InferenceSession import numpy as np import json from fastapi import FastAPI app = FastAPI() # Initialize components tokenizer = AutoTokenizer.from_pretrained( "Xenova/multi-qa-mpnet-base-dot-v1", use_fast=False # Avoids framework dependencies ) session = InferenceSession("model.onnx") def cosine_similarity(a, b): return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b)) @app.post("/predict") async def predict(query: str): # Tokenize inputs = tokenizer(query, return_tensors="np") inputs = {k: v.astype(np.int64) for k, v in inputs.items()} # Get embedding embedding = session.run(None, inputs)[0][0] return {"embedding": embedding.tolist()}