from fastapi import FastAPI from pydantic import BaseModel import joblib from datasets import load_dataset import os os.environ["HF_DATASETS_CACHE"] = "/tmp" # --- Load model and label map --- crf = joblib.load("crf_model.pkl") raw = load_dataset("DFKI-SLT/few-nerd", "supervised") label_map = raw['train'].features['ner_tags'].feature.int2str def word2features(tokens, i): w = tokens[i] f = { 'word.lower()': w.lower(), 'word.isupper()': w.isupper(), 'word.istitle()': w.istitle(), 'word.isdigit()': w.isdigit(), 'bias': 1.0, } if i > 0: f['prev.lower()'] = tokens[i - 1].lower() else: f['BOS'] = True if i < len(tokens) - 1: f['next.lower()'] = tokens[i + 1].lower() else: f['EOS'] = True return f def sentence_to_features(tokens): return [word2features(tokens, i) for i in range(len(tokens))] # --- API schema --- class SentenceRequest(BaseModel): tokens: list[str] # --- Initialize app --- app = FastAPI(title="NER with CRF") @app.post("/predict") def predict(req: SentenceRequest): features = [sentence_to_features(req.tokens)] y_pred = crf.predict(features)[0] # Convert to plain Python list y_pred = list(map(str, y_pred)) # Return JSON-serializable dict return { "tokens": req.tokens, "predicted_labels": y_pred } @app.post("/split") def split(sent): tokens = sent.split() return {"tokens": tokens}