from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple from spacy.util import registry from thinc.types import Floats2d from spacy.tokens import Doc from spacy.pipeline import TextCategorizer from spacy.training import Example, validate_examples from spacy.pipeline.textcat import textcat_score from spacy.vocab import Vocab from spacy.scorer import Scorer from spacy.language import Language from thinc.api import Model import numpy @Language.factory( "weighted_textcat", assigns=["doc.cats"], default_config={ "threshold": 0.0, "scorer": {"@scorers": "spacy.textcat_scorer.v2"}, }, default_score_weights={ "cats_score": 1.0, "cats_score_desc": None, "cats_micro_p": None, "cats_micro_r": None, "cats_micro_f": None, "cats_macro_p": None, "cats_macro_r": None, "cats_macro_f": None, "cats_macro_auc": None, "cats_f_per_type": None, }, ) def make_textcat( nlp: Language, name: str, model: Model[List[Doc], List[Floats2d]], threshold: float, scorer: Optional[Callable], class_weights: Optional[List], ) -> "TextCategorizer": """Create a TextCategorizer component. The text categorizer predicts categories over a whole document. It can learn one or more labels, and the labels are considered to be mutually exclusive (i.e. one true label per doc). model (Model[List[Doc], List[Floats2d]]): A model instance that predicts scores for each category. threshold (float): Cutoff to consider a prediction "positive". scorer (Optional[Callable]): The scoring method. """ if class_weights == "null": class_weights = None return CustomTextcat( nlp.vocab, model, name, threshold=threshold, scorer=scorer, weights=class_weights, ) def textcat_score(examples: Iterable[Example], **kwargs) -> Dict[str, Any]: return Scorer.score_cats( examples, "cats", multi_label=False, **kwargs, ) @registry.scorers("spacy.textcat_scorer.v2") def make_textcat_scorer(): return textcat_score class CustomTextcat(TextCategorizer): def __init__( self, vocab: Vocab, model: Model, name: str = "textcat", *, threshold: float, scorer: Optional[Callable] = textcat_score, weights: Optional[List[float]] = None, ) -> None: """Initialize a text categorizer for single-label classification. vocab (Vocab): The shared vocabulary. model (thinc.api.Model): The Thinc Model powering the pipeline component. name (str): The component instance name, used to add entries to the losses during training. threshold (float): Unused, not needed for single-label (exclusive classes) classification. scorer (Optional[Callable]): The scoring method. Defaults to Scorer.score_cats for the attribute "cats". DOCS: https://spacy.io/api/textcategorizer#init """ self.vocab = vocab self.model = model self.name = name self._rehearsal_model = None cfg: Dict[str, Any] = { "labels": [], "threshold": threshold, "positive_label": None, } self.cfg = dict(cfg) self.scorer = scorer if weights is not None: self.weights = numpy.array(weights) def get_loss(self, examples: Iterable[Example], scores) -> Tuple[float, float]: """Find the loss and gradient of loss for the batch of documents and their predicted scores. examples (Iterable[Examples]): The batch of examples. scores: Scores representing the model's predictions. RETURNS (Tuple[float, float]): The loss and the gradient. DOCS: https://spacy.io/api/textcategorizer#get_loss """ validate_examples(examples, "TextCategorizer.get_loss") self._validate_categories(examples) truths, not_missing = self._examples_to_truth(examples) not_missing = self.model.ops.asarray(not_missing) # type: ignore d_scores = scores - truths d_scores *= not_missing weights = self.model.ops.asarray(self.weights) # type: ignore if weights is not None: squared = d_scores**2 mean_square_error = numpy.sum(squared * weights) / ( numpy.sum(weights) * len(squared) ) d_scores *= weights else: mean_square_error = (d_scores**2).mean() return float(mean_square_error), d_scores