|  | from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple | 
					
						
						|  | from spacy.util import registry | 
					
						
						|  | from thinc.types import Floats2d | 
					
						
						|  | from spacy.tokens import Doc | 
					
						
						|  | from spacy.pipeline import TextCategorizer | 
					
						
						|  | from spacy.training import Example, validate_examples | 
					
						
						|  | from spacy.pipeline.textcat import textcat_score | 
					
						
						|  | from spacy.vocab import Vocab | 
					
						
						|  | from spacy.scorer import Scorer | 
					
						
						|  | from spacy.language import Language | 
					
						
						|  | from thinc.api import Model | 
					
						
						|  | import numpy | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | @Language.factory( | 
					
						
						|  | "weighted_textcat", | 
					
						
						|  | assigns=["doc.cats"], | 
					
						
						|  | default_config={ | 
					
						
						|  | "threshold": 0.0, | 
					
						
						|  | "scorer": {"@scorers": "spacy.textcat_scorer.v2"}, | 
					
						
						|  | }, | 
					
						
						|  | default_score_weights={ | 
					
						
						|  | "cats_score": 1.0, | 
					
						
						|  | "cats_score_desc": None, | 
					
						
						|  | "cats_micro_p": None, | 
					
						
						|  | "cats_micro_r": None, | 
					
						
						|  | "cats_micro_f": None, | 
					
						
						|  | "cats_macro_p": None, | 
					
						
						|  | "cats_macro_r": None, | 
					
						
						|  | "cats_macro_f": None, | 
					
						
						|  | "cats_macro_auc": None, | 
					
						
						|  | "cats_f_per_type": None, | 
					
						
						|  | }, | 
					
						
						|  | ) | 
					
						
						|  | def make_textcat( | 
					
						
						|  | nlp: Language, | 
					
						
						|  | name: str, | 
					
						
						|  | model: Model[List[Doc], List[Floats2d]], | 
					
						
						|  | threshold: float, | 
					
						
						|  | scorer: Optional[Callable], | 
					
						
						|  | class_weights: Optional[List], | 
					
						
						|  | ) -> "TextCategorizer": | 
					
						
						|  | """Create a TextCategorizer component. The text categorizer predicts categories | 
					
						
						|  | over a whole document. It can learn one or more labels, and the labels are considered | 
					
						
						|  | to be mutually exclusive (i.e. one true label per doc). | 
					
						
						|  |  | 
					
						
						|  | model (Model[List[Doc], List[Floats2d]]): A model instance that predicts | 
					
						
						|  | scores for each category. | 
					
						
						|  | threshold (float): Cutoff to consider a prediction "positive". | 
					
						
						|  | scorer (Optional[Callable]): The scoring method. | 
					
						
						|  | """ | 
					
						
						|  | if class_weights == "null": | 
					
						
						|  | class_weights = None | 
					
						
						|  | return CustomTextcat( | 
					
						
						|  | nlp.vocab, | 
					
						
						|  | model, | 
					
						
						|  | name, | 
					
						
						|  | threshold=threshold, | 
					
						
						|  | scorer=scorer, | 
					
						
						|  | weights=class_weights, | 
					
						
						|  | ) | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | def textcat_score(examples: Iterable[Example], **kwargs) -> Dict[str, Any]: | 
					
						
						|  | return Scorer.score_cats( | 
					
						
						|  | examples, | 
					
						
						|  | "cats", | 
					
						
						|  | multi_label=False, | 
					
						
						|  | **kwargs, | 
					
						
						|  | ) | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | @registry.scorers("spacy.textcat_scorer.v2") | 
					
						
						|  | def make_textcat_scorer(): | 
					
						
						|  | return textcat_score | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | class CustomTextcat(TextCategorizer): | 
					
						
						|  | def __init__( | 
					
						
						|  | self, | 
					
						
						|  | vocab: Vocab, | 
					
						
						|  | model: Model, | 
					
						
						|  | name: str = "textcat", | 
					
						
						|  | *, | 
					
						
						|  | threshold: float, | 
					
						
						|  | scorer: Optional[Callable] = textcat_score, | 
					
						
						|  | weights: Optional[List[float]] = None, | 
					
						
						|  | ) -> None: | 
					
						
						|  | """Initialize a text categorizer for single-label classification. | 
					
						
						|  |  | 
					
						
						|  | vocab (Vocab): The shared vocabulary. | 
					
						
						|  | model (thinc.api.Model): The Thinc Model powering the pipeline component. | 
					
						
						|  | name (str): The component instance name, used to add entries to the | 
					
						
						|  | losses during training. | 
					
						
						|  | threshold (float): Unused, not needed for single-label (exclusive | 
					
						
						|  | classes) classification. | 
					
						
						|  | scorer (Optional[Callable]): The scoring method. Defaults to | 
					
						
						|  | Scorer.score_cats for the attribute "cats". | 
					
						
						|  |  | 
					
						
						|  | DOCS: https://spacy.io/api/textcategorizer#init | 
					
						
						|  | """ | 
					
						
						|  | self.vocab = vocab | 
					
						
						|  | self.model = model | 
					
						
						|  | self.name = name | 
					
						
						|  | self._rehearsal_model = None | 
					
						
						|  | cfg: Dict[str, Any] = { | 
					
						
						|  | "labels": [], | 
					
						
						|  | "threshold": threshold, | 
					
						
						|  | "positive_label": None, | 
					
						
						|  | } | 
					
						
						|  | self.cfg = dict(cfg) | 
					
						
						|  | self.scorer = scorer | 
					
						
						|  | if weights is not None: | 
					
						
						|  | self.weights = numpy.array(weights) | 
					
						
						|  |  | 
					
						
						|  | def get_loss(self, examples: Iterable[Example], scores) -> Tuple[float, float]: | 
					
						
						|  | """Find the loss and gradient of loss for the batch of documents and | 
					
						
						|  | their predicted scores. | 
					
						
						|  |  | 
					
						
						|  | examples (Iterable[Examples]): The batch of examples. | 
					
						
						|  | scores: Scores representing the model's predictions. | 
					
						
						|  | RETURNS (Tuple[float, float]): The loss and the gradient. | 
					
						
						|  |  | 
					
						
						|  | DOCS: https://spacy.io/api/textcategorizer#get_loss | 
					
						
						|  | """ | 
					
						
						|  | validate_examples(examples, "TextCategorizer.get_loss") | 
					
						
						|  | self._validate_categories(examples) | 
					
						
						|  | truths, not_missing = self._examples_to_truth(examples) | 
					
						
						|  | not_missing = self.model.ops.asarray(not_missing) | 
					
						
						|  | d_scores = scores - truths | 
					
						
						|  | d_scores *= not_missing | 
					
						
						|  | weights = self.model.ops.asarray(self.weights) | 
					
						
						|  | if weights is not None: | 
					
						
						|  | squared = d_scores**2 | 
					
						
						|  | mean_square_error = numpy.sum(squared * weights) / ( | 
					
						
						|  | numpy.sum(weights) * len(squared) | 
					
						
						|  | ) | 
					
						
						|  | d_scores *= weights | 
					
						
						|  | else: | 
					
						
						|  | mean_square_error = (d_scores**2).mean() | 
					
						
						|  | return float(mean_square_error), d_scores | 
					
						
						|  |  |