presidio_demo

Sleeping

App Files Files Community

presidio commited on Apr 4, 2023

Commit

1215818

1 Parent(s): cf32ce5

Upload 7 files

Browse files

Files changed (7) hide show

demo_text.txt +12 -0
index.md +22 -0
presidio_streamlit.py +293 -0
requirements.txt +8 -0
transformers_rec/__init__.py +5 -0
transformers_rec/configuration.py +116 -0
transformers_rec/transformers_recognizer.py +324 -0

demo_text.txt ADDED Viewed

	@@ -0,0 +1,12 @@

+Here are a few examples sentences we currently support:
+Hello, my name is David Johnson and I live in Maine.
+My credit card number is 4095-2609-9393-4932 and my crypto wallet id is 16Yeky6GMjeNkAiNcBY7ZhrLoMSgg1BoyZ.
+On September 18 I visited microsoft.com and sent an email to [email protected],  from the IP 192.168.0.1.
+My passport: 191280342 and my phone number: (212) 555-1234.
+This is a valid International Bank Account Number: IL150120690000003111111 . Can you please check the status on bank account 954567876544?
+Kate's social security number is 078-05-1126.  Her driver license? it is 1234567A.

index.md ADDED Viewed

	@@ -0,0 +1,22 @@

+# Simple demo website for Presidio
+Here's a simple app, written in pure Python, to create a demo website for Presidio.
+The app is based on the [streamlit](https://streamlit.io/) package.
+## Requirements
+1. Install dependencies (preferably in a virtual environment)
+```sh
+pip install streamlit pandas presidio-analyzer presidio-anonymizer
+```
+2. Download the [presidio_streamlit.py](presidio_streamlit.py) file.
+3. *Optional*: Update the `analyzer_engine` and `anonymizer_engine` functions for your specific implementation
+3. Start the app:
+```sh
+streamlit run presidio_streamlit.py
+```
+## Output
+Output should be similar to this screenshot:
+![image](https://user-images.githubusercontent.com/3776619/120109161-efe21080-c170-11eb-8a29-9eaf71e722ee.png)

presidio_streamlit.py ADDED Viewed

	@@ -0,0 +1,293 @@

+"""Streamlit app for Presidio."""
+from json import JSONEncoder
+from typing import List
+import pandas as pd
+import spacy
+import streamlit as st
+from annotated_text import annotated_text
+from presidio_analyzer import AnalyzerEngine, RecognizerResult, RecognizerRegistry
+from presidio_analyzer.nlp_engine import NlpEngineProvider
+from presidio_anonymizer import AnonymizerEngine
+from presidio_anonymizer.entities import OperatorConfig
+from transformers_rec import (
+    STANFORD_COFIGURATION,
+    TransformersRecognizer,
+    BERT_DEID_CONFIGURATION,
+)
+# Helper methods
+@st.cache_resource
+def analyzer_engine(model_path: str):
+    """Return AnalyzerEngine.
+    :param model_path: Which model to use for NER:
+        "StanfordAIMI/stanford-deidentifier-base",
+        "obi/deid_roberta_i2b2",
+        "en_core_web_lg"
+    """
+    registry = RecognizerRegistry()
+    registry.load_predefined_recognizers()
+    # Set up NLP Engine according to the model of choice
+    if model_path == "en_core_web_lg":
+        if not spacy.util.is_package("en_core_web_lg"):
+            spacy.cli.download("en_core_web_lg")
+        nlp_configuration = {
+            "nlp_engine_name": "spacy",
+            "models": [{"lang_code": "en", "model_name": "en_core_web_lg"}],
+        }
+    else:
+        if not spacy.util.is_package("en_core_web_sm"):
+            spacy.cli.download("en_core_web_sm")
+        # Using a small spaCy model + a HF NER model
+        transformers_recognizer = TransformersRecognizer(model_path=model_path)
+        if model_path == "StanfordAIMI/stanford-deidentifier-base":
+            transformers_recognizer.load_transformer(**STANFORD_COFIGURATION)
+        elif model_path == "obi/deid_roberta_i2b2":
+            transformers_recognizer.load_transformer(**BERT_DEID_CONFIGURATION)
+        # Use small spaCy model, no need for both spacy and HF models
+        # The transformers model is used here as a recognizer, not as an NlpEngine
+        nlp_configuration = {
+            "nlp_engine_name": "spacy",
+            "models": [{"lang_code": "en", "model_name": "en_core_web_sm"}],
+        }
+        registry.add_recognizer(transformers_recognizer)
+    nlp_engine = NlpEngineProvider(nlp_configuration=nlp_configuration).create_engine()
+    analyzer = AnalyzerEngine(nlp_engine=nlp_engine, registry=registry)
+    return analyzer
+@st.cache_resource
+def anonymizer_engine():
+    """Return AnonymizerEngine."""
+    return AnonymizerEngine()
+@st.cache_data
+def get_supported_entities():
+    """Return supported entities from the Analyzer Engine."""
+    return analyzer_engine(st_model).get_supported_entities()
+@st.cache_data
+def analyze(**kwargs):
+    """Analyze input using Analyzer engine and input arguments (kwargs)."""
+    if "entities" not in kwargs or "All" in kwargs["entities"]:
+        kwargs["entities"] = None
+    return analyzer_engine(st_model).analyze(**kwargs)
+def anonymize(text: str, analyze_results: List[RecognizerResult]):
+    """Anonymize identified input using Presidio Anonymizer.
+    :param text: Full text
+    :param analyze_results: list of results from presidio analyzer engine
+    """
+    if st_operator == "mask":
+        operator_config = {
+            "type": "mask",
+            "masking_char": st_mask_char,
+            "chars_to_mask": st_number_of_chars,
+            "from_end": False,
+        }
+    elif st_operator == "encrypt":
+        operator_config = {"key": st_encrypt_key}
+    elif st_operator == "highlight":
+        operator_config = {"lambda": lambda x: x}
+    else:
+        operator_config = None
+    if st_operator == "highlight":
+        operator = "custom"
+    else:
+        operator = st_operator
+    res = anonymizer_engine().anonymize(
+        text,
+        analyze_results,
+        operators={"DEFAULT": OperatorConfig(operator, operator_config)},
+    )
+    return res
+def annotate(text: str, analyze_results: List[RecognizerResult]):
+    """
+    Highlights every identified entity on top of the text.
+    :param text: full text
+    :param analyze_results: list of analyzer results.
+    """
+    tokens = []
+    # Use the anonymizer to resolve overlaps
+    results = anonymize(text, analyze_results)
+    # sort by start index
+    results = sorted(results.items, key=lambda x: x.start)
+    for i, res in enumerate(results):
+        if i == 0:
+            tokens.append(text[: res.start])
+        # append entity text and entity type
+        tokens.append((text[res.start: res.end], res.entity_type))
+        # if another entity coming i.e. we're not at the last results element, add text up to next entity
+        if i != len(results) - 1:
+            tokens.append(text[res.end: results[i + 1].start])
+        # if no more entities coming, add all remaining text
+        else:
+            tokens.append(text[res.end:])
+    return tokens
+st.set_page_config(page_title="Presidio demo", layout="wide")
+# Sidebar
+st.sidebar.header(
+    """
+PII De-Identification with Microsoft Presidio
+"""
+)
+st.sidebar.info(
+    "Presidio is an open source customizable framework for PII detection and de-identification\n"
+    "[Code](https://aka.ms/presidio) | "
+    "[Tutorial](https://microsoft.github.io/presidio/tutorial/) | "
+    "[Installation](https://microsoft.github.io/presidio/installation/) | "
+    "[FAQ](https://microsoft.github.io/presidio/faq/)",
+    icon="ℹ️",
+)
+st.sidebar.markdown(
+    "[![Pypi Downloads](https://img.shields.io/pypi/dm/presidio-analyzer.svg)](https://img.shields.io/pypi/dm/presidio-analyzer.svg)"
+    "[![MIT license](https://img.shields.io/badge/license-MIT-brightgreen.svg)](http://opensource.org/licenses/MIT)"
+    "![GitHub Repo stars](https://img.shields.io/github/stars/microsoft/presidio?style=social)"
+)
+st_model = st.sidebar.selectbox(
+    "NER model",
+    [
+        "StanfordAIMI/stanford-deidentifier-base",
+        "obi/deid_roberta_i2b2",
+        "en_core_web_lg",
+    ],
+    index=1,
+)
+st.sidebar.markdown("> Note: Models might take some time to download. ")
+st_operator = st.sidebar.selectbox(
+    "De-identification approach",
+    ["redact", "replace", "mask", "hash", "encrypt", "highlight"],
+    index=1,
+)
+if st_operator == "mask":
+    st_number_of_chars = st.sidebar.number_input(
+        "number of chars", value=15, min_value=0, max_value=100
+    )
+    st_mask_char = st.sidebar.text_input("Mask character", value="*", max_chars=1)
+elif st_operator == "encrypt":
+    st_encrypt_key = st.sidebar.text_input("AES key", value="WmZq4t7w!z%C&F)J")
+st_threshold = st.sidebar.slider(
+    label="Acceptance threshold", min_value=0.0, max_value=1.0, value=0.35
+)
+st_return_decision_process = st.sidebar.checkbox(
+    "Add analysis explanations to findings", value=False
+)
+st_entities = st.sidebar.multiselect(
+    label="Which entities to look for?",
+    options=get_supported_entities(),
+    default=list(get_supported_entities()),
+)
+# Main panel
+analyzer_load_state = st.info("Starting Presidio analyzer...")
+engine = analyzer_engine(model_path=st_model)
+analyzer_load_state.empty()
+# Read default text
+with open("demo_text.txt") as f:
+    demo_text = f.readlines()
+# Create two columns for before and after
+col1, col2 = st.columns(2)
+# Before:
+col1.subheader("Input string:")
+st_text = col1.text_area(
+    label="Enter text",
+    value="".join(demo_text),
+    height=400,
+)
+st_analyze_results = analyze(
+    text=st_text,
+    entities=st_entities,
+    language="en",
+    score_threshold=st_threshold,
+    return_decision_process=st_return_decision_process,
+)
+# After
+if st_operator != "highlight":
+    with col2:
+        st.subheader(f"Output")
+        st_anonymize_results = anonymize(st_text, st_analyze_results)
+        st.text_area(label="De-identified", value=st_anonymize_results.text, height=400)
+else:
+    st.subheader("Highlighted")
+    annotated_tokens = annotate(st_text, st_analyze_results)
+    # annotated_tokens
+    annotated_text(*annotated_tokens)
+# json result
+class ToDictEncoder(JSONEncoder):
+    """Encode dict to json."""
+    def default(self, o):
+        """Encode to JSON using to_dict."""
+        return o.to_dict()
+# table result
+st.subheader(
+    "Findings" if not st_return_decision_process else "Findings with decision factors"
+)
+if st_analyze_results:
+    df = pd.DataFrame.from_records([r.to_dict() for r in st_analyze_results])
+    df["text"] = [st_text[res.start: res.end] for res in st_analyze_results]
+    df_subset = df[["entity_type", "text", "start", "end", "score"]].rename(
+        {
+            "entity_type": "Entity type",
+            "text": "Text",
+            "start": "Start",
+            "end": "End",
+            "score": "Confidence",
+        },
+        axis=1,
+    )
+    df_subset["Text"] = [st_text[res.start: res.end] for res in st_analyze_results]
+    if st_return_decision_process:
+        analysis_explanation_df = pd.DataFrame.from_records(
+            [r.analysis_explanation.to_dict() for r in st_analyze_results]
+        )
+        df_subset = pd.concat([df_subset, analysis_explanation_df], axis=1)
+    st.dataframe(df_subset.reset_index(drop=True), use_container_width=True)
+else:
+    st.text("No findings")

requirements.txt ADDED Viewed

	@@ -0,0 +1,8 @@

+presidio-analyzer
+presidio-anonymizer
+streamlit
+pandas
+st-annotated-text
+faker
+torch
+transformers

transformers_rec/__init__.py ADDED Viewed

	@@ -0,0 +1,5 @@

+from .configuration import BERT_DEID_CONFIGURATION, STANFORD_COFIGURATION
+from .transformers_recognizer import TransformersRecognizer
+__all__ = ["BERT_DEID_CONFIGURATION", "STANFORD_COFIGURATION", "TransformersRecognizer"]

transformers_rec/configuration.py ADDED Viewed

	@@ -0,0 +1,116 @@

+STANFORD_COFIGURATION = {
+    "DEFAULT_MODEL_PATH": "StanfordAIMI/stanford-deidentifier-base",
+    "PRESIDIO_SUPPORTED_ENTITIES": [
+        "LOCATION",
+        "PERSON",
+        "ORGANIZATION",
+        "AGE",
+        "PHONE_NUMBER",
+        "EMAIL",
+        "DATE_TIME",
+        "DEVICE",
+        "ZIP",
+        "PROFESSION",
+        "USERNAME"
+    ],
+    "LABELS_TO_IGNORE": ["O"],
+    "DEFAULT_EXPLANATION": "Identified as {} by the StanfordAIMI/stanford-deidentifier-base NER model",
+    "SUB_WORD_AGGREGATION": "simple",
+    "DATASET_TO_PRESIDIO_MAPPING": {
+        "DATE": "DATE_TIME",
+        "DOCTOR": "PERSON",
+        "PATIENT": "PERSON",
+        "HOSPITAL": "LOCATION",
+        "MEDICALRECORD": "O",
+        "IDNUM": "O",
+        "ORGANIZATION": "ORGANIZATION",
+        "ZIP": "ZIP",
+        "PHONE": "PHONE_NUMBER",
+        "USERNAME": "USERNAME",
+        "STREET": "LOCATION",
+        "PROFESSION": "PROFESSION",
+        "COUNTRY": "LOCATION",
+        "LOCATION-OTHER": "LOCATION",
+        "FAX": "PHONE_NUMBER",
+        "EMAIL": "EMAIL",
+        "STATE": "LOCATION",
+        "DEVICE": "DEVICE",
+        "ORG": "ORGANIZATION",
+        "AGE": "AGE",
+    },
+    "MODEL_TO_PRESIDIO_MAPPING": {
+        "PER": "PERSON",
+        "PERSON": "PERSON",
+        "LOC": "LOCATION",
+        "ORG": "ORGANIZATION",
+        "AGE": "AGE",
+        "PATIENT": "PERSON",
+        "HCW": "PERSON",
+        "HOSPITAL": "LOCATION",
+        "PATORG": "ORGANIZATION",
+        "DATE": "DATE_TIME",
+        "PHONE": "PHONE_NUMBER",
+        "VENDOR": "ORGANIZATION",
+    },
+    "CHUNK_OVERLAP_SIZE": 40,
+    "CHUNK_SIZE": 600,
+}
+BERT_DEID_CONFIGURATION = {
+    "PRESIDIO_SUPPORTED_ENTITIES": [
+        "LOCATION",
+        "PERSON",
+        "ORGANIZATION",
+        "AGE",
+        "PHONE_NUMBER",
+        "EMAIL",
+        "DATE_TIME",
+        "ZIP",
+        "PROFESSION",
+        "USERNAME",
+    ],
+    "DEFAULT_MODEL_PATH": "obi/deid_roberta_i2b2",
+    "LABELS_TO_IGNORE": ["O"],
+    "DEFAULT_EXPLANATION": "Identified as {} by the obi/deid_roberta_i2b2 NER model",
+    "SUB_WORD_AGGREGATION": "simple",
+    "DATASET_TO_PRESIDIO_MAPPING": {
+        "DATE": "DATE_TIME",
+        "DOCTOR": "PERSON",
+        "PATIENT": "PERSON",
+        "HOSPITAL": "ORGANIZATION",
+        "MEDICALRECORD": "O",
+        "IDNUM": "O",
+        "ORGANIZATION": "ORGANIZATION",
+        "ZIP": "O",
+        "PHONE": "PHONE_NUMBER",
+        "USERNAME": "",
+        "STREET": "LOCATION",
+        "PROFESSION": "PROFESSION",
+        "COUNTRY": "LOCATION",
+        "LOCATION-OTHER": "LOCATION",
+        "FAX": "PHONE_NUMBER",
+        "EMAIL": "EMAIL",
+        "STATE": "LOCATION",
+        "DEVICE": "O",
+        "ORG": "ORGANIZATION",
+        "AGE": "AGE",
+    },
+    "MODEL_TO_PRESIDIO_MAPPING": {
+        "PER": "PERSON",
+        "LOC": "LOCATION",
+        "ORG": "ORGANIZATION",
+        "AGE": "AGE",
+        "ID": "O",
+        "EMAIL": "EMAIL",
+        "PATIENT": "PERSON",
+        "STAFF": "PERSON",
+        "HOSP": "ORGANIZATION",
+        "PATORG": "ORGANIZATION",
+        "DATE": "DATE_TIME",
+        "PHONE": "PHONE_NUMBER",
+    },
+    "CHUNK_OVERLAP_SIZE": 40,
+    "CHUNK_SIZE": 600,
+}

transformers_rec/transformers_recognizer.py ADDED Viewed

	@@ -0,0 +1,324 @@

+import copy
+import logging
+from typing import Optional, List
+import torch
+from presidio_analyzer import (
+    RecognizerResult,
+    EntityRecognizer,
+    AnalysisExplanation,
+)
+from presidio_analyzer.nlp_engine import NlpArtifacts
+from .configuration import BERT_DEID_CONFIGURATION
+logger = logging.getLogger("presidio-analyzer")
+try:
+    from transformers import (
+        AutoTokenizer,
+        AutoModelForTokenClassification,
+        pipeline,
+        TokenClassificationPipeline,
+    )
+except ImportError:
+    logger.error("transformers_rec is not installed")
+class TransformersRecognizer(EntityRecognizer):
+    """
+    Wrapper for a transformers_rec model, if needed to be used within Presidio Analyzer.
+    The class loads models hosted on HuggingFace - https://huggingface.co/
+    and loads the model and tokenizer into a TokenClassification pipeline.
+    Samples are split into short text chunks, ideally shorter than max_length input_ids of the individual model,
+    to avoid truncation by the Tokenizer and loss of information
+    A configuration object should be maintained for each dataset-model combination and translate
+    entities names into a standardized view. A sample of a configuration file is attached in
+    the example.
+    :param supported_entities: List of entities to run inference on
+    :type supported_entities: Optional[List[str]]
+    :param pipeline: Instance of a TokenClassificationPipeline including a Tokenizer and a Model, defaults to None
+    :type pipeline: Optional[TokenClassificationPipeline], optional
+    :param model_path: string referencing a HuggingFace uploaded model to be used for Inference, defaults to None
+    :type model_path: Optional[str], optional
+    :example
+    >from presidio_analyzer import AnalyzerEngine, RecognizerRegistry
+    >model_path = "obi/deid_roberta_i2b2"
+    >transformers_recognizer = TransformersRecognizer(model_path=model_path,
+    >supported_entities = model_configuration.get("PRESIDIO_SUPPORTED_ENTITIES"))
+    >transformers_recognizer.load_transformer(**model_configuration)
+    >registry = RecognizerRegistry()
+    >registry.add_recognizer(transformers_recognizer)
+    >analyzer = AnalyzerEngine(registry=registry)
+    >sample = "My name is Christopher and I live in Irbid."
+    >results = analyzer.analyze(sample, language="en",return_decision_process=True)
+    >for result in results:
+    >    print(result,'----', sample[result.start:result.end])
+    """
+    def load(self) -> None:
+        pass
+    def __init__(
+        self,
+        model_path: Optional[str] = None,
+        pipeline: Optional[TokenClassificationPipeline] = None,
+        supported_entities: Optional[List[str]] = None,
+    ):
+        if not supported_entities:
+            supported_entities = BERT_DEID_CONFIGURATION[
+                "PRESIDIO_SUPPORTED_ENTITIES"
+            ]
+        super().__init__(
+            supported_entities=supported_entities,
+            name=f"Transformers model {model_path}",
+        )
+        self.model_path = model_path
+        self.pipeline = pipeline
+        self.is_loaded = False
+        self.aggregation_mechanism = None
+        self.ignore_labels = None
+        self.model_to_presidio_mapping = None
+        self.entity_mapping = None
+        self.default_explanation = None
+        self.text_overlap_length = None
+        self.chunk_length = None
+    def load_transformer(self, **kwargs) -> None:
+        """Load external configuration parameters and set default values.
+        :param kwargs: define default values for class attributes and modify pipeline behavior
+        **DATASET_TO_PRESIDIO_MAPPING (dict) - defines mapping entity strings from dataset format to Presidio format
+        **MODEL_TO_PRESIDIO_MAPPING (dict) -  defines mapping entity strings from chosen model format to Presidio format
+        **SUB_WORD_AGGREGATION(str) - define how to aggregate sub-word tokens into full words and spans as defined
+        in HuggingFace https://huggingface.co/transformers/v4.8.0/main_classes/pipelines.html#transformers.TokenClassificationPipeline # noqa
+        **CHUNK_OVERLAP_SIZE (int) - number of overlapping characters in each text chunk
+        when splitting a single text into multiple inferences
+        **CHUNK_SIZE (int) - number of characters in each chunk of text
+        **LABELS_TO_IGNORE (List(str)) - List of entities to skip evaluation. Defaults to ["O"]
+        **DEFAULT_EXPLANATION (str) - string format to use for prediction explanations
+        """
+        self.entity_mapping = kwargs.get("DATASET_TO_PRESIDIO_MAPPING", {})
+        self.model_to_presidio_mapping = kwargs.get("MODEL_TO_PRESIDIO_MAPPING", {})
+        self.ignore_labels = kwargs.get("LABELS_TO_IGNORE", ["O"])
+        self.aggregation_mechanism = kwargs.get("SUB_WORD_AGGREGATION", "simple")
+        self.default_explanation = kwargs.get("DEFAULT_EXPLANATION", None)
+        self.text_overlap_length = kwargs.get("CHUNK_OVERLAP_SIZE", 40)
+        self.chunk_length = kwargs.get("CHUNK_SIZE", 600)
+        if not self.pipeline:
+            if not self.model_path:
+                self.model_path = "obi/deid_roberta_i2b2"
+                logger.warning(
+                    f"Both 'model' and 'model_path' arguments are None. Using default model_path={self.model_path}"
+                )
+        self._load_pipeline()
+    def _load_pipeline(self) -> None:
+        """Initialize NER transformers_rec pipeline using the model_path provided"""
+        logging.debug(f"Initializing NER pipeline using {self.model_path} path")
+        device = 0 if torch.cuda.is_available() else -1
+        self.pipeline = pipeline(
+            "ner",
+            model=AutoModelForTokenClassification.from_pretrained(self.model_path),
+            tokenizer=AutoTokenizer.from_pretrained(self.model_path),
+            # Will attempt to group sub-entities to word level
+            aggregation_strategy=self.aggregation_mechanism,
+            device=device,
+            framework="pt",
+            ignore_labels=self.ignore_labels,
+        )
+        self.is_loaded = True
+    def get_supported_entities(self) -> List[str]:
+        """
+        Return supported entities by this model.
+        :return: List of the supported entities.
+        """
+        return self.supported_entities
+    # Class to use transformers_rec with Presidio as an external recognizer.
+    def analyze(
+        self, text: str, entities: List[str], nlp_artifacts: NlpArtifacts = None
+    ) -> List[RecognizerResult]:
+        """
+        Analyze text using transformers_rec model to produce NER tagging.
+        :param text : The text for analysis.
+        :param entities: Not working properly for this recognizer.
+        :param nlp_artifacts: Not used by this recognizer.
+        :return: The list of Presidio RecognizerResult constructed from the recognized
+            transformers_rec detections.
+        """
+        results = list()
+        # Run transformer model on the provided text
+        ner_results = self._get_ner_results_for_text(text)
+        for res in ner_results:
+            entity = self.model_to_presidio_mapping.get(res["entity_group"], None)
+            if not entity:
+                continue
+            res["entity_group"] = self.__check_label_transformer(res["entity_group"])
+            textual_explanation = self.default_explanation.format(res["entity_group"])
+            explanation = self.build_transformers_explanation(
+                float(round(res["score"], 2)), textual_explanation, res["word"]
+            )
+            transformers_result = self._convert_to_recognizer_result(res, explanation)
+            results.append(transformers_result)
+        return results
+    @staticmethod
+    def split_text_to_word_chunks(
+        input_length: int, chunk_length: int, overlap_length: int
+    ) -> List[List]:
+        """The function calculates chunks of text with size chunk_length. Each chunk has overlap_length number of
+        words to create context and continuity for the model
+        :param input_length: Length of input_ids for a given text
+        :type input_length: int
+        :param chunk_length: Length of each chunk of input_ids.
+        Should match the max input length of the transformer model
+        :type chunk_length: int
+        :param overlap_length: Number of overlapping words in each chunk
+        :type overlap_length: int
+        :return: List of start and end positions for individual text chunks
+        :rtype: List[List]
+        """
+        if input_length < chunk_length:
+            return [[0, input_length]]
+        if chunk_length <= overlap_length:
+            logger.warning(
+                "overlap_length should be shorter than chunk_length, setting overlap_length to by half of chunk_length"
+            )
+            overlap_length = chunk_length // 2
+        return [
+            [i, min([i + chunk_length, input_length])]
+            for i in range(
+                0, input_length - overlap_length, chunk_length - overlap_length
+            )
+        ]
+    def _get_ner_results_for_text(self, text: str) -> List[dict]:
+        """The function runs model inference on the provided text.
+        The text is split into chunks with n overlapping characters.
+        The results are then aggregated and duplications are removed.
+        :param text: The text to run inference on
+        :type text: str
+        :return: List of entity predictions on the word level
+        :rtype: List[dict]
+        """
+        model_max_length = self.pipeline.tokenizer.model_max_length
+        # calculate inputs based on the text
+        text_length = len(text)
+        # split text into chunks
+        logger.info(
+            f"splitting the text into chunks, length {text_length} > {model_max_length*2}"
+        )
+        predictions = list()
+        chunk_indexes = TransformersRecognizer.split_text_to_word_chunks(
+            text_length, self.chunk_length, self.text_overlap_length
+        )
+        # iterate over text chunks and run inference
+        for chunk_start, chunk_end in chunk_indexes:
+            chunk_text = text[chunk_start:chunk_end]
+            chunk_preds = self.pipeline(chunk_text)
+            # align indexes to match the original text - add to each position the value of chunk_start
+            aligned_predictions = list()
+            for prediction in chunk_preds:
+                prediction_tmp = copy.deepcopy(prediction)
+                prediction_tmp["start"] += chunk_start
+                prediction_tmp["end"] += chunk_start
+                aligned_predictions.append(prediction_tmp)
+            predictions.extend(aligned_predictions)
+        # remove duplicates
+        predictions = [dict(t) for t in {tuple(d.items()) for d in predictions}]
+        return predictions
+    @staticmethod
+    def _convert_to_recognizer_result(
+        prediction_result: dict, explanation: AnalysisExplanation
+    ) -> RecognizerResult:
+        """The method parses NER model predictions into a RecognizerResult format to enable down the stream analysis
+        :param prediction_result: A single example of entity prediction
+        :type prediction_result: dict
+        :param explanation: Textual representation of model prediction
+        :type explanation: str
+        :return: An instance of RecognizerResult which is used to model evaluation calculations
+        :rtype: RecognizerResult
+        """
+        transformers_results = RecognizerResult(
+            entity_type=prediction_result["entity_group"],
+            start=prediction_result["start"],
+            end=prediction_result["end"],
+            score=float(round(prediction_result["score"], 2)),
+            analysis_explanation=explanation,
+        )
+        return transformers_results
+    def build_transformers_explanation(
+        self,
+        original_score: float,
+        explanation: str,
+        pattern: str,
+    ) -> AnalysisExplanation:
+        """
+        Create explanation for why this result was detected.
+        :param original_score: Score given by this recognizer
+        :param explanation: Explanation string
+        :param pattern: Regex pattern used
+        :return Structured explanation and scores of a NER model prediction
+        :rtype: AnalysisExplanation
+        """
+        explanation = AnalysisExplanation(
+            recognizer=self.__class__.__name__,
+            original_score=float(original_score),
+            textual_explanation=explanation,
+            pattern=pattern,
+        )
+        return explanation
+    def __check_label_transformer(self, label: str) -> str:
+        """The function validates the predicted label is identified by Presidio
+        and maps the string into a Presidio representation
+        :param label: Predicted label by the model
+        :type label: str
+        :return: Returns the predicted entity if the label is found in model_to_presidio mapping dictionary
+        and is supported by Presidio entities
+        :rtype: str
+        """
+        if label == "O":
+            return label
+        # convert model label to presidio label
+        entity = self.model_to_presidio_mapping.get(label, None)
+        if entity is None:
+            logger.warning(f"Found unrecognized label {label}, returning entity as 'O'")
+            return "O"
+        if entity not in self.supported_entities:
+            logger.warning(f"Found entity {entity} which is not supported by Presidio")
+            return "O"
+        return entity