File size: 4,568 Bytes
bafcf39
 
7810536
 
bafcf39
6319afc
7810536
 
bafcf39
f0f9378
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
bafcf39
 
 
 
 
7810536
 
bafcf39
 
 
 
 
 
 
 
 
7810536
bafcf39
 
 
 
 
7810536
bafcf39
 
7810536
bafcf39
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7810536
bafcf39
7810536
 
 
bafcf39
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
from typing import Any, Dict, Iterable, Iterator, List, Optional, Tuple, Union

import gradio as gr

# from tqdm import tqdm
from presidio_analyzer import DictAnalyzerResult, RecognizerResult
from presidio_analyzer.nlp_engine import NlpArtifacts


def recognizer_result_from_dict(data: Dict) -> RecognizerResult:
    """
    Create RecognizerResult from a dictionary.

    :param data: e.g. {
        "entity_type": "NAME",
        "start": 24,
        "end": 32,
        "score": 0.8,
        "recognition_metadata": None
    }
    :return: RecognizerResult
    """

    entity_type = data.get("Type")
    start = data.get("BeginOffset")
    end = data.get("EndOffset")
    score = data.get("Score")
    analysis_explanation = None
    recognition_metadata = None

    return RecognizerResult(
        entity_type, start, end, score, analysis_explanation, recognition_metadata
    )


def analyze_iterator_custom(
    self,
    texts: Iterable[Union[str, bool, float, int]],
    language: str,
    list_length: int,
    progress=gr.Progress(),
    **kwargs,
) -> List[List[RecognizerResult]]:
    """
    Analyze an iterable of strings.

    :param texts: An list containing strings to be analyzed.
    :param language: Input language
    :param list_length: Length of the input list.
    :param kwargs: Additional parameters for the `AnalyzerEngine.analyze` method.
    """

    # validate types
    texts = self._validate_types(texts)

    # Process the texts as batch for improved performance
    nlp_artifacts_batch: Iterator[Tuple[str, NlpArtifacts]] = (
        self.analyzer_engine.nlp_engine.process_batch(texts=texts, language=language)
    )

    list_results = []

    # Uncomment this if you want to show progress within a file
    # for text, nlp_artifacts in progress.tqdm(nlp_artifacts_batch, total = list_length, desc = "Analysing text for personal information", unit = "rows"):
    for text, nlp_artifacts in nlp_artifacts_batch:
        results = self.analyzer_engine.analyze(
            text=str(text), nlp_artifacts=nlp_artifacts, language=language, **kwargs
        )

        list_results.append(results)

    return list_results


def analyze_dict(
    self,
    input_dict: Dict[str, Union[Any, Iterable[Any]]],
    language: str,
    keys_to_skip: Optional[List[str]] = None,
    **kwargs,
) -> Iterator[DictAnalyzerResult]:
    """
    Analyze a dictionary of keys (strings) and values/iterable of values.

    Non-string values are returned as is.

    :param input_dict: The input dictionary for analysis
    :param language: Input language
    :param keys_to_skip: Keys to ignore during analysis
    :param kwargs: Additional keyword arguments
    for the `AnalyzerEngine.analyze` method.
    Use this to pass arguments to the analyze method,
    such as `ad_hoc_recognizers`, `context`, `return_decision_process`.
    See `AnalyzerEngine.analyze` for the full list.
    """

    context = []
    if "context" in kwargs:
        context = kwargs["context"]
        del kwargs["context"]

    if not keys_to_skip:
        keys_to_skip = []

    for key, value in input_dict.items():
        if not value or key in keys_to_skip:
            yield DictAnalyzerResult(key=key, value=value, recognizer_results=[])
            continue  # skip this key as requested

        # Add the key as an additional context
        specific_context = context[:]
        specific_context.append(key)

        if type(value) in (str, int, bool, float):
            results: List[RecognizerResult] = self.analyzer_engine.analyze(
                text=str(value), language=language, context=[key], **kwargs
            )
        elif isinstance(value, dict):
            new_keys_to_skip = self._get_nested_keys_to_skip(key, keys_to_skip)
            results = self.analyze_dict(
                input_dict=value,
                language=language,
                context=specific_context,
                keys_to_skip=new_keys_to_skip,
                **kwargs,
            )
        elif isinstance(value, Iterable):
            # Recursively iterate nested dicts
            list_length = len(value)

            results: List[List[RecognizerResult]] = analyze_iterator_custom(
                self,
                texts=value,
                language=language,
                context=specific_context,
                list_length=list_length,
                **kwargs,
            )
        else:
            raise ValueError(f"type {type(value)} is unsupported.")

        yield DictAnalyzerResult(key=key, value=value, recognizer_results=results)