File size: 3,753 Bytes
36da3fb
 
 
 
 
 
 
 
 
 
d966e51
36da3fb
 
 
3919cdf
 
 
 
 
 
 
 
 
 
 
 
36da3fb
 
3919cdf
 
 
 
 
8334793
3919cdf
36da3fb
 
 
 
 
 
 
 
 
3d1ac96
36da3fb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
46d4fe3
36da3fb
 
 
 
 
 
 
 
 
 
 
3510abe
 
 
36da3fb
 
 
 
 
 
 
 
 
 
 
7b1516b
36da3fb
 
 
 
 
 
 
 
 
1ee999b
 
 
 
36da3fb
 
3919cdf
36da3fb
 
 
801ee27
 
36da3fb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3919cdf
36da3fb
 
 
 
 
 
 
3919cdf
36da3fb
 
284fc35
36da3fb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
import json
import os
import random

import gradio as gr
import spacy
from huggingface_hub import snapshot_download
from spacy import displacy
from spacy.tokens import Span

# download spacy model --
os.system('python -m spacy download en_core_web_sm')

# # set up colors for PII types ---
# options = {
#     "colors": {
#         "NAME_STUDENT": "#7FDBFF",  # Soft blue
#         "EMAIL": "#008080",         # Dark cyan
#         "USERNAME": "#C3B1E1",      # Pastel violet
#         "ID_NUM": "#2ECC40",        # Medium green
#         "PHONE_NUM": "#FF851B",     # Deep orange
#         "URL_PERSONAL": "#4682B4",  # Steel blue
#         "STREET_ADDRESS": "#808000",  # Muted olive
#     }
# }

options = {
    "colors": {
        "NAME_STUDENT": "#6EB5FF",    # Lighter blue
        "EMAIL": "#42D4B5",           # Light teal
        "USERNAME": "#D8B4E2",        # Light lavender
        "ID_NUM": "#7AE88F",          # Light green
        "PHONE_NUM": "#FFB87D",       # Light peach
        "URL_PERSONAL": "#C9B4E2",    # Pale purple
        "STREET_ADDRESS": "#B4B77F"   # Light olive
    }
}

# download datamix ---


def download_data():

    snapshot_download(
        repo_id="rbiswasfc/pii-datamix",
        repo_type="dataset",
        local_dir="./data",
    )
    print("Data downloaded!")


download_data()

# load data ---
with open("./data/datamix.json") as f:
    data = json.load(f)

subsets = list(data.keys())
pii_types = list(options["colors"].keys())
pii_types.append("Random")

nlp = spacy.load("en_core_web_sm")
# render sample --


def render_sample(subset, pii_type):
    candidates = data[subset]
    while True:
        sample = random.choice(candidates)
        if pii_type == "Random":
            break
        elif pii_type in sample['piis']:
            break

    print("---" * 10)
    print(sample['document'])
    print("---" * 10)
    # render
    doc = spacy.tokens.Doc(nlp.vocab, words=sample['tokens'], spaces=sample['trailing_whitespace'])

    #
    ents = []
    in_entity = False
    start, end = 0, 0

    for index, label in enumerate(sample['labels']):
        if label.startswith('B-'):
            if in_entity:  # End the previous entity
                ents.append(Span(doc, start, end, sample['labels'][start][2:]))
            start, end = index, index + 1  # Start a new entity
            in_entity = True
        elif label.startswith('I-') and in_entity:
            end = index + 1  # Continue the entity
        elif in_entity:
            # End the current entity and reset
            ents.append(Span(doc, start, end, sample['labels'][start][2:]))
            in_entity = False

    # Add the last entity if we're still in one
    if in_entity:
        ents.append(Span(doc, start, end, sample['labels'][start][2:]))

    doc.ents = ents
    output = displacy.render(doc, style="ent", jupyter=False, options=options)
    return {'document': sample['document']}, output


# app layout & callback ---
# with gr.Blocks(theme=gr.themes.Soft()) as demo:
with gr.Blocks() as demo:
    with gr.Row():
        subset_dropdown = gr.Dropdown(
            subsets,
            value=subsets[0],
            label="Subset",
            info="Select data subset..."
        )

        focus_pii = gr.Dropdown(
            pii_types,
            value="Random",
            label="PII Focus",
            info="Select a PII type to focus on..."
        )

    sample_btn = gr.Button("Sample")
    document_id_display = gr.JSON(label="Document ID")

    sample_display = gr.HTML(label="Example")

    # callback ---
    sample_btn.click(
        fn=render_sample,
        inputs=[subset_dropdown, focus_pii],
        outputs=[document_id_display, sample_display],
    )

# launch app ---
demo.launch()