File size: 2,959 Bytes
36da3fb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
eb8c8f9
36da3fb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
import json
import os
import random

import gradio as gr
import spacy
from huggingface_hub import snapshot_download
from spacy import displacy
from spacy.tokens import Span

# download spacy model ---
os.system('python -m spacy download en_core_web_sm')

# # set up colors for PII types ---
options = {
    "colors": {
        "NAME_STUDENT": "#7FDBFF",  # Soft blue
        "EMAIL": "#008080",         # Dark cyan
        "USERNAME": "#C3B1E1",      # Pastel violet
        "ID_NUM": "#2ECC40",        # Medium green
        "PHONE_NUM": "#FF851B",     # Deep orange
        "URL_PERSONAL": "#4682B4",  # Steel blue
        "STREET_ADDRESS": "#808000",  # Muted olive
    }
}


# download datamix ---


def download_data():

    snapshot_download(
        repo_id="rbiswasfc/pii_datamix",
        repo_type="dataset",
        local_dir="./data",
    )
    print("Data downloaded!")


download_data()

# load data ---
with open("./data/datamix.json") as f:
    data = json.load(f)

subsets = list(data.keys())
pii_types = list(options["colors"].keys())
pii_types.append("Random")

nlp = spacy.load("en_core_web_sm")
# render sample ---


def render_sample(subset, pii_type):
    candidates = data[subset]
    while True:
        sample = random.choice(candidates)
        if pii_type == "Random":
            break
        elif pii_type in sample['piis']:
            break

    # render
    doc = spacy.tokens.Doc(nlp.vocab, words=sample['tokens'], spaces=sample['trailing_whitespace'])

    #
    ents = []
    in_entity = False
    start, end = 0, 0

    for index, label in enumerate(sample['labels']):
        if label.startswith('B-'):
            if in_entity:  # End the previous entity
                ents.append(Span(doc, start, end, label[2:]))
            start, end = index, index + 1  # Start a new entity
            in_entity = True
        elif label.startswith('I-') and in_entity:
            end = index + 1  # Continue the entity
        elif in_entity:
            # End the current entity and reset
            ents.append(Span(doc, start, end, sample['labels'][start][2:]))
            in_entity = False

    doc.ents = ents
    output = displacy.render(doc, style="ent", jupyter=False, options=options)
    return output


# app layout & callback ---
with gr.Blocks(theme=gr.themes.Soft()) as demo:
    with gr.Row():
        subset_dropdown = gr.Dropdown(
            subsets,
            value=subsets[0],
            label="Subset",
            info="Select data subset..."
        )

        focus_pii = gr.Dropdown(
            pii_types,
            value="Random",
            label="PII Focus",
            info="Select a PII type to focus on..."
        )

    sample_btn = gr.Button("Sample")

    sample_display = gr.HTML(label="Example")

    # callback ---
    sample_btn.click(
        fn=render_sample,
        inputs=[subset_dropdown, focus_pii],
        outputs=sample_display,
    )

# launch app ---
demo.launch()