File size: 3,061 Bytes
36da3fb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3510abe
 
 
36da3fb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
801ee27
 
36da3fb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
import json
import os
import random

import gradio as gr
import spacy
from huggingface_hub import snapshot_download
from spacy import displacy
from spacy.tokens import Span

# download spacy model ---
os.system('python -m spacy download en_core_web_sm')

# # set up colors for PII types ---
options = {
    "colors": {
        "NAME_STUDENT": "#7FDBFF",  # Soft blue
        "EMAIL": "#008080",         # Dark cyan
        "USERNAME": "#C3B1E1",      # Pastel violet
        "ID_NUM": "#2ECC40",        # Medium green
        "PHONE_NUM": "#FF851B",     # Deep orange
        "URL_PERSONAL": "#4682B4",  # Steel blue
        "STREET_ADDRESS": "#808000",  # Muted olive
    }
}


# download datamix ---


def download_data():

    snapshot_download(
        repo_id="rbiswasfc/pii_datamix",
        repo_type="dataset",
        local_dir="./data",
    )
    print("Data downloaded!")


download_data()

# load data ---
with open("./data/datamix.json") as f:
    data = json.load(f)

subsets = list(data.keys())
pii_types = list(options["colors"].keys())
pii_types.append("Random")

nlp = spacy.load("en_core_web_sm")
# render sample ---


def render_sample(subset, pii_type):
    candidates = data[subset]
    while True:
        sample = random.choice(candidates)
        if pii_type == "Random":
            break
        elif pii_type in sample['piis']:
            break

    print("---" * 10)
    print(sample['document'])
    print("---" * 10)
    # render
    doc = spacy.tokens.Doc(nlp.vocab, words=sample['tokens'], spaces=sample['trailing_whitespace'])

    #
    ents = []
    in_entity = False
    start, end = 0, 0

    for index, label in enumerate(sample['labels']):
        if label.startswith('B-'):
            if in_entity:  # End the previous entity
                ents.append(Span(doc, start, end, label[2:]))
            start, end = index, index + 1  # Start a new entity
            in_entity = True
        elif label.startswith('I-') and in_entity:
            end = index + 1  # Continue the entity
        elif in_entity:
            # End the current entity and reset
            ents.append(Span(doc, start, end, sample['labels'][start][2:]))
            in_entity = False

    doc.ents = ents
    output = displacy.render(doc, style="ent", jupyter=False, options=options)
    return output


# app layout & callback ---
# with gr.Blocks(theme=gr.themes.Soft()) as demo:
with gr.Blocks() as demo:
    with gr.Row():
        subset_dropdown = gr.Dropdown(
            subsets,
            value=subsets[0],
            label="Subset",
            info="Select data subset..."
        )

        focus_pii = gr.Dropdown(
            pii_types,
            value="Random",
            label="PII Focus",
            info="Select a PII type to focus on..."
        )

    sample_btn = gr.Button("Sample")

    sample_display = gr.HTML(label="Example")

    # callback ---
    sample_btn.click(
        fn=render_sample,
        inputs=[subset_dropdown, focus_pii],
        outputs=sample_display,
    )

# launch app ---
demo.launch()