File size: 3,061 Bytes
36da3fb 3510abe 36da3fb 801ee27 36da3fb |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 |
import json
import os
import random
import gradio as gr
import spacy
from huggingface_hub import snapshot_download
from spacy import displacy
from spacy.tokens import Span
# download spacy model ---
os.system('python -m spacy download en_core_web_sm')
# # set up colors for PII types ---
options = {
"colors": {
"NAME_STUDENT": "#7FDBFF", # Soft blue
"EMAIL": "#008080", # Dark cyan
"USERNAME": "#C3B1E1", # Pastel violet
"ID_NUM": "#2ECC40", # Medium green
"PHONE_NUM": "#FF851B", # Deep orange
"URL_PERSONAL": "#4682B4", # Steel blue
"STREET_ADDRESS": "#808000", # Muted olive
}
}
# download datamix ---
def download_data():
snapshot_download(
repo_id="rbiswasfc/pii_datamix",
repo_type="dataset",
local_dir="./data",
)
print("Data downloaded!")
download_data()
# load data ---
with open("./data/datamix.json") as f:
data = json.load(f)
subsets = list(data.keys())
pii_types = list(options["colors"].keys())
pii_types.append("Random")
nlp = spacy.load("en_core_web_sm")
# render sample ---
def render_sample(subset, pii_type):
candidates = data[subset]
while True:
sample = random.choice(candidates)
if pii_type == "Random":
break
elif pii_type in sample['piis']:
break
print("---" * 10)
print(sample['document'])
print("---" * 10)
# render
doc = spacy.tokens.Doc(nlp.vocab, words=sample['tokens'], spaces=sample['trailing_whitespace'])
#
ents = []
in_entity = False
start, end = 0, 0
for index, label in enumerate(sample['labels']):
if label.startswith('B-'):
if in_entity: # End the previous entity
ents.append(Span(doc, start, end, label[2:]))
start, end = index, index + 1 # Start a new entity
in_entity = True
elif label.startswith('I-') and in_entity:
end = index + 1 # Continue the entity
elif in_entity:
# End the current entity and reset
ents.append(Span(doc, start, end, sample['labels'][start][2:]))
in_entity = False
doc.ents = ents
output = displacy.render(doc, style="ent", jupyter=False, options=options)
return output
# app layout & callback ---
# with gr.Blocks(theme=gr.themes.Soft()) as demo:
with gr.Blocks() as demo:
with gr.Row():
subset_dropdown = gr.Dropdown(
subsets,
value=subsets[0],
label="Subset",
info="Select data subset..."
)
focus_pii = gr.Dropdown(
pii_types,
value="Random",
label="PII Focus",
info="Select a PII type to focus on..."
)
sample_btn = gr.Button("Sample")
sample_display = gr.HTML(label="Example")
# callback ---
sample_btn.click(
fn=render_sample,
inputs=[subset_dropdown, focus_pii],
outputs=sample_display,
)
# launch app ---
demo.launch()
|