rbiswasfc's picture
initial version
36da3fb
raw
history blame
3.03 kB
import json
import os
import random
import gradio as gr
import spacy
from huggingface_hub import snapshot_download
from spacy import displacy
from spacy.tokens import Span
# download spacy model ---
os.system('python -m spacy download en_core_web_sm')
# # set up colors for PII types ---
options = {
"colors": {
"NAME_STUDENT": "#7FDBFF", # Soft blue
"EMAIL": "#008080", # Dark cyan
"USERNAME": "#C3B1E1", # Pastel violet
"ID_NUM": "#2ECC40", # Medium green
"PHONE_NUM": "#FF851B", # Deep orange
"URL_PERSONAL": "#4682B4", # Steel blue
"STREET_ADDRESS": "#808000", # Muted olive
}
}
# download datamix ---
def download_data():
snapshot_download(
repo_id="rbiswasfc/pii_datamix",
repo_type="dataset",
local_dir="./data",
)
print("Data downloaded!")
download_data()
# load data ---
with open("./data/datamix.json") as f:
data = json.load(f)
subsets = list(data.keys())
pii_types = list(options["colors"].keys())
pii_types.append("Random")
nlp = spacy.load("en_core_web_sm")
# render sample ---
def render_sample(subset, pii_type):
candidates = data[subset]
while True:
sample = random.choice(candidates)
if pii_type == "Random":
break
elif pii_type in sample['piis']:
break
# render
doc = spacy.tokens.Doc(nlp.vocab, words=sample['tokens'], spaces=sample['trailing_whitespace'])
#
ents = []
in_entity = False
start, end = 0, 0
for index, label in enumerate(sample['labels']):
if label.startswith('B-'):
if in_entity: # End the previous entity
ents.append(Span(doc, start, end, label[2:]))
start, end = index, index + 1 # Start a new entity
in_entity = True
elif label.startswith('I-') and in_entity:
end = index + 1 # Continue the entity
elif in_entity:
# End the current entity and reset
ents.append(Span(doc, start, end, sample['labels'][start][2:]))
in_entity = False
doc.ents = ents
output = displacy.render(doc, style="ent", jupyter=False, options=options)
return output
# app layout & callback ---
with gr.Blocks(theme=gr.themes.Default(primary_hue=gr.themes.colors.red, secondary_hue=gr.themes.colors.pink)) as demo:
with gr.Row():
subset_dropdown = gr.Dropdown(
subsets,
value=subsets[0],
label="Subset",
info="Select data subset..."
)
focus_pii = gr.Dropdown(
pii_types,
value="Random",
label="PII Focus",
info="Select a PII type to focus on..."
)
sample_btn = gr.Button("Sample")
sample_display = gr.HTML(label="Example")
# callback ---
sample_btn.click(
fn=render_sample,
inputs=[subset_dropdown, focus_pii],
outputs=sample_display,
)
# launch app ---
demo.launch()