File size: 3,753 Bytes
36da3fb d966e51 36da3fb 3919cdf 36da3fb 3919cdf 8334793 3919cdf 36da3fb 3d1ac96 36da3fb 46d4fe3 36da3fb 3510abe 36da3fb 7b1516b 36da3fb 1ee999b 36da3fb 3919cdf 36da3fb 801ee27 36da3fb 3919cdf 36da3fb 3919cdf 36da3fb 284fc35 36da3fb |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 |
import json
import os
import random
import gradio as gr
import spacy
from huggingface_hub import snapshot_download
from spacy import displacy
from spacy.tokens import Span
# download spacy model --
os.system('python -m spacy download en_core_web_sm')
# # set up colors for PII types ---
# options = {
# "colors": {
# "NAME_STUDENT": "#7FDBFF", # Soft blue
# "EMAIL": "#008080", # Dark cyan
# "USERNAME": "#C3B1E1", # Pastel violet
# "ID_NUM": "#2ECC40", # Medium green
# "PHONE_NUM": "#FF851B", # Deep orange
# "URL_PERSONAL": "#4682B4", # Steel blue
# "STREET_ADDRESS": "#808000", # Muted olive
# }
# }
options = {
"colors": {
"NAME_STUDENT": "#6EB5FF", # Lighter blue
"EMAIL": "#42D4B5", # Light teal
"USERNAME": "#D8B4E2", # Light lavender
"ID_NUM": "#7AE88F", # Light green
"PHONE_NUM": "#FFB87D", # Light peach
"URL_PERSONAL": "#C9B4E2", # Pale purple
"STREET_ADDRESS": "#B4B77F" # Light olive
}
}
# download datamix ---
def download_data():
snapshot_download(
repo_id="rbiswasfc/pii-datamix",
repo_type="dataset",
local_dir="./data",
)
print("Data downloaded!")
download_data()
# load data ---
with open("./data/datamix.json") as f:
data = json.load(f)
subsets = list(data.keys())
pii_types = list(options["colors"].keys())
pii_types.append("Random")
nlp = spacy.load("en_core_web_sm")
# render sample --
def render_sample(subset, pii_type):
candidates = data[subset]
while True:
sample = random.choice(candidates)
if pii_type == "Random":
break
elif pii_type in sample['piis']:
break
print("---" * 10)
print(sample['document'])
print("---" * 10)
# render
doc = spacy.tokens.Doc(nlp.vocab, words=sample['tokens'], spaces=sample['trailing_whitespace'])
#
ents = []
in_entity = False
start, end = 0, 0
for index, label in enumerate(sample['labels']):
if label.startswith('B-'):
if in_entity: # End the previous entity
ents.append(Span(doc, start, end, sample['labels'][start][2:]))
start, end = index, index + 1 # Start a new entity
in_entity = True
elif label.startswith('I-') and in_entity:
end = index + 1 # Continue the entity
elif in_entity:
# End the current entity and reset
ents.append(Span(doc, start, end, sample['labels'][start][2:]))
in_entity = False
# Add the last entity if we're still in one
if in_entity:
ents.append(Span(doc, start, end, sample['labels'][start][2:]))
doc.ents = ents
output = displacy.render(doc, style="ent", jupyter=False, options=options)
return {'document': sample['document']}, output
# app layout & callback ---
# with gr.Blocks(theme=gr.themes.Soft()) as demo:
with gr.Blocks() as demo:
with gr.Row():
subset_dropdown = gr.Dropdown(
subsets,
value=subsets[0],
label="Subset",
info="Select data subset..."
)
focus_pii = gr.Dropdown(
pii_types,
value="Random",
label="PII Focus",
info="Select a PII type to focus on..."
)
sample_btn = gr.Button("Sample")
document_id_display = gr.JSON(label="Document ID")
sample_display = gr.HTML(label="Example")
# callback ---
sample_btn.click(
fn=render_sample,
inputs=[subset_dropdown, focus_pii],
outputs=[document_id_display, sample_display],
)
# launch app ---
demo.launch()
|