File size: 2,959 Bytes

import json
import os
import random

import gradio as gr
import spacy
from huggingface_hub import snapshot_download
from spacy import displacy
from spacy.tokens import Span

# download spacy model ---
os.system('python -m spacy download en_core_web_sm')

# # set up colors for PII types ---
options = {
    "colors": {
        "NAME_STUDENT": "#7FDBFF",  # Soft blue
        "EMAIL": "#008080",         # Dark cyan
        "USERNAME": "#C3B1E1",      # Pastel violet
        "ID_NUM": "#2ECC40",        # Medium green
        "PHONE_NUM": "#FF851B",     # Deep orange
        "URL_PERSONAL": "#4682B4",  # Steel blue
        "STREET_ADDRESS": "#808000",  # Muted olive
    }
}


# download datamix ---


def download_data():

    snapshot_download(
        repo_id="rbiswasfc/pii_datamix",
        repo_type="dataset",
        local_dir="./data",
    )
    print("Data downloaded!")


download_data()

# load data ---
with open("./data/datamix.json") as f:
    data = json.load(f)

subsets = list(data.keys())
pii_types = list(options["colors"].keys())
pii_types.append("Random")

nlp = spacy.load("en_core_web_sm")
# render sample ---


def render_sample(subset, pii_type):
    candidates = data[subset]
    while True:
        sample = random.choice(candidates)
        if pii_type == "Random":
            break
        elif pii_type in sample['piis']:
            break

    # render
    doc = spacy.tokens.Doc(nlp.vocab, words=sample['tokens'], spaces=sample['trailing_whitespace'])

    #
    ents = []
    in_entity = False
    start, end = 0, 0

    for index, label in enumerate(sample['labels']):
        if label.startswith('B-'):
            if in_entity:  # End the previous entity
                ents.append(Span(doc, start, end, label[2:]))
            start, end = index, index + 1  # Start a new entity
            in_entity = True
        elif label.startswith('I-') and in_entity:
            end = index + 1  # Continue the entity
        elif in_entity:
            # End the current entity and reset
            ents.append(Span(doc, start, end, sample['labels'][start][2:]))
            in_entity = False

    doc.ents = ents
    output = displacy.render(doc, style="ent", jupyter=False, options=options)
    return output


# app layout & callback ---
with gr.Blocks(theme=gr.themes.Soft()) as demo:
    with gr.Row():
        subset_dropdown = gr.Dropdown(
            subsets,
            value=subsets[0],
            label="Subset",
            info="Select data subset..."
        )

        focus_pii = gr.Dropdown(
            pii_types,
            value="Random",
            label="PII Focus",
            info="Select a PII type to focus on..."
        )

    sample_btn = gr.Button("Sample")

    sample_display = gr.HTML(label="Example")

    # callback ---
    sample_btn.click(
        fn=render_sample,
        inputs=[subset_dropdown, focus_pii],
        outputs=sample_display,
    )

# launch app ---
demo.launch()