Update app.py
Browse files
app.py
CHANGED
@@ -7,7 +7,7 @@ import pandas as pd
|
|
7 |
from datasets import Features
|
8 |
from gradio_huggingfacehub_search import HuggingfaceHubSearch
|
9 |
|
10 |
-
from analyze import get_column_description, get_columns_with_strings, presidio_scan_entities
|
11 |
|
12 |
MAX_ROWS = 100
|
13 |
T = TypeVar("T")
|
@@ -34,7 +34,7 @@ class track_iter:
|
|
34 |
self.next_idx += 1
|
35 |
yield item
|
36 |
|
37 |
-
def analyze_dataset(dataset: str) -> pd.DataFrame:
|
38 |
info_resp = requests.get(f"https://datasets-server.huggingface.co/info?dataset={dataset}", timeout=3).json()
|
39 |
if "error" in info_resp:
|
40 |
yield "β " + info_resp["error"], pd.DataFrame()
|
@@ -52,8 +52,9 @@ def analyze_dataset(dataset: str) -> pd.DataFrame:
|
|
52 |
for presidio_entity in presidio_scan_entities(
|
53 |
rows, scanned_columns=scanned_columns, columns_descriptions=columns_descriptions
|
54 |
):
|
55 |
-
|
56 |
-
|
|
|
57 |
yield f"β
Scanning {dataset} [{rows.next_idx}/{num_rows} rows]:", pd.DataFrame(presidio_entities)
|
58 |
|
59 |
with gr.Blocks() as demo:
|
@@ -65,6 +66,12 @@ with gr.Blocks() as demo:
|
|
65 |
placeholder="Search for dataset id on Huggingface",
|
66 |
search_type="dataset",
|
67 |
),
|
|
|
|
|
|
|
|
|
|
|
|
|
68 |
]
|
69 |
button = gr.Button("Run Presidio Scan")
|
70 |
outputs = [
|
|
|
7 |
from datasets import Features
|
8 |
from gradio_huggingfacehub_search import HuggingfaceHubSearch
|
9 |
|
10 |
+
from analyze import analyzer, get_column_description, get_columns_with_strings, presidio_scan_entities
|
11 |
|
12 |
MAX_ROWS = 100
|
13 |
T = TypeVar("T")
|
|
|
34 |
self.next_idx += 1
|
35 |
yield item
|
36 |
|
37 |
+
def analyze_dataset(dataset: str, enabled_presidio_entities: str) -> pd.DataFrame:
|
38 |
info_resp = requests.get(f"https://datasets-server.huggingface.co/info?dataset={dataset}", timeout=3).json()
|
39 |
if "error" in info_resp:
|
40 |
yield "β " + info_resp["error"], pd.DataFrame()
|
|
|
52 |
for presidio_entity in presidio_scan_entities(
|
53 |
rows, scanned_columns=scanned_columns, columns_descriptions=columns_descriptions
|
54 |
):
|
55 |
+
if presidio_entity.type in enabled_presidio_entities:
|
56 |
+
presidio_entities.append(presidio_entity)
|
57 |
+
yield f"βοΈ Scanning {dataset} [{rows.next_idx}/{num_rows} rows]:", pd.DataFrame(presidio_entities)
|
58 |
yield f"β
Scanning {dataset} [{rows.next_idx}/{num_rows} rows]:", pd.DataFrame(presidio_entities)
|
59 |
|
60 |
with gr.Blocks() as demo:
|
|
|
66 |
placeholder="Search for dataset id on Huggingface",
|
67 |
search_type="dataset",
|
68 |
),
|
69 |
+
gr.CheckBoxGroup(
|
70 |
+
label="Presidio entities",
|
71 |
+
choices=analyzer.get_supported_entities(),
|
72 |
+
value=["PERSON", "CREDIT_CARD", "US_SSN", "PHONE_NUMBER", "EMAIL_ADDRESS", "IP_ADDRESS", "US_BANK_NUMBER", "EMAIL", "IBAN_CODE"],
|
73 |
+
interative=True,
|
74 |
+
),
|
75 |
]
|
76 |
button = gr.Button("Run Presidio Scan")
|
77 |
outputs = [
|