import os from uuid import uuid4 import duckdb import gradio as gr import pandas as pd import requests from duckdb import DuckDBPyConnection, DuckDBPyRelation from duckdb.typing import DuckDBPyType from huggingface_hub import HfApi Connection = DuckDBPyConnection Table = DuckDBPyRelation Dtype = DuckDBPyType READ_PARQUET_FUNCTIONS = ("dd.read_parquet", "pd.read_parquet") memory_con = duckdb.connect(":memory:") empty_tbl = memory_con.sql("SELECT null as col_1, null as col_2, null as col_3, null as col_4 FROM range(10)") PAGE_SIZE = 5 NUM_TRENDING_DATASETS = 10 NUM_USER_DATASETS = 10 SESSIONS_DIR = "s" URL = "https://huggingface.co/spaces/lhoestq/dataset-spreadsheets" css = """ .transparent-dropdown, .transparent-dropdown .container .wrap, .transparent-accordion { background: var(--body-background-fill); } .gradio-container { padding: var(--size-4) 0 !important; max-width: 98% !important; } .cell-menu-button { z-index: -1; } """ def to_json_df(con: Connection, tbl: Table) -> pd.DataFrame: query = ", ".join("nullif(([" + col + "]::JSON)[0]::VARCHAR, 'null') AS " + col for col in tbl.columns) out = con.sql(f"SELECT {query} FROM tbl").df() return out def from_json_df(con: Connection, df: pd.DataFrame, columns: list[str], dtypes: list[Dtype]) -> Table: query = ", ".join( "if(" + col + " IS null, null, (" + col + "::JSON::" + str(dtype) + ")" + ("[2:-2]" if str(dtype) == "VARCHAR" else "") # remove double quotes at the start and end + ") AS " + col for col, dtype in zip(columns, dtypes)) return con.sql(f"SELECT {query} FROM df") def setup_edits(con: Connection, dataset: str, pattern: str) -> None: con.sql(f"CREATE VIEW IF NOT EXISTS dataset AS SELECT * FROM 'hf://datasets/{dataset}/{pattern}'") empty_dataset_tbl = con.sql("SELECT * FROM dataset LIMIT 0;") columns = empty_dataset_tbl.columns dtypes = empty_dataset_tbl.dtypes con.sql(f"CREATE TABLE IF NOT EXISTS edits(rowid INTEGER PRIMARY KEY, {', '.join(col + ' ' + str(dtype) for col, dtype in zip(columns, dtypes))})") con.sql( "CREATE VIEW IF NOT EXISTS edited_dataset AS " "WITH edits_per_rowid AS (SELECT * FROM (SELECT unnest(range(max(rowid) + 1)) AS rowid FROM edits) LEFT JOIN edits USING (rowid) ORDER BY rowid) " f"SELECT {', '.join('ifnull(edits_per_rowid.' + col + ', dataset.' + col + ') AS ' + col for col in columns)} FROM dataset POSITIONAL JOIN edits_per_rowid" ) gr.set_static_paths(paths=[SESSIONS_DIR + "/"]) with gr.Blocks(css=css) as demo: session_state = gr.BrowserState() loading_codes_json = gr.JSON([], visible=False) with gr.Row(): with gr.Column(): gr.Markdown("#

🤗 (WIP) Hugging Face Dataset Spreadsheets 📝

\n\n

Edit any dataset on Hugging Face (full list here)") with gr.Group(): with gr.Tab("Select Dataset"): with gr.Row(): dataset_dropdown = gr.Dropdown(label="Dataset", allow_custom_value=True, scale=10) subset_dropdown = gr.Dropdown(info="Subset", allow_custom_value=True, show_label=False, visible=False) split_dropdown = gr.Dropdown(info="Split", allow_custom_value=True, show_label=False, visible=False) with gr.Tab("Share Link"): share_link_textbox = gr.Textbox(label="Copy the link to the Spreadsheet:", show_copy_button=True, interactive=False) with gr.Tab("Use Locally"): use_locally_markdown = gr.Markdown() dataframe = gr.DataFrame(to_json_df(memory_con, empty_tbl), interactive=True, wrap=True) def show_subset_dropdown(dataset: str): if dataset and "/" not in dataset.strip().strip("/"): return [] resp = requests.get(f"https://datasets-server.huggingface.co/compatible-libraries?dataset={dataset}", timeout=3).json() loading_codes = ([lib["loading_codes"] for lib in resp.get("libraries", []) if lib["function"] in READ_PARQUET_FUNCTIONS] or [[]])[0] or [] subsets = [loading_code["config_name"] for loading_code in loading_codes] subset = (subsets or [""])[0] return dict(choices=subsets, value=subset, visible=len(subsets) > 1, key=hash(str(loading_codes))), loading_codes def show_split_dropdown(subset: str, loading_codes: list[dict]): splits = ([list(loading_code["arguments"]["splits"]) for loading_code in loading_codes if loading_code["config_name"] == subset] or [[]])[0] split = (splits or [""])[0] return dict(choices=splits, value=split, visible=len(splits) > 1, key=hash(str(loading_codes) + subset)) def show_input_dataframe(dataset: str, subset: str, split: str, loading_codes: list[dict], session: str): pattern = ([loading_code["arguments"]["splits"][split] for loading_code in loading_codes if loading_code["config_name"] == subset] or [None])[0] if session and dataset and subset and split and pattern: duckdb_file = session + ".duckdb" os.makedirs(SESSIONS_DIR, exist_ok=True) con = duckdb.connect(os.path.join(SESSIONS_DIR, duckdb_file)) setup_edits(con, dataset, pattern) # Uncomment to have one edit for testing # con.sql("INSERT OR REPLACE INTO edits SELECT 2 AS rowid, * FROM dataset LIMIT 1") tbl = con.sql(f"SELECT * FROM edited_dataset LIMIT {PAGE_SIZE}") return dict(value=to_json_df(con, tbl)) else: return dict(value=to_json_df(memory_con, empty_tbl)) @demo.load(inputs=session_state, outputs=[dataset_dropdown, loading_codes_json, subset_dropdown, split_dropdown, dataframe, session_state, share_link_textbox, use_locally_markdown]) def _fetch_datasets(session: str | None, request: gr.Request): datasets = list(HfApi().list_datasets(limit=NUM_TRENDING_DATASETS, sort="trendingScore", direction=-1, filter=["format:parquet"])) session = request.query_params.get(SESSIONS_DIR) or session if session: namespace, dataset_name, subset, split, _ = session.split("--") dataset = namespace + "/" + dataset_name if "dataset" in request.query_params and request.query_params["dataset"] != dataset: session = None dataset = request.query_params["dataset"] else: dataset = request.query_params.get("dataset") or datasets[0].id subsets, loading_codes = show_subset_dropdown(dataset) subsets["value"] = subset if session else subsets["value"] splits = show_split_dropdown(subsets["value"], loading_codes) splits["value"] = split if session else splits["value"] session = session if isinstance(session, str) else f"{dataset.replace('/', '--')}--{subsets['value']}--{splits['value']}--{uuid4()}" input_dataframe = show_input_dataframe(dataset, subsets["value"], splits["value"], loading_codes, session) return { dataset_dropdown: gr.Dropdown(choices=[dataset.id for dataset in datasets], value=dataset), loading_codes_json: loading_codes, subset_dropdown: gr.Dropdown(**subsets), split_dropdown: gr.Dropdown(**splits), session_state: session, dataframe: gr.DataFrame(**input_dataframe), share_link_textbox: f"{URL}?{SESSIONS_DIR}={session}", use_locally_markdown: ( f"""In DuckDB:\n\n```sql\nATTACH '{URL}/gradio_api/file={SESSIONS_DIR}/{session}.duckdb AS db';\nUSE db;\nSELECT * FROM edited_dataset LIMIT 5;\n```\n\n""" f"""In Python:\n\n```python\nimport duckdb\n\nduckdb.sql("ATTACH '{URL}/gradio_api/file={SESSIONS_DIR}/{session}.duckdb' AS db")\nduckdb.sql("USE db")\ndf = duckdb.sql("SELECT * FROM edited_dataset LIMIT 5").df()\n```""" ) } @dataset_dropdown.select(inputs=[dataset_dropdown], outputs=[session_state, loading_codes_json, subset_dropdown, split_dropdown, dataframe]) def _show_subset_dropdown(session: str | None, dataset: str): subsets, loading_codes = show_subset_dropdown(dataset) splits = show_split_dropdown(subsets["value"], loading_codes) session = f"{dataset.replace('/', '--')}--{subsets['value']}--{splits['value']}--{uuid4()}" input_dataframe = show_input_dataframe(dataset, subsets["value"], splits["value"], loading_codes, session) return { loading_codes_json: loading_codes, subset_dropdown: gr.Dropdown(**subsets), split_dropdown: gr.Dropdown(**splits), session_state: session, dataframe: gr.DataFrame(**input_dataframe), } @subset_dropdown.select(inputs=[dataset_dropdown, subset_dropdown, loading_codes_json], outputs=[session_state, split_dropdown, dataframe]) def _show_split_dropdown(dataset: str, subset: str, loading_codes: list[dict]): splits = show_split_dropdown(subset, loading_codes) session = f"{dataset.replace('/', '--')}--{subset}--{splits['value']}--{uuid4()}" input_dataframe = show_input_dataframe(dataset, subset, splits["value"], loading_codes, session) return { split_dropdown: gr.Dropdown(**splits), session_state: session, dataframe: gr.DataFrame(**input_dataframe), } @split_dropdown.select(inputs=[dataset_dropdown, subset_dropdown, split_dropdown, loading_codes_json], outputs=[session_state, dataframe]) def _show_input_dataframe(dataset: str, subset: str, split: str, loading_codes: list[dict]) -> pd.DataFrame: session = f"{dataset.replace('/', '--')}--{subset}--{split}--{uuid4()}" input_dataframe = show_input_dataframe(dataset, subset, split, loading_codes, session) return { session_state: session, dataframe: gr.DataFrame(**input_dataframe), } @dataframe.input(inputs=[dataframe, session_state, dataset_dropdown, subset_dropdown, split_dropdown, loading_codes_json]) def _dataframe_input(df: pd.DataFrame, session: str | None, dataset: str, subset: str, split: str, loading_codes: list[dict]): pattern = ([loading_code["arguments"]["splits"][split] for loading_code in loading_codes if loading_code["config_name"] == subset] or [None])[0] if session and dataset and subset and split and pattern: duckdb_file = session + ".duckdb" os.makedirs(SESSIONS_DIR, exist_ok=True) con = duckdb.connect(os.path.join(SESSIONS_DIR, duckdb_file)) setup_edits(con, dataset, pattern) empty_dataset_tbl = con.sql("SELECT * EXCLUDE (rowid) FROM edits LIMIT 0;") columns = empty_dataset_tbl.columns dtypes = empty_dataset_tbl.dtypes tbl = from_json_df(con, df, columns=columns, dtypes=dtypes) # TODO add edits for page > 1 con.sql(f"INSERT OR REPLACE INTO edits SELECT * FROM (SELECT unnest(range({len(df)})) AS rowid) POSITIONAL JOIN tbl") print(f"Saved {dataset} edits") if __name__ == "__main__": demo.launch()