from functools import partial, lru_cache import duckdb import gradio as gr import pandas as pd import requests from huggingface_hub import HfApi READ_PARQUET_FUNCTIONS = ("dd.read_parquet", "pd.read_parquet") EMPTY_DF = pd.DataFrame([{str(i): "" for i in range(4)}] * 10) MAX_NUM_COLUMNS = 20 css = """ @media (prefers-color-scheme: dark) { .transparent-dropdown, .transparent-dropdown .container .wrap { background: var(--bg-dark); } } @media (prefers-color-scheme: light) { .transparent-dropdown, .transparent-dropdown .container .wrap { background: var(--bg); } } input { -webkit-user-select: none; -moz-user-select: none; -ms-user-select: none; user-select: none; } .cell-menu-button { z-index: -1; } thead { display: none; } """ js = """ function setDataFrameReadonly() { MutationObserver = window.MutationObserver || window.WebKitMutationObserver; var observer = new MutationObserver(function(mutations, observer) { // fired when a mutation occurs document.querySelectorAll('.readonly-dataframe div .table-wrap button svelte-virtual-table-viewport table tbody tr td .cell-wrap input').forEach(i => i.setAttribute("readonly", "true")); }); // define what element should be observed by the observer // and what types of mutations trigger the callback observer.observe(document, { subtree: true, childList: true }); } """ text_functions_df = pd.read_csv("text_functions.tsv", delimiter="\t") @lru_cache(maxsize=3) def duckdb_sql(query: str) -> duckdb.DuckDBPyRelation: return duckdb.sql(query) def prepare_function(func: str, placeholder: str, column_name: str) -> str: if "(" in func: prepared_func = func.split("(") prepared_func[1] = prepared_func[1].replace(placeholder, column_name, 1) prepared_func = "(".join(prepared_func) else: prepared_func = func.replace(placeholder, column_name, 1) return prepared_func with gr.Blocks(css=css, js=js) as demo: loading_codes_json = gr.JSON(visible=False) dataset_subset_split_textbox = gr.Textbox(visible=False) input_dataframe = gr.DataFrame(visible=False) with gr.Group(): with gr.Row(): dataset_dropdown = gr.Dropdown(label="Open Dataset", allow_custom_value=True, scale=10) subset_dropdown = gr.Dropdown(info="Subset", allow_custom_value=True, show_label=False, visible=False, elem_classes="transparent-dropdown") split_dropdown = gr.Dropdown(info="Split", allow_custom_value=True, show_label=False, visible=False, elem_classes="transparent-dropdown") gr.LoginButton() with gr.Row(): transform_dropdowns = [gr.Dropdown(choices=[column_name] + [prepare_function(text_func, "string", column_name) for text_func in text_functions_df.Name if "string" in text_func], value=column_name, container=False, interactive=True, allow_custom_value=True, visible=True) for column_name in EMPTY_DF.columns] transform_dropdowns += [gr.Dropdown(choices=[None], value=None, container=False, interactive=True, allow_custom_value=True, visible=False) for _ in range(MAX_NUM_COLUMNS - len(transform_dropdowns))] dataframe = gr.DataFrame(EMPTY_DF, column_widths=[f"{1/len(EMPTY_DF.columns):.0%}"] * len(EMPTY_DF.columns), interactive=True, elem_classes="readonly-dataframe") def show_subset_dropdown(dataset: str): if dataset and "/" not in dataset.strip().strip("/"): return [] resp = requests.get(f"https://datasets-server.huggingface.co/compatible-libraries?dataset={dataset}", timeout=3).json() loading_codes = ([lib["loading_codes"] for lib in resp.get("libraries", []) if lib["function"] in READ_PARQUET_FUNCTIONS] or [[]])[0] or [] subsets = [loading_code["config_name"] for loading_code in loading_codes] subset = (subsets or [""])[0] return dict(choices=subsets, value=subset, visible=len(subsets) > 1, key=hash(str(loading_codes))), loading_codes def show_split_dropdown(subset: str, loading_codes: list[dict]): splits = ([list(loading_code["arguments"]["splits"]) for loading_code in loading_codes if loading_code["config_name"] == subset] or [[]])[0] split = (splits or [""])[0] return dict(choices=splits, value=split, visible=len(splits) > 1, key=hash(str(loading_codes) + subset)) def show_input_dataframe(dataset: str, subset: str, split: str, loading_codes: list[dict]) -> pd.DataFrame: pattern = ([loading_code["arguments"]["splits"][split] for loading_code in loading_codes if loading_code["config_name"] == subset] or [None])[0] if dataset and subset and split and pattern: df = duckdb_sql(f"SELECT * FROM 'hf://datasets/{dataset}/{pattern}' LIMIT 10").df() input_df = df else: input_df = EMPTY_DF new_transform_dropdowns = [dict(choices=[column_name] + [prepare_function(text_func, "string", column_name) for text_func in text_functions_df.Name if "string" in text_func], value=column_name, container=False, interactive=True, allow_custom_value=True, visible=True) for column_name in input_df.columns] new_transform_dropdowns += [dict(choices=[None], value=None, container=False, interactive=True, allow_custom_value=True, visible=False) for _ in range(MAX_NUM_COLUMNS - len(new_transform_dropdowns))] return [dict(value=df, column_widths=[f"{1/len(df.columns):.0%}"] * len(df.columns))] + new_transform_dropdowns def set_dataframe(input_df: pd.DataFrame, *transforms: tuple[str], column_index: int): try: return duckdb.sql(f"SELECT {', '.join(transform for transform in transforms if transform)} FROM input_df;").df() except Exception as e: gr.Error(f"{type(e).__name__}: {e}") return input_df for column_index, transform_dropdown in enumerate(transform_dropdowns): transform_dropdown.select(partial(set_dataframe, column_index=column_index), inputs=[input_dataframe] + transform_dropdowns, outputs=dataframe) @demo.load(outputs=[dataset_dropdown, loading_codes_json, subset_dropdown, split_dropdown, input_dataframe, dataframe] + transform_dropdowns) def _fetch_datasets(request: gr.Request, oauth_token: gr.OAuthToken | None): api = HfApi(token=oauth_token.token if oauth_token else None) datasets = list(api.list_datasets(limit=3, sort="trendingScore", direction=-1, filter=["format:parquet"])) if oauth_token and (user := api.whoami().get("name")): datasets += list(api.list_datasets(limit=3, sort="trendingScore", direction=-1, filter=["format:parquet"], author=user)) dataset = request.query_params.get("dataset") or datasets[0].id subsets, loading_codes = show_subset_dropdown(dataset) splits = show_split_dropdown(subsets["value"], loading_codes) input_df, *new_transform_dropdowns = show_input_dataframe(dataset, subsets["value"], splits["value"], loading_codes) return { dataset_dropdown: gr.Dropdown(choices=[dataset.id for dataset in datasets], value=dataset), loading_codes_json: loading_codes, subset_dropdown: gr.Dropdown(**subsets), split_dropdown: gr.Dropdown(**splits), input_dataframe: gr.DataFrame(**input_df), dataframe: gr.DataFrame(**input_df), **dict(zip(transform_dropdowns, [gr.Dropdown(**new_transform_dropdown) for new_transform_dropdown in new_transform_dropdowns])) } @dataset_dropdown.select(inputs=dataset_dropdown, outputs=[loading_codes_json, subset_dropdown, split_dropdown, input_dataframe, dataframe] + transform_dropdowns) def _show_subset_dropdown(dataset: str): subsets, loading_codes = show_subset_dropdown(dataset) splits = show_split_dropdown(subsets["value"], loading_codes) input_df, *new_transform_dropdowns = show_input_dataframe(dataset, subsets["value"], splits["value"], loading_codes) return { loading_codes_json: loading_codes, subset_dropdown: gr.Dropdown(**subsets), split_dropdown: gr.Dropdown(**splits), input_dataframe: gr.DataFrame(**input_df), dataframe: gr.DataFrame(**input_df), **dict(zip(transform_dropdowns, [gr.Dropdown(**new_transform_dropdown) for new_transform_dropdown in new_transform_dropdowns])) } @subset_dropdown.select(inputs=[dataset_dropdown, subset_dropdown, loading_codes_json], outputs=[split_dropdown, input_dataframe, dataframe] + transform_dropdowns) def _show_split_dropdown(dataset: str, subset: str, loading_codes: list[dict]): splits = show_split_dropdown(subset, loading_codes) input_df, *new_transform_dropdowns = show_input_dataframe(dataset, subset, splits["value"], loading_codes) return { split_dropdown: gr.Dropdown(**splits), input_dataframe: gr.DataFrame(**input_df), dataframe: gr.DataFrame(**input_df), **dict(zip(transform_dropdowns, [gr.Dropdown(**new_transform_dropdown) for new_transform_dropdown in new_transform_dropdowns])) } @split_dropdown.select(inputs=[dataset_dropdown, subset_dropdown, split_dropdown, loading_codes_json], outputs=[input_dataframe, dataframe] + transform_dropdowns) def _show_input_dataframe(dataset: str, subset: str, split: str, loading_codes: list[dict]) -> pd.DataFrame: input_df, *new_transform_dropdowns = show_input_dataframe(dataset, subset, split, loading_codes) return { input_dataframe: gr.DataFrame(**input_df), dataframe: gr.DataFrame(**input_df), **dict(zip(transform_dropdowns, [gr.Dropdown(**new_transform_dropdown) for new_transform_dropdown in new_transform_dropdowns])) } if __name__ == "__main__": demo.launch()