lhoestq HF staff commited on
Commit
33da8e3
Β·
1 Parent(s): b347aa0

add share link + use locally

Browse files
Files changed (1) hide show
  1. app.py +110 -41
app.py CHANGED
@@ -1,20 +1,25 @@
1
- from functools import lru_cache
 
2
 
3
  import duckdb
4
  import gradio as gr
5
  import pandas as pd
6
  import requests
7
- from duckdb import DuckDBPyRelation
8
  from duckdb.typing import DuckDBPyType
9
  from huggingface_hub import HfApi
10
 
 
11
  Table = DuckDBPyRelation
12
  Dtype = DuckDBPyType
13
  READ_PARQUET_FUNCTIONS = ("dd.read_parquet", "pd.read_parquet")
14
- EMPTY_TABLE = duckdb.sql("SELECT null as col_1, null as col_2, null as col_3, null as col_4 FROM range(10)")
15
- PAGE_SIZE = 100
 
16
  NUM_TRENDING_DATASETS = 10
17
  NUM_USER_DATASETS = 10
 
 
18
  css = """
19
  .transparent-dropdown, .transparent-dropdown .container .wrap, .transparent-accordion {
20
  background: var(--body-background-fill);
@@ -23,32 +28,53 @@ css = """
23
  padding: var(--size-4) 0 !important;
24
  max-width: 98% !important;
25
  }
 
 
 
26
  """
27
 
28
- @lru_cache(maxsize=3)
29
- def cached_duckdb_sql(query: str) -> Table:
30
- return duckdb.sql(query)
 
31
 
32
- def to_json_df(tbl: Table) -> pd.DataFrame:
33
- query = ", ".join("nullif(([" + col + "]::JSON)[0]::VARCHAR, 'null') as " + col for col in tbl.columns)
34
- return duckdb.sql(f"SELECT {query} FROM tbl").df()
 
 
 
35
 
36
- def from_json_df(df: pd.DataFrame, dtypes: list[Dtype]) -> Table:
37
- query = ", ".join("(ifnull(" + col + ", 'null')::JSON)::" + dtype + " as " + col for col, dtype in zip(df.columns, dtypes))
38
- return duckdb.sql(f"SELECT {query} FROM df")
 
 
 
 
 
 
 
 
 
39
 
40
  with gr.Blocks(css=css) as demo:
41
- loading_codes_json = gr.JSON(visible=False)
 
42
  with gr.Row():
43
  with gr.Column():
44
  gr.Markdown("# <p style='text-align:center;'>πŸ€— (WIP) Hugging Face Dataset Spreadsheets πŸ“</p>\n\n<p style='text-align:center;'>Edit any dataset on Hugging Face (full list <a href='https://huggingface.co/datasets' target='_blank'>here</a>)")
45
  with gr.Group():
46
- with gr.Row():
47
- dataset_dropdown = gr.Dropdown(label="Dataset", allow_custom_value=True, scale=10)
48
- subset_dropdown = gr.Dropdown(info="Subset", allow_custom_value=True, show_label=False, visible=False, elem_classes="transparent-dropdown")
49
- split_dropdown = gr.Dropdown(info="Split", allow_custom_value=True, show_label=False, visible=False, elem_classes="transparent-dropdown")
50
- gr.LoginButton()
51
- dataframe = gr.DataFrame(to_json_df(EMPTY_TABLE), interactive=True, wrap=True)
 
 
 
 
52
 
53
  def show_subset_dropdown(dataset: str):
54
  if dataset and "/" not in dataset.strip().strip("/"):
@@ -64,60 +90,103 @@ with gr.Blocks(css=css) as demo:
64
  split = (splits or [""])[0]
65
  return dict(choices=splits, value=split, visible=len(splits) > 1, key=hash(str(loading_codes) + subset))
66
 
67
- def show_input_dataframe(dataset: str, subset: str, split: str, loading_codes: list[dict]):
68
  pattern = ([loading_code["arguments"]["splits"][split] for loading_code in loading_codes if loading_code["config_name"] == subset] or [None])[0]
69
- if dataset and subset and split and pattern:
70
- tbl = cached_duckdb_sql(f"SELECT * FROM 'hf://datasets/{dataset}/{pattern}' LIMIT {PAGE_SIZE}")
 
 
 
 
 
 
 
71
  else:
72
- tbl = EMPTY_TABLE
73
- return dict(value=to_json_df(tbl))
74
 
75
- @demo.load(outputs=[dataset_dropdown, loading_codes_json, subset_dropdown, split_dropdown, dataframe])
76
- def _fetch_datasets(request: gr.Request, oauth_token: gr.OAuthToken | None):
77
- api = HfApi(token=oauth_token.token if oauth_token else None)
78
- datasets = list(api.list_datasets(limit=NUM_TRENDING_DATASETS, sort="trendingScore", direction=-1, filter=["format:parquet"]))
79
- if oauth_token and (user := api.whoami().get("name")):
80
- datasets += list(api.list_datasets(limit=NUM_USER_DATASETS, sort="trendingScore", direction=-1, filter=["format:parquet"], author=user))
81
- dataset = request.query_params.get("dataset") or datasets[0].id
 
 
 
 
 
82
  subsets, loading_codes = show_subset_dropdown(dataset)
 
83
  splits = show_split_dropdown(subsets["value"], loading_codes)
84
- input_dataframe = show_input_dataframe(dataset, subsets["value"], splits["value"], loading_codes)
 
 
85
  return {
86
  dataset_dropdown: gr.Dropdown(choices=[dataset.id for dataset in datasets], value=dataset),
87
  loading_codes_json: loading_codes,
88
  subset_dropdown: gr.Dropdown(**subsets),
89
  split_dropdown: gr.Dropdown(**splits),
 
90
  dataframe: gr.DataFrame(**input_dataframe),
 
 
 
 
 
91
  }
92
 
93
- @dataset_dropdown.select(inputs=dataset_dropdown, outputs=[loading_codes_json, subset_dropdown, split_dropdown, dataframe])
94
- def _show_subset_dropdown(dataset: str):
95
  subsets, loading_codes = show_subset_dropdown(dataset)
96
  splits = show_split_dropdown(subsets["value"], loading_codes)
97
- input_dataframe = show_input_dataframe(dataset, subsets["value"], splits["value"], loading_codes)
 
98
  return {
99
  loading_codes_json: loading_codes,
100
  subset_dropdown: gr.Dropdown(**subsets),
101
  split_dropdown: gr.Dropdown(**splits),
 
102
  dataframe: gr.DataFrame(**input_dataframe),
103
  }
104
 
105
- @subset_dropdown.select(inputs=[dataset_dropdown, subset_dropdown, loading_codes_json], outputs=[split_dropdown, dataframe])
106
  def _show_split_dropdown(dataset: str, subset: str, loading_codes: list[dict]):
107
  splits = show_split_dropdown(subset, loading_codes)
108
- input_dataframe = show_input_dataframe(dataset, subset, splits["value"], loading_codes)
 
109
  return {
110
  split_dropdown: gr.Dropdown(**splits),
 
111
  dataframe: gr.DataFrame(**input_dataframe),
112
  }
113
 
114
- @split_dropdown.select(inputs=[dataset_dropdown, subset_dropdown, split_dropdown, loading_codes_json], outputs=[dataframe])
115
  def _show_input_dataframe(dataset: str, subset: str, split: str, loading_codes: list[dict]) -> pd.DataFrame:
116
- input_dataframe = show_input_dataframe(dataset, subset, split, loading_codes)
 
117
  return {
 
118
  dataframe: gr.DataFrame(**input_dataframe),
119
  }
120
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
121
 
122
  if __name__ == "__main__":
123
  demo.launch()
 
1
+ import os
2
+ from uuid import uuid4
3
 
4
  import duckdb
5
  import gradio as gr
6
  import pandas as pd
7
  import requests
8
+ from duckdb import DuckDBPyConnection, DuckDBPyRelation
9
  from duckdb.typing import DuckDBPyType
10
  from huggingface_hub import HfApi
11
 
12
+ Connection = DuckDBPyConnection
13
  Table = DuckDBPyRelation
14
  Dtype = DuckDBPyType
15
  READ_PARQUET_FUNCTIONS = ("dd.read_parquet", "pd.read_parquet")
16
+ memory_con = duckdb.connect(":memory:")
17
+ empty_tbl = memory_con.sql("SELECT null as col_1, null as col_2, null as col_3, null as col_4 FROM range(10)")
18
+ PAGE_SIZE = 5
19
  NUM_TRENDING_DATASETS = 10
20
  NUM_USER_DATASETS = 10
21
+ SESSIONS_DIR = "s"
22
+ URL = "https://huggingface.co/spaces/lhoestq/dataset-spreadsheets"
23
  css = """
24
  .transparent-dropdown, .transparent-dropdown .container .wrap, .transparent-accordion {
25
  background: var(--body-background-fill);
 
28
  padding: var(--size-4) 0 !important;
29
  max-width: 98% !important;
30
  }
31
+ .cell-menu-button {
32
+ z-index: -1;
33
+ }
34
  """
35
 
36
+ def to_json_df(con: Connection, tbl: Table) -> pd.DataFrame:
37
+ query = ", ".join("nullif(([" + col + "]::JSON)[0]::VARCHAR, 'null') AS " + col for col in tbl.columns)
38
+ out = con.sql(f"SELECT {query} FROM tbl").df()
39
+ return out
40
 
41
+ def from_json_df(con: Connection, df: pd.DataFrame, columns: list[str], dtypes: list[Dtype]) -> Table:
42
+ query = ", ".join(
43
+ "if(" + col + " IS null, null, (" + col + "::JSON::" + str(dtype) + ")"
44
+ + ("[2:-2]" if str(dtype) == "VARCHAR" else "") # remove double quotes at the start and end
45
+ + ") AS " + col for col, dtype in zip(columns, dtypes))
46
+ return con.sql(f"SELECT {query} FROM df")
47
 
48
+ def setup_edits(con: Connection, dataset: str, pattern: str) -> None:
49
+ con.sql(f"CREATE VIEW IF NOT EXISTS dataset AS SELECT * FROM 'hf://datasets/{dataset}/{pattern}'")
50
+ empty_dataset_tbl = con.sql("SELECT * FROM dataset LIMIT 0;")
51
+ columns = empty_dataset_tbl.columns
52
+ dtypes = empty_dataset_tbl.dtypes
53
+ con.sql(f"CREATE TABLE IF NOT EXISTS edits(rowid INTEGER PRIMARY KEY, {', '.join(col + ' ' + str(dtype) for col, dtype in zip(columns, dtypes))})")
54
+ con.sql(
55
+ "CREATE VIEW IF NOT EXISTS edited_dataset AS "
56
+ "WITH edits_per_rowid AS (SELECT * FROM (SELECT unnest(range(max(rowid) + 1)) AS rowid FROM edits) LEFT JOIN edits USING (rowid) ORDER BY rowid) "
57
+ f"SELECT {', '.join('ifnull(edits_per_rowid.' + col + ', dataset.' + col + ') AS ' + col for col in columns)} FROM dataset POSITIONAL JOIN edits_per_rowid"
58
+ )
59
+ gr.set_static_paths(paths=[SESSIONS_DIR + "/"])
60
 
61
  with gr.Blocks(css=css) as demo:
62
+ session_state = gr.BrowserState()
63
+ loading_codes_json = gr.JSON([], visible=False)
64
  with gr.Row():
65
  with gr.Column():
66
  gr.Markdown("# <p style='text-align:center;'>πŸ€— (WIP) Hugging Face Dataset Spreadsheets πŸ“</p>\n\n<p style='text-align:center;'>Edit any dataset on Hugging Face (full list <a href='https://huggingface.co/datasets' target='_blank'>here</a>)")
67
  with gr.Group():
68
+ with gr.Tab("Select Dataset"):
69
+ with gr.Row():
70
+ dataset_dropdown = gr.Dropdown(label="Dataset", allow_custom_value=True, scale=10)
71
+ subset_dropdown = gr.Dropdown(info="Subset", allow_custom_value=True, show_label=False, visible=False)
72
+ split_dropdown = gr.Dropdown(info="Split", allow_custom_value=True, show_label=False, visible=False)
73
+ with gr.Tab("Share Link"):
74
+ share_link_textbox = gr.Textbox(label="Copy the link to the Spreadsheet:", show_copy_button=True, interactive=False)
75
+ with gr.Tab("Use Locally"):
76
+ use_locally_markdown = gr.Markdown()
77
+ dataframe = gr.DataFrame(to_json_df(memory_con, empty_tbl), interactive=True, wrap=True)
78
 
79
  def show_subset_dropdown(dataset: str):
80
  if dataset and "/" not in dataset.strip().strip("/"):
 
90
  split = (splits or [""])[0]
91
  return dict(choices=splits, value=split, visible=len(splits) > 1, key=hash(str(loading_codes) + subset))
92
 
93
+ def show_input_dataframe(dataset: str, subset: str, split: str, loading_codes: list[dict], session: str):
94
  pattern = ([loading_code["arguments"]["splits"][split] for loading_code in loading_codes if loading_code["config_name"] == subset] or [None])[0]
95
+ if session and dataset and subset and split and pattern:
96
+ duckdb_file = session + ".duckdb"
97
+ os.makedirs(SESSIONS_DIR, exist_ok=True)
98
+ con = duckdb.connect(os.path.join(SESSIONS_DIR, duckdb_file))
99
+ setup_edits(con, dataset, pattern)
100
+ # Uncomment to have one edit for testing
101
+ # con.sql("INSERT OR REPLACE INTO edits SELECT 2 AS rowid, * FROM dataset LIMIT 1")
102
+ tbl = con.sql(f"SELECT * FROM edited_dataset LIMIT {PAGE_SIZE}")
103
+ return dict(value=to_json_df(con, tbl))
104
  else:
105
+ return dict(value=to_json_df(memory_con, empty_tbl))
106
+
107
 
108
+ @demo.load(inputs=session_state, outputs=[dataset_dropdown, loading_codes_json, subset_dropdown, split_dropdown, dataframe, session_state, share_link_textbox, use_locally_markdown])
109
+ def _fetch_datasets(session: str | None, request: gr.Request):
110
+ datasets = list(HfApi().list_datasets(limit=NUM_TRENDING_DATASETS, sort="trendingScore", direction=-1, filter=["format:parquet"]))
111
+ session = request.query_params.get(SESSIONS_DIR) or session
112
+ if session:
113
+ namespace, dataset_name, subset, split, _ = session.split("--")
114
+ dataset = namespace + "/" + dataset_name
115
+ if "dataset" in request.query_params and request.query_params["dataset"] != dataset:
116
+ session = None
117
+ dataset = request.query_params["dataset"]
118
+ else:
119
+ dataset = request.query_params.get("dataset") or datasets[0].id
120
  subsets, loading_codes = show_subset_dropdown(dataset)
121
+ subsets["value"] = subset if session else subsets["value"]
122
  splits = show_split_dropdown(subsets["value"], loading_codes)
123
+ splits["value"] = split if session else splits["value"]
124
+ session = session if isinstance(session, str) else f"{dataset.replace('/', '--')}--{subsets['value']}--{splits['value']}--{uuid4()}"
125
+ input_dataframe = show_input_dataframe(dataset, subsets["value"], splits["value"], loading_codes, session)
126
  return {
127
  dataset_dropdown: gr.Dropdown(choices=[dataset.id for dataset in datasets], value=dataset),
128
  loading_codes_json: loading_codes,
129
  subset_dropdown: gr.Dropdown(**subsets),
130
  split_dropdown: gr.Dropdown(**splits),
131
+ session_state: session,
132
  dataframe: gr.DataFrame(**input_dataframe),
133
+ share_link_textbox: f"{URL}?{SESSIONS_DIR}={session}",
134
+ use_locally_markdown: (
135
+ f"""In DuckDB:\n\n```sql\nATTACH '{URL}/gradio_api/file={SESSIONS_DIR}/{session}.duckdb AS db';\nUSE db;\nSELECT * FROM edited_dataset LIMIT 5;\n```\n\n"""
136
+ f"""In Python:\n\n```python\nimport duckdb\n\nduckdb.sql("ATTACH '{URL}/gradio_api/file={SESSIONS_DIR}/{session}.duckdb' AS db")\nduckdb.sql("USE db")\ndf = duckdb.sql("SELECT * FROM edited_dataset LIMIT 5").df()\n```"""
137
+ )
138
  }
139
 
140
+ @dataset_dropdown.select(inputs=[dataset_dropdown], outputs=[session_state, loading_codes_json, subset_dropdown, split_dropdown, dataframe])
141
+ def _show_subset_dropdown(session: str | None, dataset: str):
142
  subsets, loading_codes = show_subset_dropdown(dataset)
143
  splits = show_split_dropdown(subsets["value"], loading_codes)
144
+ session = f"{dataset.replace('/', '--')}--{subsets['value']}--{splits['value']}--{uuid4()}"
145
+ input_dataframe = show_input_dataframe(dataset, subsets["value"], splits["value"], loading_codes, session)
146
  return {
147
  loading_codes_json: loading_codes,
148
  subset_dropdown: gr.Dropdown(**subsets),
149
  split_dropdown: gr.Dropdown(**splits),
150
+ session_state: session,
151
  dataframe: gr.DataFrame(**input_dataframe),
152
  }
153
 
154
+ @subset_dropdown.select(inputs=[dataset_dropdown, subset_dropdown, loading_codes_json], outputs=[session_state, split_dropdown, dataframe])
155
  def _show_split_dropdown(dataset: str, subset: str, loading_codes: list[dict]):
156
  splits = show_split_dropdown(subset, loading_codes)
157
+ session = f"{dataset.replace('/', '--')}--{subset}--{splits['value']}--{uuid4()}"
158
+ input_dataframe = show_input_dataframe(dataset, subset, splits["value"], loading_codes, session)
159
  return {
160
  split_dropdown: gr.Dropdown(**splits),
161
+ session_state: session,
162
  dataframe: gr.DataFrame(**input_dataframe),
163
  }
164
 
165
+ @split_dropdown.select(inputs=[dataset_dropdown, subset_dropdown, split_dropdown, loading_codes_json], outputs=[session_state, dataframe])
166
  def _show_input_dataframe(dataset: str, subset: str, split: str, loading_codes: list[dict]) -> pd.DataFrame:
167
+ session = f"{dataset.replace('/', '--')}--{subset}--{split}--{uuid4()}"
168
+ input_dataframe = show_input_dataframe(dataset, subset, split, loading_codes, session)
169
  return {
170
+ session_state: session,
171
  dataframe: gr.DataFrame(**input_dataframe),
172
  }
173
+
174
+ @dataframe.input(inputs=[dataframe, session_state, dataset_dropdown, subset_dropdown, split_dropdown, loading_codes_json])
175
+ def _dataframe_input(df: pd.DataFrame, session: str | None, dataset: str, subset: str, split: str, loading_codes: list[dict]):
176
+ pattern = ([loading_code["arguments"]["splits"][split] for loading_code in loading_codes if loading_code["config_name"] == subset] or [None])[0]
177
+ if session and dataset and subset and split and pattern:
178
+ duckdb_file = session + ".duckdb"
179
+ os.makedirs(SESSIONS_DIR, exist_ok=True)
180
+ con = duckdb.connect(os.path.join(SESSIONS_DIR, duckdb_file))
181
+ setup_edits(con, dataset, pattern)
182
+ empty_dataset_tbl = con.sql("SELECT * EXCLUDE (rowid) FROM edits LIMIT 0;")
183
+ columns = empty_dataset_tbl.columns
184
+ dtypes = empty_dataset_tbl.dtypes
185
+ tbl = from_json_df(con, df, columns=columns, dtypes=dtypes)
186
+ # TODO add edits for page > 1
187
+ con.sql(f"INSERT OR REPLACE INTO edits SELECT * FROM (SELECT unnest(range({len(df)})) AS rowid) POSITIONAL JOIN tbl")
188
+ print(f"Saved {dataset} edits")
189
+
190
 
191
  if __name__ == "__main__":
192
  demo.launch()