import gradio as gr import duckdb from huggingface_hub import HfFileSystem from huggingface_hub.hf_file_system import safe_quote import pandas as pd PARQUET_REVISION="refs/convert/parquet" TABLE_WILDCARD="{table}" fs = HfFileSystem() duckdb.register_filesystem(fs) def greet(dataset, config, split, sql): try: if TABLE_WILDCARD not in sql: raise Exception(f"Query must contains {TABLE_WILDCARD} wildcard.") # dataset="glue" # config="mnli" path=f"{config}/{dataset}-{split}.parquet" # Only from one split location=f"hf://datasets/{dataset}@{safe_quote(PARQUET_REVISION)}/{path}" print(location) sql = sql.replace(TABLE_WILDCARD, f"'{location}'") # result = duckdb.query(f"SELECT idx as id, premise as p FROM '{location}' LIMIT 2").to_df() result = duckdb.query(sql).to_df() print("QUERY SUCCESSED") except Exception as error: print(f"Error: {str(error)}") return pd.DataFrame({"Error": [f"❌ {str(error)}"]}) return result with gr.Blocks() as demo: gr.Markdown(" ## DuckDB demo using parquet revision") dataset = gr.Textbox(label="dataset", placeholder="mstz/iris") config = gr.Textbox(label="config", placeholder="iris") split = gr.Textbox(label="split", placeholder="train") sql = gr.Textbox(label="sql", placeholder=f"SELECT sepal_length FROM {TABLE_WILDCARD} LIMIT 3") run_button = gr.Button("Run") gr.Markdown("### Result") cached_responses_table = gr.DataFrame() run_button.click(greet, inputs=[dataset, config, split, sql], outputs=cached_responses_table) if __name__ == "__main__": demo.launch() # duckdb.query(f"SELECT idx as id, premise as p FROM '{location}' LIMIT 2").show() # duckdb.query(f"SELECT idx as id, premise as p FROM '{location}' LIMIT 2") # duckdb.query(f"SELECT max(idx) as max FROM '{location}' LIMIT 2") # duckdb.query(f"SELECT idx FROM '{location}' ORDER BY idx DESC LIMIT 1").show()