File size: 2,822 Bytes
6216400
5822bba
 
 
9b95e7f
aef303c
5822bba
aef303c
5822bba
554bcd2
5822bba
554bcd2
 
6216400
aef303c
66f4448
aef303c
 
 
67fdc30
 
aef303c
 
 
 
 
 
9b95e7f
554bcd2
 
aef303c
 
 
 
 
 
554bcd2
 
 
aef303c
9b95e7f
 
 
 
 
 
 
 
 
 
aef303c
 
 
 
 
 
9b95e7f
 
 
aef303c
9b95e7f
 
 
 
 
6216400
5822bba
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
import gradio as gr
import duckdb
from huggingface_hub import HfFileSystem
from huggingface_hub.hf_file_system import safe_quote
import pandas as pd
import requests

DATASETS_SERVER_ENDPOINT = "https://datasets-server.huggingface.co"
PARQUET_REVISION="refs/convert/parquet"
TABLE_WILDCARD="{table}"

fs = HfFileSystem()
duckdb.register_filesystem(fs)

def get_parquet_files(dataset, config, split):
        response = requests.get(f"{DATASETS_SERVER_ENDPOINT}/parquet?dataset={dataset}&config={config}", timeout=60)
        if response.status_code != 200:
            raise Exception(response)
        
        response = response.json()
        parquet_files = response["parquet_files"]
        file_names = [content["filename"] for content in parquet_files if content["split"] == split]
        if len(file_names) == 0:
             raise Exception("No parquet files found for dataset")
        return file_names

def run_command(dataset, config, split, sql):
    try:
        if TABLE_WILDCARD not in sql:
            raise Exception(f"Query must contains {TABLE_WILDCARD} wildcard.")
        
        parquet_files = get_parquet_files(dataset, config, split)
        print(f"File names found: {','.join(parquet_files)}")
        parquet_first_file = parquet_files[0]
        print(f"Trying with the first one {parquet_first_file}")
        location=f"hf://datasets/{dataset}@{safe_quote(PARQUET_REVISION)}/{config}/{parquet_first_file}"
        print(location)
        sql = sql.replace(TABLE_WILDCARD, f"'{location}'")
        result = duckdb.query(sql).to_df()
        print("Ok")
    except Exception as error:
        print(f"Error: {str(error)}")
        return pd.DataFrame({"Error": [f"❌ {str(error)}"]})
    return result

with gr.Blocks() as demo:
    gr.Markdown(" ## DuckDB demo using parquet revision")
    dataset = gr.Textbox(label="dataset", placeholder="mstz/iris")
    config = gr.Textbox(label="config", placeholder="iris")
    split = gr.Textbox(label="split", placeholder="train")
    sql = gr.Textbox(
            label="Query in sql format",
            placeholder=f"SELECT sepal_length FROM {TABLE_WILDCARD} LIMIT 3",
            value=f"SELECT sepal_length FROM {TABLE_WILDCARD} LIMIT 3",
            lines=3,
    )
    run_button = gr.Button("Run")
    gr.Markdown("### Result")
    cached_responses_table = gr.DataFrame()
    run_button.click(run_command, inputs=[dataset, config, split, sql], outputs=cached_responses_table)



if __name__ == "__main__":
    demo.launch()




# duckdb.query(f"SELECT idx as id, premise as p FROM '{location}' LIMIT 2").show()

# duckdb.query(f"SELECT idx as id, premise as p FROM '{location}' LIMIT 2")
# duckdb.query(f"SELECT max(idx) as max FROM '{location}' LIMIT 2")
# duckdb.query(f"SELECT idx  FROM '{location}'  ORDER BY idx DESC LIMIT 1").show()