File size: 2,925 Bytes
6216400
5822bba
 
 
9b95e7f
aef303c
5822bba
aef303c
5822bba
554bcd2
5822bba
554bcd2
 
6216400
aef303c
66f4448
aef303c
 
 
67fdc30
 
aef303c
 
 
 
 
 
9b95e7f
554bcd2
 
aef303c
 
 
 
 
 
554bcd2
 
 
aef303c
9b95e7f
 
 
 
 
 
518f556
 
 
 
aef303c
518f556
aef303c
 
 
 
9b95e7f
 
 
aef303c
9b95e7f
 
 
 
 
6216400
5822bba
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
import gradio as gr
import duckdb
from huggingface_hub import HfFileSystem
from huggingface_hub.hf_file_system import safe_quote
import pandas as pd
import requests

DATASETS_SERVER_ENDPOINT = "https://datasets-server.huggingface.co"
PARQUET_REVISION="refs/convert/parquet"
TABLE_WILDCARD="{table}"

fs = HfFileSystem()
duckdb.register_filesystem(fs)

def get_parquet_files(dataset, config, split):
        response = requests.get(f"{DATASETS_SERVER_ENDPOINT}/parquet?dataset={dataset}&config={config}", timeout=60)
        if response.status_code != 200:
            raise Exception(response)
        
        response = response.json()
        parquet_files = response["parquet_files"]
        file_names = [content["filename"] for content in parquet_files if content["split"] == split]
        if len(file_names) == 0:
             raise Exception("No parquet files found for dataset")
        return file_names

def run_command(dataset, config, split, sql):
    try:
        if TABLE_WILDCARD not in sql:
            raise Exception(f"Query must contains {TABLE_WILDCARD} wildcard.")
        
        parquet_files = get_parquet_files(dataset, config, split)
        print(f"File names found: {','.join(parquet_files)}")
        parquet_first_file = parquet_files[0]
        print(f"Trying with the first one {parquet_first_file}")
        location=f"hf://datasets/{dataset}@{safe_quote(PARQUET_REVISION)}/{config}/{parquet_first_file}"
        print(location)
        sql = sql.replace(TABLE_WILDCARD, f"'{location}'")
        result = duckdb.query(sql).to_df()
        print("Ok")
    except Exception as error:
        print(f"Error: {str(error)}")
        return pd.DataFrame({"Error": [f"❌ {str(error)}"]})
    return result

with gr.Blocks() as demo:
    gr.Markdown(" ## SQL Query using DuckDB for datasets server parquet files")
    dataset = gr.Textbox(label="dataset", placeholder="mstz/iris", value="mstz/iris")
    config = gr.Textbox(label="config", placeholder="iris", value="iris")
    split = gr.Textbox(label="split", placeholder="train", value="train")
    sql = gr.Textbox(
            label="Query in sql format - It should has {table} wildcard",
            placeholder=f"SELECT sepal_length FROM {TABLE_WILDCARD} LIMIT 3",
            value=f"SELECT sepal_length FROM {TABLE_WILDCARD} LIMIT 3",
            lines=3,
    )
    run_button = gr.Button("Run")
    gr.Markdown("### Result")
    cached_responses_table = gr.DataFrame()
    run_button.click(run_command, inputs=[dataset, config, split, sql], outputs=cached_responses_table)



if __name__ == "__main__":
    demo.launch()




# duckdb.query(f"SELECT idx as id, premise as p FROM '{location}' LIMIT 2").show()

# duckdb.query(f"SELECT idx as id, premise as p FROM '{location}' LIMIT 2")
# duckdb.query(f"SELECT max(idx) as max FROM '{location}' LIMIT 2")
# duckdb.query(f"SELECT idx  FROM '{location}'  ORDER BY idx DESC LIMIT 1").show()