Spaces:
Running
Running
Merge pull request #116 from thliang01/main
Browse files- duckdb/008_loading_parquet.py +266 -0
- duckdb/README.md +1 -0
duckdb/008_loading_parquet.py
ADDED
@@ -0,0 +1,266 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# /// script
|
2 |
+
# requires-python = ">=3.10"
|
3 |
+
# dependencies = [
|
4 |
+
# "marimo",
|
5 |
+
# "duckdb==1.2.1",
|
6 |
+
# "pyarrow==19.0.1",
|
7 |
+
# "plotly.express",
|
8 |
+
# "sqlglot==27.0.0",
|
9 |
+
# ]
|
10 |
+
# ///
|
11 |
+
|
12 |
+
import marimo
|
13 |
+
|
14 |
+
__generated_with = "0.14.10"
|
15 |
+
app = marimo.App(width="medium")
|
16 |
+
|
17 |
+
|
18 |
+
@app.cell(hide_code=True)
|
19 |
+
def _(mo):
|
20 |
+
mo.md(
|
21 |
+
r"""
|
22 |
+
# Loading Parquet files with DuckDB
|
23 |
+
*By [Thomas Liang](https://github.com/thliang01)*
|
24 |
+
#
|
25 |
+
"""
|
26 |
+
)
|
27 |
+
return
|
28 |
+
|
29 |
+
|
30 |
+
@app.cell(hide_code=True)
|
31 |
+
def _(mo):
|
32 |
+
mo.md(
|
33 |
+
r"""
|
34 |
+
[Apache Parquet](https://parquet.apache.org/) is a popular columnar storage format, optimized for analytics. Its columnar nature allows query engines like DuckDB to read only the necessary columns, leading to significant performance gains, especially for wide tables.
|
35 |
+
|
36 |
+
DuckDB has excellent, built-in support for reading Parquet files, making it incredibly easy to query and analyze Parquet data directly without a separate loading step.
|
37 |
+
|
38 |
+
In this notebook, we'll explore how to load and analyze Airbnb's stock price data from a remote Parquet file:
|
39 |
+
<ul>
|
40 |
+
<li>Querying a remote Parquet file directly.</li>
|
41 |
+
<li>Using the `read_parquet` function for more control.</li>
|
42 |
+
<li>Creating a persistent table from a Parquet file.</li>
|
43 |
+
<li>Performing basic data analysis and visualization.</li>
|
44 |
+
</ul>
|
45 |
+
"""
|
46 |
+
)
|
47 |
+
return
|
48 |
+
|
49 |
+
|
50 |
+
@app.cell
|
51 |
+
def _():
|
52 |
+
AIRBNB_URL = 'https://huggingface.co/datasets/BatteRaquette58/airbnb-stock-price/resolve/main/data/airbnb-stock.parquet'
|
53 |
+
return (AIRBNB_URL,)
|
54 |
+
|
55 |
+
|
56 |
+
@app.cell(hide_code=True)
|
57 |
+
def _(mo):
|
58 |
+
mo.md(r"""## Using `FROM` to query Parquet files""")
|
59 |
+
return
|
60 |
+
|
61 |
+
|
62 |
+
@app.cell(hide_code=True)
|
63 |
+
def _(mo):
|
64 |
+
mo.md(
|
65 |
+
r"""
|
66 |
+
The simplest way to query a Parquet file is to use it directly in a `FROM` clause, just like you would with a table. DuckDB will automatically detect that it's a Parquet file and read it accordingly.
|
67 |
+
|
68 |
+
Let's query a dataset of Airbnb's stock price from Hugging Face.
|
69 |
+
"""
|
70 |
+
)
|
71 |
+
return
|
72 |
+
|
73 |
+
|
74 |
+
@app.cell
|
75 |
+
def _(AIRBNB_URL, mo, null):
|
76 |
+
mo.sql(
|
77 |
+
f"""
|
78 |
+
SELECT *
|
79 |
+
FROM '{AIRBNB_URL}'
|
80 |
+
LIMIT 5;
|
81 |
+
"""
|
82 |
+
)
|
83 |
+
return
|
84 |
+
|
85 |
+
|
86 |
+
@app.cell(hide_code=True)
|
87 |
+
def _(mo):
|
88 |
+
mo.md(r"""## Using `read_parquet`""")
|
89 |
+
return
|
90 |
+
|
91 |
+
|
92 |
+
@app.cell(hide_code=True)
|
93 |
+
def _(mo):
|
94 |
+
mo.md(
|
95 |
+
r"""
|
96 |
+
For more control, you can use the `read_parquet` table function. This is useful when you need to specify options, for example, when dealing with multiple files or specific data types.
|
97 |
+
Some useful options for `read_parquet` include:
|
98 |
+
|
99 |
+
- `binary_as_string=True`: Reads `BINARY` columns as `VARCHAR`.
|
100 |
+
- `filename=True`: Adds a `filename` column with the path of the file for each row.
|
101 |
+
- `hive_partitioning=True`: Enables reading of Hive-partitioned datasets.
|
102 |
+
|
103 |
+
Here, we'll use `read_parquet` to select only a few relevant columns. This is much more efficient than `SELECT *` because DuckDB only needs to read the data for the columns we specify.
|
104 |
+
"""
|
105 |
+
)
|
106 |
+
return
|
107 |
+
|
108 |
+
|
109 |
+
@app.cell
|
110 |
+
def _(AIRBNB_URL, mo):
|
111 |
+
mo.sql(
|
112 |
+
f"""
|
113 |
+
SELECT Date, Open, "close_last", High, Low
|
114 |
+
FROM read_parquet('{AIRBNB_URL}')
|
115 |
+
LIMIT 5;
|
116 |
+
"""
|
117 |
+
)
|
118 |
+
return
|
119 |
+
|
120 |
+
|
121 |
+
@app.cell(hide_code=True)
|
122 |
+
def _(mo):
|
123 |
+
mo.md(
|
124 |
+
r"""
|
125 |
+
You can also read multiple Parquet files at once using a glob pattern. For example, to read all Parquet files in a directory `data/`:
|
126 |
+
|
127 |
+
```sql
|
128 |
+
SELECT * FROM read_parquet('data/*.parquet');
|
129 |
+
```
|
130 |
+
"""
|
131 |
+
)
|
132 |
+
return
|
133 |
+
|
134 |
+
|
135 |
+
@app.cell(hide_code=True)
|
136 |
+
def _(mo):
|
137 |
+
mo.md(r"""## Creating a table from a Parquet file""")
|
138 |
+
return
|
139 |
+
|
140 |
+
|
141 |
+
@app.cell(hide_code=True)
|
142 |
+
def _(mo):
|
143 |
+
mo.md(
|
144 |
+
r"""
|
145 |
+
While querying Parquet files directly is powerful, sometimes it's useful to load the data into a persistent table within your DuckDB database. This can simplify subsequent queries and is a good practice if you'll be accessing the data frequently.
|
146 |
+
"""
|
147 |
+
)
|
148 |
+
return
|
149 |
+
|
150 |
+
|
151 |
+
@app.cell
|
152 |
+
def _(AIRBNB_URL, mo):
|
153 |
+
stock_table = mo.sql(
|
154 |
+
f"""
|
155 |
+
CREATE OR REPLACE TABLE airbnb_stock AS
|
156 |
+
SELECT * FROM read_parquet('{AIRBNB_URL}');
|
157 |
+
"""
|
158 |
+
)
|
159 |
+
return airbnb_stock, stock_table
|
160 |
+
|
161 |
+
|
162 |
+
@app.cell(hide_code=True)
|
163 |
+
def _(mo, stock_table):
|
164 |
+
mo.md(
|
165 |
+
f"""
|
166 |
+
{stock_table}
|
167 |
+
|
168 |
+
Now that the `airbnb_stock` table is created, we can query it like any other SQL table.
|
169 |
+
"""
|
170 |
+
)
|
171 |
+
return
|
172 |
+
|
173 |
+
|
174 |
+
@app.cell
|
175 |
+
def _(airbnb_stock, mo):
|
176 |
+
mo.sql(
|
177 |
+
f"""
|
178 |
+
SELECT * FROM airbnb_stock LIMIT 5;
|
179 |
+
"""
|
180 |
+
)
|
181 |
+
return
|
182 |
+
|
183 |
+
|
184 |
+
@app.cell(hide_code=True)
|
185 |
+
def _(mo):
|
186 |
+
mo.md(r"""## Analysis and Visualization""")
|
187 |
+
return
|
188 |
+
|
189 |
+
|
190 |
+
@app.cell(hide_code=True)
|
191 |
+
def _(mo):
|
192 |
+
mo.md(r"""Let's perform a simple analysis: plotting the closing stock price over time.""")
|
193 |
+
return
|
194 |
+
|
195 |
+
|
196 |
+
@app.cell
|
197 |
+
def _(airbnb_stock, mo):
|
198 |
+
stock_data = mo.sql(
|
199 |
+
f"""
|
200 |
+
SELECT
|
201 |
+
CAST(to_timestamp(Date) AS DATE) AS "Date",
|
202 |
+
"close_last"
|
203 |
+
FROM airbnb_stock
|
204 |
+
ORDER BY "Date";
|
205 |
+
"""
|
206 |
+
)
|
207 |
+
return (stock_data,)
|
208 |
+
|
209 |
+
|
210 |
+
@app.cell(hide_code=True)
|
211 |
+
def _(mo):
|
212 |
+
mo.md(r"""Now we can easily visualize this result using marimo's integration with plotting libraries like Plotly.""")
|
213 |
+
return
|
214 |
+
|
215 |
+
|
216 |
+
@app.cell
|
217 |
+
def _(px, stock_data):
|
218 |
+
px.line(
|
219 |
+
stock_data,
|
220 |
+
x="Date",
|
221 |
+
y="close_last",
|
222 |
+
title="Airbnb (ABNB) Stock Price Over Time",
|
223 |
+
labels={"Date": "Date", "close_last": "Closing Price (USD)"},
|
224 |
+
)
|
225 |
+
return
|
226 |
+
|
227 |
+
|
228 |
+
@app.cell(hide_code=True)
|
229 |
+
def _(mo):
|
230 |
+
mo.md(r"""## Conclusion""")
|
231 |
+
return
|
232 |
+
|
233 |
+
|
234 |
+
@app.cell(hide_code=True)
|
235 |
+
def _(mo):
|
236 |
+
mo.md(
|
237 |
+
r"""
|
238 |
+
In this notebook, we've seen how easy it is to work with Parquet files in DuckDB. We learned how to:
|
239 |
+
<ul>
|
240 |
+
<li>Query Parquet files directly from a URL using a simple `FROM` clause.</li>
|
241 |
+
<li>Use the `read_parquet` function for more fine-grained control and efficiency.</li>
|
242 |
+
<li>Load data from a Parquet file into a DuckDB table.</li>
|
243 |
+
<li>Seamlessly analyze and visualize the data using SQL and Python.</li>
|
244 |
+
</ul>
|
245 |
+
|
246 |
+
DuckDB's native Parquet support makes it a powerful tool for interactive data analysis on large datasets without complex ETL pipelines.
|
247 |
+
"""
|
248 |
+
)
|
249 |
+
return
|
250 |
+
|
251 |
+
|
252 |
+
@app.cell
|
253 |
+
def _():
|
254 |
+
import marimo as mo
|
255 |
+
import plotly.express as px
|
256 |
+
return mo, px
|
257 |
+
|
258 |
+
|
259 |
+
@app.cell
|
260 |
+
def _():
|
261 |
+
import pyarrow
|
262 |
+
return
|
263 |
+
|
264 |
+
|
265 |
+
if __name__ == "__main__":
|
266 |
+
app.run()
|
duckdb/README.md
CHANGED
@@ -29,3 +29,4 @@ Thanks to all our notebook authors!
|
|
29 |
|
30 |
* [Mustjaab](https://github.com/Mustjaab)
|
31 |
* [julius383](https://github.com/julius383)
|
|
|
|
29 |
|
30 |
* [Mustjaab](https://github.com/Mustjaab)
|
31 |
* [julius383](https://github.com/julius383)
|
32 |
+
* [thliang01](https://github.com/thliang01)
|