Spaces:
Running
Running
refactor: Improve DuckDB Parquet tutorial notebook
Browse files- Add author attribution to notebook header
- Add sqlglot dependency for future SQL parsing capabilities
- Use consistent table references via variables instead of string literals
- Remove unused pyarrow import
- Improve markdown formatting for better readability
The notebook now properly references the created airbnb_stock table
through variables, making the code more maintainable and reducing
the risk of typos in table names.
- duckdb/008_loading_parquet.py +34 -34
duckdb/008_loading_parquet.py
CHANGED
@@ -5,6 +5,7 @@
|
|
5 |
# "duckdb==1.2.1",
|
6 |
# "pyarrow==19.0.1",
|
7 |
# "plotly.express",
|
|
|
8 |
# ]
|
9 |
# ///
|
10 |
|
@@ -16,7 +17,13 @@ app = marimo.App(width="medium")
|
|
16 |
|
17 |
@app.cell(hide_code=True)
|
18 |
def _(mo):
|
19 |
-
mo.md(
|
|
|
|
|
|
|
|
|
|
|
|
|
20 |
return
|
21 |
|
22 |
|
@@ -39,10 +46,11 @@ def _(mo):
|
|
39 |
)
|
40 |
return
|
41 |
|
|
|
42 |
@app.cell
|
43 |
def _():
|
44 |
AIRBNB_URL = 'https://huggingface.co/datasets/BatteRaquette58/airbnb-stock-price/resolve/main/data/airbnb-stock.parquet'
|
45 |
-
return AIRBNB_URL,
|
46 |
|
47 |
|
48 |
@app.cell(hide_code=True)
|
@@ -64,7 +72,7 @@ def _(mo):
|
|
64 |
|
65 |
|
66 |
@app.cell
|
67 |
-
def _(AIRBNB_URL, mo):
|
68 |
mo.sql(
|
69 |
f"""
|
70 |
SELECT *
|
@@ -86,8 +94,8 @@ def _(mo):
|
|
86 |
mo.md(
|
87 |
r"""
|
88 |
For more control, you can use the `read_parquet` table function. This is useful when you need to specify options, for example, when dealing with multiple files or specific data types.
|
89 |
-
|
90 |
Some useful options for `read_parquet` include:
|
|
|
91 |
- `binary_as_string=True`: Reads `BINARY` columns as `VARCHAR`.
|
92 |
- `filename=True`: Adds a `filename` column with the path of the file for each row.
|
93 |
- `hive_partitioning=True`: Enables reading of Hive-partitioned datasets.
|
@@ -148,23 +156,23 @@ def _(AIRBNB_URL, mo):
|
|
148 |
SELECT * FROM read_parquet('{AIRBNB_URL}');
|
149 |
"""
|
150 |
)
|
151 |
-
return stock_table
|
152 |
|
153 |
|
154 |
-
@app.cell
|
155 |
def _(mo, stock_table):
|
156 |
mo.md(
|
157 |
f"""
|
158 |
-
|
159 |
|
160 |
-
|
161 |
-
|
162 |
)
|
163 |
return
|
164 |
|
165 |
|
166 |
@app.cell
|
167 |
-
def _(mo):
|
168 |
mo.sql(
|
169 |
f"""
|
170 |
SELECT * FROM airbnb_stock LIMIT 5;
|
@@ -181,15 +189,12 @@ def _(mo):
|
|
181 |
|
182 |
@app.cell(hide_code=True)
|
183 |
def _(mo):
|
184 |
-
mo.md(
|
185 |
-
r"""
|
186 |
-
Let's perform a simple analysis: plotting the closing stock price over time.
|
187 |
-
"""
|
188 |
-
)
|
189 |
return
|
190 |
|
|
|
191 |
@app.cell
|
192 |
-
def _(mo):
|
193 |
stock_data = mo.sql(
|
194 |
f"""
|
195 |
SELECT
|
@@ -199,16 +204,12 @@ def _(mo):
|
|
199 |
ORDER BY "Date";
|
200 |
"""
|
201 |
)
|
202 |
-
return stock_data,
|
203 |
|
204 |
|
205 |
@app.cell(hide_code=True)
|
206 |
def _(mo):
|
207 |
-
mo.md(
|
208 |
-
r"""
|
209 |
-
Now we can easily visualize this result using marimo's integration with plotting libraries like Plotly.
|
210 |
-
"""
|
211 |
-
)
|
212 |
return
|
213 |
|
214 |
|
@@ -234,16 +235,16 @@ def _(mo):
|
|
234 |
def _(mo):
|
235 |
mo.md(
|
236 |
r"""
|
237 |
-
|
238 |
-
|
239 |
-
|
240 |
-
|
241 |
-
|
242 |
-
|
243 |
-
|
244 |
-
|
245 |
-
|
246 |
-
|
247 |
)
|
248 |
return
|
249 |
|
@@ -258,9 +259,8 @@ def _():
|
|
258 |
@app.cell
|
259 |
def _():
|
260 |
import pyarrow
|
261 |
-
return
|
262 |
|
263 |
|
264 |
if __name__ == "__main__":
|
265 |
app.run()
|
266 |
-
|
|
|
5 |
# "duckdb==1.2.1",
|
6 |
# "pyarrow==19.0.1",
|
7 |
# "plotly.express",
|
8 |
+
# "sqlglot==27.0.0",
|
9 |
# ]
|
10 |
# ///
|
11 |
|
|
|
17 |
|
18 |
@app.cell(hide_code=True)
|
19 |
def _(mo):
|
20 |
+
mo.md(
|
21 |
+
r"""
|
22 |
+
# Loading Parquet files with DuckDB
|
23 |
+
*By [Thomas Liang](https://github.com/thliang01)*
|
24 |
+
#
|
25 |
+
"""
|
26 |
+
)
|
27 |
return
|
28 |
|
29 |
|
|
|
46 |
)
|
47 |
return
|
48 |
|
49 |
+
|
50 |
@app.cell
|
51 |
def _():
|
52 |
AIRBNB_URL = 'https://huggingface.co/datasets/BatteRaquette58/airbnb-stock-price/resolve/main/data/airbnb-stock.parquet'
|
53 |
+
return (AIRBNB_URL,)
|
54 |
|
55 |
|
56 |
@app.cell(hide_code=True)
|
|
|
72 |
|
73 |
|
74 |
@app.cell
|
75 |
+
def _(AIRBNB_URL, mo, null):
|
76 |
mo.sql(
|
77 |
f"""
|
78 |
SELECT *
|
|
|
94 |
mo.md(
|
95 |
r"""
|
96 |
For more control, you can use the `read_parquet` table function. This is useful when you need to specify options, for example, when dealing with multiple files or specific data types.
|
|
|
97 |
Some useful options for `read_parquet` include:
|
98 |
+
|
99 |
- `binary_as_string=True`: Reads `BINARY` columns as `VARCHAR`.
|
100 |
- `filename=True`: Adds a `filename` column with the path of the file for each row.
|
101 |
- `hive_partitioning=True`: Enables reading of Hive-partitioned datasets.
|
|
|
156 |
SELECT * FROM read_parquet('{AIRBNB_URL}');
|
157 |
"""
|
158 |
)
|
159 |
+
return airbnb_stock, stock_table
|
160 |
|
161 |
|
162 |
+
@app.cell(hide_code=True)
|
163 |
def _(mo, stock_table):
|
164 |
mo.md(
|
165 |
f"""
|
166 |
+
{stock_table}
|
167 |
|
168 |
+
Now that the `airbnb_stock` table is created, we can query it like any other SQL table.
|
169 |
+
"""
|
170 |
)
|
171 |
return
|
172 |
|
173 |
|
174 |
@app.cell
|
175 |
+
def _(airbnb_stock, mo):
|
176 |
mo.sql(
|
177 |
f"""
|
178 |
SELECT * FROM airbnb_stock LIMIT 5;
|
|
|
189 |
|
190 |
@app.cell(hide_code=True)
|
191 |
def _(mo):
|
192 |
+
mo.md(r"""Let's perform a simple analysis: plotting the closing stock price over time.""")
|
|
|
|
|
|
|
|
|
193 |
return
|
194 |
|
195 |
+
|
196 |
@app.cell
|
197 |
+
def _(airbnb_stock, mo):
|
198 |
stock_data = mo.sql(
|
199 |
f"""
|
200 |
SELECT
|
|
|
204 |
ORDER BY "Date";
|
205 |
"""
|
206 |
)
|
207 |
+
return (stock_data,)
|
208 |
|
209 |
|
210 |
@app.cell(hide_code=True)
|
211 |
def _(mo):
|
212 |
+
mo.md(r"""Now we can easily visualize this result using marimo's integration with plotting libraries like Plotly.""")
|
|
|
|
|
|
|
|
|
213 |
return
|
214 |
|
215 |
|
|
|
235 |
def _(mo):
|
236 |
mo.md(
|
237 |
r"""
|
238 |
+
In this notebook, we've seen how easy it is to work with Parquet files in DuckDB. We learned how to:
|
239 |
+
<ul>
|
240 |
+
<li>Query Parquet files directly from a URL using a simple `FROM` clause.</li>
|
241 |
+
<li>Use the `read_parquet` function for more fine-grained control and efficiency.</li>
|
242 |
+
<li>Load data from a Parquet file into a DuckDB table.</li>
|
243 |
+
<li>Seamlessly analyze and visualize the data using SQL and Python.</li>
|
244 |
+
</ul>
|
245 |
+
|
246 |
+
DuckDB's native Parquet support makes it a powerful tool for interactive data analysis on large datasets without complex ETL pipelines.
|
247 |
+
"""
|
248 |
)
|
249 |
return
|
250 |
|
|
|
259 |
@app.cell
|
260 |
def _():
|
261 |
import pyarrow
|
262 |
+
return
|
263 |
|
264 |
|
265 |
if __name__ == "__main__":
|
266 |
app.run()
|
|