thliang01 commited on
Commit
e64bc6a
·
unverified ·
1 Parent(s): 97902b8

refactor: Improve DuckDB Parquet tutorial notebook

Browse files

- Add author attribution to notebook header
- Add sqlglot dependency for future SQL parsing capabilities
- Use consistent table references via variables instead of string literals
- Remove unused pyarrow import
- Improve markdown formatting for better readability

The notebook now properly references the created airbnb_stock table
through variables, making the code more maintainable and reducing
the risk of typos in table names.

Files changed (1) hide show
  1. duckdb/008_loading_parquet.py +34 -34
duckdb/008_loading_parquet.py CHANGED
@@ -5,6 +5,7 @@
5
  # "duckdb==1.2.1",
6
  # "pyarrow==19.0.1",
7
  # "plotly.express",
 
8
  # ]
9
  # ///
10
 
@@ -16,7 +17,13 @@ app = marimo.App(width="medium")
16
 
17
  @app.cell(hide_code=True)
18
  def _(mo):
19
- mo.md(r"""# Loading Parquet files with DuckDB""")
 
 
 
 
 
 
20
  return
21
 
22
 
@@ -39,10 +46,11 @@ def _(mo):
39
  )
40
  return
41
 
 
42
  @app.cell
43
  def _():
44
  AIRBNB_URL = 'https://huggingface.co/datasets/BatteRaquette58/airbnb-stock-price/resolve/main/data/airbnb-stock.parquet'
45
- return AIRBNB_URL,
46
 
47
 
48
  @app.cell(hide_code=True)
@@ -64,7 +72,7 @@ def _(mo):
64
 
65
 
66
  @app.cell
67
- def _(AIRBNB_URL, mo):
68
  mo.sql(
69
  f"""
70
  SELECT *
@@ -86,8 +94,8 @@ def _(mo):
86
  mo.md(
87
  r"""
88
  For more control, you can use the `read_parquet` table function. This is useful when you need to specify options, for example, when dealing with multiple files or specific data types.
89
-
90
  Some useful options for `read_parquet` include:
 
91
  - `binary_as_string=True`: Reads `BINARY` columns as `VARCHAR`.
92
  - `filename=True`: Adds a `filename` column with the path of the file for each row.
93
  - `hive_partitioning=True`: Enables reading of Hive-partitioned datasets.
@@ -148,23 +156,23 @@ def _(AIRBNB_URL, mo):
148
  SELECT * FROM read_parquet('{AIRBNB_URL}');
149
  """
150
  )
151
- return stock_table,
152
 
153
 
154
- @app.cell
155
  def _(mo, stock_table):
156
  mo.md(
157
  f"""
158
- {stock_table}
159
 
160
- Now that the `airbnb_stock` table is created, we can query it like any other SQL table.
161
- """
162
  )
163
  return
164
 
165
 
166
  @app.cell
167
- def _(mo):
168
  mo.sql(
169
  f"""
170
  SELECT * FROM airbnb_stock LIMIT 5;
@@ -181,15 +189,12 @@ def _(mo):
181
 
182
  @app.cell(hide_code=True)
183
  def _(mo):
184
- mo.md(
185
- r"""
186
- Let's perform a simple analysis: plotting the closing stock price over time.
187
- """
188
- )
189
  return
190
 
 
191
  @app.cell
192
- def _(mo):
193
  stock_data = mo.sql(
194
  f"""
195
  SELECT
@@ -199,16 +204,12 @@ def _(mo):
199
  ORDER BY "Date";
200
  """
201
  )
202
- return stock_data,
203
 
204
 
205
  @app.cell(hide_code=True)
206
  def _(mo):
207
- mo.md(
208
- r"""
209
- Now we can easily visualize this result using marimo's integration with plotting libraries like Plotly.
210
- """
211
- )
212
  return
213
 
214
 
@@ -234,16 +235,16 @@ def _(mo):
234
  def _(mo):
235
  mo.md(
236
  r"""
237
- In this notebook, we've seen how easy it is to work with Parquet files in DuckDB. We learned how to:
238
- <ul>
239
- <li>Query Parquet files directly from a URL using a simple `FROM` clause.</li>
240
- <li>Use the `read_parquet` function for more fine-grained control and efficiency.</li>
241
- <li>Load data from a Parquet file into a DuckDB table.</li>
242
- <li>Seamlessly analyze and visualize the data using SQL and Python.</li>
243
- </ul>
244
-
245
- DuckDB's native Parquet support makes it a powerful tool for interactive data analysis on large datasets without complex ETL pipelines.
246
- """
247
  )
248
  return
249
 
@@ -258,9 +259,8 @@ def _():
258
  @app.cell
259
  def _():
260
  import pyarrow
261
- return pyarrow,
262
 
263
 
264
  if __name__ == "__main__":
265
  app.run()
266
-
 
5
  # "duckdb==1.2.1",
6
  # "pyarrow==19.0.1",
7
  # "plotly.express",
8
+ # "sqlglot==27.0.0",
9
  # ]
10
  # ///
11
 
 
17
 
18
  @app.cell(hide_code=True)
19
  def _(mo):
20
+ mo.md(
21
+ r"""
22
+ # Loading Parquet files with DuckDB
23
+ *By [Thomas Liang](https://github.com/thliang01)*
24
+ #
25
+ """
26
+ )
27
  return
28
 
29
 
 
46
  )
47
  return
48
 
49
+
50
  @app.cell
51
  def _():
52
  AIRBNB_URL = 'https://huggingface.co/datasets/BatteRaquette58/airbnb-stock-price/resolve/main/data/airbnb-stock.parquet'
53
+ return (AIRBNB_URL,)
54
 
55
 
56
  @app.cell(hide_code=True)
 
72
 
73
 
74
  @app.cell
75
+ def _(AIRBNB_URL, mo, null):
76
  mo.sql(
77
  f"""
78
  SELECT *
 
94
  mo.md(
95
  r"""
96
  For more control, you can use the `read_parquet` table function. This is useful when you need to specify options, for example, when dealing with multiple files or specific data types.
 
97
  Some useful options for `read_parquet` include:
98
+
99
  - `binary_as_string=True`: Reads `BINARY` columns as `VARCHAR`.
100
  - `filename=True`: Adds a `filename` column with the path of the file for each row.
101
  - `hive_partitioning=True`: Enables reading of Hive-partitioned datasets.
 
156
  SELECT * FROM read_parquet('{AIRBNB_URL}');
157
  """
158
  )
159
+ return airbnb_stock, stock_table
160
 
161
 
162
+ @app.cell(hide_code=True)
163
  def _(mo, stock_table):
164
  mo.md(
165
  f"""
166
+ {stock_table}
167
 
168
+ Now that the `airbnb_stock` table is created, we can query it like any other SQL table.
169
+ """
170
  )
171
  return
172
 
173
 
174
  @app.cell
175
+ def _(airbnb_stock, mo):
176
  mo.sql(
177
  f"""
178
  SELECT * FROM airbnb_stock LIMIT 5;
 
189
 
190
  @app.cell(hide_code=True)
191
  def _(mo):
192
+ mo.md(r"""Let's perform a simple analysis: plotting the closing stock price over time.""")
 
 
 
 
193
  return
194
 
195
+
196
  @app.cell
197
+ def _(airbnb_stock, mo):
198
  stock_data = mo.sql(
199
  f"""
200
  SELECT
 
204
  ORDER BY "Date";
205
  """
206
  )
207
+ return (stock_data,)
208
 
209
 
210
  @app.cell(hide_code=True)
211
  def _(mo):
212
+ mo.md(r"""Now we can easily visualize this result using marimo's integration with plotting libraries like Plotly.""")
 
 
 
 
213
  return
214
 
215
 
 
235
  def _(mo):
236
  mo.md(
237
  r"""
238
+ In this notebook, we've seen how easy it is to work with Parquet files in DuckDB. We learned how to:
239
+ <ul>
240
+ <li>Query Parquet files directly from a URL using a simple `FROM` clause.</li>
241
+ <li>Use the `read_parquet` function for more fine-grained control and efficiency.</li>
242
+ <li>Load data from a Parquet file into a DuckDB table.</li>
243
+ <li>Seamlessly analyze and visualize the data using SQL and Python.</li>
244
+ </ul>
245
+
246
+ DuckDB's native Parquet support makes it a powerful tool for interactive data analysis on large datasets without complex ETL pipelines.
247
+ """
248
  )
249
  return
250
 
 
259
  @app.cell
260
  def _():
261
  import pyarrow
262
+ return
263
 
264
 
265
  if __name__ == "__main__":
266
  app.run()