thliang01 commited on
Commit
5d24bcb
·
unverified ·
1 Parent(s): ef43da6

feat: Expand Apache Arrow tutorial with advanced examples and performance benchmarks

Browse files

- Add comprehensive examples for converting between DuckDB, Arrow, and Polars/Pandas DataFrames
- Add advanced multi-source data joining example combining DuckDB tables, Polars DataFrames, and Pandas DataFrames
- Include performance demonstration with 1M row dataset showcasing zero-copy benefits
- Enhance documentation with detailed explanations of Arrow's columnar format advantages
- Demonstrate zero-copy conversions using .to_arrow(), pl.from_arrow(), and .to_pandas() methods
- Improve code organization with hidden cells for better notebook readability
- Include timing measurements to demonstrate query performance on large datasets
- Expand summary section highlighting key learning outcomes

This enhancement provides users with more comprehensive examples of Apache Arrow's
capabilities, including real-world scenarios for combining heterogeneous data sources
and quantifiable performance benefits of the zero-copy architecture.

duckdb/011_working_with_apache_arrow.py CHANGED
@@ -41,6 +41,8 @@ def _(mo):
41
  - Create an Arrow table from a DuckDB query.
42
  - Load an Arrow table into DuckDB.
43
  - Convert between DuckDB, Arrow, and Polars/Pandas DataFrames.
 
 
44
  """
45
  )
46
  return
@@ -153,39 +155,237 @@ def _(mo, new_data):
153
  )
154
  return
155
 
156
- # Working in Interoperability with Polars and Pandas
157
 
158
- # @app.cell(hide_code=True)
159
- # def _(mo):
160
- # mo.md(
161
- # r"""
162
- # ## 3. Interoperability with Polars and Pandas
163
 
164
- # The real power of DuckDB's Arrow integration comes from its seamless interoperability with data frame libraries like Polars and Pandas. Because they all share the Arrow in-memory format, conversions are often zero-copy and extremely fast.
165
- # """
166
- # )
167
- # return
168
 
169
 
170
- # @app.cell(hide_code=True)
171
- # def _(mo):
172
- # mo.md(r"### From DuckDB to Polars/Pandas")
173
- # return
174
 
175
 
176
  @app.cell
177
- def _():
178
- import marimo as mo
179
- import plotly.express as px
180
- return mo, px
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
181
 
182
 
183
  @app.cell
184
  def _():
 
185
  import pyarrow as pa
186
  import polars as pl
187
  import pandas as pd
188
- return
189
 
190
 
191
  if __name__ == "__main__":
 
41
  - Create an Arrow table from a DuckDB query.
42
  - Load an Arrow table into DuckDB.
43
  - Convert between DuckDB, Arrow, and Polars/Pandas DataFrames.
44
+ - Combining data from multiple sources
45
+ - Performance benefits
46
  """
47
  )
48
  return
 
155
  )
156
  return
157
 
 
158
 
159
+ @app.cell(hide_code=True)
160
+ def _(mo):
161
+ mo.md(
162
+ r"""
163
+ ## 3. Convert between DuckDB, Arrow, and Polars/Pandas DataFrames.
164
 
165
+ The real power of DuckDB's Arrow integration comes from its seamless interoperability with data frame libraries like Polars and Pandas. Because they all share the Arrow in-memory format, conversions are often zero-copy and extremely fast.
166
+ """
167
+ )
168
+ return
169
 
170
 
171
+ @app.cell(hide_code=True)
172
+ def _(mo):
173
+ mo.md(r"### From DuckDB to Polars/Pandas")
174
+ return
175
 
176
 
177
  @app.cell
178
+ def _(pl, users_arrow_table):
179
+ # Convert the Arrow table to a Polars DataFrame
180
+ users_polars_df = pl.from_arrow(users_arrow_table)
181
+ users_polars_df
182
+ return (users_polars_df,)
183
+
184
+
185
+ @app.cell
186
+ def _(users_arrow_table):
187
+ # Convert the Arrow table to a Pandas DataFrame
188
+ users_pandas_df = users_arrow_table.to_pandas()
189
+ users_pandas_df
190
+ return (users_pandas_df,)
191
+
192
+
193
+ @app.cell(hide_code=True)
194
+ def _(mo):
195
+ mo.md(r"### From Polars/Pandas to DuckDB")
196
+ return
197
+
198
+
199
+ @app.cell
200
+ def _(pl):
201
+ # Create a Polars DataFrame
202
+ polars_df = pl.DataFrame({
203
+ "product_id": [101, 102, 103],
204
+ "product_name": ["Laptop", "Mouse", "Keyboard"],
205
+ "price": [1200.00, 25.50, 75.00]
206
+ })
207
+ polars_df
208
+ return (polars_df,)
209
+
210
+
211
+ @app.cell(hide_code=True)
212
+ def _(mo):
213
+ mo.md(r"Now we can query this Polars DataFrame directly in DuckDB:")
214
+ return
215
+
216
+
217
+ @app.cell
218
+ def _(mo, polars_df):
219
+ # Query the Polars DataFrame directly in DuckDB
220
+ mo.sql(
221
+ f"""
222
+ SELECT product_name, price
223
+ FROM polars_df
224
+ WHERE price > 50
225
+ ORDER BY price DESC;
226
+ """
227
+ )
228
+ return
229
+
230
+
231
+ @app.cell(hide_code=True)
232
+ def _(mo):
233
+ mo.md(r"Similarly, we can query a Pandas DataFrame:")
234
+ return
235
+
236
+
237
+ @app.cell
238
+ def _(pd):
239
+ # Create a Pandas DataFrame
240
+ pandas_df = pd.DataFrame({
241
+ "order_id": [1001, 1002, 1003, 1004],
242
+ "product_id": [101, 102, 103, 101],
243
+ "quantity": [1, 2, 1, 3],
244
+ "order_date": pd.to_datetime(['2024-01-15', '2024-01-16', '2024-01-16', '2024-01-17'])
245
+ })
246
+ pandas_df
247
+ return (pandas_df,)
248
+
249
+
250
+ @app.cell
251
+ def _(mo, pandas_df):
252
+ # Query the Pandas DataFrame in DuckDB
253
+ mo.sql(
254
+ f"""
255
+ SELECT order_date, SUM(quantity) as total_quantity
256
+ FROM pandas_df
257
+ GROUP BY order_date
258
+ ORDER BY order_date;
259
+ """
260
+ )
261
+ return
262
+
263
+
264
+ @app.cell(hide_code=True)
265
+ def _(mo):
266
+ mo.md(
267
+ r"""
268
+ ## 4. Advanced Example: Combining Multiple Data Sources
269
+
270
+ One of the most powerful features is the ability to join data from different sources (DuckDB tables, Arrow tables, Polars/Pandas DataFrames) in a single query:
271
+ """
272
+ )
273
+ return
274
+
275
+
276
+ @app.cell
277
+ def _(mo, pandas_df, polars_df):
278
+ # Join the DuckDB users table with the Polars products DataFrame and Pandas orders DataFrame
279
+ result = mo.sql(
280
+ f"""
281
+ SELECT
282
+ u.name as customer_name,
283
+ p.product_name,
284
+ o.quantity,
285
+ p.price,
286
+ (o.quantity * p.price) as total_amount
287
+ FROM users u
288
+ CROSS JOIN pandas_df o
289
+ JOIN polars_df p ON o.product_id = p.product_id
290
+ WHERE u.id = 1 -- Just for Alice
291
+ ORDER BY o.order_date;
292
+ """
293
+ )
294
+ result
295
+ return (result,)
296
+
297
+
298
+ @app.cell(hide_code=True)
299
+ def _(mo):
300
+ mo.md(
301
+ r"""
302
+ ## 5. Performance Benefits
303
+
304
+ The Arrow format provides several performance benefits:
305
+
306
+ - **Zero-copy data sharing**: Data can be shared between DuckDB and other Arrow-compatible systems without copying.
307
+ - **Columnar format**: Efficient for analytical queries that typically access a subset of columns.
308
+ - **Type safety**: Arrow's rich type system ensures data types are preserved across systems.
309
+ """
310
+ )
311
+ return
312
+
313
+
314
+ @app.cell(hide_code=True)
315
+ def _(mo):
316
+ mo.md(r"Let's create a larger dataset to demonstrate the performance:")
317
+ return
318
+
319
+
320
+ @app.cell
321
+ def _(pl):
322
+ import time
323
+
324
+ # Create a larger Polars DataFrame
325
+ large_polars_df = pl.DataFrame({
326
+ "id": range(1_000_000),
327
+ "value": pl.Series([i * 2.5 for i in range(1_000_000)]),
328
+ "category": pl.Series([f"cat_{i % 100}" for i in range(1_000_000)])
329
+ })
330
+
331
+ print(f"Created DataFrame with {len(large_polars_df):,} rows")
332
+ return large_polars_df, time
333
+
334
+
335
+ @app.cell
336
+ def _(large_polars_df, mo, time):
337
+ # Time a query on the large DataFrame
338
+ start_time = time.time()
339
+
340
+ result_large = mo.sql(
341
+ f"""
342
+ SELECT
343
+ category,
344
+ COUNT(*) as count,
345
+ AVG(value) as avg_value,
346
+ MIN(value) as min_value,
347
+ MAX(value) as max_value
348
+ FROM large_polars_df
349
+ GROUP BY category
350
+ ORDER BY count DESC
351
+ LIMIT 10;
352
+ """
353
+ )
354
+
355
+ query_time = time.time() - start_time
356
+ print(f"Query completed in {query_time:.3f} seconds")
357
+
358
+ result_large
359
+ return query_time, result_large, start_time
360
+
361
+
362
+ @app.cell(hide_code=True)
363
+ def _(mo):
364
+ mo.md(
365
+ r"""
366
+ ## Summary
367
+
368
+ In this notebook, we've explored:
369
+
370
+ 1. **Creating Arrow tables from DuckDB queries** using `.to_arrow()`
371
+ 2. **Loading Arrow tables into DuckDB** and querying them directly
372
+ 3. **Converting between DuckDB, Arrow, Polars, and Pandas** with zero-copy operations
373
+ 4. **Combining data from multiple sources** in a single SQL query
374
+ 5. **Performance benefits** of using Arrow's columnar format
375
+
376
+ The seamless integration between DuckDB and Arrow-compatible systems makes it easy to work with data across different tools while maintaining high performance.
377
+ """
378
+ )
379
+ return
380
 
381
 
382
  @app.cell
383
  def _():
384
+ import marimo as mo
385
  import pyarrow as pa
386
  import polars as pl
387
  import pandas as pd
388
+ return mo, pa, pd, pl
389
 
390
 
391
  if __name__ == "__main__":