Spaces:
Sleeping
Sleeping
# ============================================================================= | |
# Marimo Notebook Template: Lazy Load & Interactively View a Hugging Face Parquet Dataset | |
# ============================================================================= | |
# This template demonstrates how to: | |
# • Lazy load a Hugging Face dataset from all directories using a recursive globbing | |
# pattern for Parquet files. | |
# • Preview the loaded DataFrame along with metadata using a custom command. | |
# • Provide an interactive button to expand the DataFrame view. | |
# • (Optionally) Read local JSONL files (commented out). | |
# | |
# Note: According to the Polars documentation, you can read multiple files with: | |
# pl.read_parquet("hf://datasets/{username}/{dataset}/{path_to_file}") | |
# and globbing patterns such as "**/*.parquet" work to query all files recursively. | |
# | |
# Install dependencies with: | |
# pip install polars marimo | |
# ============================================================================= | |
import polars as pl | |
import marimo as mo # Marimo provides UI and lazy-loading decorators | |
# ------------------------------------------------------------------------------ | |
# 2. Lazy Load the Dataset | |
# | |
# Use the recursive globbing pattern "**/*.parquet" to read all Parquet files | |
# from all subdirectories on Hugging Face. | |
# ------------------------------------------------------------------------------ | |
dataset_url = "hf://datasets/cicero-im/processed_prompt1/**/*.parquet" | |
# Use Marimo's lazy decorator to defer data loading until needed. | |
def load_dataset(): | |
# Load all Parquet files matching the recursive pattern. | |
df = pl.read_parquet(dataset_url) | |
# Uncomment the next line to read local JSONL files instead: | |
# df = pl.read_ndjson("/local/path/to/*.jsonl") | |
return df | |
# Calling load_dataset() returns a lazy DataFrame that is materialized on demand. | |
df = load_dataset() | |
# ------------------------------------------------------------------------------ | |
# 3. Preview the DataFrame | |
# | |
# Define a custom command to preview the DataFrame with metadata. | |
# mo.ui.table is assumed to render a rich interactive table. | |
# ------------------------------------------------------------------------------ | |
def preview_dataframe(df: pl.DataFrame): | |
# Display a preview (first few rows) along with metadata (e.g., row count, column names). | |
return mo.ui.table(df.head(), metadata=True) | |
# Obtain and render the preview. | |
preview = preview_dataframe(df) | |
preview | |
# ------------------------------------------------------------------------------ | |
# 4. Expand the DataFrame for Better Visualization | |
# | |
# Create an interactive button that, when clicked, renders the full DataFrame | |
# with expanded display options (e.g. full width). | |
# ------------------------------------------------------------------------------ | |
expand_option = mo.ui.button(label="Expand Dataframe") | |
def expand_dataframe(): | |
# Render the complete DataFrame view using the UI table component. | |
# Adjust display parameters such as width and height. | |
mo.ui.table(df, width="100%", height="auto") | |
# Render the expand button. | |
expand_option | |
# ------------------------------------------------------------------------------ | |
# 5. Commented-Out Formulas for Column Selection | |
# | |
# The following examples (commented out) demonstrate different column selection techniques: | |
# | |
# Example 1: Select specific columns by name: | |
# selected_columns_df = df.select(["column1", "column2"]) | |
# | |
# Example 2: Select all columns except column 'a': | |
# all_except_a_df = df.select(pl.exclude("a")) | |
# | |
# Example 3: Select a range of columns (e.g., from the second to the fourth column): | |
# range_columns_df = df.select(pl.col(df.columns[1:4])) | |
# ------------------------------------------------------------------------------ |