marimo / app.py
arthrod's picture
Update app.py
d398a54 verified
raw
history blame
3.81 kB
# =============================================================================
# Marimo Notebook Template: Lazy Load & Interactively View a Hugging Face Parquet Dataset
# =============================================================================
# This template demonstrates how to:
# • Lazy load a Hugging Face dataset from all directories using a recursive globbing
# pattern for Parquet files.
# • Preview the loaded DataFrame along with metadata using a custom command.
# • Provide an interactive button to expand the DataFrame view.
# • (Optionally) Read local JSONL files (commented out).
#
# Note: According to the Polars documentation, you can read multiple files with:
# pl.read_parquet("hf://datasets/{username}/{dataset}/{path_to_file}")
# and globbing patterns such as "**/*.parquet" work to query all files recursively.
#
# Install dependencies with:
# pip install polars marimo
# =============================================================================
import polars as pl
import marimo as mo # Marimo provides UI and lazy-loading decorators
# ------------------------------------------------------------------------------
# 2. Lazy Load the Dataset
#
# Use the recursive globbing pattern "**/*.parquet" to read all Parquet files
# from all subdirectories on Hugging Face.
# ------------------------------------------------------------------------------
dataset_url = "hf://datasets/cicero-im/processed_prompt1/**/*.parquet"
@mo.lazy # Use Marimo's lazy decorator to defer data loading until needed.
def load_dataset():
# Load all Parquet files matching the recursive pattern.
df = pl.read_parquet(dataset_url)
# Uncomment the next line to read local JSONL files instead:
# df = pl.read_ndjson("/local/path/to/*.jsonl")
return df
# Calling load_dataset() returns a lazy DataFrame that is materialized on demand.
df = load_dataset()
# ------------------------------------------------------------------------------
# 3. Preview the DataFrame
#
# Define a custom command to preview the DataFrame with metadata.
# mo.ui.table is assumed to render a rich interactive table.
# ------------------------------------------------------------------------------
def preview_dataframe(df: pl.DataFrame):
# Display a preview (first few rows) along with metadata (e.g., row count, column names).
return mo.ui.table(df.head(), metadata=True)
# Obtain and render the preview.
preview = preview_dataframe(df)
preview
# ------------------------------------------------------------------------------
# 4. Expand the DataFrame for Better Visualization
#
# Create an interactive button that, when clicked, renders the full DataFrame
# with expanded display options (e.g. full width).
# ------------------------------------------------------------------------------
expand_option = mo.ui.button(label="Expand Dataframe")
@expand_option.on_click
def expand_dataframe():
# Render the complete DataFrame view using the UI table component.
# Adjust display parameters such as width and height.
mo.ui.table(df, width="100%", height="auto")
# Render the expand button.
expand_option
# ------------------------------------------------------------------------------
# 5. Commented-Out Formulas for Column Selection
#
# The following examples (commented out) demonstrate different column selection techniques:
#
# Example 1: Select specific columns by name:
# selected_columns_df = df.select(["column1", "column2"])
#
# Example 2: Select all columns except column 'a':
# all_except_a_df = df.select(pl.exclude("a"))
#
# Example 3: Select a range of columns (e.g., from the second to the fourth column):
# range_columns_df = df.select(pl.col(df.columns[1:4]))
# ------------------------------------------------------------------------------