Spaces:

cicero-im
/

marimo

Sleeping

App Files Files Community

marimo / app.py

arthrod

Update app.py

d398a54 verified 5 months ago

raw

history blame

3.81 kB

	# =============================================================================
	# Marimo Notebook Template: Lazy Load & Interactively View a Hugging Face Parquet Dataset
	# =============================================================================
	# This template demonstrates how to:
	# • Lazy load a Hugging Face dataset from all directories using a recursive globbing
	# pattern for Parquet files.
	# • Preview the loaded DataFrame along with metadata using a custom command.
	# • Provide an interactive button to expand the DataFrame view.
	# • (Optionally) Read local JSONL files (commented out).
	#
	# Note: According to the Polars documentation, you can read multiple files with:
	# pl.read_parquet("hf://datasets/{username}/{dataset}/{path_to_file}")
	# and globbing patterns such as "*/.parquet" work to query all files recursively.
	#
	# Install dependencies with:
	# pip install polars marimo
	# =============================================================================

	import polars as pl
	import marimo as mo # Marimo provides UI and lazy-loading decorators

	# ------------------------------------------------------------------------------
	# 2. Lazy Load the Dataset
	#
	# Use the recursive globbing pattern "*/.parquet" to read all Parquet files
	# from all subdirectories on Hugging Face.
	# ------------------------------------------------------------------------------
	dataset_url = "hf://datasets/cicero-im/processed_prompt1/*/.parquet"

	@mo.lazy # Use Marimo's lazy decorator to defer data loading until needed.
	def load_dataset():
	# Load all Parquet files matching the recursive pattern.
	df = pl.read_parquet(dataset_url)
	# Uncomment the next line to read local JSONL files instead:
	# df = pl.read_ndjson("/local/path/to/*.jsonl")
	return df

	# Calling load_dataset() returns a lazy DataFrame that is materialized on demand.
	df = load_dataset()

	# ------------------------------------------------------------------------------
	# 3. Preview the DataFrame
	#
	# Define a custom command to preview the DataFrame with metadata.
	# mo.ui.table is assumed to render a rich interactive table.
	# ------------------------------------------------------------------------------
	def preview_dataframe(df: pl.DataFrame):
	# Display a preview (first few rows) along with metadata (e.g., row count, column names).
	return mo.ui.table(df.head(), metadata=True)

	# Obtain and render the preview.
	preview = preview_dataframe(df)
	preview

	# ------------------------------------------------------------------------------
	# 4. Expand the DataFrame for Better Visualization
	#
	# Create an interactive button that, when clicked, renders the full DataFrame
	# with expanded display options (e.g. full width).
	# ------------------------------------------------------------------------------
	expand_option = mo.ui.button(label="Expand Dataframe")

	@expand_option.on_click
	def expand_dataframe():
	# Render the complete DataFrame view using the UI table component.
	# Adjust display parameters such as width and height.
	mo.ui.table(df, width="100%", height="auto")

	# Render the expand button.
	expand_option

	# ------------------------------------------------------------------------------
	# 5. Commented-Out Formulas for Column Selection
	#
	# The following examples (commented out) demonstrate different column selection techniques:
	#
	# Example 1: Select specific columns by name:
	# selected_columns_df = df.select(["column1", "column2"])
	#
	# Example 2: Select all columns except column 'a':
	# all_except_a_df = df.select(pl.exclude("a"))
	#
	# Example 3: Select a range of columns (e.g., from the second to the fourth column):
	# range_columns_df = df.select(pl.col(df.columns[1:4]))
	# ------------------------------------------------------------------------------