Spaces:

lynx-analytics
/

lynxkite

Running

App Files Files Community

lynxkite / lynxkite-graph-analytics /src /lynxkite_graph_analytics /bionemo_ops.py

darabos

Make it simpler to declare operations "slow".

e05c3b0 about 2 months ago

raw

history blame contribute delete

16.2 kB

	"""BioNeMo related operations

	The intention is to showcase how BioNeMo can be integrated with LynxKite. This should be
	considered as a reference implementation and not a production ready code.
	The operations are quite specific for this example notebook:
	https://github.com/NVIDIA/bionemo-framework/blob/main/docs/docs/user-guide/examples/bionemo-geneformer/geneformer-celltype-classification.ipynb
	"""

	from lynxkite.core import ops
	import requests
	import tarfile
	import os
	from collections import Counter
	from . import core
	import numpy as np
	import torch
	from pathlib import Path
	import random
	from contextlib import contextmanager
	import cellxgene_census # TODO: This needs numpy < 2
	import tempfile
	from sklearn.ensemble import RandomForestClassifier
	from sklearn.pipeline import Pipeline
	from sklearn.model_selection import StratifiedKFold, cross_validate
	from sklearn.metrics import (
	make_scorer,
	accuracy_score,
	precision_score,
	recall_score,
	f1_score,
	roc_auc_score,
	confusion_matrix,
	)
	from sklearn.decomposition import PCA
	from sklearn.model_selection import cross_val_predict
	from sklearn.preprocessing import LabelEncoder
	from bionemo.scdl.io.single_cell_collection import SingleCellCollection

	import scanpy


	op = ops.op_registration(core.ENV)
	DATA_PATH = Path("/workspace")


	@contextmanager
	def random_seed(seed: int):
	state = random.getstate()
	random.seed(seed)
	try:
	yield
	finally:
	# Go back to previous state
	random.setstate(state)


	@op("BioNeMo > Download CELLxGENE dataset", slow=True)
	def download_cellxgene_dataset(
	*,
	save_path: str,
	census_version: str = "2023-12-15",
	organism: str = "Homo sapiens",
	value_filter='dataset_id=="8e47ed12-c658-4252-b126-381df8d52a3d"',
	max_workers: int = 1,
	use_mp: bool = False,
	) -> None:
	"""Downloads a CELLxGENE dataset"""

	with cellxgene_census.open_soma(census_version=census_version) as census:
	adata = cellxgene_census.get_anndata(
	census,
	organism,
	obs_value_filter=value_filter,
	)
	with random_seed(32):
	indices = list(range(len(adata)))
	random.shuffle(indices)
	micro_batch_size: int = 32
	num_steps: int = 256
	selection = sorted(indices[: micro_batch_size * num_steps])
	# NOTE: there's a current constraint that predict_step needs to be a function of micro-batch-size.
	# this is something we are working on fixing. A quick hack is to set micro-batch-size=1, but this is
	# slow. In this notebook we are going to use mbs=32 and subsample the anndata.
	adata = adata[selection].copy() # so it's not a view
	h5ad_outfile = DATA_PATH / Path("hs-celltype-bench.h5ad")
	adata.write_h5ad(h5ad_outfile)
	with tempfile.TemporaryDirectory() as temp_dir:
	coll = SingleCellCollection(temp_dir)
	coll.load_h5ad_multi(h5ad_outfile.parent, max_workers=max_workers, use_processes=use_mp)
	coll.flatten(DATA_PATH / save_path, destroy_on_copy=True)
	return DATA_PATH / save_path


	@op("BioNeMo > Import H5AD file")
	def import_h5ad(*, file_path: str):
	return scanpy.read_h5ad(DATA_PATH / Path(file_path))


	@op("BioNeMo > Download model", slow=True)
	def download_model(*, model_name: str) -> str:
	"""Downloads a model."""
	model_download_parameters = {
	"geneformer_100m": {
	"name": "geneformer_100m",
	"version": "2.0",
	"path": "geneformer_106M_240530_nemo2",
	},
	"geneformer_10m": {
	"name": "geneformer_10m",
	"version": "2.0",
	"path": "geneformer_10M_240530_nemo2",
	},
	"geneformer_10m2": {
	"name": "geneformer_10m",
	"version": "2.1",
	"path": "geneformer_10M_241113_nemo2",
	},
	}

	# Define the URL and output file
	url_template = "https://api.ngc.nvidia.com/v2/models/org/nvidia/team/clara/{name}/{version}/files?redirect=true&path={path}.tar.gz"
	url = url_template.format(**model_download_parameters[model_name])
	model_filename = f"{DATA_PATH}/{model_download_parameters[model_name]['path']}"
	output_file = f"{model_filename}.tar.gz"

	# Send the request
	response = requests.get(url, allow_redirects=True, stream=True)
	response.raise_for_status() # Raise an error for bad responses (4xx and 5xx)

	# Save the file to disk
	with open(f"{output_file}", "wb") as file:
	for chunk in response.iter_content(chunk_size=8192):
	file.write(chunk)

	# Extract the tar.gz file
	os.makedirs(model_filename, exist_ok=True)
	with tarfile.open(output_file, "r:gz") as tar:
	tar.extractall(path=model_filename)

	return model_filename


	@op("BioNeMo > Infer", slow=True)
	def infer(dataset_path: str, model_path: str \| None = None, *, results_path: str) -> str:
	"""Infer on a dataset."""
	# This import is slow, so we only import it when we need it.
	from bionemo.geneformer.scripts.infer_geneformer import infer_model

	infer_model(
	data_path=dataset_path,
	checkpoint_path=model_path,
	results_path=DATA_PATH / results_path,
	include_hiddens=False,
	micro_batch_size=32,
	include_embeddings=True,
	include_logits=False,
	seq_length=2048,
	precision="bf16-mixed",
	devices=1,
	num_nodes=1,
	num_dataset_workers=10,
	)
	return DATA_PATH / results_path


	@op("BioNeMo > Load results")
	def load_results(results_path: str):
	embeddings = (
	torch.load(f"{results_path}/predictions__rank_0.pt")["embeddings"].float().cpu().numpy()
	)
	return embeddings


	@op("BioNeMo > Get labels")
	def get_labels(adata):
	infer_metadata = adata.obs
	labels = infer_metadata["cell_type"].values
	label_encoder = LabelEncoder()
	integer_labels = label_encoder.fit_transform(labels)
	label_encoder.integer_labels = integer_labels
	return label_encoder


	@op("BioNeMo > Plot labels", view="visualization")
	def plot_labels(adata):
	infer_metadata = adata.obs
	labels = infer_metadata["cell_type"].values
	label_counts = Counter(labels)
	labels = list(label_counts.keys())
	values = list(label_counts.values())

	options = {
	"title": {
	"text": "Cell type counts for classification dataset",
	"left": "center",
	},
	"tooltip": {"trigger": "axis", "axisPointer": {"type": "shadow"}},
	"xAxis": {
	"type": "category",
	"data": labels,
	"axisLabel": {"rotate": 45, "align": "right"},
	},
	"yAxis": {"type": "value"},
	"series": [
	{
	"name": "Count",
	"type": "bar",
	"data": values,
	"itemStyle": {"color": "#4285F4"},
	}
	],
	}
	return options


	@op("BioNeMo > Run benchmark", slow=True)
	def run_benchmark(data, labels, *, use_pca: bool = False):
	"""
	data - contains the single cell expression (or whatever feature) in each row.
	labels - contains the string label for each cell

	data_shape (R, C)
	labels_shape (R,)
	"""
	np.random.seed(1337)
	# Define the target dimension 'n_components'
	n_components = 10 # for example, adjust based on your specific needs

	# Create a pipeline that includes Gaussian random projection and RandomForestClassifier
	if use_pca:
	pipeline = Pipeline(
	[
	("projection", PCA(n_components=n_components)),
	("classifier", RandomForestClassifier(class_weight="balanced")),
	]
	)
	else:
	pipeline = Pipeline([("classifier", RandomForestClassifier(class_weight="balanced"))])

	# Set up StratifiedKFold to ensure each fold reflects the overall distribution of labels
	cv = StratifiedKFold(n_splits=5)

	# Define the scoring functions
	scoring = {
	"accuracy": make_scorer(accuracy_score),
	"precision": make_scorer(precision_score, average="macro"), # 'macro' averages over classes
	"recall": make_scorer(recall_score, average="macro"),
	"f1_score": make_scorer(f1_score, average="macro"),
	# 'roc_auc' requires probability or decision function; hence use multi_class if applicable
	"roc_auc": make_scorer(roc_auc_score, multi_class="ovr"),
	}
	labels = labels.integer_labels
	# Perform stratified cross-validation with multiple metrics using the pipeline
	results = cross_validate(
	pipeline, data, labels, cv=cv, scoring=scoring, return_train_score=False
	)

	# Print the cross-validation results
	print("Cross-validation metrics:")
	results_out = {}
	for metric, scores in results.items():
	if metric.startswith("test_"):
	results_out[metric] = (scores.mean(), scores.std())
	print(f"{metric[5:]}: {scores.mean():.3f} (+/- {scores.std():.3f})")

	predictions = cross_val_predict(pipeline, data, labels, cv=cv)

	# v Return confusion matrix and metrics.
	conf_matrix = confusion_matrix(labels, predictions)

	return results_out, conf_matrix


	@op("BioNeMo > Plot confusion matrix", view="visualization", slow=True)
	def plot_confusion_matrix(benchmark_output, labels):
	cm = benchmark_output[1]
	labels = labels.classes_
	str_labels = [str(label) for label in labels]
	norm_cm = [[float(val / sum(row)) if sum(row) else 0 for val in row] for row in cm]
	# heatmap has the 0,0 at the bottom left corner
	num_rows = len(str_labels)
	heatmap_data = [
	[j, num_rows - i - 1, norm_cm[i][j]] for i in range(len(labels)) for j in range(len(labels))
	]

	options = {
	"title": {"text": "Confusion Matrix", "left": "center"},
	"tooltip": {"position": "top"},
	"xAxis": {
	"type": "category",
	"data": str_labels,
	"splitArea": {"show": True},
	"axisLabel": {"rotate": 70, "align": "right"},
	},
	"yAxis": {
	"type": "category",
	"data": list(reversed(str_labels)),
	"splitArea": {"show": True},
	},
	"grid": {
	"height": "70%",
	"width": "70%",
	"left": "20%",
	"right": "10%",
	"bottom": "10%",
	"top": "10%",
	},
	"visualMap": {
	"min": 0,
	"max": 1,
	"calculable": True,
	"orient": "vertical",
	"right": 10,
	"top": "center",
	"inRange": {"color": ["#E0F7FA", "#81D4FA", "#29B6F6", "#0288D1", "#01579B"]},
	},
	"series": [
	{
	"name": "Confusion matrix",
	"type": "heatmap",
	"data": heatmap_data,
	"emphasis": {"itemStyle": {"borderColor": "#333", "borderWidth": 1}},
	"itemStyle": {"borderColor": "#D3D3D3", "borderWidth": 2},
	}
	],
	}
	return options


	@op("BioNeMo > Plot accuracy comparison", view="visualization")
	def accuracy_comparison(benchmark_output10m, benchmark_output100m):
	results_10m = benchmark_output10m[0]
	results_106M = benchmark_output100m[0]
	data = {
	"model": ["10M parameters", "106M parameters"],
	"accuracy_mean": [
	results_10m["test_accuracy"][0],
	results_106M["test_accuracy"][0],
	],
	"accuracy_std": [
	results_10m["test_accuracy"][1],
	results_106M["test_accuracy"][1],
	],
	}

	labels = data["model"] # X-axis labels
	values = data["accuracy_mean"] # Y-axis values
	error_bars = data["accuracy_std"] # Standard deviation for error bars

	options = {
	"title": {
	"text": "Accuracy Comparison",
	"left": "center",
	"textStyle": {
	"fontSize": 20, # Bigger font for title
	"fontWeight": "bold", # Make title bold
	},
	},
	"grid": {
	"height": "70%",
	"width": "70%",
	"left": "20%",
	"right": "10%",
	"bottom": "10%",
	"top": "10%",
	},
	"tooltip": {"trigger": "axis", "axisPointer": {"type": "shadow"}},
	"xAxis": {
	"type": "category",
	"data": labels,
	"axisLabel": {
	"rotate": 45, # Rotate labels for better readability
	"align": "right",
	"textStyle": {
	"fontSize": 14, # Bigger font for X-axis labels
	"fontWeight": "bold",
	},
	},
	},
	"yAxis": {
	"type": "value",
	"name": "Accuracy",
	"min": 0,
	"max": 1,
	"interval": 0.1, # Matches np.arange(0, 1.05, 0.05)
	"axisLabel": {
	"textStyle": {
	"fontSize": 14, # Bigger font for X-axis labels
	"fontWeight": "bold",
	}
	},
	},
	"series": [
	{
	"name": "Accuracy",
	"type": "bar",
	"data": values,
	"itemStyle": {
	"color": "#440154" # Viridis color palette (dark purple)
	},
	},
	{
	"name": "Error Bars",
	"type": "errorbar",
	"data": [[val - err, val + err] for val, err in zip(values, error_bars)],
	"itemStyle": {"color": "#1f77b4"},
	},
	],
	}
	return options


	@op("BioNeMo > Plot f1 comparison", view="visualization")
	def f1_comparison(benchmark_output10m, benchmark_output100m):
	results_10m = benchmark_output10m[0]
	results_106M = benchmark_output100m[0]
	data = {
	"model": ["10M parameters", "106M parameters"],
	"f1_score_mean": [
	results_10m["test_f1_score"][0],
	results_106M["test_f1_score"][0],
	],
	"f1_score_std": [
	results_10m["test_f1_score"][1],
	results_106M["test_f1_score"][1],
	],
	}

	labels = data["model"] # X-axis labels
	values = data["f1_score_mean"] # Y-axis values
	error_bars = data["f1_score_std"] # Standard deviation for error bars

	options = {
	"title": {
	"text": "F1 Score Comparison",
	"left": "center",
	"textStyle": {
	"fontSize": 20, # Bigger font for title
	"fontWeight": "bold", # Make title bold
	},
	},
	"grid": {
	"height": "70%",
	"width": "70%",
	"left": "20%",
	"right": "10%",
	"bottom": "10%",
	"top": "10%",
	},
	"tooltip": {"trigger": "axis", "axisPointer": {"type": "shadow"}},
	"xAxis": {
	"type": "category",
	"data": labels,
	"axisLabel": {
	"rotate": 45, # Rotate labels for better readability
	"align": "right",
	"textStyle": {
	"fontSize": 14, # Bigger font for X-axis labels
	"fontWeight": "bold",
	},
	},
	},
	"yAxis": {
	"type": "value",
	"name": "F1 Score",
	"min": 0,
	"max": 1,
	"interval": 0.1, # Matches np.arange(0, 1.05, 0.05),
	"axisLabel": {
	"textStyle": {
	"fontSize": 14, # Bigger font for X-axis labels
	"fontWeight": "bold",
	}
	},
	},
	"series": [
	{
	"name": "F1 Score",
	"type": "bar",
	"data": values,
	"itemStyle": {
	"color": "#440154" # Viridis color palette (dark purple)
	},
	},
	{
	"name": "Error Bars",
	"type": "errorbar",
	"data": [[val - err, val + err] for val, err in zip(values, error_bars)],
	"itemStyle": {"color": "#1f77b4"},
	},
	],
	}
	return options