paccmann / app.py
jannisborn's picture
update
8aab0ae unverified
raw
history blame
4.72 kB
import logging
import os
import pathlib
import tempfile
from typing import List, Optional
import gradio as gr
import pandas as pd
from rdkit import Chem
from tqdm import tqdm
from configuration import GENE_EXPRESSION_METADATA
from submission import submission
logger = logging.getLogger(__name__)
logger.addHandler(logging.NullHandler())
site_mapper = {
"central_nervous_system": "CNS",
"haematopoietic_and_lymphoid_tissue": "Haema_lymph",
"upper_aerodigestive_tract": "digestive",
"autonomic_ganglia": "ganglia",
}
def run_inference(
smiles: Optional[str],
smiles_path: Optional[str],
omic_path: Optional[str],
confidence: bool,
):
# Read SMILES
if smiles is None and smiles_path is None:
raise TypeError("Pass either single SMILES or a file")
elif smiles is not None:
smiles = [smiles]
elif smiles_path is not None:
smiles_data = pd.read_csv(smiles_path.name, sep="\t", header=None)
smiles = smiles_data[0]
for smi in smiles:
if Chem.MolFromSmiles(smi) is None:
raise ValueError(f"Found invalid SMILES {smi}")
# Read omics and otherwise load baseline
if not isinstance(omic_path, (str, type(None))):
raise TypeError(f"Omics file pass has to be None or str, not {type(omic_path)}")
# ToDo: Add progress bar for multiple smiles
results = {}
for smi in tqdm(smiles, total=len(smiles)):
result = submission(
drug={"smiles": smi},
workspace_id="emulated_workspace_id",
task_id="emulated_task_id",
estimate_confidence=confidence,
omics_file=omic_path,
)
# For the moment no attention analysis
result.pop("gene_attention")
result.pop("smiles_attention", None)
result.pop("IC50")
results[f"IC50_{smi}"] = result["log_micromolar_IC50"].squeeze().round(3)
results[f"IC50_{smi}"].shape
if confidence:
results[f"aleatoric_confidence_{smi}"] = (
result["aleatoric_confidence"].squeeze().round(3)
)
results[f"epistemic_confidence_{smi}"] = (
result["aleatoric_confidence"].squeeze().round(3)
)
predicted_df = pd.DataFrame(results)
# Prepare DF to visualize
if omic_path is None:
df = GENE_EXPRESSION_METADATA.copy()
df.drop(
[
"histology",
"cell_line_name",
"IC50 (min/max scaled)",
"IC50 (log(μmol))",
],
axis=1,
inplace=True,
)
df["site"] = df["site"].apply(lambda x: site_mapper.get(x, x))
df["cell_line"] = df["cell_line"].apply(lambda x: x.split("_")[0])
if confidence:
df.drop(
["aleatoric_confidence", "epistemic_confidence"], axis=1, inplace=True
)
else:
pass
result_df = pd.concat(
[df["cell_line"], predicted_df, df.drop(["cell_line"], axis=1)], axis=1
)
# Save to temporary dir
temp_path = os.path.join(tempfile.gettempdir(), "paccmann_result.csv")
result_df.to_csv(temp_path)
return temp_path, result_df.head(25)
if __name__ == "__main__":
# Load metadata
metadata_root = pathlib.Path(__file__).parent.joinpath("model_cards")
examples = [
["COc1cc(O)c2c(c1)C=CCC(O)C(O)C(=O)C=CCC(C)OC2=O", None, None, False],
["COC1=C(C=C2C(=C1)N=CN=C2NC3=CC(=C(C=C3)F)Cl)OCCCN4CCOCC4", None, None, True],
[None, metadata_root.joinpath("molecules.smi"), None, False],
]
with open(metadata_root.joinpath("article.md"), "r") as f:
article = f.read()
with open(metadata_root.joinpath("description.md"), "r") as f:
description = f.read()
demo = gr.Interface(
fn=run_inference,
title="PaccMann",
inputs=[
gr.Textbox(
label="SMILES",
placeholder="COc1cc(O)c2c(c1)C=CCC(O)C(O)C(=O)C=CCC(C)OC2=O",
lines=1,
),
gr.File(
file_types=[".smi", ".tsv"],
label="Multiple SMILES",
),
gr.File(
file_types=[".csv"],
label="Transcriptomics data file",
),
gr.Radio(choices=[True, False], label="Estimate confidence", value=False),
],
outputs=[
gr.File(label="Download full results"),
gr.DataFrame(label="Preview of results for 25 cell lines"),
],
article=article,
description=description,
examples=examples,
)
demo.launch(debug=True, show_error=True)