File size: 4,468 Bytes
8e66b23
 
ec53722
8e66b23
ec53722
 
8e66b23
ec53722
8e66b23
ec53722
8e66b23
 
 
 
ec53722
 
 
 
 
 
8e66b23
 
 
ec53722
 
 
 
8e66b23
ec53722
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8e66b23
ec53722
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8e66b23
ec53722
8e66b23
ec53722
 
8e66b23
 
ec53722
8e66b23
 
ec53722
8e66b23
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ec53722
8e66b23
 
ec53722
 
8e66b23
 
ec53722
 
 
8e66b23
ec53722
 
 
8e66b23
ec53722
8e66b23
ec53722
8e66b23
 
ec53722
8e66b23
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
import logging
import pathlib
from typing import List, Optional

from rdkit import Chem
from tqdm import tqdm
import gradio as gr
from submission import submission
import pandas as pd
from configuration import GENE_EXPRESSION_METADATA

logger = logging.getLogger(__name__)
logger.addHandler(logging.NullHandler())

site_mapper = {
    "central_nervous_system": "CNS",
    "haematopoietic_and_lymphoid_tissue": "Haema_lymph",
    "upper_aerodigestive_tract": "digestive",
    "autonomic_ganglia": "ganglia",
}


def run_inference(
    smiles: Optional[str],
    smiles_path: Optional[str],
    omic_path: Optional[str],
    confidence: bool,
):
    # Read SMILES
    if not isinstance(smiles_path, (str, type(None))):
        raise TypeError(
            f"SMILES file pass has to be None or str, not {type(smiles_path)}"
        )
    if smiles is None and smiles_path is None:
        raise TypeError(f"Pass either single SMILES or a file")
    elif smiles is not None:
        smiles = [smiles]
    elif smiles_path is not None:
        smiles_data = pd.read_csv(smiles_path, sep="\t", header=False)
        smiles = smiles_data[0]
        for smi in smiles:
            if Chem.MolFromSmiles(smi) is None:
                raise ValueError(f"Found invalid SMILES {smi}")

    # Read omics and otherwise load baseline
    if not isinstance(omic_path, (str, type(None))):
        raise TypeError(f"Omics file pass has to be None or str, not {type(omic_path)}")

    # ToDo: Add progress bar for multiple smiles
    results = {}
    for smi in tqdm(smiles, total=len(smiles)):
        result = submission(
            drug={"smiles": smi},
            workspace_id="emulated_workspace_id",
            task_id="emulated_task_id",
            estimate_confidence=confidence,
            omics_file=omic_path,
        )
        # For the moment no attention analysis
        result.pop("gene_attention")
        result.pop("smiles_attention", None)
        result.pop("IC50")

        results[f"IC50_{smi}"] = result["log_micromolar_IC50"].squeeze().round(3)
        results[f"IC50_{smi}"].shape
        if confidence:
            results[f"aleatoric_confidence_{smi}"] = (
                result["aleatoric_confidence"].squeeze().round(3)
            )
            results[f"epistemic_confidence_{smi}"] = (
                result["aleatoric_confidence"].squeeze().round(3)
            )
    print(results)
    predicted_df = pd.DataFrame(results)

    # Prepare DF to visualize
    if omic_path is None:
        df = GENE_EXPRESSION_METADATA
        print(df.columns)
        df.drop(
            [
                "histology",
                "cell_line_name",
                "IC50 (min/max scaled)",
                "IC50 (log(μmol))",
            ],
            axis=1,
            inplace=True,
        )
        df["site"] = df["site"].apply(lambda x: site_mapper.get(x, x))
        df["cell_line"] = df["cell_line"].apply(lambda x: x.split("_")[0])
    else:
        pass

    result_df = pd.concat(
        [df["cell_line"], predicted_df, df.drop(["cell_line"], axis=1)], axis=1
    )

    return result_df, result_df


if __name__ == "__main__":

    # Load metadata
    metadata_root = pathlib.Path(__file__).parent.joinpath("model_cards")

    examples = pd.read_csv(
        metadata_root.joinpath("examples.csv"), header=None, sep="|"
    ).fillna("")

    with open(metadata_root.joinpath("article.md"), "r") as f:
        article = f.read()
    with open(metadata_root.joinpath("description.md"), "r") as f:
        description = f.read()

    demo = gr.Interface(
        fn=run_inference,
        title="PaccMann",
        inputs=[
            gr.Textbox(
                label="SMILES",
                placeholder="COc1cc(O)c2c(c1)C=CCC(O)C(O)C(=O)C=CCC(C)OC2=O",
                lines=1,
            ),
            gr.File(
                file_types=[".smi", ".tsv"],
                label="List of SMILES (tab-separated file with SMILES in first column)",
            ),
            gr.File(
                file_types=[".csv"],
                label="Transcriptomics data with cell lines in rows and genes in columns",
            ),
            gr.Radio(choices=[True, False], label="Estimate confidence", value=False),
        ],
        outputs=[gr.DataFrame(label="Output"), gr.File()],
        article=article,
        description=description,
        # examples=examples.values.tolist(),
    )
    demo.launch(debug=True, show_error=True)