Spaces:

westlake-repl
/

Demo_ProTrek_650M

Running

App Files Files Community

LTEnjoy commited on 2 days ago

Commit

2bd60c8

verified ·

1 Parent(s): 9606143

Delete demo

Browse files

Files changed (8) hide show

demo/__init__.py +0 -0
demo/config.yaml +0 -22
demo/modules/__init__.py +0 -19
demo/modules/blocks.py +0 -66
demo/modules/compute_score.py +0 -127
demo/modules/init_model.py +0 -118
demo/modules/search.py +0 -301
demo/modules/tmalign.py +0 -78

demo/__init__.py DELETED Viewed

File without changes

demo/config.yaml DELETED Viewed

@@ -1,22 +0,0 @@
-model_dir: /data/ProTrek_650M_UniRef50
-faiss_config:
-  IO_FLAG_MMAP: True
-sequence_index_dir:
-  - name: Swiss-Prot
-    index_dir: /data/ProTrek-faiss-index/ProTrek_650M_UniRef50/Swiss-Prot/sequence
-  - name: UniRef50
-    index_dir: /data/ProTrek-faiss-index/ProTrek_650M_UniRef50/UniRef50/sequence
-  - name: Uncharacterized
-    index_dir: /data/ProTrek-faiss-index/ProTrek_650M_UniRef50/Uncharacterized/sequence
-  - name: PDB
-    index_dir: /data/ProTrek-faiss-index/ProTrek_650M_UniRef50/PDB/sequence
-structure_index_dir:
-  - name: Swiss-Prot
-    index_dir: /data/ProTrek-faiss-index/ProTrek_650M_UniRef50/Swiss-Prot/structure
-  - name: PDB
-    index_dir: /data/ProTrek-faiss-index/ProTrek_650M_UniRef50/PDB/structure
-text_index_dir:
-  - name: Swiss-Prot
-    index_dir: /data/ProTrek-faiss-index/ProTrek_650M_UniRef50/Swiss-Prot/text

demo/modules/__init__.py DELETED Viewed

@@ -1,19 +0,0 @@
-import sys
-sys.path += []
-import argparse
-def main():
-    pass
-def get_args():
-    parser = argparse.ArgumentParser()
-    return parser.parse_args()
-if __name__ == '__main__':
-    args = get_args()
-    main()

demo/modules/blocks.py DELETED Viewed

@@ -1,66 +0,0 @@
-import gradio as gr
-from utils.foldseek_util import get_struc_seq
-####################################################
-#                  gradio blocks                   #
-####################################################
-def upload_pdb_button(visible: bool = True, chain_visible: bool = True):
-    """
-    Provide an upload button to upload a pdb file
-    Args:
-        visible: Whether the block is visible or not
-    """
-    with gr.Column(scale=0):
-        # Which chain to be extracted
-        chain_box = gr.Textbox(label="Chain (to be extracted from the pdb file)", value="A",
-                               visible=chain_visible, interactive=True)
-        upload_btn = gr.UploadButton(label="Upload .pdb/.cif file", visible=visible)
-    return upload_btn, chain_box
-####################################################
-#                 Trigger functions                #
-####################################################
-def parse_pdb_file(input_type: str, file: str, chain: str) -> str:
-    """
-    Parse the uploaded structure file
-    Args:
-        input_type: Type of input. Must be one of ["protein sequence", "protein structure"]
-        file: Path to the uploaded file
-        chain: Chain to be extracted from the pdb file
-    Returns:
-        Protein sequence or Foldseek sequence
-    """
-    try:
-        parsed_seqs = get_struc_seq("/tmp/foldseek", file, [chain])[chain]
-        if input_type == "sequence":
-            return parsed_seqs[0]
-        else:
-            return parsed_seqs[1].lower()
-    except Exception as e:
-        raise gr.Error(f"{e}")
-def set_upload_visible(visible: bool) -> gr.Interface:
-    """
-    Set the visibility of the upload button
-    Args:
-        visible: Whether the block is visible or not
-    Returns:
-        gr.Interface: Updated interface
-    """
-    return gr.update(visible=visible)

demo/modules/compute_score.py DELETED Viewed

@@ -1,127 +0,0 @@
-import gradio as gr
-import torch
-from .init_model import model
-from .blocks import upload_pdb_button, parse_pdb_file
-input_types = ["sequence", "structure", "text"]
-input_examples = {
-    "sequence": [
-        "MQLQRLGAPLLKRLVGGCIRQSTAPIMPCVVVSGSGGFLTPVRTYMPLPNDQSDFSPYIEIDLPSESRIQSLHKSGLAAQEWVACEKVHGTNFGIYLINQGDHEVVRFAKRSGIMDPNENFFGYHILIDEFTAQIRILNDLLKQKYGLSRVGRLVLNGELFGAKYKHPLVPKSEKWCTLPNGKKFPIAGVQIQREPFPQYSPELHFFAFDIKYSVSGAEEDFVLLGYDEFVEFSSKVPNLLYARALVRGTLDECLAFDVENFMTPLPALLGLGNYPLEGNLAEGVVIRHVRRGDPAVEKHNVSTIIKLRCSSFMELKHPGKQKELKETFIDTVRSGALRRVRGNVTVISDSMLPQVEAAANDLLLNNVSDGRLSNVLSKIGREPLLSGEVSQVDVALMLAKDALKDFLKEVDSLVLNTTLAFRKLLITNVYFESKRLVEQKWKELMQEEAAAQSEAIPPLSPAAPTKGE",
-        "MSLSTEQMLRDYPRSMQINGQIPKNAIHETYGNDGVDVFIAGSGPIGATYAKLCVEAGLRVVMVEIGAADSFYAVNAEEGTAVPYVPGYHKKNEIEFQKDIDRFVNVIKGALQQVSVPVRNQNVPTLDPGAWSAPPGSSAISNGKNPHQREFENLSAEAVTRGVGGMSTHWTCSTPRIHPPMESLPGIGRPKLSNDPAEDDKEWNELYSEAERLIGTSTKEFDESIRHTLVLRSLQDAYKDRQRIFRPLPLACHRLKNAPEYVEWHSAENLFHSIYNDDKQKKLFTLLTNHRCTRLALTGGYEKKIGAAEVRNLLATRNPSSQLDSYIMAKVYVLASGAIGNPQILYNSGFSGLQVTPRNDSLIPNLGRYITEQPMAFCQIVLRQEFVDSVRDDPYGLPWWKEAVAQHIAKNPTDALPIPFRDPEPQVTTPFTEEHPWHTQIHRDAFSYGAVGPEVDSRVIVDLRWFGATDPEANNLLVFQNDVQDGYSMPQPTFRYRPSTASNVRARKMMADMCEVASNLGGYLPTSPPQFMDPGLALHLAGTTRIGFDKATTVADNNSLVWDFANLYVAGNGTIRTGFGENPTLTSMCHAIKSARSIINTLKGGTDGKNTGEHRNL",
-        "MGVHECPAWLWLLLSLLSLPLGLPVLGAPPRLICDSRVLERYLLEAKEAENITTGCAEHCSLNENITVPDTKVNFYAWKRMEVGQQAVEVWQGLALLSEAVLRGQALLVNSSQPWEPLQLHVDKAVSGLRSLTTLLRALGAQKEAISPPDAASAAPLRTITADTFRKLFRVYSNFLRGKLKLYTGEACRTGDR"
-    ],
-    "structure": [
-        "ddddddddddddddddddddddddddddddddpdpddpddpqpdddfddpdqqlddadddfaaddpvqvvlcvvvvvlqakkfkwfdadffkkkwkwadpdpdidifidtnvgtdglqpddllclvcvvlsvqlvvllqvvvcvvvvapafrmkmfiwgkdalddpfppadadpdwhagsvgdidgsvpgdrdddpaqhahsdiaietewiwiarnsdpvriqtafqvvvcvsqvprpphhyidgqfmggnllnlldpqqpaaqlrnqqvvnqvgddpprggqfikmfrrpprppvvcvsvrhgihtdghlvnvcvvdppcsvvcccnrcvprnvvscvvvvndhdtdvlsrhhpvlsvllvqllvlldpvllvvldvvvdlpclqvvvqdllnsllsslvvsvvvsvvpddpvnvpgdpvsvvvssvsssvsssvvsvvcvvvvnvvsvvvvvvvddppdpdddpddd",
-        "dpdplvvqppdddplqappppfaadpvcvlvdpvaaaeeeeaqallsllllllclvlvgfyeyefqaeqpdwdddpddvpdddftqtqfapcqppvclqpqqvllvvqvvfwdwqeaefdqpppvpddppddhddppdgdddqqhdppfdpqqdlgqatwgghrntcqnhdpqfddawadadpvahqgtfdaldpdpvvrvvlvvvllvvlcvqlvkdqclqvpflqqcllqvllcvvcvvppwhkgggtgswhadpvhsldirhttsssscvvqrvdpssvssydyhyskhqqewhaghdpfgetawtkiarnccvvpvpdrgihigghrfyeypralprvllrcvssvqalqdpggdprhnqdqffalkwfwwkkkfkfffdpvsqvcqcvppppdpssnvqlvvqcvvcvpdpgsgdssrakhfmwtdadpvqqktktwidghhndddddppddpsrmimimiihwafrdrqfgwgfdppgdhpvrttrihtrddgdpvsvvsvvvrlvvsvvssvstgdtdprgpididrrnsvnlieqrqaedddsvngqayqlqhgpsyphygyfdrnhrngigngdcvsvrssssvsnsvvsscvvvvdpdddppdddddd",
-        "ddppppdcvvvvvvvvvppppppvppldplvvlldvvllvvqlvllvvllvvcvvpdpnfflqdwqkafdlddpvvvvvpddlllllqlllvrlvsllvrlvsslvslvpdpdrdvvnnvssvvlnvssvvvnvssvslvsvvsnppddppprdddgdididrgssvssvsvssnsvgsvvvssvvssvvvvd"
-    ],
-    "text": [
-        "RNA-editing ligase in kinetoplastid mitochondrial.",
-        "Oxidase which catalyzes the oxidation of various aldopyranoses and disaccharides.",
-        "Erythropoietin for regulation of erythrocyte proliferation and differentiation."
-    ]
-}
-samples = [[s1, s2] for s1, s2 in zip(input_examples["sequence"], input_examples["text"])]
-def compute_score(input_type_1: str, input_1: str, input_type_2: str, input_2: str):
-    with torch.no_grad():
-        input_reprs = []
-        for input_type, input in [(input_type_1, input_1), (input_type_2, input_2)]:
-            if input_type == "sequence":
-                input_reprs.append(model.get_protein_repr([input]))
-            elif input_type == "structure":
-                input_reprs.append(model.get_structure_repr([input]))
-            else:
-                input_reprs.append(model.get_text_repr([input]))
-        score = input_reprs[0] @ input_reprs[1].T / model.temperature
-    return f"{score.item():.4f}"
-def change_input_type(choice_1: str, choice_2: str):
-    examples_1 = input_examples[choice_1]
-    examples_2 = input_examples[choice_2]
-    # Change examples if input type is changed
-    global samples
-    samples = [[s1, s2] for s1, s2 in zip(examples_1, examples_2)]
-    # Set visibility of upload button
-    if choice_1 == "text":
-        visible_1 = False
-    else:
-        visible_1 = True
-    if choice_2 == "text":
-        visible_2 = False
-    else:
-        visible_2 = True
-    return (gr.update(samples=samples), "", "", gr.update(visible=visible_1), gr.update(visible=visible_1),
-            gr.update(visible=visible_2), gr.update(visible=visible_2))
-# Load example from dataset
-def load_example(example_id):
-    return samples[example_id]
-# Build the block for computing protein-text similarity
-def build_score_computation():
-    gr.Markdown(f"# Compute similarity score between two modalities")
-    with gr.Row(equal_height=True):
-        with gr.Column():
-            # Compute similarity score between sequence and text
-            with gr.Row():
-                input_1 = gr.Textbox(label="Input 1")
-                # Choose the type of input 1
-                input_type_1 = gr.Dropdown(input_types, label="Input type", value="sequence",
-                                           interactive=True, visible=True)
-                # Provide an upload button to upload a pdb file
-                upload_btn_1, chain_box_1 = upload_pdb_button(visible=True)
-                upload_btn_1.upload(parse_pdb_file, inputs=[input_type_1, upload_btn_1, chain_box_1], outputs=[input_1])
-            with gr.Row():
-                input_2 = gr.Textbox(label="Input 2")
-                # Choose the type of input 2
-                input_type_2 = gr.Dropdown(input_types, label="Input type", value="text",
-                                           interactive=True, visible=True)
-                # Provide an upload button to upload a pdb file
-                upload_btn_2, chain_box_2 = upload_pdb_button(visible=False)
-                upload_btn_2.upload(parse_pdb_file, inputs=[input_type_2, upload_btn_2, chain_box_2], outputs=[input_2])
-            # Provide examples
-            examples = gr.Dataset(samples=samples, type="index", components=[input_1, input_2], label="Input examples")
-            # Add click event to examples
-            examples.click(fn=load_example, inputs=[examples], outputs=[input_1, input_2])
-            compute_btn = gr.Button(value="Compute")
-        # Change examples based on input type
-        input_type_1.change(fn=change_input_type, inputs=[input_type_1, input_type_2],
-                            outputs=[examples, input_1, input_2, upload_btn_1, chain_box_1,
-                                     upload_btn_2, chain_box_2])
-        input_type_2.change(fn=change_input_type, inputs=[input_type_1, input_type_2],
-                            outputs=[examples, input_1, input_2, upload_btn_1, chain_box_1,
-                                     upload_btn_2, chain_box_2])
-        similarity_score = gr.Label(label="similarity score")
-        compute_btn.click(fn=compute_score, inputs=[input_type_1, input_1, input_type_2, input_2],
-                          outputs=[similarity_score])

demo/modules/init_model.py DELETED Viewed

@@ -1,118 +0,0 @@
-import faiss
-import numpy as np
-import pandas as pd
-import os
-import yaml
-import glob
-from easydict import EasyDict
-from utils.constants import sequence_level
-from model.ProTrek.protrek_trimodal_model import ProTrekTrimodalModel
-from tqdm import tqdm
-print(os.listdir("/data"))
-def load_model():
-    model_config = {
-        "protein_config": glob.glob(f"{config.model_dir}/esm2_*")[0],
-        "text_config": f"{config.model_dir}/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext",
-        "structure_config": glob.glob(f"{config.model_dir}/foldseek_*")[0],
-        "load_protein_pretrained": False,
-        "load_text_pretrained": False,
-        "from_checkpoint": glob.glob(f"{config.model_dir}/*.pt")[0]
-    }
-    model = ProTrekTrimodalModel(**model_config)
-    model.eval()
-    return model
-def load_faiss_index(index_path: str):
-    if config.faiss_config.IO_FLAG_MMAP:
-        index = faiss.read_index(index_path, faiss.IO_FLAG_MMAP)
-    else:
-        index = faiss.read_index(index_path)
-    index.metric_type = faiss.METRIC_INNER_PRODUCT
-    return index
-def load_index():
-    all_index = {}
-    # Load protein sequence index
-    all_index["sequence"] = {}
-    for db in tqdm(config.sequence_index_dir, desc="Loading sequence index..."):
-        db_name = db["name"]
-        index_dir = db["index_dir"]
-        index_path = f"{index_dir}/sequence.index"
-        sequence_index = load_faiss_index(index_path)
-        id_path = f"{index_dir}/ids.tsv"
-        uniprot_ids = pd.read_csv(id_path, sep="\t", header=None).values.flatten()
-        all_index["sequence"][db_name] = {"index": sequence_index, "ids": uniprot_ids}
-    # Load protein structure index
-    print("Loading structure index...")
-    all_index["structure"] = {}
-    for db in tqdm(config.structure_index_dir, desc="Loading structure index..."):
-        db_name = db["name"]
-        index_dir = db["index_dir"]
-        index_path = f"{index_dir}/structure.index"
-        structure_index = load_faiss_index(index_path)
-        id_path = f"{index_dir}/ids.tsv"
-        uniprot_ids = pd.read_csv(id_path, sep="\t", header=None).values.flatten()
-        all_index["structure"][db_name] = {"index": structure_index, "ids": uniprot_ids}
-    # Load text index
-    all_index["text"] = {}
-    valid_subsections = {}
-    for db in tqdm(config.text_index_dir, desc="Loading text index..."):
-        db_name = db["name"]
-        index_dir = db["index_dir"]
-        all_index["text"][db_name] = {}
-        text_dir = f"{index_dir}/subsections"
-        # Remove "Taxonomic lineage" from sequence_level. This is a special case which we don't need to index.
-        valid_subsections[db_name] = set()
-        sequence_level.add("Global")
-        for subsection in tqdm(sequence_level):
-            index_path = f"{text_dir}/{subsection.replace(' ', '_')}.index"
-            if not os.path.exists(index_path):
-                continue
-            text_index = load_faiss_index(index_path)
-            id_path = f"{text_dir}/{subsection.replace(' ', '_')}_ids.tsv"
-            text_ids = pd.read_csv(id_path, sep="\t", header=None).values.flatten()
-            all_index["text"][db_name][subsection] = {"index": text_index, "ids": text_ids}
-            valid_subsections[db_name].add(subsection)
-    # Sort valid_subsections
-    for db_name in valid_subsections:
-        valid_subsections[db_name] = sorted(list(valid_subsections[db_name]))
-    return all_index, valid_subsections
-# Load the config file
-root_dir = __file__.rsplit("/", 3)[0]
-config_path = f"{root_dir}/demo/config.yaml"
-with open(config_path, 'r', encoding='utf-8') as r:
-    config = EasyDict(yaml.safe_load(r))
-device = "cuda"
-print("Loading model...")
-model = load_model()
-# model.to(device)
-all_index, valid_subsections = load_index()
-print("Done...")
-# model = None
-# all_index, valid_subsections = {"text": {}, "sequence": {"UniRef50": None}, "structure": {"UniRef50": None}}, {}

demo/modules/search.py DELETED Viewed

@@ -1,301 +0,0 @@
-import gradio as gr
-import torch
-import pandas as pd
-import matplotlib.pyplot as plt
-import numpy as np
-from scipy.stats import norm
-from .init_model import model, all_index, valid_subsections
-from .blocks import upload_pdb_button, parse_pdb_file
-tmp_file_path = "/tmp/results.tsv"
-tmp_plot_path = "/tmp/histogram.svg"
-# Samples for input
-samples = {
-    "sequence": [
-            ["MSATAEQNARNPKGKGGFARTVSQRKRKRLFLIGGALAVLAVAVGLMLTAFNQDIRFFRTPADLTEQDMTSGARFRLGGLVEEGSVSRTGSELRFTVTDTIKTVKVVFEGIPPDLFREGQGVVAEGRFGSDGLFRADNVLAKHDENYVPKDLADSLKKKGVWEGK"],
-            ["MITLDWEKANGLITTVVQDATTKQVLMVAYMNQESLAKTMATGETWFWSRSRKTLWHKGATSGNIQTVKTIAVDCDADTLLVTVDPAGPACHTGHISCFYRHYPEGKDLT"],
-            ["MDLKQYVSEVQDWPKPGVSFKDITTIMDNGEAYGYATDKIVEYAKDRDVDIVVGPEARGFIIGCPVAYSMGIGFAPVRKEGKLPREVIRYEYDLEYGTNVLTMHKDAIKPGQRVLITDDLLATGGTIEAAIKLVEKLGGIVVGIAFIIELKYLNGIEKIKDYDVMSLISYDE"]
-        ],
-    "structure": [
-            ["dddddddddddddddpdpppvcppvnvvvvvvvvvvvvvvvvvvvvvvvvvvqdpqdedeqvrddpcqqpvqhkhkykafwappqwdddpqkiwtwghnppgiaieieghdappqddhrfikifiaghdpvrhtygdhidtdddpddddvvnvvvcvvvvndpdd"],
-            ["dddadcpvpvqkakefeaeppprdtadiaiagpvqvvvcvvpqwhwgqdpvvrdidgqcpvpvqiwrwddwdaddnrryiytythtpahsdpvrhvhpppadvvgpddpd"],
-            ["dplvvqwdwdaqpphhpdtdthcvscvvppvslvvqlvvvlvvcvvqvaqeeeeepdqrcsnrvsscvvvvhyywykyfpppddaawdwdwdddppgitiiithlpseaaageyeyegaeqalqprvlrvvvrcvvnnyddaeyeyqeyevcrvncvsvvvhhydyvyydpd"]
-        ],
-    "text": [
-        ["Proteins with zinc bindings."],
-        ["Proteins locating at cell membrane."],
-        ["Protein that serves as an enzyme."]
-    ],
-}
-def clear_results():
-    return "", gr.update(visible=False), gr.update(visible=False)
-def plot(scores) -> None:
-    """
-    Plot the distribution of scores and fit a normal distribution.
-    Args:
-        scores: List of scores
-    """
-    plt.hist(scores, bins=100, density=True, alpha=0.6)
-    plt.title('Distribution of similarity scores in the database', fontsize=15)
-    plt.xlabel('Similarity score', fontsize=15)
-    plt.ylabel('Density', fontsize=15)
-    y_ed = plt.gca().get_ylim()[-1]
-    plt.ylim(-0.05, y_ed)
-    # Add note
-    x_st = plt.gca().get_xlim()[0]
-    text = ("Note: For the \"UniRef50\" and \"Uncharacterized\" databases, the figure illustrates\n "
-            "only top-ranked clusters (identified using Faiss), whereas for other databases, it\n "
-            "displays the distribution across all samples.")
-    plt.text(x_st, -0.04, text, fontsize=8)
-    mu, std = norm.fit(scores)
-    # Plot the Gaussian
-    xmin, xmax = plt.xlim()
-    _, ymax = plt.ylim()
-    x = np.linspace(xmin, xmax, 100)
-    p = norm.pdf(x, mu, std)
-    plt.plot(x, p)
-    # Plot total number of scores
-    plt.text(xmax, 0.9*ymax, f"Total number: {len(scores)}", ha='right', fontsize=12)
-    # Convert the plot to svg format
-    plt.savefig(tmp_plot_path)
-    plt.cla()
-# Search from database
-def search(input: str, nprobe: int, topk: int, input_type: str, query_type: str, subsection_type: str, db: str):
-    print(f"Input type: {input_type}\n Output type: {query_type}\nDatabase: {db}\nSubsection: {subsection_type}")
-    input_modality = input_type.replace("sequence", "protein")
-    with torch.no_grad():
-        input_embedding = getattr(model, f"get_{input_modality}_repr")([input]).cpu().numpy()
-    if query_type == "text":
-        index = all_index["text"][db][subsection_type]["index"]
-        ids = all_index["text"][db][subsection_type]["ids"]
-    else:
-        index = all_index[query_type][db]["index"]
-        ids = all_index[query_type][db]["ids"]
-    if hasattr(index, "nprobe"):
-        if index.nlist < nprobe:
-            raise gr.Error(f"The number of clusters to search must be less than or equal to the number of clusters in the index ({index.nlist}).")
-        else:
-            index.nprobe = nprobe
-    if topk > index.ntotal:
-        raise gr.Error(f"You cannot retrieve more than the database size ({index.ntotal}).")
-    # Retrieve all scores to plot the distribution
-    scores, ranks = index.search(input_embedding, index.ntotal)
-    scores, ranks = scores[0], ranks[0]
-    # Remove inf values
-    selector = scores > -1
-    scores = scores[selector]
-    ranks = ranks[selector]
-    scores = scores / model.temperature.item()
-    plot(scores)
-    top_scores = scores[:topk]
-    top_ranks = ranks[:topk]
-    # ranks = [list(range(topk))]
-    # ids = ["P12345"] * topk
-    # scores = torch.randn(topk).tolist()
-    # Write the results to a temporary file for downloading
-    with open(tmp_file_path, "w") as w:
-        w.write("Id\tMatching score\n")
-        for i in range(topk):
-            rank = top_ranks[i]
-            w.write(f"{ids[rank]}\t{top_scores[i]}\n")
-    # Get topk ids
-    topk_ids = []
-    for rank in top_ranks:
-        now_id = ids[rank]
-        if query_type == "text":
-            topk_ids.append(now_id.replace("|", "\\|"))
-        else:
-            if db != "PDB":
-                # Provide link to uniprot website
-                topk_ids.append(f"[{now_id}](https://www.uniprot.org/uniprotkb/{now_id})")
-            else:
-                # Provide link to pdb website
-                pdb_id = now_id.split("-")[0]
-                topk_ids.append(f"[{now_id}](https://www.rcsb.org/structure/{pdb_id})")
-    limit = 1000
-    df = pd.DataFrame({"Id": topk_ids[:limit], "Matching score": top_scores[:limit]})
-    if len(topk_ids) > limit:
-        info_df = pd.DataFrame({"Id": ["Download the file to check all results"], "Matching score": ["..."]},
-                               index=[1000])
-        df = pd.concat([df, info_df], axis=0)
-    output = df.to_markdown()
-    return (output,
-            gr.DownloadButton(label="Download results", value=tmp_file_path, visible=True, scale=0),
-            gr.update(value=tmp_plot_path, visible=True))
-def change_input_type(choice: str):
-    # Change examples if input type is changed
-    global samples
-    # Set visibility of upload button
-    if choice == "text":
-        visible = False
-    else:
-        visible = True
-    return gr.update(samples=samples[choice]), "", gr.update(visible=visible), gr.update(visible=visible)
-# Load example from dataset
-def load_example(example_id):
-    return example_id[0]
-# Change the visibility of subsection type
-def change_output_type(query_type: str, subsection_type: str):
-    db_type = list(all_index[query_type].keys())[0]
-    nprobe_visible = check_index_ivf(query_type, db_type, subsection_type)
-    subsection_visible = True if query_type == "text" else False
-    return (
-        gr.update(visible=subsection_visible),
-        gr.update(visible=nprobe_visible),
-        gr.update(choices=list(all_index[query_type].keys()), value=db_type)
-    )
-def check_index_ivf(index_type: str, db: str, subsection_type: str = None) -> bool:
-    """
-    Check if the index is of IVF type.
-    Args:
-        index_type: Type of index.
-        subsection_type: If the "index_type" is "text", get the index based on the subsection type.
-    Returns:
-        Whether the index is of IVF type or not.
-    """
-    if index_type == "sequence":
-        index = all_index["sequence"][db]["index"]
-    elif index_type == "structure":
-        index = all_index["structure"][db]["index"]
-    elif index_type == "text":
-        index = all_index["text"][db][subsection_type]["index"]
-    # nprobe_visible = True if hasattr(index, "nprobe") else False
-    # return nprobe_visible
-    return False
-def change_db_type(query_type: str, subsection_type: str, db_type: str):
-    """
-    Change the database to search.
-    Args:
-        query_type: The output type.
-        db_type: The database to search.
-    """
-    if query_type == "text":
-        subsection_update = gr.update(choices=list(valid_subsections[db_type]), value="Function")
-    else:
-        subsection_update = gr.update(visible=False)
-    nprobe_visible = check_index_ivf(query_type, db_type, subsection_type)
-    return subsection_update, gr.update(visible=nprobe_visible)
-# Build the searching block
-def build_search_module():
-    gr.Markdown(f"# Search from database")
-    with gr.Row(equal_height=True):
-        with gr.Column():
-            # Set input type
-            input_type = gr.Radio(["sequence", "structure", "text"], label="Input type (e.g. 'text' means searching based on text descriptions)", value="text")
-            with gr.Row():
-                # Set output type
-                query_type = gr.Radio(
-                    ["sequence", "structure", "text"],
-                    label="Output type (e.g. 'sequence' means returning qualified sequences)",
-                    value="sequence",
-                    scale=2,
-                )
-                # If the output type is "text", provide an option to choose the subsection of text
-                text_db = list(all_index["text"].keys())[0]
-                sequence_db = list(all_index["sequence"].keys())[0]
-                subsection_type = gr.Dropdown(valid_subsections[text_db], label="Subsection of text", value="Function",
-                                              interactive=True, visible=False, scale=0)
-                db_type = gr.Dropdown(all_index["sequence"].keys(), label="Database", value=sequence_db,
-                                              interactive=True, visible=True, scale=0)
-            with gr.Row():
-                # Input box
-                input = gr.Text(label="Input")
-                # Provide an upload button to upload a pdb file
-                upload_btn, chain_box = upload_pdb_button(visible=False, chain_visible=False)
-                upload_btn.upload(parse_pdb_file, inputs=[input_type, upload_btn, chain_box], outputs=[input])
-            # If the index is of IVF type, provide an option to choose the number of clusters.
-            nprobe_visible = check_index_ivf(query_type.value, db_type.value)
-            nprobe = gr.Slider(1, 1000000, 1000,  step=1, visible=nprobe_visible,
-                               label="Number of clusters to search (lower value for faster search and higher value for more accurate search)")
-            # Add event listener to output type
-            query_type.change(fn=change_output_type, inputs=[query_type, subsection_type],
-                              outputs=[subsection_type, nprobe, db_type])
-            # Add event listener to db type
-            db_type.change(fn=change_db_type, inputs=[query_type, subsection_type, db_type],
-                           outputs=[subsection_type, nprobe])
-            # Choose topk results
-            topk = gr.Slider(1, 1000000, 5,  step=1, label="Retrieve top k results")
-            # Provide examples
-            examples = gr.Dataset(samples=samples["text"], components=[input], label="Input examples")
-            # Add click event to examples
-            examples.click(fn=load_example, inputs=[examples], outputs=input)
-            # Change examples based on input type
-            input_type.change(fn=change_input_type, inputs=[input_type], outputs=[examples, input, upload_btn, chain_box])
-            with gr.Row():
-                search_btn = gr.Button(value="Search")
-                clear_btn = gr.Button(value="Clear")
-        with gr.Row():
-            with gr.Column():
-                results = gr.Markdown(label="results", height=450)
-                download_btn = gr.DownloadButton(label="Download results", visible=False)
-                # Plot the distribution of scores
-                histogram = gr.Image(label="Histogram of matching scores", type="filepath", scale=1, visible=False)
-        search_btn.click(fn=search, inputs=[input, nprobe, topk, input_type, query_type, subsection_type, db_type],
-                      outputs=[results, download_btn, histogram])
-        clear_btn.click(fn=clear_results, outputs=[results, download_btn, histogram])

demo/modules/tmalign.py DELETED Viewed

@@ -1,78 +0,0 @@
-import gradio as gr
-import os
-from .blocks import upload_pdb_button
-from utils.downloader import download_pdb, download_af2
-root_dir = __file__.rsplit("/", 3)[0]
-structure_types = ["AlphaFoldDB", "PDB"]
-def upload_structure(file: str):
-    return file
-def get_structure_path(structure: str, structure_type: str) -> str:
-    # If the structure is manually uploaded
-    if structure[0] == "/":
-        return structure
-    # If the structure is a Uniprot ID, download the structure from AlphaFoldDB
-    elif structure_type == "AlphaFoldDB":
-        save_path = f"{root_dir}/demo/cache/{structure}.pdb"
-        if not os.path.exists(save_path):
-            download_af2(structure, "pdb", save_path)
-        return save_path
-    # If the structure is a PDB ID, download the structure from PDB
-    elif structure_type == "PDB":
-        save_path = f"{root_dir}/demo/cache/{structure}.cif"
-        if not os.path.exists(save_path):
-            download_pdb(structure, "cif", save_path)
-        return save_path
-def tmalign(structure_1: str, structure_type_1: str, structure_2: str, structure_type_2: str):
-    structure_path_1 = get_structure_path(structure_1, structure_type_1)
-    structure_path_2 = get_structure_path(structure_2, structure_type_2)
-    cmd = f"/tmp/TMalign {structure_path_1} {structure_path_2}"
-    r = os.popen(cmd)
-    text = r.read()
-    return text
-# Build the block for computing protein-text similarity
-def build_TMalign():
-    gr.Markdown(f"# Calculate TM-score between two protein structures")
-    with gr.Row(equal_height=True):
-        with gr.Column():
-            # Compute similarity score between sequence and text
-            with gr.Row():
-                structure_1 = gr.Textbox(label="Protein structure 1 (input Uniprot ID or PDB ID or upload a pdb file)")
-                structure_type_1 = gr.Dropdown(structure_types, label="Structure type (if the structure is manually uploaded, ignore this field)",
-                                               value="AlphaFoldDB", interactive=True, visible=True)
-                # Provide an upload button to upload a pdb file
-                upload_btn_1, _ = upload_pdb_button(visible=True, chain_visible=False)
-                upload_btn_1.upload(upload_structure, inputs=[upload_btn_1], outputs=[structure_1])
-            with gr.Row():
-                structure_2 = gr.Textbox(label="Protein structure 2 (input Uniprot ID or PDB ID or upload a pdb file)")
-                structure_type_2 = gr.Dropdown(structure_types, label="Structure type (if the structure is manually uploaded, ignore this field)",
-                                               value="AlphaFoldDB", interactive=True, visible=True)
-                # Provide an upload button to upload a pdb file
-                upload_btn_2, _ = upload_pdb_button(visible=True, chain_visible=False)
-                upload_btn_2.upload(upload_structure, inputs=[upload_btn_2], outputs=[structure_2])
-            compute_btn = gr.Button(value="Compute TM-score")
-            tmscore = gr.TextArea(label="TM-score", interactive=False)
-            compute_btn.click(tmalign, inputs=[structure_1, structure_type_1, structure_2, structure_type_2],
-                              outputs=[tmscore])