|
import streamlit as st |
|
from stmol import showmol |
|
import py3Dmol |
|
import requests |
|
import biotite.structure.io as bsio |
|
import random |
|
import hashlib |
|
import urllib3 |
|
from Bio.Blast import NCBIWWW, NCBIXML |
|
from Bio.Seq import Seq |
|
from Bio.SeqRecord import SeqRecord |
|
import time |
|
|
|
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) |
|
|
|
st.set_page_config(layout='wide') |
|
st.sidebar.title('🔮 GenPro2 Protein Generator & Structure Predictor') |
|
st.sidebar.write('GenPro2 is an end-to-end single sequence protein generator and structure predictor based [*ESMFold*](https://esmatlas.com/about) and the ESM-2 language model.') |
|
|
|
def generate_sequence_from_words(words, length): |
|
seed = ' '.join(words).encode('utf-8') |
|
random.seed(hashlib.md5(seed).hexdigest()) |
|
amino_acids = "ACDEFGHIKLMNPQRSTVWY" |
|
return ''.join(random.choice(amino_acids) for _ in range(length)) |
|
|
|
def render_mol(pdb): |
|
pdbview = py3Dmol.view() |
|
pdbview.addModel(pdb,'pdb') |
|
pdbview.setStyle({'cartoon':{'color':'spectrum'}}) |
|
pdbview.setBackgroundColor('white') |
|
pdbview.zoomTo() |
|
pdbview.zoom(2, 800) |
|
pdbview.spin(True) |
|
showmol(pdbview, height = 500,width=800) |
|
|
|
def perform_blast_analysis(sequence): |
|
st.subheader('Protein Analysis') |
|
with st.spinner("Analyzing generated protein... This may take a few minutes."): |
|
progress_bar = st.progress(0) |
|
for i in range(100): |
|
progress_bar.progress(i + 1) |
|
time.sleep(0.1) |
|
|
|
try: |
|
record = SeqRecord(Seq(sequence), id='random_protein') |
|
result_handle = NCBIWWW.qblast("blastp", "swissprot", record.seq) |
|
|
|
blast_record = NCBIXML.read(result_handle) |
|
|
|
if blast_record.alignments: |
|
alignment = blast_record.alignments[0] |
|
hsp = alignment.hsps[0] |
|
|
|
|
|
title_parts = alignment.title.split('|') |
|
protein_name = title_parts[-1].strip() |
|
organism = title_parts[-2].split('OS=')[-1].split('OX=')[0].strip() |
|
|
|
|
|
identity_percentage = (hsp.identities / alignment.length) * 100 |
|
|
|
st.write(f"**Top Match:** {protein_name}") |
|
st.write(f"**Organism:** {organism}") |
|
st.write(f"**Sequence Identity:** {identity_percentage:.2f}%") |
|
st.write(f"**E-value:** {hsp.expect:.2e}") |
|
|
|
|
|
if hasattr(alignment, 'description') and alignment.description: |
|
st.write(f"**Potential Function:** {alignment.description}") |
|
|
|
|
|
blast_link = f"https://blast.ncbi.nlm.nih.gov/Blast.cgi?PROGRAM=blastp&PAGE_TYPE=BlastSearch&LINK_LOC=blasthome" |
|
st.markdown(f"[View full BLAST results (may require re-running the search)]({blast_link})") |
|
else: |
|
st.write("No significant matches found. This might be a unique protein sequence!") |
|
except Exception as e: |
|
st.error(f"An error occurred during protein analysis: {str(e)}") |
|
st.write("Please try again later or contact support if the issue persists.") |
|
|
|
def update(sequence, word1, word2, word3, sequence_length): |
|
headers = { |
|
'Content-Type': 'application/x-www-form-urlencoded', |
|
} |
|
try: |
|
response = requests.post('https://api.esmatlas.com/foldSequence/v1/pdb/', |
|
headers=headers, |
|
data=sequence, |
|
verify=False, |
|
timeout=300) |
|
response.raise_for_status() |
|
pdb_string = response.content.decode('utf-8') |
|
|
|
with open('predicted.pdb', 'w') as f: |
|
f.write(pdb_string) |
|
|
|
struct = bsio.load_structure('predicted.pdb', extra_fields=["b_factor"]) |
|
b_value = round(struct.b_factor.mean(), 2) |
|
|
|
st.session_state.structure_info = { |
|
'pdb_string': pdb_string, |
|
'b_value': b_value, |
|
'word1': word1, |
|
'word2': word2, |
|
'word3': word3, |
|
'sequence_length': sequence_length |
|
} |
|
|
|
st.session_state.show_analyze_button = True |
|
|
|
except requests.exceptions.RequestException as e: |
|
st.error(f"An error occurred while calling the API: {str(e)}") |
|
st.write("Please try again later or contact support if the issue persists.") |
|
|
|
|
|
if 'sequence' not in st.session_state: |
|
st.session_state.sequence = None |
|
if 'show_analyze_button' not in st.session_state: |
|
st.session_state.show_analyze_button = False |
|
if 'structure_info' not in st.session_state: |
|
st.session_state.structure_info = None |
|
|
|
st.title("Word-Seeded Protein Sequence Generator and Structure Predictor") |
|
|
|
st.sidebar.subheader("Generate Sequence from Words") |
|
word1 = st.sidebar.text_input("Word 1") |
|
word2 = st.sidebar.text_input("Word 2") |
|
word3 = st.sidebar.text_input("Word 3") |
|
sequence_length = st.sidebar.number_input("Sequence Length", min_value=50, max_value=400, value=100, step=10) |
|
|
|
if st.sidebar.button('Generate and Predict'): |
|
if word1 and word2 and word3: |
|
sequence = generate_sequence_from_words([word1, word2, word3], sequence_length) |
|
st.session_state.sequence = sequence |
|
st.sidebar.text_area("Generated Sequence", sequence, height=100) |
|
st.sidebar.info("Note: The same words and sequence length will always produce the same sequence.") |
|
|
|
with st.spinner("Predicting protein structure... This may take a few minutes."): |
|
update(sequence, word1, word2, word3, sequence_length) |
|
else: |
|
st.sidebar.warning("Please enter all three words to generate a sequence.") |
|
|
|
|
|
if st.session_state.structure_info: |
|
info = st.session_state.structure_info |
|
st.subheader(f'Predicted protein structure using seed: {info["word1"]}, {info["word2"]}, and {info["word3"]} + length {info["sequence_length"]}') |
|
render_mol(info['pdb_string']) |
|
|
|
st.subheader('plDDT Score') |
|
st.write('plDDT is a per-residue estimate of the confidence in prediction on a scale from 0-100%.') |
|
st.info(f'Average plDDT: {int(info["b_value"] * 100)}%') |
|
|
|
st.download_button( |
|
label="Download PDB", |
|
data=info['pdb_string'], |
|
file_name='predicted.pdb', |
|
mime='text/plain', |
|
) |
|
|
|
st.markdown(""" |
|
## What to do next: |
|
If you find interesting results from the sequence folding, you can explore further: |
|
1. Learn more about protein structures and sequences. |
|
2. Visit the [Protein Data Bank (PDB)](https://www.rcsb.org/) for known protein structures. |
|
3. Compare your folded structure with known functional proteins by downloading your results. |
|
4. Read about similar proteins to gain insights into potential functions. |
|
5. Click the "Analyze Protein" button below to get more information about your generated protein. |
|
|
|
**Remember, this folding is based on randomly generated sequences. Interpret the results with caution. |
|
Enjoy exploring the world of protein sequences! Share your high-confidence protein images with us on X [*@WandsAI*](https://x.com/wandsai)! |
|
""") |
|
|
|
|
|
if st.session_state.show_analyze_button: |
|
if st.button('Analyze Protein'): |
|
perform_blast_analysis(st.session_state.sequence) |