|
import streamlit as st |
|
import random |
|
import hashlib |
|
import py3Dmol |
|
import requests |
|
import io |
|
from Bio import PDB |
|
|
|
def generate_sequence_from_words(words, length): |
|
seed = ' '.join(words).encode('utf-8') |
|
random.seed(hashlib.md5(seed).hexdigest()) |
|
amino_acids = "ACDEFGHIKLMNPQRSTVWY" |
|
return ''.join(random.choice(amino_acids) for _ in range(length)) |
|
|
|
def predict_structure(sequence): |
|
url = "https://api.colabfold.com/batch" |
|
data = { |
|
"queries": [["query", sequence]], |
|
"num_relax": 0, |
|
"use_templates": False, |
|
"num_models": 1 |
|
} |
|
response = requests.post(url, json=data) |
|
if response.status_code == 200: |
|
return response.json() |
|
else: |
|
st.error(f"Error in structure prediction: {response.text}") |
|
return None |
|
|
|
def visualize_protein(pdb_string): |
|
view = py3Dmol.view(width=800, height=400) |
|
view.addModel(pdb_string, 'pdb') |
|
view.setStyle({'cartoon': {'color': 'spectrum'}}) |
|
view.zoomTo() |
|
return view |
|
|
|
st.title("Protein Sequence Generator and Structure Predictor") |
|
|
|
st.write("Enter three random words to seed your protein sequence:") |
|
word1 = st.text_input("Word 1") |
|
word2 = st.text_input("Word 2") |
|
word3 = st.text_input("Word 3") |
|
|
|
sequence_length = st.number_input("Enter desired sequence length", |
|
min_value=50, |
|
max_value=200, |
|
value=100, |
|
step=10) |
|
|
|
if st.button("Generate Sequence and Predict Structure"): |
|
if word1 and word2 and word3: |
|
words = [word1, word2, word3] |
|
sequence = generate_sequence_from_words(words, sequence_length) |
|
st.write(f"Generated sequence inspired by '{word1}', '{word2}', and '{word3}' with length '{sequence_length}':") |
|
st.code(sequence) |
|
|
|
st.header("Protein Structure Prediction") |
|
with st.spinner("Predicting protein structure... This may take a few minutes."): |
|
prediction = predict_structure(sequence) |
|
if prediction and 'pdb_string' in prediction[0]: |
|
pdb_string = prediction[0]['pdb_string'] |
|
view = visualize_protein(pdb_string) |
|
|
|
st_py3dmol = py3Dmol.show3d(view, width=800, height=400) |
|
st.components.v1.html(st_py3dmol.startjs, height=400) |
|
|
|
|
|
plddt_scores = prediction[0].get('plddt', []) |
|
if plddt_scores: |
|
avg_plddt = sum(plddt_scores) / len(plddt_scores) |
|
st.write(f"Average pLDDT score: {avg_plddt:.2f}") |
|
st.write("pLDDT > 90: Very high confidence") |
|
st.write("90 > pLDDT > 70: Confident") |
|
st.write("70 > pLDDT > 50: Low confidence") |
|
st.write("pLDDT < 50: Very low confidence") |
|
else: |
|
st.error("Failed to predict structure. Please try again.") |
|
else: |
|
st.error("Please enter all three words.") |
|
|
|
st.markdown(""" |
|
## What to do next: |
|
1. Experiment with different seed words and sequence lengths. |
|
2. Learn about how protein sequences relate to their predicted structures. |
|
3. Remember that these are computational predictions and may not represent the actual biological structure. |
|
4. For real protein structures, visit the [Protein Data Bank (PDB)](https://www.rcsb.org/). |
|
Enjoy exploring the world of protein sequences and predicted structures! |
|
""") |