import streamlit as st import random import hashlib import py3Dmol import requests import io from Bio import PDB def generate_sequence_from_words(words, length): seed = ' '.join(words).encode('utf-8') random.seed(hashlib.md5(seed).hexdigest()) amino_acids = "ACDEFGHIKLMNPQRSTVWY" return ''.join(random.choice(amino_acids) for _ in range(length)) def predict_structure(sequence): url = "https://api.colabfold.com/batch" data = { "queries": [["query", sequence]], "num_relax": 0, "use_templates": False, "num_models": 1 } response = requests.post(url, json=data) if response.status_code == 200: return response.json() else: st.error(f"Error in structure prediction: {response.text}") return None def visualize_protein(pdb_string): view = py3Dmol.view(width=800, height=400) view.addModel(pdb_string, 'pdb') view.setStyle({'cartoon': {'color': 'spectrum'}}) view.zoomTo() return view st.title("Protein Sequence Generator and Structure Predictor") st.write("Enter three random words to seed your protein sequence:") word1 = st.text_input("Word 1") word2 = st.text_input("Word 2") word3 = st.text_input("Word 3") sequence_length = st.number_input("Enter desired sequence length", min_value=50, max_value=200, value=100, step=10) if st.button("Generate Sequence and Predict Structure"): if word1 and word2 and word3: words = [word1, word2, word3] sequence = generate_sequence_from_words(words, sequence_length) st.write(f"Generated sequence inspired by '{word1}', '{word2}', and '{word3}' with length '{sequence_length}':") st.code(sequence) st.header("Protein Structure Prediction") with st.spinner("Predicting protein structure... This may take a few minutes."): prediction = predict_structure(sequence) if prediction and 'pdb_string' in prediction[0]: pdb_string = prediction[0]['pdb_string'] view = visualize_protein(pdb_string) st_py3dmol = py3Dmol.show3d(view, width=800, height=400) st.components.v1.html(st_py3dmol.startjs, height=400) # Display confidence scores plddt_scores = prediction[0].get('plddt', []) if plddt_scores: avg_plddt = sum(plddt_scores) / len(plddt_scores) st.write(f"Average pLDDT score: {avg_plddt:.2f}") st.write("pLDDT > 90: Very high confidence") st.write("90 > pLDDT > 70: Confident") st.write("70 > pLDDT > 50: Low confidence") st.write("pLDDT < 50: Very low confidence") else: st.error("Failed to predict structure. Please try again.") else: st.error("Please enter all three words.") st.markdown(""" ## What to do next: 1. Experiment with different seed words and sequence lengths. 2. Learn about how protein sequences relate to their predicted structures. 3. Remember that these are computational predictions and may not represent the actual biological structure. 4. For real protein structures, visit the [Protein Data Bank (PDB)](https://www.rcsb.org/). Enjoy exploring the world of protein sequences and predicted structures! """)