|
from transformers import pipeline |
|
from rcsbsearchapi import AttributeQuery |
|
from rcsbsearchapi.search import SequenceQuery, SeqMotifQuery |
|
import os |
|
from dotenv import load_dotenv |
|
from shiny import App, render, ui, reactive |
|
from itables.shiny import DT |
|
import pandas as pd |
|
import warnings |
|
import re |
|
import time |
|
|
|
|
|
from shinywidgets import output_widget, render_widget |
|
import requests |
|
|
|
from Bio import PDB |
|
from Bio.PDB.PDBList import PDBList |
|
from Bio.PDB.Polypeptide import protein_letters_3to1 |
|
import shutil |
|
warnings.filterwarnings('ignore') |
|
|
|
|
|
load_dotenv() |
|
|
|
class PDBSearchAssistant: |
|
def __init__(self, model_name="google/flan-t5-large"): |
|
|
|
self.pipe = pipeline( |
|
"text2text-generation", |
|
model=model_name, |
|
max_new_tokens=1024, |
|
temperature=0.1, |
|
torch_dtype="auto", |
|
device="cpu" |
|
) |
|
|
|
self.prompt_template = """ |
|
Extract specific search parameters from the protein-related query: |
|
1. Protein name or type |
|
2. Resolution cutoff (in Γ
) |
|
3. Protein sequence information |
|
4. Specific PDB ID |
|
5. Experimental method (X-RAY, EM, NMR) |
|
6. Organism/Species information |
|
7. Sequence similarity (in %) |
|
|
|
Format: |
|
Protein: [protein name or type] |
|
Resolution: [maximum resolution in Γ
, if mentioned] |
|
Sequence: [any sequence mentioned] |
|
PDB_ID: [specific PDB ID if mentioned] |
|
Method: [experimental method if mentioned] |
|
Organism: [organism/species if mentioned] |
|
Similarity: [similarity percentage if mentioned] |
|
|
|
Examples: |
|
Query: "Find structures with sequence MNIFEMLRIDEGLRLKIYKDTEGYYTIGIGHLLTKSPSLNAAKSELDKAIGRN and resolution better than 2.5Γ
" |
|
Protein: none |
|
Resolution: 2.5 |
|
Sequence: MNIFEMLRIDEGLRLKIYKDTEGYYTIGIGHLLTKSPSLNAAKSELDKAIGRN |
|
PDB_ID: none |
|
Method: none |
|
Organism: none |
|
Similarity: 100 |
|
|
|
Query: "human insulin" |
|
Protein: insulin |
|
Resolution: none |
|
Sequence: none |
|
PDB_ID: none |
|
Method: none |
|
Organism: Homo sapiens |
|
Similarity: none |
|
|
|
Query: "mouse insulin" |
|
Protein: insulin |
|
Resolution: none |
|
Sequence: none |
|
PDB_ID: none |
|
Method: none |
|
Organism: Mus musculus |
|
Similarity: none |
|
|
|
Query: "Spike protein" |
|
Protein: Spike protein |
|
Resolution: none |
|
Sequence: none |
|
PDB_ID: none |
|
Method: none |
|
Organism: none |
|
Similarity: none |
|
|
|
Query: "Human hemoglobin C resolution better than 2.5Γ
" |
|
Protein: hemoglobin C |
|
Resolution: 2.5 |
|
Sequence: none |
|
PDB_ID: none |
|
Method: none |
|
Organism: Homo sapiens |
|
Similarity: none |
|
|
|
Query: "Find structures containing sequence with similarity 90% MNIFEMLRIDEGLRLKIYKDTEGYYTIGIGHLLTKSPSLNAAKSELDKAIGRN" |
|
Protein: none |
|
Resolution: none |
|
Sequence: MNIFEMLRIDEGLRLKIYKDTEGYYTIGIGHLLTKSPSLNAAKSELDKAIGRN |
|
PDB_ID: none |
|
Method: none |
|
Organism: none |
|
Similarity: 90 |
|
|
|
Query: "Get sequence of PDB ID 8ET6" |
|
Protein: none |
|
Organism: none |
|
Resolution: none |
|
Sequence: none |
|
PDB_ID: 8ET6 |
|
Method: none |
|
|
|
Now analyze: |
|
Query: {query} |
|
""" |
|
|
|
self.pdb_dir = "pdb_tmp" |
|
os.makedirs(self.pdb_dir, exist_ok=True) |
|
self.pdbl = PDBList() |
|
|
|
def search_pdb(self, query): |
|
try: |
|
|
|
formatted_prompt = self.prompt_template.format(query=query) |
|
response = self.pipe(formatted_prompt)[0]['generated_text'] |
|
print("Generated parameters:", response) |
|
|
|
|
|
resolution_limit = None |
|
pdb_id = None |
|
sequence = None |
|
method = None |
|
organism = None |
|
has_resolution_query = False |
|
resolution_direction = "less" |
|
similarity = None |
|
|
|
print("Raw LLM response:", response) |
|
|
|
|
|
|
|
resolution_matches = re.finditer(r'[Rr]esolution:\s*(\d+(?:\.\d+)?)', response) |
|
for match in resolution_matches: |
|
try: |
|
value = float(match.group(1)) |
|
if value > 0: |
|
resolution_limit = value |
|
has_resolution_query = True |
|
print(f"Extracted resolution: {resolution_limit}Γ
") |
|
break |
|
except ValueError: |
|
continue |
|
|
|
|
|
|
|
cleaned_response = re.sub(r'[Rr]esolution:\s*\d+(?:\.\d+)?(?:\s*Γ
?)?\s*', '', response) |
|
print("cleaned_responese :", cleaned_response) |
|
|
|
|
|
response_pairs = {} |
|
for pair in re.finditer(r'(\w+):\s*([^:]+?)(?=\s+\w+:|$)', cleaned_response): |
|
key, value = pair.groups() |
|
print(key, value) |
|
key = key.lower() |
|
value = value.strip() |
|
if value.lower() not in ['none', 'n/a']: |
|
response_pairs[key] = value |
|
|
|
print("Parsed response pairs:", response_pairs) |
|
|
|
|
|
if not response_pairs: |
|
if 'protein' in response: |
|
response_pairs['protein'] = response |
|
print("Replaced response pairs:", response_pairs) |
|
|
|
|
|
if 'sequence' in response_pairs: |
|
sequence = response_pairs['sequence'] |
|
if len(sequence) >= 25: |
|
print(f"Extracted sequence: {sequence}") |
|
|
|
if 'similarity' in response_pairs: |
|
try: |
|
similarity_str = response_pairs['similarity'].replace('%', '') |
|
similarity = float(similarity_str) |
|
print(f"Extracted similarity: {similarity}%") |
|
except ValueError: |
|
pass |
|
|
|
if 'pdb_id' in response_pairs: |
|
pdb_id = response_pairs['pdb_id'].upper() |
|
|
|
if 'method' in response_pairs: |
|
method = response_pairs['method'].upper() |
|
|
|
if 'organism' in response_pairs: |
|
organism = response_pairs['organism'] |
|
|
|
|
|
if similarity is None: |
|
similarity_match = re.search(r'similarity\s+(\d+(?:\.\d+)?)\s*%', query.lower()) |
|
if similarity_match: |
|
try: |
|
similarity = float(similarity_match.group(1)) |
|
print(f"Extracted similarity from query: {similarity}%") |
|
except ValueError: |
|
pass |
|
|
|
|
|
if similarity is None and sequence: |
|
similarity = 100 |
|
print("No similarity specified, using default 100%") |
|
|
|
|
|
if not has_resolution_query: |
|
resolution_pattern = r'resolution (?:better|worse|less|greater) than (\d+\.?\d*)(?:\s*Γ
|A)?' |
|
resolution_match = re.search(resolution_pattern, query.lower()) |
|
if resolution_match: |
|
resolution_limit = float(resolution_match.group(1)) |
|
has_resolution_query = True |
|
print(f"Extracted resolution from query: {resolution_limit}Γ
") |
|
|
|
|
|
protein_name = None |
|
if 'protein' in response_pairs: |
|
protein_name = response_pairs['protein'] |
|
print(f"Extracted protein name: {protein_name}") |
|
|
|
|
|
queries = [] |
|
|
|
|
|
if protein_name: |
|
print(f"Adding protein name filter: {protein_name}") |
|
try: |
|
protein_query = AttributeQuery( |
|
attribute="struct.title", |
|
operator="contains_words", |
|
value=protein_name |
|
) |
|
queries.append(protein_query) |
|
|
|
protein_entity_query = AttributeQuery( |
|
attribute="rcsb_entity_container_identifiers.entity_names.value", |
|
operator="contains_words", |
|
value=protein_name |
|
) |
|
queries.append(protein_entity_query) |
|
|
|
print(f"Created protein queries successfully: {protein_query}, {protein_entity_query}") |
|
except Exception as e: |
|
print(f"Error creating protein queries: {str(e)}") |
|
|
|
|
|
query_words = query.split() |
|
for word in query_words: |
|
if (len(word) >= 25 and |
|
all(c in 'ACDEFGHIKLMNPQRSTVWY' for c in word.upper()) and |
|
sum(c.isupper() for c in word) / len(word) > 0.8): |
|
sequence = word |
|
break |
|
|
|
if sequence: |
|
if len(sequence) < 25: |
|
print("Warning: Sequence must be at least 25 residues long. Skipping sequence search.") |
|
else: |
|
if similarity is None: |
|
similarity = 100 |
|
print("No similarity specified, using default 100%") |
|
|
|
identity_cutoff = similarity / 100.0 |
|
print(f"Adding sequence search with identity {similarity}% (cutoff: {identity_cutoff})") |
|
sequence_query = SequenceQuery( |
|
sequence, |
|
identity_cutoff=identity_cutoff, |
|
evalue_cutoff=1, |
|
sequence_type="protein" |
|
) |
|
queries.append(sequence_query) |
|
print(f"Created sequence query with parameters: {sequence_query.params}") |
|
|
|
|
|
if resolution_limit and has_resolution_query: |
|
operator = "less_or_equal" if resolution_direction == "less" else "greater_or_equal" |
|
print(f"Adding resolution filter: {operator} {resolution_limit}Γ
") |
|
resolution_query = AttributeQuery( |
|
attribute="rcsb_entry_info.resolution_combined", |
|
operator=operator, |
|
value=resolution_limit |
|
) |
|
queries.append(resolution_query) |
|
print(f"Created resolution query with cutoff: {resolution_limit}Γ
") |
|
|
|
|
|
if pdb_id: |
|
print(f"Searching for specific PDB ID: {pdb_id}") |
|
id_query = AttributeQuery( |
|
attribute="rcsb_id", |
|
operator="exact_match", |
|
value=pdb_id.upper() |
|
) |
|
queries = [id_query] |
|
|
|
|
|
if method: |
|
print(f"Adding experimental method filter: {method}") |
|
method_query = AttributeQuery( |
|
attribute="exptl.method", |
|
operator="exact_match", |
|
value=method |
|
) |
|
queries.append(method_query) |
|
|
|
|
|
if organism: |
|
print(f"Adding organism filter: {organism}") |
|
organism_query = AttributeQuery( |
|
attribute="rcsb_entity_source_organism.taxonomy_lineage.name", |
|
operator="exact_match", |
|
value=organism |
|
) |
|
queries.append(organism_query) |
|
|
|
|
|
if queries: |
|
try: |
|
if protein_name and len(queries) >= 2: |
|
print("Combining protein queries with OR") |
|
protein_queries = queries[0] | queries[1] |
|
print("Successfully combined protein queries") |
|
|
|
if len(queries) > 2: |
|
print("Combining with additional queries using AND") |
|
final_query = queries[0] & queries[1] |
|
|
|
|
|
|
|
else: |
|
final_query = protein_queries |
|
else: |
|
final_query = queries[0] |
|
for q in queries[1:]: |
|
final_query = final_query & q |
|
|
|
print("Final query:", final_query) |
|
|
|
|
|
session = final_query.exec(results_verbosity="minimal") |
|
results = [] |
|
|
|
|
|
|
|
|
|
|
|
try: |
|
for entry in session: |
|
try: |
|
|
|
if isinstance(entry, dict): |
|
if entry.get('score') > 0.75: |
|
pdb_id = entry.get('identifier') |
|
elif hasattr(entry, 'identifier'): |
|
pdb_id = entry.identifier |
|
else: |
|
pdb_id = str(entry) |
|
|
|
pdb_id = pdb_id.upper() |
|
|
|
if not pdb_id or len(pdb_id) != 4: |
|
continue |
|
|
|
|
|
if len(results) > 1 and results[-1]["PDB ID"] == pdb_id: |
|
break |
|
|
|
|
|
structure_url = f"https://data.rcsb.org/rest/v1/core/entry/{pdb_id}" |
|
response = requests.get(structure_url) |
|
|
|
if response.status_code != 200: |
|
continue |
|
|
|
structure_data = response.json() |
|
|
|
result = { |
|
'PDB ID': pdb_id, |
|
'Title': structure_data.get('struct', {}).get('title', 'N/A'), |
|
'# of total residues': structure_data.get('refine_hist', [{}])[0].get('pdbx_number_residues_total', 'N/A'), |
|
'# of atoms of protein': structure_data.get('refine_hist', [{}])[0].get('pdbx_number_atoms_protein', 'N/A'), |
|
'Resolution': f"{structure_data.get('rcsb_entry_info', {}).get('resolution_combined', [0.0])[0]:.2f}Γ
", |
|
'Method': structure_data.get('exptl', [{}])[0].get('method', 'Unknown'), |
|
'Release Date': structure_data.get('rcsb_accession_info', {}).get('initial_release_date', 'N/A') |
|
} |
|
|
|
results.append(result) |
|
|
|
|
|
if len(results) >= 500: |
|
break |
|
|
|
except Exception as e: |
|
print(f"Error processing entry: {str(e)}") |
|
continue |
|
|
|
except Exception as e: |
|
print(f"Error processing results: {str(e)}") |
|
print(f"Error type: {type(e)}") |
|
|
|
print(f"Found {len(results)} structures") |
|
return results |
|
|
|
except Exception as e: |
|
print(f"Error combining queries: {str(e)}") |
|
print(f"Query state: {queries}") |
|
return [] |
|
|
|
return [] |
|
|
|
except Exception as e: |
|
print(f"Error during search: {str(e)}") |
|
print(f"Error type: {type(e)}") |
|
return [] |
|
|
|
def get_sequences_by_pdb_id(self, pdb_id): |
|
"""Get sequences for all chains in a PDB structure using Biopython""" |
|
try: |
|
|
|
pdb_path = self.pdbl.retrieve_pdb_file( |
|
pdb_id, |
|
pdir=self.pdb_dir, |
|
file_format="pdb" |
|
) |
|
|
|
|
|
structure_url = f"https://data.rcsb.org/rest/v1/core/entry/{pdb_id}" |
|
response = requests.get(structure_url) |
|
structure_data = response.json() if response.status_code == 200 else {} |
|
|
|
if not pdb_path or not os.path.exists(pdb_path): |
|
print(f"Failed to download PDB file for {pdb_id}") |
|
|
|
sequences = [] |
|
|
|
entity_ids = structure_data.get('rcsb_entry_container_identifiers', {}).get('polymer_entity_ids', {}) |
|
for i in entity_ids: |
|
sequence_url = f"https://data.rcsb.org/rest/v1/core/polymer_entity/{pdb_id}/{i}" |
|
seq_response = requests.get(sequence_url) |
|
seq_data = seq_response.json() if response.status_code == 200 else {} |
|
sequence = seq_data.get('entity_poly', {}).get('pdbx_seq_one_letter_code_can', 'N/A') |
|
|
|
chain_info = { |
|
'chain_id': seq_data.get('entity_poly', {}).get('pdbx_strand_id', 'N/A'), |
|
'entity_id': i, |
|
'description': structure_data.get('struct', {}).get('title', 'N/A'), |
|
'sequence': sequence, |
|
'length': len(sequence), |
|
'resolution': structure_data.get('rcsb_entry_info', {}).get('resolution_combined', [0.0])[0], |
|
'method': structure_data.get('exptl', [{}])[0].get('method', 'Unknown'), |
|
'release_date': structure_data.get('rcsb_accession_info', {}).get('initial_release_date', 'N/A') |
|
} |
|
sequences.append(chain_info) |
|
print("not Bio pdb list") |
|
|
|
return sequences |
|
|
|
|
|
parser = PDB.PDBParser(QUIET=True) |
|
structure = parser.get_structure(pdb_id, pdb_path) |
|
|
|
sequences = [] |
|
|
|
for model in structure: |
|
for chain in model: |
|
sequence = "" |
|
for residue in chain: |
|
if PDB.is_aa(residue, standard=True): |
|
try: |
|
|
|
resname = residue.get_resname() |
|
if resname in protein_letters_3to1: |
|
sequence += protein_letters_3to1[resname] |
|
except: |
|
continue |
|
|
|
if sequence: |
|
chain_info = { |
|
'chain_id': chain.id, |
|
'entity_id': '1', |
|
'description': structure_data.get('struct', {}).get('title', 'N/A'), |
|
'sequence': sequence, |
|
'length': len(sequence), |
|
'resolution': structure_data.get('rcsb_entry_info', {}).get('resolution_combined', [0.0])[0], |
|
'method': structure_data.get('exptl', [{}])[0].get('method', 'Unknown'), |
|
'release_date': structure_data.get('rcsb_accession_info', {}).get('initial_release_date', 'N/A') |
|
} |
|
sequences.append(chain_info) |
|
|
|
|
|
if os.path.exists(pdb_path): |
|
os.remove(pdb_path) |
|
|
|
return sequences |
|
|
|
except Exception as e: |
|
print(f"Error getting sequences for PDB ID {pdb_id}: {str(e)}") |
|
return [] |
|
|
|
def __del__(self): |
|
"""Cleanup temporary directory on object destruction""" |
|
if hasattr(self, 'pdb_dir') and os.path.exists(self.pdb_dir): |
|
shutil.rmtree(self.pdb_dir) |
|
|
|
def process_query(self, query): |
|
"""Process query and return results""" |
|
try: |
|
|
|
formatted_prompt = self.prompt_template.format(query=query) |
|
response = self.pipe(formatted_prompt)[0]['generated_text'] |
|
print("Generated parameters:", response) |
|
|
|
|
|
pdb_id = None |
|
for line in response.split('\n'): |
|
if 'PDB_ID:' in line: |
|
value = line.split('PDB_ID:')[1].strip() |
|
if value.lower() not in ['none', 'n/a']: |
|
pdb_id = value.upper() |
|
break |
|
|
|
|
|
sequence_keywords = ['sequence', 'seq'] |
|
is_sequence_query = any(keyword in query.lower() for keyword in sequence_keywords) |
|
|
|
if is_sequence_query and pdb_id: |
|
|
|
sequences = self.get_sequences_by_pdb_id(pdb_id) |
|
return { |
|
"type": "sequence", |
|
"results": sequences |
|
} |
|
|
|
|
|
return { |
|
"type": "structure", |
|
"results": self.search_pdb(query) |
|
} |
|
|
|
except Exception as e: |
|
print(f"Error processing query: {str(e)}") |
|
return {"type": "structure", "results": []} |
|
|
|
def render_html(pdb_id, chain_count): |
|
if pdb_id is None or chain_count <= 0: |
|
return "" |
|
|
|
chains = [chr(65 + i) for i in range(chain_count)] |
|
|
|
|
|
chain_html_blocks = "".join([ |
|
f""" |
|
<div> |
|
{pdb_id} {chain} |
|
</div> |
|
<div class="viewer_3Dmoljs" |
|
data-pdb="{pdb_id}" |
|
data-select="chain:{chain}" |
|
data-backgroundcolor="0xffffff" |
|
data-style="cartoon:color=spectrum" |
|
data-spin="axis:y;speed:0.2"> |
|
</div> |
|
""" |
|
for chain in chains |
|
]) |
|
|
|
html_content = f""" |
|
<!DOCTYPE html> |
|
<html> |
|
<head> |
|
<script src="https://3Dmol.org/build/3Dmol-min.js"></script> |
|
<script src="https://3Dmol.org/build/3Dmol.ui-min.js"></script> |
|
<style> |
|
.viewer_3Dmoljs {{ |
|
width: 100%; |
|
height: 400px; |
|
position: relative; |
|
}} |
|
</style> |
|
</head> |
|
<body> |
|
<div> |
|
{pdb_id} |
|
</div> |
|
<div class="viewer_3Dmoljs" |
|
data-pdb="{pdb_id}" |
|
data-backgroundcolor="0xffffff" |
|
data-style="cartoon:color=spectrum" |
|
data-spin="axis:y;speed:0.2"> |
|
</div> |
|
{chain_html_blocks} |
|
</body> |
|
</html> |
|
""" |
|
|
|
|
|
escaped_content = (html_content |
|
.replace('"', '"') |
|
.replace('<', '<') |
|
.replace('>', '>') |
|
.replace('\n', '') |
|
) |
|
|
|
return f'<iframe style="width: 100%; height: 480px; border: none;" srcdoc=\'{escaped_content}\'></iframe>' |
|
|
|
def create_interactive_table(df): |
|
|
|
df = df.drop_duplicates() |
|
|
|
column_order = ['PDB ID', 'Resolution', 'Title','# of total residues', '# of atoms of protein', 'Method','Release Date'] |
|
df = df[column_order] |
|
|
|
|
|
df['Release Date'] = pd.to_datetime(df['Release Date']).dt.strftime('%Y-%m-%d') |
|
return df |
|
|
|
|
|
|
|
app_ui = ui.page_fluid( |
|
ui.tags.head( |
|
ui.tags.style(""" |
|
.container-fluid { |
|
max-width: 1200px; |
|
margin: 0 auto; |
|
padding: 20px; |
|
} |
|
.table a { |
|
color: #0d6efd; |
|
text-decoration: none; |
|
} |
|
.table a:hover { |
|
color: #0a58ca; |
|
text-decoration: underline; |
|
} |
|
.dt-layout-cell { |
|
overflow-x: auto; |
|
max-width :100%; |
|
max-height: 600px; |
|
} |
|
table colgroup col[data-dt-column="2"] { |
|
width: 450px !important; |
|
min-width: 450px !important; |
|
} |
|
.shiny-input-container { |
|
max-width: 100%; |
|
margin: 0 auto; |
|
} |
|
#query { |
|
height: 300px; |
|
font-size: 16px; |
|
padding: 15px; |
|
width: 80%; |
|
margin: 0 auto; |
|
display: block; |
|
white-space: pre-wrap; |
|
word-wrap: break-word; |
|
resize: vertical; |
|
overflow-y: auto; |
|
} |
|
.content-wrapper { |
|
text-align: center; |
|
max-width: 1000px; |
|
margin: 0 auto; |
|
} |
|
.search-button { |
|
margin: 20px 0; |
|
} |
|
h2, h4 { |
|
text-align: center; |
|
margin: 20px 0; |
|
} |
|
.example-box { |
|
height: 250px; |
|
margin: 0; |
|
background-color: white; |
|
border: 1px solid #dee2e6; |
|
padding: 20px; |
|
border-radius: 8px; |
|
overflow-y: auto; |
|
text-align: left; |
|
} |
|
.example-box p { |
|
font-weight: bold; |
|
margin-bottom: 10px; |
|
padding-left: 0; |
|
} |
|
.example-box ul { |
|
margin: 0; |
|
padding-left: 20px; |
|
} |
|
.example-box li { |
|
word-wrap: break-word; |
|
margin: 10px 0; |
|
line-height: 1.5; |
|
text-align: left; |
|
} |
|
.query-label { |
|
display: block; |
|
text-align: left; |
|
margin-bottom: 10px; |
|
margin-left: 10%; |
|
font-weight: bold; |
|
} |
|
.status-box { |
|
background-color: #f8f9fa; |
|
border-radius: 8px; |
|
padding: 15px; |
|
margin: 20px auto; |
|
width: 80%; |
|
text-align: left; |
|
} |
|
.status-label { |
|
font-weight: bold; |
|
margin-right: 10px; |
|
} |
|
.status-ready { |
|
color: #198754; /* Bootstrap success color */ |
|
font-weight: bold; |
|
} |
|
.sequence-results { |
|
width: 80%; |
|
margin: 20px auto; |
|
text-align: left; |
|
font-family: monospace; |
|
white-space: pre-wrap; |
|
word-wrap: break-word; |
|
background-color: #f8f9fa; |
|
border-radius: 8px; |
|
padding: 20px; |
|
overflow-x: hidden; |
|
} |
|
.sequence-text { |
|
word-break: break-all; |
|
margin: 10px 0; |
|
line-height: 1.5; |
|
} |
|
.status-spinner { |
|
display: none; |
|
margin-left: 10px; |
|
vertical-align: middle; |
|
} |
|
.status-spinner.active { |
|
display: inline-block; |
|
} |
|
.3d-viewer-container { |
|
text-align: center; |
|
margin: 20px auto; |
|
padding: 20px; |
|
background-color: #f8f9fa; |
|
border-radius: 8px; |
|
width: 90%; |
|
} |
|
.3d-iframe { |
|
margin-top: 15px; |
|
border: 1px solid #ddd; |
|
border-radius: 4px; |
|
} |
|
.3d-viewer-container select { |
|
margin: 15px auto; |
|
padding: 8px; |
|
font-size: 16px; |
|
border-radius: 4px; |
|
border: 1px solid #ced4da; |
|
} |
|
.tool-description { |
|
text-align: center; |
|
color: #666; |
|
margin: 0 auto 30px; |
|
max-width: 800px; |
|
line-height: 1.6; |
|
font-size: 1.1em; |
|
} |
|
.main-content { |
|
display: flex; |
|
flex-direction: column; |
|
gap: 20px; |
|
} |
|
.search-section { |
|
background-color: #f8f9fa; |
|
border-radius: 12px; |
|
padding: 25px; |
|
margin-bottom: 20px; |
|
} |
|
.example-box { |
|
height: 100%; |
|
margin: 0; |
|
background-color: white; |
|
border: 1px solid #dee2e6; |
|
padding: 20px; |
|
border-radius: 8px; |
|
} |
|
.status-text { |
|
margin-top: 10px; |
|
color: #666; |
|
font-size: 0.9em; |
|
} |
|
.status-label { |
|
font-weight: bold; |
|
margin-right: 5px; |
|
} |
|
.status-spinner { |
|
display: none; |
|
margin-left: 10px; |
|
vertical-align: middle; |
|
} |
|
.status-spinner.active { |
|
display: inline-block; |
|
} |
|
.query-header { |
|
display: flex; |
|
justify-content: space-between; |
|
align-items: center; |
|
margin-bottom: 10px; |
|
} |
|
.query-label { |
|
margin: 0; |
|
font-weight: bold; |
|
} |
|
.btn-primary { |
|
margin-left: 15px; |
|
} |
|
.query-header { |
|
margin-bottom: 10px; |
|
} |
|
.query-label-group { |
|
display: flex; |
|
align-items: center; |
|
gap: 10px; /* λΌλ²¨κ³Ό λ²νΌ μ¬μ΄ κ°κ²© */ |
|
} |
|
.query-label { |
|
margin: 0; |
|
font-weight: bold; |
|
} |
|
.btn-primary { |
|
padding: 5px 15px; |
|
} |
|
.viewer-section { |
|
background-color: #f8f9fa; |
|
border: 1px solid #dee2e6; |
|
border-radius: 8px; |
|
padding: 20px; |
|
margin: 20px 0; |
|
} |
|
.viewer-content { |
|
margin-top: 15px; |
|
} |
|
.viewer-content select { |
|
max-width: 200px; |
|
margin: 0 auto 15px; |
|
display: block; |
|
} |
|
.viewer-iframe { |
|
background-color: white; |
|
border-radius: 4px; |
|
padding: 10px; |
|
} |
|
h4 { |
|
margin: 0; |
|
color: #333; |
|
} |
|
.results-section { |
|
background-color: #f8f9fa; |
|
border: 1px solid #dee2e6; |
|
border-radius: 8px; |
|
padding: 20px; |
|
margin: 20px 0; |
|
} |
|
.viewer-section, .sequence-section { |
|
background-color: #f8f9fa; |
|
border: 1px solid #dee2e6; |
|
border-radius: 8px; |
|
padding: 20px; |
|
margin: 20px 0; |
|
height: 100%; |
|
} |
|
.sequence-content { |
|
background-color: white; |
|
border-radius: 4px; |
|
padding: 15px; |
|
margin-top: 15px; |
|
max-height: 600px; |
|
overflow-y: auto; |
|
font-family: monospace; |
|
white-space: pre-wrap; |
|
word-wrap: break-word; |
|
overflow-x: hidden; |
|
text-align: left; |
|
} |
|
.sequence-text { |
|
word-break: break-all; |
|
margin: 10px 0; |
|
line-height: 1.5; |
|
text-align: left; |
|
} |
|
.status-spinner { |
|
display: none; |
|
margin-left: 10px; |
|
vertical-align: middle; |
|
} |
|
.status-spinner.active { |
|
display: inline-block; |
|
} |
|
.query-header { |
|
display: flex; |
|
justify-content: space-between; |
|
align-items: center; |
|
margin-bottom: 10px; |
|
} |
|
.query-label { |
|
margin: 0; |
|
font-weight: bold; |
|
} |
|
.btn-primary { |
|
margin-left: 15px; |
|
} |
|
.query-header { |
|
margin-bottom: 10px; |
|
} |
|
.query-label-group { |
|
display: flex; |
|
align-items: center; |
|
gap: 10px; /* λΌλ²¨κ³Ό λ²νΌ μ¬μ΄ κ°κ²© */ |
|
} |
|
.query-label { |
|
margin: 0; |
|
font-weight: bold; |
|
} |
|
.btn-primary { |
|
padding: 5px 15px; |
|
} |
|
.viewer-section { |
|
background-color: #f8f9fa; |
|
border: 1px solid #dee2e6; |
|
border-radius: 8px; |
|
padding: 20px; |
|
margin: 20px 0; |
|
} |
|
.viewer-content { |
|
margin-top: 15px; |
|
} |
|
.viewer-content select { |
|
max-width: 200px; |
|
margin: 0 auto 15px; |
|
display: block; |
|
} |
|
.viewer-iframe { |
|
background-color: white; |
|
border-radius: 4px; |
|
padding: 10px; |
|
} |
|
h4 { |
|
margin: 0; |
|
color: #333; |
|
} |
|
.btn-info { |
|
margin-top: 15px; |
|
} |
|
.structure-details-section { |
|
margin-top: 20px; |
|
background-color: #f8f9fa; |
|
border: 1px solid #dee2e6; |
|
border-radius: 8px; |
|
padding: 20px; |
|
} |
|
|
|
.pdb-selector { |
|
display: flex; |
|
align-items: ; |
|
justify-content: flex-start; |
|
gap: 5px; |
|
margin-top: 20px; |
|
margin-bottom: 20px; |
|
margin-left: 20px; |
|
} |
|
.pdb-selector .form-group.shiny-input-container{ |
|
margin-left: 250px; |
|
} |
|
.pdb-select-label { |
|
font-weight: bold; |
|
margin: 0; |
|
white-space: nowrap; |
|
display: inline-block; |
|
vertical-align: middle; |
|
} |
|
|
|
.pdb-selector select { |
|
margin-left: 0; |
|
vertical-align: left; |
|
display: inline-block; |
|
} |
|
|
|
.viewer-section, .sequence-section { |
|
background-color: white; |
|
border: 1px solid #dee2e6; |
|
border-radius: 8px; |
|
padding: 20px; |
|
margin-top: 20px; |
|
height: 100%; |
|
} |
|
""") |
|
), |
|
ui.div( |
|
{"class": "content-wrapper"}, |
|
ui.h2("Advanced PDB Structure Search Tool"), |
|
ui.div( |
|
{"class": "tool-description"}, |
|
"An AI-powered search tool for exploring protein structures in the Protein Data Bank (PDB). ", |
|
"Search by protein name, sequence, resolution, experimental method, or organism to find relevant structures. ", |
|
"You can also retrieve amino acid sequences for specific PDB IDs." |
|
), |
|
ui.div( |
|
{"class": "main-content"}, |
|
ui.div( |
|
{"class": "search-section"}, |
|
ui.row( |
|
ui.column(8, |
|
ui.div( |
|
{"class": "query-header"}, |
|
ui.div( |
|
{"class": "query-label-group"}, |
|
ui.tags.label( |
|
"Search Query", |
|
{"class": "query-label", "for": "query"} |
|
), |
|
ui.input_action_button("search", "Search", |
|
class_="btn-primary") |
|
) |
|
), |
|
ui.input_text_area( |
|
"query", |
|
"", |
|
value="", |
|
width="100%", |
|
resize="vertical" |
|
), |
|
ui.div( |
|
{"class": "status-text"}, |
|
ui.tags.span("Status: ", class_="status-label"), |
|
ui.output_text("search_status", inline=True), |
|
ui.tags.i({"class": "fas fa-spinner fa-spin status-spinner"}) |
|
) |
|
), |
|
ui.column(4, |
|
ui.div( |
|
{"class": "example-box"}, |
|
ui.p("Example queries:"), |
|
ui.tags.ul( |
|
ui.tags.li("Sequence of PDB ID 8ET6"), |
|
ui.tags.li("Spike protein"), |
|
ui.tags.li("Membrane protein"), |
|
ui.tags.li("Human insulin"), |
|
ui.tags.li("Human hemoglobin C resolution better than 2.5Γ
"), |
|
ui.tags.li("Find structures containing sequence with similarity 90% FVNQHLCGSHLVEALYLVCGERGFFYTPKTRREAEDLQVGQVELGGGPGAGSLQPLALEGSLQKR"), |
|
ui.tags.li("Find structures with resolution better than 3 angstrom and sequence similarity 90% of FVNQHLCGSHLVEALYLVCGERGFFYTPKTRREAEDLQVGQVELGGGPGAGSLQPLALEGSLQKR"), |
|
|
|
) |
|
) |
|
) |
|
), |
|
), |
|
ui.row( |
|
ui.column(12, |
|
ui.div( |
|
{"class": "results-section"}, |
|
ui.h4("PDB Search Results"), |
|
ui.output_ui( |
|
"results_table", |
|
|
|
), |
|
ui.download_button("download", "Download Results", |
|
class_="btn btn-info") |
|
) |
|
) |
|
), |
|
ui.div( |
|
{"class": "structure-details-section"}, |
|
ui.div( |
|
{"class": "pdb-selector"}, |
|
ui.tags.label( |
|
"Select PDB ID", |
|
{"class": "pdb-select-label"} |
|
), |
|
ui.input_selectize( |
|
"selected_pdb", |
|
"", |
|
choices=[], |
|
width="200px" |
|
) |
|
), |
|
ui.row( |
|
ui.column(6, |
|
ui.div( |
|
{"class": "viewer-section"}, |
|
ui.h4("3D Structure Viewer"), |
|
ui.div( |
|
{"class": "viewer-content"}, |
|
ui.div( |
|
{"class": "viewer-iframe"}, |
|
ui.output_ui("output_iframe") |
|
) |
|
) |
|
) |
|
), |
|
ui.column(6, |
|
ui.div( |
|
{"class": "sequence-section"}, |
|
ui.h4("Sequences"), |
|
ui.div( |
|
{"class": "sequence-content"}, |
|
ui.output_text("sequence_output") |
|
) |
|
) |
|
) |
|
) |
|
) |
|
) |
|
) |
|
) |
|
|
|
def server(input, output, session): |
|
assistant = PDBSearchAssistant() |
|
results_store = reactive.Value({"type": None, "results": []}) |
|
status_store = reactive.Value("Ready") |
|
pdb_ids_store = reactive.Value([]) |
|
|
|
@reactive.Effect |
|
@reactive.event(input.search) |
|
def _(): |
|
status_store.set("Searching...") |
|
|
|
start_time = time.time() |
|
|
|
query_results = assistant.process_query(input.query()) |
|
results_store.set(query_results) |
|
|
|
elapsed_time = time.time() - start_time |
|
print(elapsed_time) |
|
|
|
pdb_ids = [] |
|
|
|
if query_results["type"] == "sequence": |
|
if not query_results["results"]: |
|
status_store.set("No sequences found") |
|
else: |
|
status_store.set("Ready") |
|
for line in input.query().split(): |
|
if re.match(r'^[0-9A-Za-z]{4}$', line): |
|
pdb_ids.append(line.upper()) |
|
else: |
|
df = pd.DataFrame(query_results["results"]) |
|
if df.empty: |
|
status_store.set("No structures found") |
|
else: |
|
status_store.set("Ready") |
|
pdb_ids = df['PDB ID'].tolist() |
|
@output |
|
@render.ui |
|
def results_table(): |
|
return ui.HTML(DT(create_interactive_table(df))) |
|
|
|
if pdb_ids: |
|
pdb_ids_store.set(pdb_ids) |
|
|
|
ui.update_selectize( |
|
"selected_pdb", |
|
choices=pdb_ids, |
|
selected=pdb_ids[0] |
|
) |
|
else: |
|
pdb_ids_store.set([]) |
|
ui.update_selectize( |
|
"selected_pdb", |
|
choices=[], |
|
selected=None |
|
) |
|
|
|
@output |
|
@render.text |
|
def search_status(): |
|
return status_store.get() |
|
|
|
@output |
|
@render.text |
|
def sequence_output(): |
|
selected_pdb = input.selected_pdb() |
|
if not selected_pdb: |
|
return "No PDB ID selected" |
|
|
|
sequences = assistant.get_sequences_by_pdb_id(selected_pdb) |
|
if not sequences: |
|
return f"No sequences found for PDB ID: {selected_pdb}" |
|
|
|
output_text = [] |
|
for seq in sequences: |
|
output_text.append(f"\nChain {seq['chain_id']} (Entity {seq['entity_id']}):") |
|
output_text.append(f"Description: {seq['description']}") |
|
output_text.append(f"Length: {seq['length']} residues") |
|
output_text.append("Sequence:") |
|
|
|
|
|
sequence = seq['sequence'] |
|
|
|
sequence = ' '.join(sequence[i:i+10] for i in range(0, len(sequence), 10)) |
|
|
|
formatted_sequence = '\n'.join([sequence[i:i+66] for i in range(0, len(sequence), 66)]) |
|
output_text.append(formatted_sequence) |
|
output_text.append("-" * 60) |
|
|
|
return "\n".join(output_text) |
|
|
|
@output |
|
@render.ui |
|
def output_iframe(): |
|
selected_pdb = input.selected_pdb() |
|
sequences = assistant.get_sequences_by_pdb_id(selected_pdb) |
|
chain_cnt = len(sequences) |
|
|
|
if selected_pdb: |
|
return ui.HTML(render_html(selected_pdb, chain_cnt)) |
|
return ui.HTML("") |
|
|
|
@output |
|
@render.download(filename="pdb_search_results.csv") |
|
def download(): |
|
file_path = "pdb_search_results.csv" |
|
if os.path.exists(file_path): |
|
os.remove(file_path) |
|
|
|
current_results = results_store.get() |
|
if current_results["type"] == "structure": |
|
df = pd.DataFrame(current_results["results"]) |
|
else: |
|
print() |
|
df = pd.DataFrame(current_results["results"]) |
|
|
|
df.to_csv(file_path, index=False) |
|
|
|
return file_path |
|
|
|
app = App(app_ui, server) |
|
|
|
if __name__ == "__main__": |
|
import nest_asyncio |
|
nest_asyncio.apply() |
|
app.run(host="0.0.0.0", port=7862) |