|
from transformers import pipeline |
|
from rcsbsearchapi import TextQuery, AttributeQuery, Query |
|
from rcsbsearchapi.search import Sort, SequenceQuery |
|
import os |
|
from dotenv import load_dotenv |
|
from shiny import App, render, ui, reactive |
|
import pandas as pd |
|
import warnings |
|
import re |
|
from UniprotKB_P_Sequence_RCSB_API_test import ProteinQuery, ProteinSearchEngine |
|
import plotly.graph_objects as go |
|
from shinywidgets import output_widget, render_widget |
|
import requests |
|
import asyncio |
|
from Bio import PDB |
|
from Bio.PDB.PDBList import PDBList |
|
from Bio.PDB.Polypeptide import protein_letters_3to1 |
|
import shutil |
|
warnings.filterwarnings('ignore') |
|
|
|
|
|
load_dotenv() |
|
|
|
|
|
|
|
|
|
class PDBSearchAssistant: |
|
def __init__(self, model_name="google/flan-t5-large"): |
|
|
|
self.pipe = pipeline( |
|
"text2text-generation", |
|
model=model_name, |
|
max_new_tokens=1024, |
|
temperature=0.1, |
|
torch_dtype="auto", |
|
device="cpu" |
|
) |
|
|
|
self.prompt_template = """ |
|
Extract specific search parameters from the protein-related query: |
|
1. Protein name or type |
|
2. Resolution cutoff (in Γ
) |
|
3. Protein sequence information |
|
4. Specific PDB ID |
|
5. Experimental method (X-RAY, EM, NMR) |
|
6. Organism/Species information |
|
7. Sequence similarity (in %) |
|
|
|
Format: |
|
Protein: [protein name or type] |
|
Organism: [organism/species if mentioned] |
|
Resolution: [maximum resolution in Γ
, if mentioned] |
|
Sequence: [any sequence mentioned] |
|
PDB_ID: [specific PDB ID if mentioned] |
|
Method: [experimental method if mentioned] |
|
|
|
Examples: |
|
Query: "Find human insulin structures with X-ray better than 2.5Γ
resolution" |
|
Protein: insulin |
|
Organism: Homo sapiens |
|
Resolution: 2.5 |
|
Sequence: none |
|
PDB_ID: none |
|
Method: X-RAY |
|
|
|
Query: "Find structures containing sequence with similarity 90% MNIFEMLRIDEGLRLKIYKDTEGYYTIGIGHLLTKSPSLNAAKSELDKAIGRNTNGVITKDEAEKLFNQDVDAAVRGILRNAKLKPVYDSLDAVRRAALINMVFQMGETGVAGFTNSLRMLQQKRWDEAAVNLAKSRWYNQTPNRAKRVITTFRTGTWDAYKNL" |
|
Protein: none |
|
Organism: none |
|
Resolution: none |
|
Sequence: MNIFEMLRIDEGLRLKIYKDTEGYYTIGIGHLLTKSPSLNAAKSELDKAIGRNTNGVITKDEAEKLFNQDVDAAVRGILRNAKLKPVYDSLDAVRRAALINMVFQMGETGVAGFTNSLRMLQQKRWDEAAVNLAKSRWYNQTPNRAKRVITTFRTGTWDAYKNL |
|
PDB_ID: none |
|
Method: none |
|
Similarity: 90 |
|
|
|
Query: "Get sequence of PDB ID 8ET6" |
|
Protein: none |
|
Organism: none |
|
Resolution: none |
|
Sequence: none |
|
PDB_ID: 8ET6 |
|
Method: none |
|
|
|
Query: "Find mouse lysozyme structures" |
|
Protein: lysozyme |
|
Organism: Mus musculus |
|
Resolution: none |
|
Sequence: none |
|
PDB_ID: none |
|
Method: none |
|
|
|
Now analyze: |
|
Query: {query} |
|
""" |
|
|
|
self.pdb_dir = "pdb_tmp" |
|
os.makedirs(self.pdb_dir, exist_ok=True) |
|
self.pdbl = PDBList() |
|
|
|
def search_pdb(self, query): |
|
try: |
|
|
|
formatted_prompt = self.prompt_template.format(query=query) |
|
response = self.pipe(formatted_prompt)[0]['generated_text'] |
|
print("Generated parameters:", response) |
|
|
|
|
|
resolution_limit = None |
|
pdb_id = None |
|
sequence = None |
|
method = None |
|
organism = None |
|
has_resolution_query = False |
|
resolution_direction = "less" |
|
similarity = None |
|
print("Raw LLM response:", response) |
|
|
|
|
|
for line in response.split('\n'): |
|
line = line.strip().lower() |
|
if 'similarity:' in line: |
|
try: |
|
similarity_str = line.split('similarity:')[1].strip() |
|
if similarity_str.lower() not in ['none', 'n/a']: |
|
similarity = float(similarity_str) |
|
print(f"Successfully extracted similarity: {similarity}%") |
|
except (ValueError, IndexError) as e: |
|
print(f"Error parsing similarity: {e}") |
|
continue |
|
|
|
|
|
if similarity is None: |
|
|
|
similarity_match = re.search(r'similarity\s+(\d+(?:\.\d+)?)\s*%', query.lower()) |
|
if similarity_match: |
|
try: |
|
similarity = float(similarity_match.group(1)) |
|
print(f"Extracted similarity from query: {similarity}%") |
|
except ValueError as e: |
|
print(f"Error parsing similarity from query: {e}") |
|
|
|
|
|
resolution_terms = { |
|
'better': 'less', |
|
'best': 'less', |
|
'highest': 'less', |
|
'good': 'less', |
|
'fine': 'less', |
|
'worse': 'greater', |
|
'worst': 'greater', |
|
'lowest': 'greater', |
|
'poor': 'greater', |
|
'resolution': None, |
|
'Γ₯': None, |
|
'angstrom': None, |
|
'than': None, |
|
'under': 'less', |
|
'below': 'less', |
|
'above': 'greater', |
|
'over': 'greater' |
|
} |
|
|
|
|
|
query_lower = query.lower() |
|
|
|
|
|
for term, direction in resolution_terms.items(): |
|
if term in query_lower: |
|
has_resolution_query = True |
|
if direction: |
|
resolution_direction = direction |
|
|
|
|
|
resolution_match = re.search(r'(\d+\.?\d*)\s*Γ₯?.*resolution', query_lower) |
|
if resolution_match: |
|
has_resolution_query = True |
|
try: |
|
resolution_limit = float(resolution_match.group(1)) |
|
except ValueError: |
|
pass |
|
|
|
|
|
for line in response.split('\n'): |
|
if 'Resolution:' in line: |
|
value = line.split('Resolution:')[1].strip() |
|
if value.lower() not in ['none', 'n/a'] and has_resolution_query: |
|
try: |
|
|
|
res_value = ''.join(c for c in value if c.isdigit() or c == '.') |
|
resolution_limit = float(res_value) |
|
except ValueError: |
|
pass |
|
elif 'Method:' in line: |
|
value = line.split('Method:')[1].strip() |
|
if value.lower() not in ['none', 'n/a']: |
|
method = value.upper() |
|
elif 'Sequence:' in line: |
|
value = line.split('Sequence:')[1].strip() |
|
if value.lower() not in ['none', 'n/a']: |
|
sequence = value |
|
elif 'PDB_ID:' in line: |
|
value = line.split('PDB_ID:')[1].strip() |
|
if value.lower() not in ['none', 'n/a']: |
|
pdb_id = value |
|
elif 'Organism:' in line: |
|
value = line.split('Organism:')[1].strip() |
|
if value.lower() not in ['none', 'n/a']: |
|
organism = value |
|
|
|
|
|
queries = [] |
|
|
|
|
|
|
|
query_words = query.split() |
|
for word in query_words: |
|
|
|
if (len(word) >= 25 and |
|
all(c in 'ACDEFGHIKLMNPQRSTVWY' for c in word.upper()) and |
|
sum(c.isupper() for c in word) / len(word) > 0.8): |
|
sequence = word |
|
break |
|
|
|
|
|
if sequence: |
|
if len(sequence) < 25: |
|
print("Warning: Sequence must be at least 25 residues long. Skipping sequence search.") |
|
sequence = None |
|
else: |
|
|
|
if similarity is None: |
|
similarity = 100 |
|
print("No similarity specified, using default 100%") |
|
|
|
identity_cutoff = similarity / 100.0 |
|
print(f"Adding sequence search with identity {similarity}% (cutoff: {identity_cutoff}) for sequence: {sequence}") |
|
sequence_query = SequenceQuery( |
|
sequence, |
|
identity_cutoff=identity_cutoff, |
|
evalue_cutoff=1, |
|
sequence_type="protein" |
|
) |
|
queries.append(sequence_query) |
|
print(f"Created sequence query with parameters: {sequence_query.params}") |
|
|
|
else: |
|
|
|
clean_query = query.lower() |
|
|
|
|
|
if has_resolution_query: |
|
clean_query = re.sub(r'\d+\.?\d*\s*Γ₯?', '', clean_query) |
|
for term in resolution_terms: |
|
clean_query = clean_query.replace(term, '') |
|
|
|
|
|
clean_query = ' '.join(clean_query.split()) |
|
|
|
print("Cleaned query:", clean_query) |
|
|
|
|
|
if clean_query.strip(): |
|
text_query = AttributeQuery( |
|
attribute="struct.title", |
|
operator="contains_phrase", |
|
value=clean_query |
|
) |
|
queries.append(text_query) |
|
|
|
|
|
if resolution_limit and has_resolution_query: |
|
operator = "less_or_equal" if resolution_direction == "less" else "greater_or_equal" |
|
print(f"Adding resolution filter: {operator} {resolution_limit}Γ
") |
|
resolution_query = AttributeQuery( |
|
attribute="rcsb_entry_info.resolution_combined", |
|
operator=operator, |
|
value=resolution_limit |
|
) |
|
queries.append(resolution_query) |
|
|
|
|
|
if pdb_id: |
|
print(f"Searching for specific PDB ID: {pdb_id}") |
|
id_query = AttributeQuery( |
|
attribute="rcsb_id", |
|
operator="exact_match", |
|
value=pdb_id.upper() |
|
) |
|
queries = [id_query] |
|
|
|
|
|
if method: |
|
print(f"Adding experimental method filter: {method}") |
|
method_query = AttributeQuery( |
|
attribute="exptl.method", |
|
operator="exact_match", |
|
value=method |
|
) |
|
queries.append(method_query) |
|
|
|
|
|
if organism: |
|
print(f"Adding organism filter: {organism}") |
|
organism_query = AttributeQuery( |
|
attribute="rcsb_entity_source_organism.taxonomy_lineage.name", |
|
operator="exact_match", |
|
value=organism |
|
) |
|
queries.append(organism_query) |
|
|
|
|
|
if queries: |
|
final_query = queries[0] |
|
for q in queries[1:]: |
|
final_query = final_query & q |
|
|
|
print("Final query:", final_query) |
|
|
|
|
|
session = final_query.exec() |
|
results = [] |
|
|
|
|
|
search_engine = ProteinSearchEngine() |
|
|
|
try: |
|
for entry in session: |
|
try: |
|
|
|
if isinstance(entry, dict): |
|
pdb_id = entry.get('identifier') |
|
elif hasattr(entry, 'identifier'): |
|
pdb_id = entry.identifier |
|
else: |
|
pdb_id = str(entry) |
|
|
|
pdb_id = pdb_id.upper() |
|
|
|
if not pdb_id or len(pdb_id) != 4: |
|
continue |
|
|
|
|
|
structure_url = f"https://data.rcsb.org/rest/v1/core/entry/{pdb_id}" |
|
response = requests.get(structure_url) |
|
|
|
if response.status_code != 200: |
|
continue |
|
|
|
structure_data = response.json() |
|
|
|
result = { |
|
'PDB ID': pdb_id, |
|
'Title': structure_data.get('struct', {}).get('title', 'N/A'), |
|
'# of total residues': structure_data.get('refine_hist', [{}])[0].get('pdbx_number_residues_total', 'N/A'), |
|
'# of atoms of protein': structure_data.get('refine_hist', [{}])[0].get('pdbx_number_atoms_protein', 'N/A'), |
|
'Resolution': f"{structure_data.get('rcsb_entry_info', {}).get('resolution_combined', [0.0])[0]:.2f}Γ
", |
|
'Method': structure_data.get('exptl', [{}])[0].get('method', 'Unknown'), |
|
'Release Date': structure_data.get('rcsb_accession_info', {}).get('initial_release_date', 'N/A') |
|
|
|
|
|
} |
|
|
|
results.append(result) |
|
|
|
|
|
if len(results) >= 10: |
|
break |
|
|
|
except Exception as e: |
|
print(f"Error processing entry: {str(e)}") |
|
continue |
|
|
|
except Exception as e: |
|
print(f"Error processing results: {str(e)}") |
|
print(f"Error type: {type(e)}") |
|
|
|
print(f"Found {len(results)} structures") |
|
return results |
|
|
|
return [] |
|
|
|
except Exception as e: |
|
print(f"Error during search: {str(e)}") |
|
print(f"Error type: {type(e)}") |
|
return [] |
|
|
|
def get_sequences_by_pdb_id(self, pdb_id): |
|
"""Get sequences for all chains in a PDB structure using Biopython""" |
|
try: |
|
|
|
pdb_path = self.pdbl.retrieve_pdb_file( |
|
pdb_id, |
|
pdir=self.pdb_dir, |
|
file_format="pdb" |
|
) |
|
|
|
if not pdb_path or not os.path.exists(pdb_path): |
|
print(f"Failed to download PDB file for {pdb_id}") |
|
return [] |
|
|
|
|
|
parser = PDB.PDBParser(QUIET=True) |
|
structure = parser.get_structure(pdb_id, pdb_path) |
|
|
|
|
|
structure_url = f"https://data.rcsb.org/rest/v1/core/entry/{pdb_id}" |
|
response = requests.get(structure_url) |
|
structure_data = response.json() if response.status_code == 200 else {} |
|
|
|
sequences = [] |
|
|
|
for model in structure: |
|
for chain in model: |
|
sequence = "" |
|
for residue in chain: |
|
if PDB.is_aa(residue, standard=True): |
|
try: |
|
|
|
resname = residue.get_resname() |
|
if resname in protein_letters_3to1: |
|
sequence += protein_letters_3to1[resname] |
|
except: |
|
continue |
|
|
|
if sequence: |
|
chain_info = { |
|
'chain_id': chain.id, |
|
'entity_id': '1', |
|
'description': structure_data.get('struct', {}).get('title', 'N/A'), |
|
'sequence': sequence, |
|
'length': len(sequence), |
|
'resolution': structure_data.get('rcsb_entry_info', {}).get('resolution_combined', [0.0])[0], |
|
'method': structure_data.get('exptl', [{}])[0].get('method', 'Unknown'), |
|
'release_date': structure_data.get('rcsb_accession_info', {}).get('initial_release_date', 'N/A') |
|
} |
|
sequences.append(chain_info) |
|
|
|
|
|
if os.path.exists(pdb_path): |
|
os.remove(pdb_path) |
|
|
|
return sequences |
|
|
|
except Exception as e: |
|
print(f"Error getting sequences for PDB ID {pdb_id}: {str(e)}") |
|
return [] |
|
|
|
def __del__(self): |
|
"""Cleanup temporary directory on object destruction""" |
|
if hasattr(self, 'pdb_dir') and os.path.exists(self.pdb_dir): |
|
shutil.rmtree(self.pdb_dir) |
|
|
|
def process_query(self, query): |
|
"""Process query and return results""" |
|
try: |
|
|
|
formatted_prompt = self.prompt_template.format(query=query) |
|
response = self.pipe(formatted_prompt)[0]['generated_text'] |
|
print("Generated parameters:", response) |
|
|
|
|
|
pdb_id = None |
|
for line in response.split('\n'): |
|
if 'PDB_ID:' in line: |
|
value = line.split('PDB_ID:')[1].strip() |
|
if value.lower() not in ['none', 'n/a']: |
|
pdb_id = value.upper() |
|
break |
|
|
|
|
|
sequence_keywords = ['sequence', 'seq'] |
|
is_sequence_query = any(keyword in query.lower() for keyword in sequence_keywords) |
|
|
|
if is_sequence_query and pdb_id: |
|
|
|
sequences = self.get_sequences_by_pdb_id(pdb_id) |
|
return { |
|
"type": "sequence", |
|
"results": sequences |
|
} |
|
|
|
|
|
return { |
|
"type": "structure", |
|
"results": self.search_pdb(query) |
|
} |
|
|
|
except Exception as e: |
|
print(f"Error processing query: {str(e)}") |
|
return {"type": "structure", "results": []} |
|
|
|
def pdbsummary(name): |
|
|
|
search_engine = ProteinSearchEngine() |
|
|
|
query = ProteinQuery( |
|
name, |
|
max_resolution= 5.0 |
|
) |
|
|
|
results = search_engine.search(query) |
|
|
|
answer = "" |
|
for i, structure in enumerate(results, 1): |
|
answer += f"\n{i}. PDB ID : {structure.pdb_id}\n" |
|
answer += f"\nResolution : {structure.resolution:.2f} A \n" |
|
answer += f"Method : {structure.method}\n Title : {structure.title}\n" |
|
answer += f"Release Date : {structure.release_date}\n Sequence length: {len(structure.sequence)} aa\n" |
|
answer += f" Sequence:\n {structure.sequence}\n" |
|
|
|
return answer |
|
|
|
def render_html(pdb_id): |
|
if pdb_id is None: |
|
return "" |
|
html_content = f""" |
|
<!DOCTYPE html> |
|
<html> |
|
<head> |
|
<script src="https://3Dmol.org/build/3Dmol-min.js"></script> |
|
<script src="https://3Dmol.org/build/3Dmol.ui-min.js"></script> |
|
<style> |
|
.viewer_3Dmoljs {{ |
|
width: 100%; |
|
height: 400px; |
|
position: relative; |
|
}} |
|
</style> |
|
</head> |
|
<body> |
|
<div class="viewer_3Dmoljs" |
|
data-pdb="{pdb_id}" |
|
data-backgroundcolor="0xffffff" |
|
data-style="cartoon:color=spectrum" |
|
data-spin="axis:y;speed:0.2"> |
|
</div> |
|
</body> |
|
</html> |
|
""" |
|
|
|
|
|
escaped_content = (html_content |
|
.replace('"', '"') |
|
.replace('<', '<') |
|
.replace('>', '>') |
|
.replace('\n', '') |
|
) |
|
|
|
return f'<iframe style="width: 100%; height: 480px; border: none;" srcdoc=\'{escaped_content}\'></iframe>' |
|
|
|
def create_interactive_table(df): |
|
if df.empty: |
|
return go.Figure() |
|
|
|
|
|
column_order = ['PDB ID', 'Resolution', 'Title','# of total residues', '# of atoms of protein', 'Method','Release Date'] |
|
df = df[column_order] |
|
|
|
|
|
df['Release Date'] = pd.to_datetime(df['Release Date']).dt.strftime('%Y-%m-%d') |
|
|
|
|
|
table = go.Figure(data=[go.Table( |
|
header=dict( |
|
values=list(df.columns), |
|
fill_color='paleturquoise', |
|
align='center', |
|
font=dict(size=16), |
|
), |
|
cells=dict( |
|
values=[ |
|
[f'<a href="https://www.rcsb.org/structure/{cell}">{cell}</a>' |
|
if i == 0 else cell |
|
for cell in df[col]] |
|
for i, col in enumerate(df.columns) |
|
], |
|
align='center', |
|
font=dict(size=15), |
|
height=35 |
|
), |
|
columnwidth=[80, 80, 400, 100, 100, 100, 100], |
|
customdata=[['html'] * len(df) if i == 0 else [''] * len(df) |
|
for i in range(len(df.columns))], |
|
hoverlabel=dict(bgcolor='white') |
|
)]) |
|
|
|
|
|
table.update_layout( |
|
margin=dict(l=20, r=20, t=20, b=20), |
|
height=450, |
|
autosize=True |
|
) |
|
|
|
return table |
|
|
|
|
|
app_ui = ui.page_fluid( |
|
ui.tags.head( |
|
ui.tags.style(""" |
|
.container-fluid { |
|
max-width: 1200px; |
|
margin: 0 auto; |
|
padding: 20px; |
|
} |
|
.table a { |
|
color: #0d6efd; |
|
text-decoration: none; |
|
} |
|
.table a:hover { |
|
color: #0a58ca; |
|
text-decoration: underline; |
|
} |
|
.shiny-input-container { |
|
max-width: 100%; |
|
margin: 0 auto; |
|
} |
|
#query { |
|
height: 300px; |
|
font-size: 16px; |
|
padding: 15px; |
|
width: 80%; |
|
margin: 0 auto; |
|
display: block; |
|
white-space: pre-wrap; |
|
word-wrap: break-word; |
|
resize: vertical; |
|
overflow-y: auto; |
|
} |
|
.content-wrapper { |
|
text-align: center; |
|
max-width: 1000px; |
|
margin: 0 auto; |
|
} |
|
.search-button { |
|
margin: 20px 0; |
|
} |
|
h2, h4 { |
|
text-align: center; |
|
margin: 20px 0; |
|
} |
|
.example-box { |
|
height: 250px; |
|
margin: 0; |
|
background-color: white; |
|
border: 1px solid #dee2e6; |
|
padding: 20px; |
|
border-radius: 8px; |
|
overflow-y: auto; |
|
text-align: left; |
|
} |
|
.example-box p { |
|
font-weight: bold; |
|
margin-bottom: 10px; |
|
padding-left: 0; |
|
} |
|
.example-box ul { |
|
margin: 0; |
|
padding-left: 20px; |
|
} |
|
.example-box li { |
|
word-wrap: break-word; |
|
margin: 10px 0; |
|
line-height: 1.5; |
|
text-align: left; |
|
} |
|
.query-label { |
|
display: block; |
|
text-align: left; |
|
margin-bottom: 10px; |
|
margin-left: 10%; |
|
font-weight: bold; |
|
} |
|
.status-box { |
|
background-color: #f8f9fa; |
|
border-radius: 8px; |
|
padding: 15px; |
|
margin: 20px auto; |
|
width: 80%; |
|
text-align: left; |
|
} |
|
.status-label { |
|
font-weight: bold; |
|
margin-right: 10px; |
|
} |
|
.status-ready { |
|
color: #198754; /* Bootstrap success color */ |
|
font-weight: bold; |
|
} |
|
.sequence-results { |
|
width: 80%; |
|
margin: 20px auto; |
|
text-align: left; |
|
font-family: monospace; |
|
white-space: pre-wrap; |
|
word-wrap: break-word; |
|
background-color: #f8f9fa; |
|
border-radius: 8px; |
|
padding: 20px; |
|
overflow-x: hidden; |
|
} |
|
.sequence-text { |
|
word-break: break-all; |
|
margin: 10px 0; |
|
line-height: 1.5; |
|
} |
|
.status-spinner { |
|
display: none; |
|
margin-left: 10px; |
|
vertical-align: middle; |
|
} |
|
.status-spinner.active { |
|
display: inline-block; |
|
} |
|
.3d-viewer-container { |
|
text-align: center; |
|
margin: 20px auto; |
|
padding: 20px; |
|
background-color: #f8f9fa; |
|
border-radius: 8px; |
|
width: 90%; |
|
} |
|
.3d-iframe { |
|
margin-top: 15px; |
|
border: 1px solid #ddd; |
|
border-radius: 4px; |
|
} |
|
.3d-viewer-container select { |
|
margin: 15px auto; |
|
padding: 8px; |
|
font-size: 16px; |
|
border-radius: 4px; |
|
border: 1px solid #ced4da; |
|
} |
|
.tool-description { |
|
text-align: center; |
|
color: #666; |
|
margin: 0 auto 30px; |
|
max-width: 800px; |
|
line-height: 1.6; |
|
font-size: 1.1em; |
|
} |
|
.main-content { |
|
display: flex; |
|
flex-direction: column; |
|
gap: 20px; |
|
} |
|
.search-section { |
|
background-color: #f8f9fa; |
|
border-radius: 12px; |
|
padding: 25px; |
|
margin-bottom: 20px; |
|
} |
|
.example-box { |
|
height: 100%; |
|
margin: 0; |
|
background-color: white; |
|
border: 1px solid #dee2e6; |
|
padding: 20px; |
|
border-radius: 8px; |
|
} |
|
.status-text { |
|
margin-top: 10px; |
|
color: #666; |
|
font-size: 0.9em; |
|
} |
|
.status-label { |
|
font-weight: bold; |
|
margin-right: 5px; |
|
} |
|
.status-spinner { |
|
display: none; |
|
margin-left: 10px; |
|
vertical-align: middle; |
|
} |
|
.status-spinner.active { |
|
display: inline-block; |
|
} |
|
.query-header { |
|
display: flex; |
|
justify-content: space-between; |
|
align-items: center; |
|
margin-bottom: 10px; |
|
} |
|
.query-label { |
|
margin: 0; |
|
font-weight: bold; |
|
} |
|
.btn-primary { |
|
margin-left: 15px; |
|
} |
|
.query-header { |
|
margin-bottom: 10px; |
|
} |
|
.query-label-group { |
|
display: flex; |
|
align-items: center; |
|
gap: 10px; /* λΌλ²¨κ³Ό λ²νΌ μ¬μ΄ κ°κ²© */ |
|
} |
|
.query-label { |
|
margin: 0; |
|
font-weight: bold; |
|
} |
|
.btn-primary { |
|
padding: 5px 15px; |
|
} |
|
.viewer-section { |
|
background-color: #f8f9fa; |
|
border: 1px solid #dee2e6; |
|
border-radius: 8px; |
|
padding: 20px; |
|
margin: 20px 0; |
|
} |
|
.viewer-content { |
|
margin-top: 15px; |
|
} |
|
.viewer-content select { |
|
max-width: 200px; |
|
margin: 0 auto 15px; |
|
display: block; |
|
} |
|
.viewer-iframe { |
|
background-color: white; |
|
border-radius: 4px; |
|
padding: 10px; |
|
} |
|
h4 { |
|
margin: 0; |
|
color: #333; |
|
} |
|
.results-section { |
|
background-color: #f8f9fa; |
|
border: 1px solid #dee2e6; |
|
border-radius: 8px; |
|
padding: 20px; |
|
margin: 20px 0; |
|
} |
|
.viewer-section, .sequence-section { |
|
background-color: #f8f9fa; |
|
border: 1px solid #dee2e6; |
|
border-radius: 8px; |
|
padding: 20px; |
|
margin: 20px 0; |
|
height: 100%; |
|
} |
|
.sequence-content { |
|
background-color: white; |
|
border-radius: 4px; |
|
padding: 15px; |
|
margin-top: 15px; |
|
max-height: 600px; |
|
overflow-y: auto; |
|
font-family: monospace; |
|
white-space: pre-wrap; |
|
word-wrap: break-word; |
|
overflow-x: hidden; |
|
text-align: left; |
|
} |
|
.sequence-text { |
|
word-break: break-all; |
|
margin: 10px 0; |
|
line-height: 1.5; |
|
text-align: left; |
|
} |
|
.status-spinner { |
|
display: none; |
|
margin-left: 10px; |
|
vertical-align: middle; |
|
} |
|
.status-spinner.active { |
|
display: inline-block; |
|
} |
|
.query-header { |
|
display: flex; |
|
justify-content: space-between; |
|
align-items: center; |
|
margin-bottom: 10px; |
|
} |
|
.query-label { |
|
margin: 0; |
|
font-weight: bold; |
|
} |
|
.btn-primary { |
|
margin-left: 15px; |
|
} |
|
.query-header { |
|
margin-bottom: 10px; |
|
} |
|
.query-label-group { |
|
display: flex; |
|
align-items: center; |
|
gap: 10px; /* λΌλ²¨κ³Ό λ²νΌ μ¬μ΄ κ°κ²© */ |
|
} |
|
.query-label { |
|
margin: 0; |
|
font-weight: bold; |
|
} |
|
.btn-primary { |
|
padding: 5px 15px; |
|
} |
|
.viewer-section { |
|
background-color: #f8f9fa; |
|
border: 1px solid #dee2e6; |
|
border-radius: 8px; |
|
padding: 20px; |
|
margin: 20px 0; |
|
} |
|
.viewer-content { |
|
margin-top: 15px; |
|
} |
|
.viewer-content select { |
|
max-width: 200px; |
|
margin: 0 auto 15px; |
|
display: block; |
|
} |
|
.viewer-iframe { |
|
background-color: white; |
|
border-radius: 4px; |
|
padding: 10px; |
|
} |
|
h4 { |
|
margin: 0; |
|
color: #333; |
|
} |
|
.btn-info { |
|
margin-top: 15px; |
|
} |
|
.structure-details-section { |
|
margin-top: 20px; |
|
background-color: #f8f9fa; |
|
border: 1px solid #dee2e6; |
|
border-radius: 8px; |
|
padding: 20px; |
|
} |
|
|
|
.pdb-selector { |
|
display: flex; |
|
align-items: ; |
|
justify-content: flex-start; |
|
gap: 5px; |
|
margin-bottom: 20px; |
|
margin-left: 20px; |
|
} |
|
|
|
.pdb-select-label { |
|
font-weight: bold; |
|
margin: 0; |
|
white-space: nowrap; |
|
display: inline-block; |
|
vertical-align: middle; |
|
} |
|
|
|
.pdb-selector select { |
|
margin-left: 0; |
|
vertical-align: left; |
|
display: inline-block; |
|
} |
|
|
|
.viewer-section, .sequence-section { |
|
background-color: white; |
|
border: 1px solid #dee2e6; |
|
border-radius: 8px; |
|
padding: 20px; |
|
margin-top: 20px; |
|
height: 100%; |
|
} |
|
""") |
|
), |
|
ui.div( |
|
{"class": "content-wrapper"}, |
|
ui.h2("Advanced PDB Structure Search Tool"), |
|
ui.div( |
|
{"class": "tool-description"}, |
|
"An AI-powered search tool for exploring protein structures in the Protein Data Bank (PDB). ", |
|
"Search by protein name, sequence, resolution, experimental method, or organism to find relevant structures. ", |
|
"You can also retrieve amino acid sequences for specific PDB IDs." |
|
), |
|
ui.div( |
|
{"class": "main-content"}, |
|
ui.div( |
|
{"class": "search-section"}, |
|
ui.row( |
|
ui.column(8, |
|
ui.div( |
|
{"class": "query-header"}, |
|
ui.div( |
|
{"class": "query-label-group"}, |
|
ui.tags.label( |
|
"Search Query", |
|
{"class": "query-label", "for": "query"} |
|
), |
|
ui.input_action_button("search", "Search", |
|
class_="btn-primary") |
|
) |
|
), |
|
ui.input_text_area( |
|
"query", |
|
"", |
|
value="Human insulin", |
|
width="100%", |
|
resize="vertical" |
|
), |
|
ui.div( |
|
{"class": "status-text"}, |
|
ui.tags.span("Status: ", class_="status-label"), |
|
ui.output_text("search_status", inline=True), |
|
ui.tags.i({"class": "fas fa-spinner fa-spin status-spinner"}) |
|
) |
|
), |
|
ui.column(4, |
|
ui.div( |
|
{"class": "example-box"}, |
|
ui.p("Example queries:"), |
|
ui.tags.ul( |
|
ui.tags.li("Human hemoglobin C resolution better than 2.5Γ
"), |
|
ui.tags.li("Find structures containing sequence with similarity 90% MNIFEMLRIDEGLRLKIYKDTEGYYTIGIGHLLTKSPSLNAAKSELDKAIGRNTNGVITKDEAEKLFNQDVDAAVRGILRNAKLKPVYDSLDAVRRAALINMVFQMGETGVAGFTNSLRMLQQKRWDEAAVNLAKSRWYNQTPNRAKRVITTFRTGTWDAYKNL"), |
|
ui.tags.li("Sequence of PDB ID 8ET6") |
|
) |
|
) |
|
) |
|
), |
|
), |
|
ui.row( |
|
ui.column(12, |
|
ui.div( |
|
{"class": "results-section"}, |
|
ui.h4("Top 10 PDBs Results"), |
|
output_widget("results_table"), |
|
ui.download_button("download", "Download Results", |
|
class_="btn btn-info") |
|
) |
|
) |
|
), |
|
ui.div( |
|
{"class": "structure-details-section"}, |
|
ui.div( |
|
{"class": "pdb-selector"}, |
|
ui.tags.label( |
|
"Select PDB ID", |
|
{"class": "pdb-select-label"} |
|
), |
|
ui.input_select( |
|
"selected_pdb", |
|
"", |
|
choices=[], |
|
width="200px" |
|
) |
|
), |
|
ui.row( |
|
ui.column(6, |
|
ui.div( |
|
{"class": "viewer-section"}, |
|
ui.h4("3D Structure Viewer"), |
|
ui.div( |
|
{"class": "viewer-content"}, |
|
ui.div( |
|
{"class": "viewer-iframe"}, |
|
ui.output_ui("output_iframe") |
|
) |
|
) |
|
) |
|
), |
|
ui.column(6, |
|
ui.div( |
|
{"class": "sequence-section"}, |
|
ui.h4("Sequences"), |
|
ui.div( |
|
{"class": "sequence-content"}, |
|
ui.output_text("sequence_output") |
|
) |
|
) |
|
) |
|
) |
|
) |
|
) |
|
) |
|
) |
|
|
|
def server(input, output, session): |
|
assistant = PDBSearchAssistant() |
|
results_store = reactive.Value({"type": None, "results": []}) |
|
status_store = reactive.Value("Ready") |
|
pdb_ids_store = reactive.Value([]) |
|
|
|
@reactive.Effect |
|
@reactive.event(input.search) |
|
def _(): |
|
status_store.set("Searching...") |
|
|
|
query_results = assistant.process_query(input.query()) |
|
results_store.set(query_results) |
|
|
|
pdb_ids = [] |
|
|
|
if query_results["type"] == "sequence": |
|
if not query_results["results"]: |
|
status_store.set("No sequences found") |
|
else: |
|
status_store.set("Ready") |
|
for line in input.query().split(): |
|
if re.match(r'^[0-9A-Za-z]{4}$', line): |
|
pdb_ids.append(line.upper()) |
|
else: |
|
df = pd.DataFrame(query_results["results"]) |
|
if df.empty: |
|
status_store.set("No structures found") |
|
else: |
|
status_store.set("Ready") |
|
pdb_ids = df['PDB ID'].tolist() |
|
@output |
|
@render_widget |
|
def results_table(): |
|
return create_interactive_table(df) |
|
|
|
if pdb_ids: |
|
pdb_ids_store.set(pdb_ids) |
|
|
|
ui.update_select( |
|
"selected_pdb", |
|
choices=pdb_ids, |
|
selected=pdb_ids[0] |
|
) |
|
else: |
|
pdb_ids_store.set([]) |
|
ui.update_select( |
|
"selected_pdb", |
|
choices=[], |
|
selected=None |
|
) |
|
|
|
@output |
|
@render.text |
|
def search_status(): |
|
return status_store.get() |
|
|
|
@output |
|
@render.text |
|
def sequence_output(): |
|
selected_pdb = input.selected_pdb() |
|
if not selected_pdb: |
|
return "No PDB ID selected" |
|
|
|
sequences = assistant.get_sequences_by_pdb_id(selected_pdb) |
|
if not sequences: |
|
return f"No sequences found for PDB ID: {selected_pdb}" |
|
|
|
output_text = [] |
|
for seq in sequences: |
|
output_text.append(f"\nChain {seq['chain_id']} (Entity {seq['entity_id']}):") |
|
output_text.append(f"Description: {seq['description']}") |
|
output_text.append(f"Length: {seq['length']} residues") |
|
output_text.append("Sequence:") |
|
|
|
|
|
sequence = seq['sequence'] |
|
|
|
sequence = ' '.join(sequence[i:i+10] for i in range(0, len(sequence), 10)) |
|
|
|
formatted_sequence = '\n'.join([sequence[i:i+66] for i in range(0, len(sequence), 66)]) |
|
output_text.append(formatted_sequence) |
|
output_text.append("-" * 60) |
|
|
|
return "\n".join(output_text) |
|
|
|
@output |
|
@render.ui |
|
def output_iframe(): |
|
selected_pdb = input.selected_pdb() |
|
if selected_pdb: |
|
return ui.HTML(render_html(selected_pdb)) |
|
return ui.HTML("") |
|
|
|
@output |
|
@render.download(filename="pdb_search_results.csv") |
|
def download(): |
|
current_results = results_store.get() |
|
if current_results["type"] == "structure": |
|
df = pd.DataFrame(current_results["results"]) |
|
else: |
|
df = pd.DataFrame(current_results["results"]) |
|
return df.to_csv(index=False) |
|
|
|
app = App(app_ui, server) |
|
|
|
if __name__ == "__main__": |
|
import nest_asyncio |
|
nest_asyncio.apply() |
|
app.run(host="0.0.0.0", port=7862) |