Upload app.py
Browse files
app.py
CHANGED
@@ -12,6 +12,10 @@ import plotly.graph_objects as go
|
|
12 |
from shinywidgets import output_widget, render_widget
|
13 |
import requests
|
14 |
import asyncio
|
|
|
|
|
|
|
|
|
15 |
warnings.filterwarnings('ignore')
|
16 |
|
17 |
# Load environment variables from .env file
|
@@ -52,56 +56,70 @@ class PDBSearchAssistant:
|
|
52 |
Examples:
|
53 |
Query: "Find human insulin structures with X-ray better than 2.5ร
resolution"
|
54 |
Protein: insulin
|
55 |
-
Organism:
|
56 |
Resolution: 2.5
|
57 |
Sequence: none
|
58 |
PDB_ID: none
|
59 |
Method: X-RAY
|
60 |
|
61 |
-
Query: "
|
62 |
Protein: none
|
63 |
Organism: none
|
64 |
Resolution: none
|
65 |
-
Sequence:
|
66 |
-
PDB_ID:
|
67 |
Method: none
|
68 |
|
69 |
-
Query: "
|
70 |
Protein: none
|
71 |
Organism: none
|
72 |
Resolution: none
|
73 |
Sequence: none
|
74 |
-
PDB_ID:
|
75 |
Method: none
|
76 |
|
77 |
-
Query: "
|
78 |
-
Protein:
|
79 |
-
Organism:
|
80 |
Resolution: none
|
81 |
Sequence: none
|
82 |
-
PDB_ID:
|
83 |
Method: none
|
84 |
|
85 |
-
Query: "
|
86 |
Protein: none
|
87 |
-
Organism:
|
88 |
Resolution: none
|
89 |
Sequence: none
|
90 |
-
PDB_ID:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
91 |
Method: none
|
92 |
-
|
93 |
-
Query: "
|
94 |
Protein: none
|
95 |
Organism: none
|
96 |
Resolution: none
|
97 |
-
Sequence:
|
98 |
-
PDB_ID:
|
99 |
Method: none
|
100 |
|
|
|
|
|
101 |
Now analyze:
|
102 |
Query: {query}
|
103 |
"""
|
104 |
|
|
|
|
|
|
|
|
|
105 |
def search_pdb(self, query):
|
106 |
try:
|
107 |
# Get search parameters from LLM
|
@@ -114,6 +132,7 @@ class PDBSearchAssistant:
|
|
114 |
pdb_id = None
|
115 |
sequence = None
|
116 |
method = None
|
|
|
117 |
has_resolution_query = False
|
118 |
resolution_direction = "less"
|
119 |
|
@@ -175,6 +194,10 @@ class PDBSearchAssistant:
|
|
175 |
value = line.split('PDB_ID:')[1].strip()
|
176 |
if value.lower() not in ['none', 'n/a']:
|
177 |
pdb_id = value
|
|
|
|
|
|
|
|
|
178 |
|
179 |
# Build search query
|
180 |
queries = []
|
@@ -260,6 +283,16 @@ class PDBSearchAssistant:
|
|
260 |
)
|
261 |
queries.append(method_query)
|
262 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
263 |
# Combine queries with AND operator
|
264 |
if queries:
|
265 |
final_query = queries[0]
|
@@ -334,91 +367,70 @@ class PDBSearchAssistant:
|
|
334 |
return []
|
335 |
|
336 |
def get_sequences_by_pdb_id(self, pdb_id):
|
337 |
-
"""Get sequences for all chains in a PDB structure"""
|
338 |
try:
|
339 |
-
#
|
340 |
-
|
341 |
-
|
342 |
-
|
343 |
-
|
344 |
-
name=pdb_id,
|
345 |
-
max_resolution=100.0 # ๋์ ๊ฐ์ผ๋ก ์ค์ ํ์ฌ ๋ชจ๋ ๊ตฌ์กฐ ํฌํจ
|
346 |
)
|
347 |
|
348 |
-
|
349 |
-
|
350 |
-
|
351 |
-
if not results:
|
352 |
return []
|
353 |
-
|
354 |
-
sequences = []
|
355 |
-
# ๊ฒฐ๊ณผ์์ sequence ์ ๋ณด ์ถ์ถ
|
356 |
-
for structure in results:
|
357 |
-
if structure.pdb_id.upper() == pdb_id.upper():
|
358 |
-
chain_info = {
|
359 |
-
'chain_id': 'ALL', # ์ฒด์ธ ์ ๋ณด๋ ํตํฉ
|
360 |
-
'entity_id': '1',
|
361 |
-
'description': structure.title,
|
362 |
-
'sequence': structure.sequence,
|
363 |
-
'length': len(structure.sequence),
|
364 |
-
'resolution': structure.resolution,
|
365 |
-
'method': structure.method,
|
366 |
-
'release_date': structure.release_date
|
367 |
-
}
|
368 |
-
sequences.append(chain_info)
|
369 |
-
break # ์ ํํ PDB ID ๋งค์น๋ฅผ ์ฐพ์ผ๋ฉด ์ค๋จ
|
370 |
|
371 |
-
#
|
372 |
-
|
373 |
-
|
374 |
-
return self._get_sequences_by_direct_api(pdb_id)
|
375 |
-
|
376 |
-
return sequences
|
377 |
|
378 |
-
|
379 |
-
|
380 |
-
|
381 |
-
|
382 |
-
|
383 |
-
def _get_sequences_by_direct_api(self, pdb_id):
|
384 |
-
"""Fallback method using direct API calls"""
|
385 |
-
# ๊ธฐ์กด์ get_sequences_by_pdb_id ๋ฉ์๋ ๋ด์ฉ์ ์ฌ๊ธฐ๋ก ์ด๋
|
386 |
-
try:
|
387 |
-
url = f"https://data.rcsb.org/rest/v1/core/polymer_entity_instances/{pdb_id}"
|
388 |
-
response = requests.get(url)
|
389 |
|
390 |
-
if response.status_code != 200:
|
391 |
-
return []
|
392 |
-
|
393 |
-
chains_data = response.json()
|
394 |
sequences = []
|
395 |
-
|
396 |
-
for
|
397 |
-
|
398 |
-
|
399 |
-
|
400 |
-
|
401 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
402 |
|
403 |
-
if
|
404 |
-
entity_data = entity_response.json()
|
405 |
-
sequence = entity_data.get('entity_poly', {}).get('pdbx_seq_one_letter_code', '')
|
406 |
-
description = entity_data.get('rcsb_polymer_entity', {}).get('pdbx_description', 'N/A')
|
407 |
-
|
408 |
chain_info = {
|
409 |
-
'chain_id':
|
410 |
-
'entity_id':
|
411 |
-
'description':
|
412 |
'sequence': sequence,
|
413 |
-
'length': len(sequence)
|
|
|
|
|
|
|
414 |
}
|
415 |
sequences.append(chain_info)
|
416 |
|
|
|
|
|
|
|
|
|
417 |
return sequences
|
418 |
|
419 |
except Exception as e:
|
420 |
-
print(f"Error
|
421 |
return []
|
|
|
|
|
|
|
|
|
|
|
422 |
|
423 |
def process_query(self, query):
|
424 |
"""Process query and return results"""
|
@@ -480,37 +492,6 @@ def pdbsummary(name):
|
|
480 |
|
481 |
return answer
|
482 |
|
483 |
-
|
484 |
-
def render_html(pdb_id, chain="A"):
|
485 |
-
if pdb_id is None or chain is None:
|
486 |
-
return ""
|
487 |
-
html_content = f"""
|
488 |
-
<html>
|
489 |
-
<header>
|
490 |
-
<script src="https://3Dmol.org/build/3Dmol-min.js"></script>
|
491 |
-
<script src="https://3Dmol.org/build/3Dmol.ui-min.js"></script>
|
492 |
-
</header>
|
493 |
-
<body>
|
494 |
-
<div style="height: 400px; position: relative;" class="viewer_3Dmoljs"
|
495 |
-
data-pdb="{pdb_id}"
|
496 |
-
data-backgroundalpha="0.0"
|
497 |
-
data-style="cartoon:color=white"
|
498 |
-
data-select1="chain:{chain}"
|
499 |
-
data-zoomto="chain:{chain}"
|
500 |
-
data-style1="cartoon:color=spectrum"
|
501 |
-
data-spin="axis:y;speed:0.2">
|
502 |
-
</div>
|
503 |
-
</body>
|
504 |
-
</html>
|
505 |
-
"""
|
506 |
-
iframe = f"""
|
507 |
-
<iframe style="width: 100%; height: 480px; border: none;"
|
508 |
-
srcdoc='{html_content}'>
|
509 |
-
</iframe>
|
510 |
-
"""
|
511 |
-
return iframe
|
512 |
-
|
513 |
-
|
514 |
def create_interactive_table(df):
|
515 |
if df.empty:
|
516 |
return go.Figure()
|
@@ -578,12 +559,16 @@ app_ui = ui.page_fluid(
|
|
578 |
margin: 0 auto;
|
579 |
}
|
580 |
#query {
|
581 |
-
height:
|
582 |
font-size: 16px;
|
583 |
padding: 15px;
|
584 |
width: 80%;
|
585 |
margin: 0 auto;
|
586 |
display: block;
|
|
|
|
|
|
|
|
|
587 |
}
|
588 |
.content-wrapper {
|
589 |
text-align: center;
|
@@ -678,11 +663,12 @@ app_ui = ui.page_fluid(
|
|
678 |
"Search Query",
|
679 |
{"class": "query-label", "for": "query"}
|
680 |
),
|
681 |
-
ui.
|
682 |
"query",
|
683 |
"",
|
684 |
value="Human insulin",
|
685 |
-
width="100%"
|
|
|
686 |
),
|
687 |
)
|
688 |
),
|
@@ -738,15 +724,6 @@ app_ui = ui.page_fluid(
|
|
738 |
ui.output_text("sequence_output")
|
739 |
)
|
740 |
)
|
741 |
-
),
|
742 |
-
ui.row(
|
743 |
-
ui.column(12,
|
744 |
-
ui.div(
|
745 |
-
{"class": "3d-iframe", "id": "3d-iframe"}, # css ๋ฏธ์ค์
|
746 |
-
ui.h4("3D Rendering"),
|
747 |
-
ui.output_ui("output_iframe")
|
748 |
-
)
|
749 |
-
)
|
750 |
)
|
751 |
)
|
752 |
)
|
@@ -759,10 +736,8 @@ def server(input, output, session):
|
|
759 |
@reactive.Effect
|
760 |
@reactive.event(input.search)
|
761 |
def _():
|
762 |
-
# ๊ฒ์ ์์ ์ ์ํ ๋ณ๊ฒฝ
|
763 |
status_store.set("Searching...")
|
764 |
|
765 |
-
# ํ๋กฌํํธ ์ฒ๋ฆฌ
|
766 |
query_results = assistant.process_query(input.query())
|
767 |
results_store.set(query_results)
|
768 |
|
@@ -770,16 +745,18 @@ def server(input, output, session):
|
|
770 |
if not query_results["results"]:
|
771 |
status_store.set("No sequences found")
|
772 |
else:
|
773 |
-
status_store.set("Ready")
|
774 |
else:
|
775 |
df = pd.DataFrame(query_results["results"])
|
776 |
-
|
777 |
-
|
778 |
-
|
779 |
-
|
780 |
-
|
781 |
-
|
782 |
-
|
|
|
|
|
783 |
@output
|
784 |
@render.text
|
785 |
def search_status():
|
@@ -819,23 +796,10 @@ def server(input, output, session):
|
|
819 |
|
820 |
return "\n".join(output_text)
|
821 |
return ""
|
822 |
-
|
823 |
-
@output
|
824 |
-
@render.text
|
825 |
-
def output_iframe():
|
826 |
-
current_results = results_store.get()
|
827 |
-
if current_results["type"] == "structure":
|
828 |
-
pdb_id = current_results["results"][0]['PDB ID']
|
829 |
-
# chain ๊ฐ์ ธ์ค๋ ๊ฑด ์์ง
|
830 |
-
return render_html(pdb_id, "A")
|
831 |
-
else:
|
832 |
-
return ""
|
833 |
|
834 |
app = App(app_ui, server)
|
835 |
|
836 |
if __name__ == "__main__":
|
837 |
import nest_asyncio
|
838 |
nest_asyncio.apply()
|
839 |
-
app.run(host="0.0.0.0", port=7862)
|
840 |
-
|
841 |
-
|
|
|
12 |
from shinywidgets import output_widget, render_widget
|
13 |
import requests
|
14 |
import asyncio
|
15 |
+
from Bio import PDB
|
16 |
+
from Bio.PDB.PDBList import PDBList
|
17 |
+
from Bio.PDB.Polypeptide import protein_letters_3to1
|
18 |
+
import shutil
|
19 |
warnings.filterwarnings('ignore')
|
20 |
|
21 |
# Load environment variables from .env file
|
|
|
56 |
Examples:
|
57 |
Query: "Find human insulin structures with X-ray better than 2.5ร
resolution"
|
58 |
Protein: insulin
|
59 |
+
Organism: Homo sapiens
|
60 |
Resolution: 2.5
|
61 |
Sequence: none
|
62 |
PDB_ID: none
|
63 |
Method: X-RAY
|
64 |
|
65 |
+
Query: "Find structures containing sequence MNIFEMLRIDEGLRLKIYKDTEGYYTIGIGHLLTKSPSLNAAKSELDKAIGRNTNGVITKDEAEKLFNQDVDAAVRGILRNAKLKPVYDSLDAVRRAALINMVFQMGETGVAGFTNSLRMLQQKRWDEAAVNLAKSRWYNQTPNRAKRVITTFRTGTWDAYKNL"
|
66 |
Protein: none
|
67 |
Organism: none
|
68 |
Resolution: none
|
69 |
+
Sequence: MNIFEMLRIDEGLRLKIYKDTEGYYTIGIGHLLTKSPSLNAAKSELDKAIGRNTNGVITKDEAEKLFNQDVDAAVRGILRNAKLKPVYDSLDAVRRAALINMVFQMGETGVAGFTNSLRMLQQKRWDEAAVNLAKSRWYNQTPNRAKRVITTFRTGTWDAYKNL
|
70 |
+
PDB_ID: none
|
71 |
Method: none
|
72 |
|
73 |
+
Query: "Get sequence of PDB ID 8ET6"
|
74 |
Protein: none
|
75 |
Organism: none
|
76 |
Resolution: none
|
77 |
Sequence: none
|
78 |
+
PDB_ID: 8ET6
|
79 |
Method: none
|
80 |
|
81 |
+
Query: "Find mouse lysozyme structures"
|
82 |
+
Protein: lysozyme
|
83 |
+
Organism: Mus musculus
|
84 |
Resolution: none
|
85 |
Sequence: none
|
86 |
+
PDB_ID: none
|
87 |
Method: none
|
88 |
|
89 |
+
Query: "Show me E. coli protein structures solved by Cryo-EM"
|
90 |
Protein: none
|
91 |
+
Organism: Escherichia coli
|
92 |
Resolution: none
|
93 |
Sequence: none
|
94 |
+
PDB_ID: none
|
95 |
+
Method: EM
|
96 |
+
|
97 |
+
Query: "Find S. cerevisiae structures with resolution better than 1.8ร
"
|
98 |
+
Protein: none
|
99 |
+
Organism: Saccharomyces cerevisiae
|
100 |
+
Resolution: 1.8
|
101 |
+
Sequence: none
|
102 |
+
PDB_ID: none
|
103 |
Method: none
|
104 |
+
|
105 |
+
Query: "Sequence of 7BZ5"
|
106 |
Protein: none
|
107 |
Organism: none
|
108 |
Resolution: none
|
109 |
+
Sequence: none
|
110 |
+
PDB_ID: 7BZ5
|
111 |
Method: none
|
112 |
|
113 |
+
|
114 |
+
|
115 |
Now analyze:
|
116 |
Query: {query}
|
117 |
"""
|
118 |
|
119 |
+
self.pdb_dir = "pdb_tmp" # ์์ PDB ํ์ผ ์ ์ฅ ๋๋ ํ ๋ฆฌ
|
120 |
+
os.makedirs(self.pdb_dir, exist_ok=True)
|
121 |
+
self.pdbl = PDBList()
|
122 |
+
|
123 |
def search_pdb(self, query):
|
124 |
try:
|
125 |
# Get search parameters from LLM
|
|
|
132 |
pdb_id = None
|
133 |
sequence = None
|
134 |
method = None
|
135 |
+
organism = None
|
136 |
has_resolution_query = False
|
137 |
resolution_direction = "less"
|
138 |
|
|
|
194 |
value = line.split('PDB_ID:')[1].strip()
|
195 |
if value.lower() not in ['none', 'n/a']:
|
196 |
pdb_id = value
|
197 |
+
elif 'Organism:' in line:
|
198 |
+
value = line.split('Organism:')[1].strip()
|
199 |
+
if value.lower() not in ['none', 'n/a']:
|
200 |
+
organism = value
|
201 |
|
202 |
# Build search query
|
203 |
queries = []
|
|
|
283 |
)
|
284 |
queries.append(method_query)
|
285 |
|
286 |
+
# Add organism filter if specified
|
287 |
+
if organism:
|
288 |
+
print(f"Adding organism filter: {organism}")
|
289 |
+
organism_query = AttributeQuery(
|
290 |
+
attribute="rcsb_entity_source_organism.taxonomy_lineage.name",
|
291 |
+
operator="exact_match",
|
292 |
+
value=organism
|
293 |
+
)
|
294 |
+
queries.append(organism_query)
|
295 |
+
|
296 |
# Combine queries with AND operator
|
297 |
if queries:
|
298 |
final_query = queries[0]
|
|
|
367 |
return []
|
368 |
|
369 |
def get_sequences_by_pdb_id(self, pdb_id):
|
370 |
+
"""Get sequences for all chains in a PDB structure using Biopython"""
|
371 |
try:
|
372 |
+
# Download PDB file
|
373 |
+
pdb_path = self.pdbl.retrieve_pdb_file(
|
374 |
+
pdb_id,
|
375 |
+
pdir=self.pdb_dir,
|
376 |
+
file_format="pdb"
|
|
|
|
|
377 |
)
|
378 |
|
379 |
+
if not pdb_path or not os.path.exists(pdb_path):
|
380 |
+
print(f"Failed to download PDB file for {pdb_id}")
|
|
|
|
|
381 |
return []
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
382 |
|
383 |
+
# Parse structure
|
384 |
+
parser = PDB.PDBParser(QUIET=True)
|
385 |
+
structure = parser.get_structure(pdb_id, pdb_path)
|
|
|
|
|
|
|
386 |
|
387 |
+
# Get structure info from RCSB API for additional details
|
388 |
+
structure_url = f"https://data.rcsb.org/rest/v1/core/entry/{pdb_id}"
|
389 |
+
response = requests.get(structure_url)
|
390 |
+
structure_data = response.json() if response.status_code == 200 else {}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
391 |
|
|
|
|
|
|
|
|
|
392 |
sequences = []
|
393 |
+
# Extract sequences from each chain
|
394 |
+
for model in structure:
|
395 |
+
for chain in model:
|
396 |
+
sequence = ""
|
397 |
+
for residue in chain:
|
398 |
+
if PDB.is_aa(residue, standard=True):
|
399 |
+
try:
|
400 |
+
# 3๊ธ์ ์๋ฏธ๋
ธ์ฐ ์ฝ๋๋ฅผ 1๊ธ์๋ก ๋ณํ
|
401 |
+
resname = residue.get_resname()
|
402 |
+
if resname in protein_letters_3to1:
|
403 |
+
sequence += protein_letters_3to1[resname]
|
404 |
+
except:
|
405 |
+
continue
|
406 |
|
407 |
+
if sequence: # Only add if sequence is not empty
|
|
|
|
|
|
|
|
|
408 |
chain_info = {
|
409 |
+
'chain_id': chain.id,
|
410 |
+
'entity_id': '1', # Default entity ID
|
411 |
+
'description': structure_data.get('struct', {}).get('title', 'N/A'),
|
412 |
'sequence': sequence,
|
413 |
+
'length': len(sequence),
|
414 |
+
'resolution': structure_data.get('rcsb_entry_info', {}).get('resolution_combined', [0.0])[0],
|
415 |
+
'method': structure_data.get('exptl', [{}])[0].get('method', 'Unknown'),
|
416 |
+
'release_date': structure_data.get('rcsb_accession_info', {}).get('initial_release_date', 'N/A')
|
417 |
}
|
418 |
sequences.append(chain_info)
|
419 |
|
420 |
+
# Cleanup downloaded file
|
421 |
+
if os.path.exists(pdb_path):
|
422 |
+
os.remove(pdb_path)
|
423 |
+
|
424 |
return sequences
|
425 |
|
426 |
except Exception as e:
|
427 |
+
print(f"Error getting sequences for PDB ID {pdb_id}: {str(e)}")
|
428 |
return []
|
429 |
+
|
430 |
+
def __del__(self):
|
431 |
+
"""Cleanup temporary directory on object destruction"""
|
432 |
+
if hasattr(self, 'pdb_dir') and os.path.exists(self.pdb_dir):
|
433 |
+
shutil.rmtree(self.pdb_dir)
|
434 |
|
435 |
def process_query(self, query):
|
436 |
"""Process query and return results"""
|
|
|
492 |
|
493 |
return answer
|
494 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
495 |
def create_interactive_table(df):
|
496 |
if df.empty:
|
497 |
return go.Figure()
|
|
|
559 |
margin: 0 auto;
|
560 |
}
|
561 |
#query {
|
562 |
+
height: 150px;
|
563 |
font-size: 16px;
|
564 |
padding: 15px;
|
565 |
width: 80%;
|
566 |
margin: 0 auto;
|
567 |
display: block;
|
568 |
+
white-space: pre-wrap;
|
569 |
+
word-wrap: break-word;
|
570 |
+
resize: vertical;
|
571 |
+
overflow-y: auto;
|
572 |
}
|
573 |
.content-wrapper {
|
574 |
text-align: center;
|
|
|
663 |
"Search Query",
|
664 |
{"class": "query-label", "for": "query"}
|
665 |
),
|
666 |
+
ui.input_text_area(
|
667 |
"query",
|
668 |
"",
|
669 |
value="Human insulin",
|
670 |
+
width="100%",
|
671 |
+
resize="vertical"
|
672 |
),
|
673 |
)
|
674 |
),
|
|
|
724 |
ui.output_text("sequence_output")
|
725 |
)
|
726 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
727 |
)
|
728 |
)
|
729 |
)
|
|
|
736 |
@reactive.Effect
|
737 |
@reactive.event(input.search)
|
738 |
def _():
|
|
|
739 |
status_store.set("Searching...")
|
740 |
|
|
|
741 |
query_results = assistant.process_query(input.query())
|
742 |
results_store.set(query_results)
|
743 |
|
|
|
745 |
if not query_results["results"]:
|
746 |
status_store.set("No sequences found")
|
747 |
else:
|
748 |
+
status_store.set("Ready")
|
749 |
else:
|
750 |
df = pd.DataFrame(query_results["results"])
|
751 |
+
if df.empty:
|
752 |
+
status_store.set("No structures found")
|
753 |
+
else:
|
754 |
+
status_store.set("Ready")
|
755 |
+
@output
|
756 |
+
@render_widget
|
757 |
+
def results_table():
|
758 |
+
return create_interactive_table(df)
|
759 |
+
|
760 |
@output
|
761 |
@render.text
|
762 |
def search_status():
|
|
|
796 |
|
797 |
return "\n".join(output_text)
|
798 |
return ""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
799 |
|
800 |
app = App(app_ui, server)
|
801 |
|
802 |
if __name__ == "__main__":
|
803 |
import nest_asyncio
|
804 |
nest_asyncio.apply()
|
805 |
+
app.run(host="0.0.0.0", port=7862)
|
|
|
|