lkjjj26 commited on
Commit
0763a52
ยท
verified ยท
1 Parent(s): 6619e16

Upload app.py

Browse files
Files changed (1) hide show
  1. app.py +115 -151
app.py CHANGED
@@ -12,6 +12,10 @@ import plotly.graph_objects as go
12
  from shinywidgets import output_widget, render_widget
13
  import requests
14
  import asyncio
 
 
 
 
15
  warnings.filterwarnings('ignore')
16
 
17
  # Load environment variables from .env file
@@ -52,56 +56,70 @@ class PDBSearchAssistant:
52
  Examples:
53
  Query: "Find human insulin structures with X-ray better than 2.5ร… resolution"
54
  Protein: insulin
55
- Organism: human
56
  Resolution: 2.5
57
  Sequence: none
58
  PDB_ID: none
59
  Method: X-RAY
60
 
61
- Query: "Get sequence of PDB ID 8ET6"
62
  Protein: none
63
  Organism: none
64
  Resolution: none
65
- Sequence: none
66
- PDB_ID: 8ET6
67
  Method: none
68
 
69
- Query: "Sequence of 7BZ5"
70
  Protein: none
71
  Organism: none
72
  Resolution: none
73
  Sequence: none
74
- PDB_ID: 7BZ5
75
  Method: none
76
 
77
- Query: "7BZ5"
78
- Protein: none
79
- Organism: none
80
  Resolution: none
81
  Sequence: none
82
- PDB_ID: 7BZ5
83
  Method: none
84
 
85
- Query: "6KAO"
86
  Protein: none
87
- Organism: none
88
  Resolution: none
89
  Sequence: none
90
- PDB_ID: 6KAO
 
 
 
 
 
 
 
 
91
  Method: none
92
-
93
- Query: "Find structures containing sequence MNIFEMLRIDEGLRLKIYKDTEGYYTIGIGHLLTKSPSLNAAKSELDKAIGRNTNGVITKDEAEKLFNQDVDAAVRGILRNAKLKPVYDSLDAVRRAALINMVFQMGETGVAGFTNSLRMLQQKRWDEAAVNLAKSRWYNQTPNRAKRVITTFRTGTWDAYKNL"
94
  Protein: none
95
  Organism: none
96
  Resolution: none
97
- Sequence: MNIFEMLRIDEGLRLKIYKDTEGYYTIGIGHLLTKSPSLNAAKSELDKAIGRNTNGVITKDEAEKLFNQDVDAAVRGILRNAKLKPVYDSLDAVRRAALINMVFQMGETGVAGFTNSLRMLQQKRWDEAAVNLAKSRWYNQTPNRAKRVITTFRTGTWDAYKNL
98
- PDB_ID: none
99
  Method: none
100
 
 
 
101
  Now analyze:
102
  Query: {query}
103
  """
104
 
 
 
 
 
105
  def search_pdb(self, query):
106
  try:
107
  # Get search parameters from LLM
@@ -114,6 +132,7 @@ class PDBSearchAssistant:
114
  pdb_id = None
115
  sequence = None
116
  method = None
 
117
  has_resolution_query = False
118
  resolution_direction = "less"
119
 
@@ -175,6 +194,10 @@ class PDBSearchAssistant:
175
  value = line.split('PDB_ID:')[1].strip()
176
  if value.lower() not in ['none', 'n/a']:
177
  pdb_id = value
 
 
 
 
178
 
179
  # Build search query
180
  queries = []
@@ -260,6 +283,16 @@ class PDBSearchAssistant:
260
  )
261
  queries.append(method_query)
262
 
 
 
 
 
 
 
 
 
 
 
263
  # Combine queries with AND operator
264
  if queries:
265
  final_query = queries[0]
@@ -334,91 +367,70 @@ class PDBSearchAssistant:
334
  return []
335
 
336
  def get_sequences_by_pdb_id(self, pdb_id):
337
- """Get sequences for all chains in a PDB structure"""
338
  try:
339
- # ProteinSearchEngine ์ธ์Šคํ„ด์Šค ์ƒ์„ฑ
340
- search_engine = ProteinSearchEngine()
341
-
342
- # ProteinQuery ๊ฐ์ฒด ์ƒ์„ฑ (resolution limit์€ ๋†’๊ฒŒ ์„ค์ •ํ•˜์—ฌ ๋ชจ๋“  ๊ฒฐ๊ณผ ํฌํ•จ)
343
- query = ProteinQuery(
344
- name=pdb_id,
345
- max_resolution=100.0 # ๋†’์€ ๊ฐ’์œผ๋กœ ์„ค์ •ํ•˜์—ฌ ๋ชจ๋“  ๊ตฌ์กฐ ํฌํ•จ
346
  )
347
 
348
- # ๊ฒ€์ƒ‰ ์‹คํ–‰
349
- results = search_engine.search(query)
350
-
351
- if not results:
352
  return []
353
-
354
- sequences = []
355
- # ๊ฒฐ๊ณผ์—์„œ sequence ์ •๋ณด ์ถ”์ถœ
356
- for structure in results:
357
- if structure.pdb_id.upper() == pdb_id.upper():
358
- chain_info = {
359
- 'chain_id': 'ALL', # ์ฒด์ธ ์ •๋ณด๋Š” ํ†ตํ•ฉ
360
- 'entity_id': '1',
361
- 'description': structure.title,
362
- 'sequence': structure.sequence,
363
- 'length': len(structure.sequence),
364
- 'resolution': structure.resolution,
365
- 'method': structure.method,
366
- 'release_date': structure.release_date
367
- }
368
- sequences.append(chain_info)
369
- break # ์ •ํ™•ํ•œ PDB ID ๋งค์น˜๋ฅผ ์ฐพ์œผ๋ฉด ์ค‘๋‹จ
370
 
371
- # ๊ฒฐ๊ณผ๊ฐ€ ์—†์œผ๋ฉด ์ง์ ‘ API ํ˜ธ์ถœ ์‹œ๋„
372
- if not sequences:
373
- print(f"No results found using ProteinSearchEngine, trying direct API call...")
374
- return self._get_sequences_by_direct_api(pdb_id)
375
-
376
- return sequences
377
 
378
- except Exception as e:
379
- print(f"Error in ProteinSearchEngine search for PDB ID {pdb_id}: {str(e)}")
380
- # ์—๋Ÿฌ ๋ฐœ์ƒ ์‹œ ์ง์ ‘ API ํ˜ธ์ถœ๋กœ ํด๋ฐฑ
381
- return self._get_sequences_by_direct_api(pdb_id)
382
-
383
- def _get_sequences_by_direct_api(self, pdb_id):
384
- """Fallback method using direct API calls"""
385
- # ๊ธฐ์กด์˜ get_sequences_by_pdb_id ๋ฉ”์†Œ๋“œ ๋‚ด์šฉ์„ ์—ฌ๊ธฐ๋กœ ์ด๋™
386
- try:
387
- url = f"https://data.rcsb.org/rest/v1/core/polymer_entity_instances/{pdb_id}"
388
- response = requests.get(url)
389
 
390
- if response.status_code != 200:
391
- return []
392
-
393
- chains_data = response.json()
394
  sequences = []
395
-
396
- for chain_id in chains_data.keys():
397
- entity_id = chains_data[chain_id].get('rcsb_polymer_entity_instance_container_identifiers', {}).get('entity_id')
398
-
399
- if entity_id:
400
- entity_url = f"https://data.rcsb.org/rest/v1/core/polymer_entity/{pdb_id}/{entity_id}"
401
- entity_response = requests.get(entity_url)
 
 
 
 
 
 
402
 
403
- if entity_response.status_code == 200:
404
- entity_data = entity_response.json()
405
- sequence = entity_data.get('entity_poly', {}).get('pdbx_seq_one_letter_code', '')
406
- description = entity_data.get('rcsb_polymer_entity', {}).get('pdbx_description', 'N/A')
407
-
408
  chain_info = {
409
- 'chain_id': chain_id,
410
- 'entity_id': entity_id,
411
- 'description': description,
412
  'sequence': sequence,
413
- 'length': len(sequence)
 
 
 
414
  }
415
  sequences.append(chain_info)
416
 
 
 
 
 
417
  return sequences
418
 
419
  except Exception as e:
420
- print(f"Error in direct API call for PDB ID {pdb_id}: {str(e)}")
421
  return []
 
 
 
 
 
422
 
423
  def process_query(self, query):
424
  """Process query and return results"""
@@ -480,37 +492,6 @@ def pdbsummary(name):
480
 
481
  return answer
482
 
483
-
484
- def render_html(pdb_id, chain="A"):
485
- if pdb_id is None or chain is None:
486
- return ""
487
- html_content = f"""
488
- <html>
489
- <header>
490
- <script src="https://3Dmol.org/build/3Dmol-min.js"></script>
491
- <script src="https://3Dmol.org/build/3Dmol.ui-min.js"></script>
492
- </header>
493
- <body>
494
- <div style="height: 400px; position: relative;" class="viewer_3Dmoljs"
495
- data-pdb="{pdb_id}"
496
- data-backgroundalpha="0.0"
497
- data-style="cartoon:color=white"
498
- data-select1="chain:{chain}"
499
- data-zoomto="chain:{chain}"
500
- data-style1="cartoon:color=spectrum"
501
- data-spin="axis:y;speed:0.2">
502
- </div>
503
- </body>
504
- </html>
505
- """
506
- iframe = f"""
507
- <iframe style="width: 100%; height: 480px; border: none;"
508
- srcdoc='{html_content}'>
509
- </iframe>
510
- """
511
- return iframe
512
-
513
-
514
  def create_interactive_table(df):
515
  if df.empty:
516
  return go.Figure()
@@ -578,12 +559,16 @@ app_ui = ui.page_fluid(
578
  margin: 0 auto;
579
  }
580
  #query {
581
- height: 100px;
582
  font-size: 16px;
583
  padding: 15px;
584
  width: 80%;
585
  margin: 0 auto;
586
  display: block;
 
 
 
 
587
  }
588
  .content-wrapper {
589
  text-align: center;
@@ -678,11 +663,12 @@ app_ui = ui.page_fluid(
678
  "Search Query",
679
  {"class": "query-label", "for": "query"}
680
  ),
681
- ui.input_text(
682
  "query",
683
  "",
684
  value="Human insulin",
685
- width="100%"
 
686
  ),
687
  )
688
  ),
@@ -738,15 +724,6 @@ app_ui = ui.page_fluid(
738
  ui.output_text("sequence_output")
739
  )
740
  )
741
- ),
742
- ui.row(
743
- ui.column(12,
744
- ui.div(
745
- {"class": "3d-iframe", "id": "3d-iframe"}, # css ๋ฏธ์„ค์ •
746
- ui.h4("3D Rendering"),
747
- ui.output_ui("output_iframe")
748
- )
749
- )
750
  )
751
  )
752
  )
@@ -759,10 +736,8 @@ def server(input, output, session):
759
  @reactive.Effect
760
  @reactive.event(input.search)
761
  def _():
762
- # ๊ฒ€์ƒ‰ ์‹œ์ž‘ ์‹œ ์ƒํƒœ ๋ณ€๊ฒฝ
763
  status_store.set("Searching...")
764
 
765
- # ํ”„๋กฌํ”„ํŠธ ์ฒ˜๋ฆฌ
766
  query_results = assistant.process_query(input.query())
767
  results_store.set(query_results)
768
 
@@ -770,16 +745,18 @@ def server(input, output, session):
770
  if not query_results["results"]:
771
  status_store.set("No sequences found")
772
  else:
773
- status_store.set("Ready") # ๊ฒ€์ƒ‰ ์™„๋ฃŒ ์‹œ Ready๋กœ ๋ณ€๊ฒฝ
774
  else:
775
  df = pd.DataFrame(query_results["results"])
776
- status_store.set("Ready") # ๊ฒ€์ƒ‰ ์™„๋ฃŒ ์‹œ Ready๋กœ ๋ณ€๊ฒฝ
777
-
778
- @output
779
- @render_widget
780
- def results_table():
781
- return create_interactive_table(df)
782
-
 
 
783
  @output
784
  @render.text
785
  def search_status():
@@ -819,23 +796,10 @@ def server(input, output, session):
819
 
820
  return "\n".join(output_text)
821
  return ""
822
-
823
- @output
824
- @render.text
825
- def output_iframe():
826
- current_results = results_store.get()
827
- if current_results["type"] == "structure":
828
- pdb_id = current_results["results"][0]['PDB ID']
829
- # chain ๊ฐ€์ ธ์˜ค๋Š” ๊ฑด ์•„์ง
830
- return render_html(pdb_id, "A")
831
- else:
832
- return ""
833
 
834
  app = App(app_ui, server)
835
 
836
  if __name__ == "__main__":
837
  import nest_asyncio
838
  nest_asyncio.apply()
839
- app.run(host="0.0.0.0", port=7862)
840
-
841
-
 
12
  from shinywidgets import output_widget, render_widget
13
  import requests
14
  import asyncio
15
+ from Bio import PDB
16
+ from Bio.PDB.PDBList import PDBList
17
+ from Bio.PDB.Polypeptide import protein_letters_3to1
18
+ import shutil
19
  warnings.filterwarnings('ignore')
20
 
21
  # Load environment variables from .env file
 
56
  Examples:
57
  Query: "Find human insulin structures with X-ray better than 2.5ร… resolution"
58
  Protein: insulin
59
+ Organism: Homo sapiens
60
  Resolution: 2.5
61
  Sequence: none
62
  PDB_ID: none
63
  Method: X-RAY
64
 
65
+ Query: "Find structures containing sequence MNIFEMLRIDEGLRLKIYKDTEGYYTIGIGHLLTKSPSLNAAKSELDKAIGRNTNGVITKDEAEKLFNQDVDAAVRGILRNAKLKPVYDSLDAVRRAALINMVFQMGETGVAGFTNSLRMLQQKRWDEAAVNLAKSRWYNQTPNRAKRVITTFRTGTWDAYKNL"
66
  Protein: none
67
  Organism: none
68
  Resolution: none
69
+ Sequence: MNIFEMLRIDEGLRLKIYKDTEGYYTIGIGHLLTKSPSLNAAKSELDKAIGRNTNGVITKDEAEKLFNQDVDAAVRGILRNAKLKPVYDSLDAVRRAALINMVFQMGETGVAGFTNSLRMLQQKRWDEAAVNLAKSRWYNQTPNRAKRVITTFRTGTWDAYKNL
70
+ PDB_ID: none
71
  Method: none
72
 
73
+ Query: "Get sequence of PDB ID 8ET6"
74
  Protein: none
75
  Organism: none
76
  Resolution: none
77
  Sequence: none
78
+ PDB_ID: 8ET6
79
  Method: none
80
 
81
+ Query: "Find mouse lysozyme structures"
82
+ Protein: lysozyme
83
+ Organism: Mus musculus
84
  Resolution: none
85
  Sequence: none
86
+ PDB_ID: none
87
  Method: none
88
 
89
+ Query: "Show me E. coli protein structures solved by Cryo-EM"
90
  Protein: none
91
+ Organism: Escherichia coli
92
  Resolution: none
93
  Sequence: none
94
+ PDB_ID: none
95
+ Method: EM
96
+
97
+ Query: "Find S. cerevisiae structures with resolution better than 1.8ร…"
98
+ Protein: none
99
+ Organism: Saccharomyces cerevisiae
100
+ Resolution: 1.8
101
+ Sequence: none
102
+ PDB_ID: none
103
  Method: none
104
+
105
+ Query: "Sequence of 7BZ5"
106
  Protein: none
107
  Organism: none
108
  Resolution: none
109
+ Sequence: none
110
+ PDB_ID: 7BZ5
111
  Method: none
112
 
113
+
114
+
115
  Now analyze:
116
  Query: {query}
117
  """
118
 
119
+ self.pdb_dir = "pdb_tmp" # ์ž„์‹œ PDB ํŒŒ์ผ ์ €์žฅ ๋””๋ ‰ํ† ๋ฆฌ
120
+ os.makedirs(self.pdb_dir, exist_ok=True)
121
+ self.pdbl = PDBList()
122
+
123
  def search_pdb(self, query):
124
  try:
125
  # Get search parameters from LLM
 
132
  pdb_id = None
133
  sequence = None
134
  method = None
135
+ organism = None
136
  has_resolution_query = False
137
  resolution_direction = "less"
138
 
 
194
  value = line.split('PDB_ID:')[1].strip()
195
  if value.lower() not in ['none', 'n/a']:
196
  pdb_id = value
197
+ elif 'Organism:' in line:
198
+ value = line.split('Organism:')[1].strip()
199
+ if value.lower() not in ['none', 'n/a']:
200
+ organism = value
201
 
202
  # Build search query
203
  queries = []
 
283
  )
284
  queries.append(method_query)
285
 
286
+ # Add organism filter if specified
287
+ if organism:
288
+ print(f"Adding organism filter: {organism}")
289
+ organism_query = AttributeQuery(
290
+ attribute="rcsb_entity_source_organism.taxonomy_lineage.name",
291
+ operator="exact_match",
292
+ value=organism
293
+ )
294
+ queries.append(organism_query)
295
+
296
  # Combine queries with AND operator
297
  if queries:
298
  final_query = queries[0]
 
367
  return []
368
 
369
  def get_sequences_by_pdb_id(self, pdb_id):
370
+ """Get sequences for all chains in a PDB structure using Biopython"""
371
  try:
372
+ # Download PDB file
373
+ pdb_path = self.pdbl.retrieve_pdb_file(
374
+ pdb_id,
375
+ pdir=self.pdb_dir,
376
+ file_format="pdb"
 
 
377
  )
378
 
379
+ if not pdb_path or not os.path.exists(pdb_path):
380
+ print(f"Failed to download PDB file for {pdb_id}")
 
 
381
  return []
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
382
 
383
+ # Parse structure
384
+ parser = PDB.PDBParser(QUIET=True)
385
+ structure = parser.get_structure(pdb_id, pdb_path)
 
 
 
386
 
387
+ # Get structure info from RCSB API for additional details
388
+ structure_url = f"https://data.rcsb.org/rest/v1/core/entry/{pdb_id}"
389
+ response = requests.get(structure_url)
390
+ structure_data = response.json() if response.status_code == 200 else {}
 
 
 
 
 
 
 
391
 
 
 
 
 
392
  sequences = []
393
+ # Extract sequences from each chain
394
+ for model in structure:
395
+ for chain in model:
396
+ sequence = ""
397
+ for residue in chain:
398
+ if PDB.is_aa(residue, standard=True):
399
+ try:
400
+ # 3๊ธ€์ž ์•„๋ฏธ๋…ธ์‚ฐ ์ฝ”๋“œ๋ฅผ 1๊ธ€์ž๋กœ ๋ณ€ํ™˜
401
+ resname = residue.get_resname()
402
+ if resname in protein_letters_3to1:
403
+ sequence += protein_letters_3to1[resname]
404
+ except:
405
+ continue
406
 
407
+ if sequence: # Only add if sequence is not empty
 
 
 
 
408
  chain_info = {
409
+ 'chain_id': chain.id,
410
+ 'entity_id': '1', # Default entity ID
411
+ 'description': structure_data.get('struct', {}).get('title', 'N/A'),
412
  'sequence': sequence,
413
+ 'length': len(sequence),
414
+ 'resolution': structure_data.get('rcsb_entry_info', {}).get('resolution_combined', [0.0])[0],
415
+ 'method': structure_data.get('exptl', [{}])[0].get('method', 'Unknown'),
416
+ 'release_date': structure_data.get('rcsb_accession_info', {}).get('initial_release_date', 'N/A')
417
  }
418
  sequences.append(chain_info)
419
 
420
+ # Cleanup downloaded file
421
+ if os.path.exists(pdb_path):
422
+ os.remove(pdb_path)
423
+
424
  return sequences
425
 
426
  except Exception as e:
427
+ print(f"Error getting sequences for PDB ID {pdb_id}: {str(e)}")
428
  return []
429
+
430
+ def __del__(self):
431
+ """Cleanup temporary directory on object destruction"""
432
+ if hasattr(self, 'pdb_dir') and os.path.exists(self.pdb_dir):
433
+ shutil.rmtree(self.pdb_dir)
434
 
435
  def process_query(self, query):
436
  """Process query and return results"""
 
492
 
493
  return answer
494
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
495
  def create_interactive_table(df):
496
  if df.empty:
497
  return go.Figure()
 
559
  margin: 0 auto;
560
  }
561
  #query {
562
+ height: 150px;
563
  font-size: 16px;
564
  padding: 15px;
565
  width: 80%;
566
  margin: 0 auto;
567
  display: block;
568
+ white-space: pre-wrap;
569
+ word-wrap: break-word;
570
+ resize: vertical;
571
+ overflow-y: auto;
572
  }
573
  .content-wrapper {
574
  text-align: center;
 
663
  "Search Query",
664
  {"class": "query-label", "for": "query"}
665
  ),
666
+ ui.input_text_area(
667
  "query",
668
  "",
669
  value="Human insulin",
670
+ width="100%",
671
+ resize="vertical"
672
  ),
673
  )
674
  ),
 
724
  ui.output_text("sequence_output")
725
  )
726
  )
 
 
 
 
 
 
 
 
 
727
  )
728
  )
729
  )
 
736
  @reactive.Effect
737
  @reactive.event(input.search)
738
  def _():
 
739
  status_store.set("Searching...")
740
 
 
741
  query_results = assistant.process_query(input.query())
742
  results_store.set(query_results)
743
 
 
745
  if not query_results["results"]:
746
  status_store.set("No sequences found")
747
  else:
748
+ status_store.set("Ready")
749
  else:
750
  df = pd.DataFrame(query_results["results"])
751
+ if df.empty:
752
+ status_store.set("No structures found")
753
+ else:
754
+ status_store.set("Ready")
755
+ @output
756
+ @render_widget
757
+ def results_table():
758
+ return create_interactive_table(df)
759
+
760
  @output
761
  @render.text
762
  def search_status():
 
796
 
797
  return "\n".join(output_text)
798
  return ""
 
 
 
 
 
 
 
 
 
 
 
799
 
800
  app = App(app_ui, server)
801
 
802
  if __name__ == "__main__":
803
  import nest_asyncio
804
  nest_asyncio.apply()
805
+ app.run(host="0.0.0.0", port=7862)