lkjjj26 commited on
Commit
b4f6595
·
verified ·
1 Parent(s): 2c254d5

Upload app.py

Browse files
Files changed (1) hide show
  1. app.py +133 -65
app.py CHANGED
@@ -62,15 +62,33 @@ class PDBSearchAssistant:
62
  Organism: none
63
  Similarity: 100
64
 
65
- Query: "Find human insulin structures with X-ray better than 2.5Å resolution"
66
  Protein: insulin
67
- Resolution: 2.5
68
  Sequence: none
69
  PDB_ID: none
70
- Method: X-RAY
71
  Organism: Homo sapiens
72
  Similarity: none
73
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
74
  Query: "Find structures containing sequence with similarity 90% MNIFEMLRIDEGLRLKIYKDTEGYYTIGIGHLLTKSPSLNAAKSELDKAIGRN"
75
  Protein: none
76
  Resolution: none
@@ -191,9 +209,37 @@ class PDBSearchAssistant:
191
  has_resolution_query = True
192
  print(f"Extracted resolution from query: {resolution_limit}Å")
193
 
 
 
 
 
 
 
194
  # Build queries list
195
  queries = []
196
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
197
  # Add sequence query if present
198
  query_words = query.split()
199
  for word in query_words:
@@ -264,74 +310,93 @@ class PDBSearchAssistant:
264
  )
265
  queries.append(organism_query)
266
 
267
- # Combine queries with AND operator
268
  if queries:
269
- final_query = queries[0]
270
- for q in queries[1:]:
271
- final_query = final_query & q
272
-
273
- print("Final query:", final_query)
274
-
275
- # Execute search
276
- session = final_query.exec()
277
- results = []
278
-
279
- # Process results with additional information
280
- # search_engine = ProteinSearchEngine()
281
-
282
  try:
283
- for entry in session:
284
- try:
285
- # PDB ID 추출 방식 개선
286
- if isinstance(entry, dict):
287
- pdb_id = entry.get('identifier')
288
- elif hasattr(entry, 'identifier'):
289
- pdb_id = entry.identifier
290
- else:
291
- pdb_id = str(entry)
292
-
293
- pdb_id = pdb_id.upper() # PDB ID는 항상 대문자
294
-
295
- if not pdb_id or len(pdb_id) != 4: # PDB ID는 항상 4자리
296
- continue
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
297
 
298
- # RCSB PDB REST API를 직접 사용하여 구조 정보 가져오기
299
- structure_url = f"https://data.rcsb.org/rest/v1/core/entry/{pdb_id}"
300
- response = requests.get(structure_url)
301
-
302
- if response.status_code != 200:
303
- continue
304
 
305
- structure_data = response.json()
306
- # 결과 구성
307
- result = {
308
- 'PDB ID': pdb_id,
309
- 'Title': structure_data.get('struct', {}).get('title', 'N/A'),
310
- '# of total residues': structure_data.get('refine_hist', [{}])[0].get('pdbx_number_residues_total', 'N/A'),
311
- '# of atoms of protein': structure_data.get('refine_hist', [{}])[0].get('pdbx_number_atoms_protein', 'N/A'),
312
- 'Resolution': f"{structure_data.get('rcsb_entry_info', {}).get('resolution_combined', [0.0])[0]:.2f}Å",
313
- 'Method': structure_data.get('exptl', [{}])[0].get('method', 'Unknown'),
314
- 'Release Date': structure_data.get('rcsb_accession_info', {}).get('initial_release_date', 'N/A')
315
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
316
 
317
- }
318
-
319
- results.append(result)
320
-
321
- # Limit to top 10 results
322
- if len(results) >= 10:
323
- break
324
 
325
- except Exception as e:
326
- print(f"Error processing entry: {str(e)}")
327
- continue
328
-
329
- except Exception as e:
330
- print(f"Error processing results: {str(e)}")
331
- print(f"Error type: {type(e)}")
 
 
 
 
 
 
 
332
 
333
- print(f"Found {len(results)} structures")
334
- return results
 
 
335
 
336
  return []
337
 
@@ -954,7 +1019,7 @@ app_ui = ui.page_fluid(
954
  ui.input_text_area(
955
  "query",
956
  "",
957
- value="Human insulin",
958
  width="100%",
959
  resize="vertical"
960
  ),
@@ -970,10 +1035,13 @@ app_ui = ui.page_fluid(
970
  {"class": "example-box"},
971
  ui.p("Example queries:"),
972
  ui.tags.ul(
 
 
 
973
  ui.tags.li("Human hemoglobin C resolution better than 2.5Å"),
974
  ui.tags.li("Find structures containing sequence with similarity 90% FVNQHLCGSHLVEALYLVCGERGFFYTPKTRREAEDLQVGQVELGGGPGAGSLQPLALEGSLQKR"),
975
  ui.tags.li("Find structures with resolution better than 3 angstrom and sequence similarity 90% of FVNQHLCGSHLVEALYLVCGERGFFYTPKTRREAEDLQVGQVELGGGPGAGSLQPLALEGSLQKR"),
976
- ui.tags.li("Sequence of PDB ID 8ET6")
977
  )
978
  )
979
  )
 
62
  Organism: none
63
  Similarity: 100
64
 
65
+ Query: "human insulin"
66
  Protein: insulin
67
+ Resolution: none
68
  Sequence: none
69
  PDB_ID: none
70
+ Method: none
71
  Organism: Homo sapiens
72
  Similarity: none
73
 
74
+ Query: "Spike protein"
75
+ Protein: Spike protein
76
+ Resolution: none
77
+ Sequence: none
78
+ PDB_ID: none
79
+ Method: none
80
+ Organism: none
81
+ Similarity: none
82
+
83
+ Query: "Human hemoglobin C resolution better than 2.5Å"
84
+ Protein: hemoglobin C
85
+ Resolution: 2.5
86
+ Sequence: none
87
+ PDB_ID: none
88
+ Method: none
89
+ Organism: Homo sapiens
90
+ Similarity: none
91
+
92
  Query: "Find structures containing sequence with similarity 90% MNIFEMLRIDEGLRLKIYKDTEGYYTIGIGHLLTKSPSLNAAKSELDKAIGRN"
93
  Protein: none
94
  Resolution: none
 
209
  has_resolution_query = True
210
  print(f"Extracted resolution from query: {resolution_limit}Å")
211
 
212
+ # Add protein name extraction from response pairs
213
+ protein_name = None
214
+ if 'protein' in response_pairs:
215
+ protein_name = response_pairs['protein']
216
+ print(f"Extracted protein name: {protein_name}")
217
+
218
  # Build queries list
219
  queries = []
220
 
221
+ # Add protein name query if specified
222
+ if protein_name:
223
+ print(f"Adding protein name filter: {protein_name}")
224
+ try:
225
+ protein_query = AttributeQuery(
226
+ attribute="struct.title",
227
+ operator="contains_words",
228
+ value=protein_name
229
+ )
230
+ queries.append(protein_query)
231
+
232
+ protein_entity_query = AttributeQuery(
233
+ attribute="rcsb_entity_container_identifiers.entity_names.value",
234
+ operator="contains_words",
235
+ value=protein_name
236
+ )
237
+ queries.append(protein_entity_query)
238
+
239
+ print(f"Created protein queries successfully: {protein_query}, {protein_entity_query}")
240
+ except Exception as e:
241
+ print(f"Error creating protein queries: {str(e)}")
242
+
243
  # Add sequence query if present
244
  query_words = query.split()
245
  for word in query_words:
 
310
  )
311
  queries.append(organism_query)
312
 
313
+ # Combine queries with improved error handling
314
  if queries:
 
 
 
 
 
 
 
 
 
 
 
 
 
315
  try:
316
+ if protein_name and len(queries) >= 2:
317
+ print("Combining protein queries with OR")
318
+ protein_queries = queries[0] | queries[1]
319
+ print("Successfully combined protein queries")
320
+
321
+ if len(queries) > 2:
322
+ print("Combining with additional queries using AND")
323
+ final_query = protein_queries
324
+ for q in queries[2:]:
325
+ final_query = final_query & q
326
+ else:
327
+ final_query = protein_queries
328
+ else:
329
+ final_query = queries[0]
330
+ for q in queries[1:]:
331
+ final_query = final_query & q
332
+
333
+ print("Final query:", final_query)
334
+
335
+ # Execute search
336
+ session = final_query.exec()
337
+ results = []
338
+
339
+ # Process results with additional information
340
+ # search_engine = ProteinSearchEngine()
341
+
342
+ try:
343
+ for entry in session:
344
+ try:
345
+ # PDB ID 추출 방식 개선
346
+ if isinstance(entry, dict):
347
+ pdb_id = entry.get('identifier')
348
+ elif hasattr(entry, 'identifier'):
349
+ pdb_id = entry.identifier
350
+ else:
351
+ pdb_id = str(entry)
352
 
353
+ pdb_id = pdb_id.upper() # PDB ID는 항상 대문자
 
 
 
 
 
354
 
355
+ if not pdb_id or len(pdb_id) != 4: # PDB ID는 항상 4자리
356
+ continue
357
+
358
+ # RCSB PDB REST API를 직접 사용하여 구조 정보 가져오기
359
+ structure_url = f"https://data.rcsb.org/rest/v1/core/entry/{pdb_id}"
360
+ response = requests.get(structure_url)
 
 
 
 
361
 
362
+ if response.status_code != 200:
363
+ continue
364
+
365
+ structure_data = response.json()
366
+ # 결과 구성
367
+ result = {
368
+ 'PDB ID': pdb_id,
369
+ 'Title': structure_data.get('struct', {}).get('title', 'N/A'),
370
+ '# of total residues': structure_data.get('refine_hist', [{}])[0].get('pdbx_number_residues_total', 'N/A'),
371
+ '# of atoms of protein': structure_data.get('refine_hist', [{}])[0].get('pdbx_number_atoms_protein', 'N/A'),
372
+ 'Resolution': f"{structure_data.get('rcsb_entry_info', {}).get('resolution_combined', [0.0])[0]:.2f}Å",
373
+ 'Method': structure_data.get('exptl', [{}])[0].get('method', 'Unknown'),
374
+ 'Release Date': structure_data.get('rcsb_accession_info', {}).get('initial_release_date', 'N/A')
375
+
376
+
377
+ }
378
 
379
+ results.append(result)
 
 
 
 
 
 
380
 
381
+ # Limit to top 10 results
382
+ if len(results) >= 10:
383
+ break
384
+
385
+ except Exception as e:
386
+ print(f"Error processing entry: {str(e)}")
387
+ continue
388
+
389
+ except Exception as e:
390
+ print(f"Error processing results: {str(e)}")
391
+ print(f"Error type: {type(e)}")
392
+
393
+ print(f"Found {len(results)} structures")
394
+ return results
395
 
396
+ except Exception as e:
397
+ print(f"Error combining queries: {str(e)}")
398
+ print(f"Query state: {queries}")
399
+ return []
400
 
401
  return []
402
 
 
1019
  ui.input_text_area(
1020
  "query",
1021
  "",
1022
+ value="",
1023
  width="100%",
1024
  resize="vertical"
1025
  ),
 
1035
  {"class": "example-box"},
1036
  ui.p("Example queries:"),
1037
  ui.tags.ul(
1038
+ ui.tags.li("Sequence of PDB ID 8ET6"),
1039
+ ui.tags.li("Spike protein"),
1040
+ ui.tags.li("Human insulin"),
1041
  ui.tags.li("Human hemoglobin C resolution better than 2.5Å"),
1042
  ui.tags.li("Find structures containing sequence with similarity 90% FVNQHLCGSHLVEALYLVCGERGFFYTPKTRREAEDLQVGQVELGGGPGAGSLQPLALEGSLQKR"),
1043
  ui.tags.li("Find structures with resolution better than 3 angstrom and sequence similarity 90% of FVNQHLCGSHLVEALYLVCGERGFFYTPKTRREAEDLQVGQVELGGGPGAGSLQPLALEGSLQKR"),
1044
+
1045
  )
1046
  )
1047
  )