Upload app.py
Browse files
app.py
CHANGED
@@ -62,15 +62,33 @@ class PDBSearchAssistant:
|
|
62 |
Organism: none
|
63 |
Similarity: 100
|
64 |
|
65 |
-
Query: "
|
66 |
Protein: insulin
|
67 |
-
Resolution:
|
68 |
Sequence: none
|
69 |
PDB_ID: none
|
70 |
-
Method:
|
71 |
Organism: Homo sapiens
|
72 |
Similarity: none
|
73 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
74 |
Query: "Find structures containing sequence with similarity 90% MNIFEMLRIDEGLRLKIYKDTEGYYTIGIGHLLTKSPSLNAAKSELDKAIGRN"
|
75 |
Protein: none
|
76 |
Resolution: none
|
@@ -191,9 +209,37 @@ class PDBSearchAssistant:
|
|
191 |
has_resolution_query = True
|
192 |
print(f"Extracted resolution from query: {resolution_limit}Å")
|
193 |
|
|
|
|
|
|
|
|
|
|
|
|
|
194 |
# Build queries list
|
195 |
queries = []
|
196 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
197 |
# Add sequence query if present
|
198 |
query_words = query.split()
|
199 |
for word in query_words:
|
@@ -264,74 +310,93 @@ class PDBSearchAssistant:
|
|
264 |
)
|
265 |
queries.append(organism_query)
|
266 |
|
267 |
-
# Combine queries with
|
268 |
if queries:
|
269 |
-
final_query = queries[0]
|
270 |
-
for q in queries[1:]:
|
271 |
-
final_query = final_query & q
|
272 |
-
|
273 |
-
print("Final query:", final_query)
|
274 |
-
|
275 |
-
# Execute search
|
276 |
-
session = final_query.exec()
|
277 |
-
results = []
|
278 |
-
|
279 |
-
# Process results with additional information
|
280 |
-
# search_engine = ProteinSearchEngine()
|
281 |
-
|
282 |
try:
|
283 |
-
|
284 |
-
|
285 |
-
|
286 |
-
|
287 |
-
|
288 |
-
|
289 |
-
|
290 |
-
|
291 |
-
|
292 |
-
|
293 |
-
|
294 |
-
|
295 |
-
|
296 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
297 |
|
298 |
-
|
299 |
-
structure_url = f"https://data.rcsb.org/rest/v1/core/entry/{pdb_id}"
|
300 |
-
response = requests.get(structure_url)
|
301 |
-
|
302 |
-
if response.status_code != 200:
|
303 |
-
continue
|
304 |
|
305 |
-
|
306 |
-
|
307 |
-
|
308 |
-
|
309 |
-
|
310 |
-
|
311 |
-
'# of atoms of protein': structure_data.get('refine_hist', [{}])[0].get('pdbx_number_atoms_protein', 'N/A'),
|
312 |
-
'Resolution': f"{structure_data.get('rcsb_entry_info', {}).get('resolution_combined', [0.0])[0]:.2f}Å",
|
313 |
-
'Method': structure_data.get('exptl', [{}])[0].get('method', 'Unknown'),
|
314 |
-
'Release Date': structure_data.get('rcsb_accession_info', {}).get('initial_release_date', 'N/A')
|
315 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
316 |
|
317 |
-
|
318 |
-
|
319 |
-
results.append(result)
|
320 |
-
|
321 |
-
# Limit to top 10 results
|
322 |
-
if len(results) >= 10:
|
323 |
-
break
|
324 |
|
325 |
-
|
326 |
-
|
327 |
-
|
328 |
-
|
329 |
-
|
330 |
-
|
331 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
332 |
|
333 |
-
|
334 |
-
|
|
|
|
|
335 |
|
336 |
return []
|
337 |
|
@@ -954,7 +1019,7 @@ app_ui = ui.page_fluid(
|
|
954 |
ui.input_text_area(
|
955 |
"query",
|
956 |
"",
|
957 |
-
value="
|
958 |
width="100%",
|
959 |
resize="vertical"
|
960 |
),
|
@@ -970,10 +1035,13 @@ app_ui = ui.page_fluid(
|
|
970 |
{"class": "example-box"},
|
971 |
ui.p("Example queries:"),
|
972 |
ui.tags.ul(
|
|
|
|
|
|
|
973 |
ui.tags.li("Human hemoglobin C resolution better than 2.5Å"),
|
974 |
ui.tags.li("Find structures containing sequence with similarity 90% FVNQHLCGSHLVEALYLVCGERGFFYTPKTRREAEDLQVGQVELGGGPGAGSLQPLALEGSLQKR"),
|
975 |
ui.tags.li("Find structures with resolution better than 3 angstrom and sequence similarity 90% of FVNQHLCGSHLVEALYLVCGERGFFYTPKTRREAEDLQVGQVELGGGPGAGSLQPLALEGSLQKR"),
|
976 |
-
|
977 |
)
|
978 |
)
|
979 |
)
|
|
|
62 |
Organism: none
|
63 |
Similarity: 100
|
64 |
|
65 |
+
Query: "human insulin"
|
66 |
Protein: insulin
|
67 |
+
Resolution: none
|
68 |
Sequence: none
|
69 |
PDB_ID: none
|
70 |
+
Method: none
|
71 |
Organism: Homo sapiens
|
72 |
Similarity: none
|
73 |
|
74 |
+
Query: "Spike protein"
|
75 |
+
Protein: Spike protein
|
76 |
+
Resolution: none
|
77 |
+
Sequence: none
|
78 |
+
PDB_ID: none
|
79 |
+
Method: none
|
80 |
+
Organism: none
|
81 |
+
Similarity: none
|
82 |
+
|
83 |
+
Query: "Human hemoglobin C resolution better than 2.5Å"
|
84 |
+
Protein: hemoglobin C
|
85 |
+
Resolution: 2.5
|
86 |
+
Sequence: none
|
87 |
+
PDB_ID: none
|
88 |
+
Method: none
|
89 |
+
Organism: Homo sapiens
|
90 |
+
Similarity: none
|
91 |
+
|
92 |
Query: "Find structures containing sequence with similarity 90% MNIFEMLRIDEGLRLKIYKDTEGYYTIGIGHLLTKSPSLNAAKSELDKAIGRN"
|
93 |
Protein: none
|
94 |
Resolution: none
|
|
|
209 |
has_resolution_query = True
|
210 |
print(f"Extracted resolution from query: {resolution_limit}Å")
|
211 |
|
212 |
+
# Add protein name extraction from response pairs
|
213 |
+
protein_name = None
|
214 |
+
if 'protein' in response_pairs:
|
215 |
+
protein_name = response_pairs['protein']
|
216 |
+
print(f"Extracted protein name: {protein_name}")
|
217 |
+
|
218 |
# Build queries list
|
219 |
queries = []
|
220 |
|
221 |
+
# Add protein name query if specified
|
222 |
+
if protein_name:
|
223 |
+
print(f"Adding protein name filter: {protein_name}")
|
224 |
+
try:
|
225 |
+
protein_query = AttributeQuery(
|
226 |
+
attribute="struct.title",
|
227 |
+
operator="contains_words",
|
228 |
+
value=protein_name
|
229 |
+
)
|
230 |
+
queries.append(protein_query)
|
231 |
+
|
232 |
+
protein_entity_query = AttributeQuery(
|
233 |
+
attribute="rcsb_entity_container_identifiers.entity_names.value",
|
234 |
+
operator="contains_words",
|
235 |
+
value=protein_name
|
236 |
+
)
|
237 |
+
queries.append(protein_entity_query)
|
238 |
+
|
239 |
+
print(f"Created protein queries successfully: {protein_query}, {protein_entity_query}")
|
240 |
+
except Exception as e:
|
241 |
+
print(f"Error creating protein queries: {str(e)}")
|
242 |
+
|
243 |
# Add sequence query if present
|
244 |
query_words = query.split()
|
245 |
for word in query_words:
|
|
|
310 |
)
|
311 |
queries.append(organism_query)
|
312 |
|
313 |
+
# Combine queries with improved error handling
|
314 |
if queries:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
315 |
try:
|
316 |
+
if protein_name and len(queries) >= 2:
|
317 |
+
print("Combining protein queries with OR")
|
318 |
+
protein_queries = queries[0] | queries[1]
|
319 |
+
print("Successfully combined protein queries")
|
320 |
+
|
321 |
+
if len(queries) > 2:
|
322 |
+
print("Combining with additional queries using AND")
|
323 |
+
final_query = protein_queries
|
324 |
+
for q in queries[2:]:
|
325 |
+
final_query = final_query & q
|
326 |
+
else:
|
327 |
+
final_query = protein_queries
|
328 |
+
else:
|
329 |
+
final_query = queries[0]
|
330 |
+
for q in queries[1:]:
|
331 |
+
final_query = final_query & q
|
332 |
+
|
333 |
+
print("Final query:", final_query)
|
334 |
+
|
335 |
+
# Execute search
|
336 |
+
session = final_query.exec()
|
337 |
+
results = []
|
338 |
+
|
339 |
+
# Process results with additional information
|
340 |
+
# search_engine = ProteinSearchEngine()
|
341 |
+
|
342 |
+
try:
|
343 |
+
for entry in session:
|
344 |
+
try:
|
345 |
+
# PDB ID 추출 방식 개선
|
346 |
+
if isinstance(entry, dict):
|
347 |
+
pdb_id = entry.get('identifier')
|
348 |
+
elif hasattr(entry, 'identifier'):
|
349 |
+
pdb_id = entry.identifier
|
350 |
+
else:
|
351 |
+
pdb_id = str(entry)
|
352 |
|
353 |
+
pdb_id = pdb_id.upper() # PDB ID는 항상 대문자
|
|
|
|
|
|
|
|
|
|
|
354 |
|
355 |
+
if not pdb_id or len(pdb_id) != 4: # PDB ID는 항상 4자리
|
356 |
+
continue
|
357 |
+
|
358 |
+
# RCSB PDB REST API를 직접 사용하여 구조 정보 가져오기
|
359 |
+
structure_url = f"https://data.rcsb.org/rest/v1/core/entry/{pdb_id}"
|
360 |
+
response = requests.get(structure_url)
|
|
|
|
|
|
|
|
|
361 |
|
362 |
+
if response.status_code != 200:
|
363 |
+
continue
|
364 |
+
|
365 |
+
structure_data = response.json()
|
366 |
+
# 결과 구성
|
367 |
+
result = {
|
368 |
+
'PDB ID': pdb_id,
|
369 |
+
'Title': structure_data.get('struct', {}).get('title', 'N/A'),
|
370 |
+
'# of total residues': structure_data.get('refine_hist', [{}])[0].get('pdbx_number_residues_total', 'N/A'),
|
371 |
+
'# of atoms of protein': structure_data.get('refine_hist', [{}])[0].get('pdbx_number_atoms_protein', 'N/A'),
|
372 |
+
'Resolution': f"{structure_data.get('rcsb_entry_info', {}).get('resolution_combined', [0.0])[0]:.2f}Å",
|
373 |
+
'Method': structure_data.get('exptl', [{}])[0].get('method', 'Unknown'),
|
374 |
+
'Release Date': structure_data.get('rcsb_accession_info', {}).get('initial_release_date', 'N/A')
|
375 |
+
|
376 |
+
|
377 |
+
}
|
378 |
|
379 |
+
results.append(result)
|
|
|
|
|
|
|
|
|
|
|
|
|
380 |
|
381 |
+
# Limit to top 10 results
|
382 |
+
if len(results) >= 10:
|
383 |
+
break
|
384 |
+
|
385 |
+
except Exception as e:
|
386 |
+
print(f"Error processing entry: {str(e)}")
|
387 |
+
continue
|
388 |
+
|
389 |
+
except Exception as e:
|
390 |
+
print(f"Error processing results: {str(e)}")
|
391 |
+
print(f"Error type: {type(e)}")
|
392 |
+
|
393 |
+
print(f"Found {len(results)} structures")
|
394 |
+
return results
|
395 |
|
396 |
+
except Exception as e:
|
397 |
+
print(f"Error combining queries: {str(e)}")
|
398 |
+
print(f"Query state: {queries}")
|
399 |
+
return []
|
400 |
|
401 |
return []
|
402 |
|
|
|
1019 |
ui.input_text_area(
|
1020 |
"query",
|
1021 |
"",
|
1022 |
+
value="",
|
1023 |
width="100%",
|
1024 |
resize="vertical"
|
1025 |
),
|
|
|
1035 |
{"class": "example-box"},
|
1036 |
ui.p("Example queries:"),
|
1037 |
ui.tags.ul(
|
1038 |
+
ui.tags.li("Sequence of PDB ID 8ET6"),
|
1039 |
+
ui.tags.li("Spike protein"),
|
1040 |
+
ui.tags.li("Human insulin"),
|
1041 |
ui.tags.li("Human hemoglobin C resolution better than 2.5Å"),
|
1042 |
ui.tags.li("Find structures containing sequence with similarity 90% FVNQHLCGSHLVEALYLVCGERGFFYTPKTRREAEDLQVGQVELGGGPGAGSLQPLALEGSLQKR"),
|
1043 |
ui.tags.li("Find structures with resolution better than 3 angstrom and sequence similarity 90% of FVNQHLCGSHLVEALYLVCGERGFFYTPKTRREAEDLQVGQVELGGGPGAGSLQPLALEGSLQKR"),
|
1044 |
+
|
1045 |
)
|
1046 |
)
|
1047 |
)
|