Upload app.py
Browse files
app.py
CHANGED
|
@@ -62,15 +62,33 @@ class PDBSearchAssistant:
|
|
| 62 |
Organism: none
|
| 63 |
Similarity: 100
|
| 64 |
|
| 65 |
-
Query: "
|
| 66 |
Protein: insulin
|
| 67 |
-
Resolution:
|
| 68 |
Sequence: none
|
| 69 |
PDB_ID: none
|
| 70 |
-
Method:
|
| 71 |
Organism: Homo sapiens
|
| 72 |
Similarity: none
|
| 73 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 74 |
Query: "Find structures containing sequence with similarity 90% MNIFEMLRIDEGLRLKIYKDTEGYYTIGIGHLLTKSPSLNAAKSELDKAIGRN"
|
| 75 |
Protein: none
|
| 76 |
Resolution: none
|
|
@@ -191,9 +209,37 @@ class PDBSearchAssistant:
|
|
| 191 |
has_resolution_query = True
|
| 192 |
print(f"Extracted resolution from query: {resolution_limit}Å")
|
| 193 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 194 |
# Build queries list
|
| 195 |
queries = []
|
| 196 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 197 |
# Add sequence query if present
|
| 198 |
query_words = query.split()
|
| 199 |
for word in query_words:
|
|
@@ -264,74 +310,93 @@ class PDBSearchAssistant:
|
|
| 264 |
)
|
| 265 |
queries.append(organism_query)
|
| 266 |
|
| 267 |
-
# Combine queries with
|
| 268 |
if queries:
|
| 269 |
-
final_query = queries[0]
|
| 270 |
-
for q in queries[1:]:
|
| 271 |
-
final_query = final_query & q
|
| 272 |
-
|
| 273 |
-
print("Final query:", final_query)
|
| 274 |
-
|
| 275 |
-
# Execute search
|
| 276 |
-
session = final_query.exec()
|
| 277 |
-
results = []
|
| 278 |
-
|
| 279 |
-
# Process results with additional information
|
| 280 |
-
# search_engine = ProteinSearchEngine()
|
| 281 |
-
|
| 282 |
try:
|
| 283 |
-
|
| 284 |
-
|
| 285 |
-
|
| 286 |
-
|
| 287 |
-
|
| 288 |
-
|
| 289 |
-
|
| 290 |
-
|
| 291 |
-
|
| 292 |
-
|
| 293 |
-
|
| 294 |
-
|
| 295 |
-
|
| 296 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 297 |
|
| 298 |
-
|
| 299 |
-
structure_url = f"https://data.rcsb.org/rest/v1/core/entry/{pdb_id}"
|
| 300 |
-
response = requests.get(structure_url)
|
| 301 |
-
|
| 302 |
-
if response.status_code != 200:
|
| 303 |
-
continue
|
| 304 |
|
| 305 |
-
|
| 306 |
-
|
| 307 |
-
|
| 308 |
-
|
| 309 |
-
|
| 310 |
-
|
| 311 |
-
'# of atoms of protein': structure_data.get('refine_hist', [{}])[0].get('pdbx_number_atoms_protein', 'N/A'),
|
| 312 |
-
'Resolution': f"{structure_data.get('rcsb_entry_info', {}).get('resolution_combined', [0.0])[0]:.2f}Å",
|
| 313 |
-
'Method': structure_data.get('exptl', [{}])[0].get('method', 'Unknown'),
|
| 314 |
-
'Release Date': structure_data.get('rcsb_accession_info', {}).get('initial_release_date', 'N/A')
|
| 315 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 316 |
|
| 317 |
-
|
| 318 |
-
|
| 319 |
-
results.append(result)
|
| 320 |
-
|
| 321 |
-
# Limit to top 10 results
|
| 322 |
-
if len(results) >= 10:
|
| 323 |
-
break
|
| 324 |
|
| 325 |
-
|
| 326 |
-
|
| 327 |
-
|
| 328 |
-
|
| 329 |
-
|
| 330 |
-
|
| 331 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 332 |
|
| 333 |
-
|
| 334 |
-
|
|
|
|
|
|
|
| 335 |
|
| 336 |
return []
|
| 337 |
|
|
@@ -954,7 +1019,7 @@ app_ui = ui.page_fluid(
|
|
| 954 |
ui.input_text_area(
|
| 955 |
"query",
|
| 956 |
"",
|
| 957 |
-
value="
|
| 958 |
width="100%",
|
| 959 |
resize="vertical"
|
| 960 |
),
|
|
@@ -970,10 +1035,13 @@ app_ui = ui.page_fluid(
|
|
| 970 |
{"class": "example-box"},
|
| 971 |
ui.p("Example queries:"),
|
| 972 |
ui.tags.ul(
|
|
|
|
|
|
|
|
|
|
| 973 |
ui.tags.li("Human hemoglobin C resolution better than 2.5Å"),
|
| 974 |
ui.tags.li("Find structures containing sequence with similarity 90% FVNQHLCGSHLVEALYLVCGERGFFYTPKTRREAEDLQVGQVELGGGPGAGSLQPLALEGSLQKR"),
|
| 975 |
ui.tags.li("Find structures with resolution better than 3 angstrom and sequence similarity 90% of FVNQHLCGSHLVEALYLVCGERGFFYTPKTRREAEDLQVGQVELGGGPGAGSLQPLALEGSLQKR"),
|
| 976 |
-
|
| 977 |
)
|
| 978 |
)
|
| 979 |
)
|
|
|
|
| 62 |
Organism: none
|
| 63 |
Similarity: 100
|
| 64 |
|
| 65 |
+
Query: "human insulin"
|
| 66 |
Protein: insulin
|
| 67 |
+
Resolution: none
|
| 68 |
Sequence: none
|
| 69 |
PDB_ID: none
|
| 70 |
+
Method: none
|
| 71 |
Organism: Homo sapiens
|
| 72 |
Similarity: none
|
| 73 |
|
| 74 |
+
Query: "Spike protein"
|
| 75 |
+
Protein: Spike protein
|
| 76 |
+
Resolution: none
|
| 77 |
+
Sequence: none
|
| 78 |
+
PDB_ID: none
|
| 79 |
+
Method: none
|
| 80 |
+
Organism: none
|
| 81 |
+
Similarity: none
|
| 82 |
+
|
| 83 |
+
Query: "Human hemoglobin C resolution better than 2.5Å"
|
| 84 |
+
Protein: hemoglobin C
|
| 85 |
+
Resolution: 2.5
|
| 86 |
+
Sequence: none
|
| 87 |
+
PDB_ID: none
|
| 88 |
+
Method: none
|
| 89 |
+
Organism: Homo sapiens
|
| 90 |
+
Similarity: none
|
| 91 |
+
|
| 92 |
Query: "Find structures containing sequence with similarity 90% MNIFEMLRIDEGLRLKIYKDTEGYYTIGIGHLLTKSPSLNAAKSELDKAIGRN"
|
| 93 |
Protein: none
|
| 94 |
Resolution: none
|
|
|
|
| 209 |
has_resolution_query = True
|
| 210 |
print(f"Extracted resolution from query: {resolution_limit}Å")
|
| 211 |
|
| 212 |
+
# Add protein name extraction from response pairs
|
| 213 |
+
protein_name = None
|
| 214 |
+
if 'protein' in response_pairs:
|
| 215 |
+
protein_name = response_pairs['protein']
|
| 216 |
+
print(f"Extracted protein name: {protein_name}")
|
| 217 |
+
|
| 218 |
# Build queries list
|
| 219 |
queries = []
|
| 220 |
|
| 221 |
+
# Add protein name query if specified
|
| 222 |
+
if protein_name:
|
| 223 |
+
print(f"Adding protein name filter: {protein_name}")
|
| 224 |
+
try:
|
| 225 |
+
protein_query = AttributeQuery(
|
| 226 |
+
attribute="struct.title",
|
| 227 |
+
operator="contains_words",
|
| 228 |
+
value=protein_name
|
| 229 |
+
)
|
| 230 |
+
queries.append(protein_query)
|
| 231 |
+
|
| 232 |
+
protein_entity_query = AttributeQuery(
|
| 233 |
+
attribute="rcsb_entity_container_identifiers.entity_names.value",
|
| 234 |
+
operator="contains_words",
|
| 235 |
+
value=protein_name
|
| 236 |
+
)
|
| 237 |
+
queries.append(protein_entity_query)
|
| 238 |
+
|
| 239 |
+
print(f"Created protein queries successfully: {protein_query}, {protein_entity_query}")
|
| 240 |
+
except Exception as e:
|
| 241 |
+
print(f"Error creating protein queries: {str(e)}")
|
| 242 |
+
|
| 243 |
# Add sequence query if present
|
| 244 |
query_words = query.split()
|
| 245 |
for word in query_words:
|
|
|
|
| 310 |
)
|
| 311 |
queries.append(organism_query)
|
| 312 |
|
| 313 |
+
# Combine queries with improved error handling
|
| 314 |
if queries:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 315 |
try:
|
| 316 |
+
if protein_name and len(queries) >= 2:
|
| 317 |
+
print("Combining protein queries with OR")
|
| 318 |
+
protein_queries = queries[0] | queries[1]
|
| 319 |
+
print("Successfully combined protein queries")
|
| 320 |
+
|
| 321 |
+
if len(queries) > 2:
|
| 322 |
+
print("Combining with additional queries using AND")
|
| 323 |
+
final_query = protein_queries
|
| 324 |
+
for q in queries[2:]:
|
| 325 |
+
final_query = final_query & q
|
| 326 |
+
else:
|
| 327 |
+
final_query = protein_queries
|
| 328 |
+
else:
|
| 329 |
+
final_query = queries[0]
|
| 330 |
+
for q in queries[1:]:
|
| 331 |
+
final_query = final_query & q
|
| 332 |
+
|
| 333 |
+
print("Final query:", final_query)
|
| 334 |
+
|
| 335 |
+
# Execute search
|
| 336 |
+
session = final_query.exec()
|
| 337 |
+
results = []
|
| 338 |
+
|
| 339 |
+
# Process results with additional information
|
| 340 |
+
# search_engine = ProteinSearchEngine()
|
| 341 |
+
|
| 342 |
+
try:
|
| 343 |
+
for entry in session:
|
| 344 |
+
try:
|
| 345 |
+
# PDB ID 추출 방식 개선
|
| 346 |
+
if isinstance(entry, dict):
|
| 347 |
+
pdb_id = entry.get('identifier')
|
| 348 |
+
elif hasattr(entry, 'identifier'):
|
| 349 |
+
pdb_id = entry.identifier
|
| 350 |
+
else:
|
| 351 |
+
pdb_id = str(entry)
|
| 352 |
|
| 353 |
+
pdb_id = pdb_id.upper() # PDB ID는 항상 대문자
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 354 |
|
| 355 |
+
if not pdb_id or len(pdb_id) != 4: # PDB ID는 항상 4자리
|
| 356 |
+
continue
|
| 357 |
+
|
| 358 |
+
# RCSB PDB REST API를 직접 사용하여 구조 정보 가져오기
|
| 359 |
+
structure_url = f"https://data.rcsb.org/rest/v1/core/entry/{pdb_id}"
|
| 360 |
+
response = requests.get(structure_url)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 361 |
|
| 362 |
+
if response.status_code != 200:
|
| 363 |
+
continue
|
| 364 |
+
|
| 365 |
+
structure_data = response.json()
|
| 366 |
+
# 결과 구성
|
| 367 |
+
result = {
|
| 368 |
+
'PDB ID': pdb_id,
|
| 369 |
+
'Title': structure_data.get('struct', {}).get('title', 'N/A'),
|
| 370 |
+
'# of total residues': structure_data.get('refine_hist', [{}])[0].get('pdbx_number_residues_total', 'N/A'),
|
| 371 |
+
'# of atoms of protein': structure_data.get('refine_hist', [{}])[0].get('pdbx_number_atoms_protein', 'N/A'),
|
| 372 |
+
'Resolution': f"{structure_data.get('rcsb_entry_info', {}).get('resolution_combined', [0.0])[0]:.2f}Å",
|
| 373 |
+
'Method': structure_data.get('exptl', [{}])[0].get('method', 'Unknown'),
|
| 374 |
+
'Release Date': structure_data.get('rcsb_accession_info', {}).get('initial_release_date', 'N/A')
|
| 375 |
+
|
| 376 |
+
|
| 377 |
+
}
|
| 378 |
|
| 379 |
+
results.append(result)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 380 |
|
| 381 |
+
# Limit to top 10 results
|
| 382 |
+
if len(results) >= 10:
|
| 383 |
+
break
|
| 384 |
+
|
| 385 |
+
except Exception as e:
|
| 386 |
+
print(f"Error processing entry: {str(e)}")
|
| 387 |
+
continue
|
| 388 |
+
|
| 389 |
+
except Exception as e:
|
| 390 |
+
print(f"Error processing results: {str(e)}")
|
| 391 |
+
print(f"Error type: {type(e)}")
|
| 392 |
+
|
| 393 |
+
print(f"Found {len(results)} structures")
|
| 394 |
+
return results
|
| 395 |
|
| 396 |
+
except Exception as e:
|
| 397 |
+
print(f"Error combining queries: {str(e)}")
|
| 398 |
+
print(f"Query state: {queries}")
|
| 399 |
+
return []
|
| 400 |
|
| 401 |
return []
|
| 402 |
|
|
|
|
| 1019 |
ui.input_text_area(
|
| 1020 |
"query",
|
| 1021 |
"",
|
| 1022 |
+
value="",
|
| 1023 |
width="100%",
|
| 1024 |
resize="vertical"
|
| 1025 |
),
|
|
|
|
| 1035 |
{"class": "example-box"},
|
| 1036 |
ui.p("Example queries:"),
|
| 1037 |
ui.tags.ul(
|
| 1038 |
+
ui.tags.li("Sequence of PDB ID 8ET6"),
|
| 1039 |
+
ui.tags.li("Spike protein"),
|
| 1040 |
+
ui.tags.li("Human insulin"),
|
| 1041 |
ui.tags.li("Human hemoglobin C resolution better than 2.5Å"),
|
| 1042 |
ui.tags.li("Find structures containing sequence with similarity 90% FVNQHLCGSHLVEALYLVCGERGFFYTPKTRREAEDLQVGQVELGGGPGAGSLQPLALEGSLQKR"),
|
| 1043 |
ui.tags.li("Find structures with resolution better than 3 angstrom and sequence similarity 90% of FVNQHLCGSHLVEALYLVCGERGFFYTPKTRREAEDLQVGQVELGGGPGAGSLQPLALEGSLQKR"),
|
| 1044 |
+
|
| 1045 |
)
|
| 1046 |
)
|
| 1047 |
)
|