update
Browse files
app.py
CHANGED
|
@@ -1,5 +1,5 @@
|
|
| 1 |
from transformers import pipeline
|
| 2 |
-
from rcsbsearchapi import TextQuery, AttributeQuery
|
| 3 |
from rcsbsearchapi.search import Sort, SequenceQuery
|
| 4 |
import os
|
| 5 |
from dotenv import load_dotenv
|
|
@@ -31,7 +31,7 @@ class PDBSearchAssistant:
|
|
| 31 |
"text2text-generation",
|
| 32 |
model=model_name,
|
| 33 |
max_new_tokens=1024,
|
| 34 |
-
temperature=0.
|
| 35 |
torch_dtype="auto",
|
| 36 |
device="cpu"
|
| 37 |
)
|
|
@@ -44,6 +44,7 @@ class PDBSearchAssistant:
|
|
| 44 |
4. Specific PDB ID
|
| 45 |
5. Experimental method (X-RAY, EM, NMR)
|
| 46 |
6. Organism/Species information
|
|
|
|
| 47 |
|
| 48 |
Format:
|
| 49 |
Protein: [protein name or type]
|
|
@@ -62,13 +63,14 @@ class PDBSearchAssistant:
|
|
| 62 |
PDB_ID: none
|
| 63 |
Method: X-RAY
|
| 64 |
|
| 65 |
-
Query: "Find structures containing sequence MNIFEMLRIDEGLRLKIYKDTEGYYTIGIGHLLTKSPSLNAAKSELDKAIGRNTNGVITKDEAEKLFNQDVDAAVRGILRNAKLKPVYDSLDAVRRAALINMVFQMGETGVAGFTNSLRMLQQKRWDEAAVNLAKSRWYNQTPNRAKRVITTFRTGTWDAYKNL"
|
| 66 |
Protein: none
|
| 67 |
Organism: none
|
| 68 |
Resolution: none
|
| 69 |
Sequence: MNIFEMLRIDEGLRLKIYKDTEGYYTIGIGHLLTKSPSLNAAKSELDKAIGRNTNGVITKDEAEKLFNQDVDAAVRGILRNAKLKPVYDSLDAVRRAALINMVFQMGETGVAGFTNSLRMLQQKRWDEAAVNLAKSRWYNQTPNRAKRVITTFRTGTWDAYKNL
|
| 70 |
PDB_ID: none
|
| 71 |
Method: none
|
|
|
|
| 72 |
|
| 73 |
Query: "Get sequence of PDB ID 8ET6"
|
| 74 |
Protein: none
|
|
@@ -86,32 +88,6 @@ class PDBSearchAssistant:
|
|
| 86 |
PDB_ID: none
|
| 87 |
Method: none
|
| 88 |
|
| 89 |
-
Query: "Show me E. coli protein structures solved by Cryo-EM"
|
| 90 |
-
Protein: none
|
| 91 |
-
Organism: Escherichia coli
|
| 92 |
-
Resolution: none
|
| 93 |
-
Sequence: none
|
| 94 |
-
PDB_ID: none
|
| 95 |
-
Method: EM
|
| 96 |
-
|
| 97 |
-
Query: "Find S. cerevisiae structures with resolution better than 1.8Å"
|
| 98 |
-
Protein: none
|
| 99 |
-
Organism: Saccharomyces cerevisiae
|
| 100 |
-
Resolution: 1.8
|
| 101 |
-
Sequence: none
|
| 102 |
-
PDB_ID: none
|
| 103 |
-
Method: none
|
| 104 |
-
|
| 105 |
-
Query: "Sequence of 7BZ5"
|
| 106 |
-
Protein: none
|
| 107 |
-
Organism: none
|
| 108 |
-
Resolution: none
|
| 109 |
-
Sequence: none
|
| 110 |
-
PDB_ID: 7BZ5
|
| 111 |
-
Method: none
|
| 112 |
-
|
| 113 |
-
|
| 114 |
-
|
| 115 |
Now analyze:
|
| 116 |
Query: {query}
|
| 117 |
"""
|
|
@@ -135,6 +111,32 @@ class PDBSearchAssistant:
|
|
| 135 |
organism = None
|
| 136 |
has_resolution_query = False
|
| 137 |
resolution_direction = "less"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 138 |
|
| 139 |
# Check if query contains resolution-related terms
|
| 140 |
resolution_terms = {
|
|
@@ -166,45 +168,47 @@ class PDBSearchAssistant:
|
|
| 166 |
has_resolution_query = True
|
| 167 |
if direction: # if not None
|
| 168 |
resolution_direction = direction
|
| 169 |
-
|
| 170 |
# Also check for numerical values with Å
|
| 171 |
-
|
|
|
|
| 172 |
has_resolution_query = True
|
|
|
|
|
|
|
|
|
|
|
|
|
| 173 |
|
| 174 |
# Clean and parse LLM response
|
| 175 |
for line in response.split('\n'):
|
| 176 |
if 'Resolution:' in line:
|
| 177 |
-
value = line.split('Resolution:')[1].strip()
|
| 178 |
if value.lower() not in ['none', 'n/a'] and has_resolution_query:
|
| 179 |
-
|
| 180 |
-
|
| 181 |
-
|
| 182 |
-
|
| 183 |
-
|
| 184 |
-
|
| 185 |
-
|
| 186 |
-
# pass
|
| 187 |
-
|
| 188 |
-
if 'Method:' in line:
|
| 189 |
value = line.split('Method:')[1].strip()
|
| 190 |
if value.lower() not in ['none', 'n/a']:
|
| 191 |
method = value.upper()
|
| 192 |
-
|
| 193 |
value = line.split('Sequence:')[1].strip()
|
| 194 |
if value.lower() not in ['none', 'n/a']:
|
| 195 |
sequence = value
|
| 196 |
-
|
| 197 |
-
value = line.split('PDB_ID:')[1].strip()
|
| 198 |
if value.lower() not in ['none', 'n/a']:
|
| 199 |
pdb_id = value
|
| 200 |
-
|
| 201 |
-
value = line.split('
|
| 202 |
if value.lower() not in ['none', 'n/a']:
|
| 203 |
organism = value
|
| 204 |
|
| 205 |
# Build search query
|
| 206 |
queries = []
|
| 207 |
-
|
| 208 |
# Check if the query contains a protein sequence pattern
|
| 209 |
# Check for amino acid sequence (minimum 25 residues)
|
| 210 |
query_words = query.split()
|
|
@@ -222,14 +226,21 @@ class PDBSearchAssistant:
|
|
| 222 |
print("Warning: Sequence must be at least 25 residues long. Skipping sequence search.")
|
| 223 |
sequence = None
|
| 224 |
else:
|
| 225 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 226 |
sequence_query = SequenceQuery(
|
| 227 |
sequence,
|
| 228 |
-
identity_cutoff=
|
| 229 |
evalue_cutoff=1,
|
| 230 |
sequence_type="protein"
|
| 231 |
)
|
| 232 |
queries.append(sequence_query)
|
|
|
|
| 233 |
# If no sequence, proceed with text search
|
| 234 |
else:
|
| 235 |
# Clean the original query and add text search
|
|
@@ -298,7 +309,7 @@ class PDBSearchAssistant:
|
|
| 298 |
|
| 299 |
# Combine queries with AND operator
|
| 300 |
if queries:
|
| 301 |
-
final_query = queries[
|
| 302 |
for q in queries[1:]:
|
| 303 |
final_query = final_query & q
|
| 304 |
|
|
@@ -335,14 +346,17 @@ class PDBSearchAssistant:
|
|
| 335 |
continue
|
| 336 |
|
| 337 |
structure_data = response.json()
|
| 338 |
-
|
| 339 |
# 결과 구성
|
| 340 |
result = {
|
| 341 |
'PDB ID': pdb_id,
|
|
|
|
|
|
|
|
|
|
| 342 |
'Resolution': f"{structure_data.get('rcsb_entry_info', {}).get('resolution_combined', [0.0])[0]:.2f}Å",
|
| 343 |
'Method': structure_data.get('exptl', [{}])[0].get('method', 'Unknown'),
|
| 344 |
-
'Title': structure_data.get('struct', {}).get('title', 'N/A'),
|
| 345 |
'Release Date': structure_data.get('rcsb_accession_info', {}).get('initial_release_date', 'N/A')
|
|
|
|
|
|
|
| 346 |
}
|
| 347 |
|
| 348 |
results.append(result)
|
|
@@ -378,34 +392,10 @@ class PDBSearchAssistant:
|
|
| 378 |
pdir=self.pdb_dir,
|
| 379 |
file_format="pdb"
|
| 380 |
)
|
| 381 |
-
|
| 382 |
if not pdb_path or not os.path.exists(pdb_path):
|
| 383 |
print(f"Failed to download PDB file for {pdb_id}")
|
| 384 |
-
|
| 385 |
-
structure_url = f"https://data.rcsb.org/rest/v1/core/entry/{pdb_id}"
|
| 386 |
-
response = requests.get(structure_url)
|
| 387 |
-
structure_data = response.json() if response.status_code == 200 else {}
|
| 388 |
-
|
| 389 |
-
|
| 390 |
-
sequence_url = f"https://data.rcsb.org/rest/v1/core/polymer_entity/{pdb_id}/1"
|
| 391 |
-
seq_response = requests.get(sequence_url)
|
| 392 |
-
seq_data = seq_response.json() if response.status_code == 200 else {}
|
| 393 |
-
sequence = seq_data.get('entity_poly', {}).get('pdbx_seq_one_letter_code', 'N/A')
|
| 394 |
-
|
| 395 |
-
sequences = []
|
| 396 |
-
|
| 397 |
-
chain_info = {
|
| 398 |
-
'chain_id': "A", # chain.id, 임의 설정 api 3개써서 가져오기는 가능
|
| 399 |
-
'entity_id': '1', # Default entity ID
|
| 400 |
-
'description': structure_data.get('struct', {}).get('title', 'N/A'),
|
| 401 |
-
'sequence': sequence,
|
| 402 |
-
'length': len(sequence),
|
| 403 |
-
'resolution': structure_data.get('rcsb_entry_info', {}).get('resolution_combined', [0.0])[0],
|
| 404 |
-
'method': structure_data.get('exptl', [{}])[0].get('method', 'Unknown'),
|
| 405 |
-
'release_date': structure_data.get('rcsb_accession_info', {}).get('initial_release_date', 'N/A')
|
| 406 |
-
}
|
| 407 |
-
sequences.append(chain_info)
|
| 408 |
-
return sequences
|
| 409 |
|
| 410 |
# Parse structure
|
| 411 |
parser = PDB.PDBParser(QUIET=True)
|
|
@@ -462,17 +452,7 @@ class PDBSearchAssistant:
|
|
| 462 |
def process_query(self, query):
|
| 463 |
"""Process query and return results"""
|
| 464 |
try:
|
| 465 |
-
#
|
| 466 |
-
query_cleaned = query.strip().upper()
|
| 467 |
-
if re.match(r'^[0-9A-Za-z]{4}$', query_cleaned):
|
| 468 |
-
# Direct PDB ID query
|
| 469 |
-
sequences = self.get_sequences_by_pdb_id(query_cleaned)
|
| 470 |
-
return {
|
| 471 |
-
"type": "sequence",
|
| 472 |
-
"results": sequences
|
| 473 |
-
}
|
| 474 |
-
|
| 475 |
-
# If not a direct PDB ID, proceed with LLM processing
|
| 476 |
formatted_prompt = self.prompt_template.format(query=query)
|
| 477 |
response = self.pipe(formatted_prompt)[0]['generated_text']
|
| 478 |
print("Generated parameters:", response)
|
|
@@ -491,6 +471,7 @@ class PDBSearchAssistant:
|
|
| 491 |
is_sequence_query = any(keyword in query.lower() for keyword in sequence_keywords)
|
| 492 |
|
| 493 |
if is_sequence_query and pdb_id:
|
|
|
|
| 494 |
sequences = self.get_sequences_by_pdb_id(pdb_id)
|
| 495 |
return {
|
| 496 |
"type": "sequence",
|
|
@@ -507,6 +488,26 @@ class PDBSearchAssistant:
|
|
| 507 |
print(f"Error processing query: {str(e)}")
|
| 508 |
return {"type": "structure", "results": []}
|
| 509 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 510 |
|
| 511 |
def render_html(pdb_id):
|
| 512 |
if pdb_id is None:
|
|
@@ -550,8 +551,8 @@ def create_interactive_table(df):
|
|
| 550 |
if df.empty:
|
| 551 |
return go.Figure()
|
| 552 |
|
| 553 |
-
# Reorder columns
|
| 554 |
-
column_order = ['PDB ID', 'Resolution', '
|
| 555 |
df = df[column_order]
|
| 556 |
|
| 557 |
# Release Date 형식 변경 (YYYY-MM-DD)
|
|
@@ -562,8 +563,8 @@ def create_interactive_table(df):
|
|
| 562 |
header=dict(
|
| 563 |
values=list(df.columns),
|
| 564 |
fill_color='paleturquoise',
|
| 565 |
-
align='center',
|
| 566 |
-
font=dict(size=16),
|
| 567 |
),
|
| 568 |
cells=dict(
|
| 569 |
values=[
|
|
@@ -572,11 +573,11 @@ def create_interactive_table(df):
|
|
| 572 |
for cell in df[col]]
|
| 573 |
for i, col in enumerate(df.columns)
|
| 574 |
],
|
| 575 |
-
align='center',
|
| 576 |
-
font=dict(size=15),
|
| 577 |
-
height=35
|
| 578 |
),
|
| 579 |
-
columnwidth=[80, 80, 100,
|
| 580 |
customdata=[['html'] * len(df) if i == 0 else [''] * len(df)
|
| 581 |
for i in range(len(df.columns))],
|
| 582 |
hoverlabel=dict(bgcolor='white')
|
|
@@ -585,7 +586,7 @@ def create_interactive_table(df):
|
|
| 585 |
# Update table layout
|
| 586 |
table.update_layout(
|
| 587 |
margin=dict(l=20, r=20, t=20, b=20),
|
| 588 |
-
height=450,
|
| 589 |
autosize=True
|
| 590 |
)
|
| 591 |
|
|
@@ -932,7 +933,7 @@ app_ui = ui.page_fluid(
|
|
| 932 |
|
| 933 |
.pdb-selector {
|
| 934 |
display: flex;
|
| 935 |
-
align-items:
|
| 936 |
justify-content: flex-start;
|
| 937 |
gap: 5px;
|
| 938 |
margin-bottom: 20px;
|
|
@@ -949,7 +950,7 @@ app_ui = ui.page_fluid(
|
|
| 949 |
|
| 950 |
.pdb-selector select {
|
| 951 |
margin-left: 0;
|
| 952 |
-
vertical-align:
|
| 953 |
display: inline-block;
|
| 954 |
}
|
| 955 |
|
|
@@ -1010,7 +1011,7 @@ app_ui = ui.page_fluid(
|
|
| 1010 |
ui.p("Example queries:"),
|
| 1011 |
ui.tags.ul(
|
| 1012 |
ui.tags.li("Human hemoglobin C resolution better than 2.5Å"),
|
| 1013 |
-
ui.tags.li("Find structures containing sequence MNIFEMLRIDEGLRLKIYKDTEGYYTIGIGHLLTKSPSLNAAKSELDKAIGRNTNGVITKDEAEKLFNQDVDAAVRGILRNAKLKPVYDSLDAVRRAALINMVFQMGETGVAGFTNSLRMLQQKRWDEAAVNLAKSRWYNQTPNRAKRVITTFRTGTWDAYKNL"),
|
| 1014 |
ui.tags.li("Sequence of PDB ID 8ET6")
|
| 1015 |
)
|
| 1016 |
)
|
|
@@ -1097,11 +1098,6 @@ def server(input, output, session):
|
|
| 1097 |
for line in input.query().split():
|
| 1098 |
if re.match(r'^[0-9A-Za-z]{4}$', line):
|
| 1099 |
pdb_ids.append(line.upper())
|
| 1100 |
-
# Clear the results table for sequence queries
|
| 1101 |
-
@output
|
| 1102 |
-
@render_widget
|
| 1103 |
-
def results_table():
|
| 1104 |
-
return create_interactive_table(pd.DataFrame())
|
| 1105 |
else:
|
| 1106 |
df = pd.DataFrame(query_results["results"])
|
| 1107 |
if df.empty:
|
|
@@ -1116,6 +1112,7 @@ def server(input, output, session):
|
|
| 1116 |
|
| 1117 |
if pdb_ids:
|
| 1118 |
pdb_ids_store.set(pdb_ids)
|
|
|
|
| 1119 |
ui.update_select(
|
| 1120 |
"selected_pdb",
|
| 1121 |
choices=pdb_ids,
|
|
|
|
| 1 |
from transformers import pipeline
|
| 2 |
+
from rcsbsearchapi import TextQuery, AttributeQuery, Query
|
| 3 |
from rcsbsearchapi.search import Sort, SequenceQuery
|
| 4 |
import os
|
| 5 |
from dotenv import load_dotenv
|
|
|
|
| 31 |
"text2text-generation",
|
| 32 |
model=model_name,
|
| 33 |
max_new_tokens=1024,
|
| 34 |
+
temperature=0.1,
|
| 35 |
torch_dtype="auto",
|
| 36 |
device="cpu"
|
| 37 |
)
|
|
|
|
| 44 |
4. Specific PDB ID
|
| 45 |
5. Experimental method (X-RAY, EM, NMR)
|
| 46 |
6. Organism/Species information
|
| 47 |
+
7. Sequence similarity (in %)
|
| 48 |
|
| 49 |
Format:
|
| 50 |
Protein: [protein name or type]
|
|
|
|
| 63 |
PDB_ID: none
|
| 64 |
Method: X-RAY
|
| 65 |
|
| 66 |
+
Query: "Find structures containing sequence with similarity 90% MNIFEMLRIDEGLRLKIYKDTEGYYTIGIGHLLTKSPSLNAAKSELDKAIGRNTNGVITKDEAEKLFNQDVDAAVRGILRNAKLKPVYDSLDAVRRAALINMVFQMGETGVAGFTNSLRMLQQKRWDEAAVNLAKSRWYNQTPNRAKRVITTFRTGTWDAYKNL"
|
| 67 |
Protein: none
|
| 68 |
Organism: none
|
| 69 |
Resolution: none
|
| 70 |
Sequence: MNIFEMLRIDEGLRLKIYKDTEGYYTIGIGHLLTKSPSLNAAKSELDKAIGRNTNGVITKDEAEKLFNQDVDAAVRGILRNAKLKPVYDSLDAVRRAALINMVFQMGETGVAGFTNSLRMLQQKRWDEAAVNLAKSRWYNQTPNRAKRVITTFRTGTWDAYKNL
|
| 71 |
PDB_ID: none
|
| 72 |
Method: none
|
| 73 |
+
Similarity: 90
|
| 74 |
|
| 75 |
Query: "Get sequence of PDB ID 8ET6"
|
| 76 |
Protein: none
|
|
|
|
| 88 |
PDB_ID: none
|
| 89 |
Method: none
|
| 90 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 91 |
Now analyze:
|
| 92 |
Query: {query}
|
| 93 |
"""
|
|
|
|
| 111 |
organism = None
|
| 112 |
has_resolution_query = False
|
| 113 |
resolution_direction = "less"
|
| 114 |
+
similarity = None # Initialize similarity
|
| 115 |
+
print("Raw LLM response:", response) # Debug print
|
| 116 |
+
|
| 117 |
+
# Parse LLM response first to get similarity value
|
| 118 |
+
for line in response.split('\n'):
|
| 119 |
+
line = line.strip().lower() # Convert to lowercase
|
| 120 |
+
if 'similarity:' in line:
|
| 121 |
+
try:
|
| 122 |
+
similarity_str = line.split('similarity:')[1].strip()
|
| 123 |
+
if similarity_str.lower() not in ['none', 'n/a']:
|
| 124 |
+
similarity = float(similarity_str)
|
| 125 |
+
print(f"Successfully extracted similarity: {similarity}%")
|
| 126 |
+
except (ValueError, IndexError) as e:
|
| 127 |
+
print(f"Error parsing similarity: {e}")
|
| 128 |
+
continue
|
| 129 |
+
|
| 130 |
+
# If similarity is still None, try to extract from original query
|
| 131 |
+
if similarity is None:
|
| 132 |
+
# Case insensitive search for similarity pattern
|
| 133 |
+
similarity_match = re.search(r'similarity\s+(\d+(?:\.\d+)?)\s*%', query.lower())
|
| 134 |
+
if similarity_match:
|
| 135 |
+
try:
|
| 136 |
+
similarity = float(similarity_match.group(1))
|
| 137 |
+
print(f"Extracted similarity from query: {similarity}%")
|
| 138 |
+
except ValueError as e:
|
| 139 |
+
print(f"Error parsing similarity from query: {e}")
|
| 140 |
|
| 141 |
# Check if query contains resolution-related terms
|
| 142 |
resolution_terms = {
|
|
|
|
| 168 |
has_resolution_query = True
|
| 169 |
if direction: # if not None
|
| 170 |
resolution_direction = direction
|
| 171 |
+
|
| 172 |
# Also check for numerical values with Å
|
| 173 |
+
resolution_match = re.search(r'(\d+\.?\d*)\s*å?.*resolution', query_lower)
|
| 174 |
+
if resolution_match:
|
| 175 |
has_resolution_query = True
|
| 176 |
+
try:
|
| 177 |
+
resolution_limit = float(resolution_match.group(1))
|
| 178 |
+
except ValueError:
|
| 179 |
+
pass
|
| 180 |
|
| 181 |
# Clean and parse LLM response
|
| 182 |
for line in response.split('\n'):
|
| 183 |
if 'Resolution:' in line:
|
| 184 |
+
value = line.split('Resolution:')[1].strip()
|
| 185 |
if value.lower() not in ['none', 'n/a'] and has_resolution_query:
|
| 186 |
+
try:
|
| 187 |
+
# Extract just the number
|
| 188 |
+
res_value = ''.join(c for c in value if c.isdigit() or c == '.')
|
| 189 |
+
resolution_limit = float(res_value)
|
| 190 |
+
except ValueError:
|
| 191 |
+
pass
|
| 192 |
+
elif 'Method:' in line:
|
|
|
|
|
|
|
|
|
|
| 193 |
value = line.split('Method:')[1].strip()
|
| 194 |
if value.lower() not in ['none', 'n/a']:
|
| 195 |
method = value.upper()
|
| 196 |
+
elif 'Sequence:' in line:
|
| 197 |
value = line.split('Sequence:')[1].strip()
|
| 198 |
if value.lower() not in ['none', 'n/a']:
|
| 199 |
sequence = value
|
| 200 |
+
elif 'PDB_ID:' in line:
|
| 201 |
+
value = line.split('PDB_ID:')[1].strip()
|
| 202 |
if value.lower() not in ['none', 'n/a']:
|
| 203 |
pdb_id = value
|
| 204 |
+
elif 'Organism:' in line:
|
| 205 |
+
value = line.split('Organism:')[1].strip()
|
| 206 |
if value.lower() not in ['none', 'n/a']:
|
| 207 |
organism = value
|
| 208 |
|
| 209 |
# Build search query
|
| 210 |
queries = []
|
| 211 |
+
|
| 212 |
# Check if the query contains a protein sequence pattern
|
| 213 |
# Check for amino acid sequence (minimum 25 residues)
|
| 214 |
query_words = query.split()
|
|
|
|
| 226 |
print("Warning: Sequence must be at least 25 residues long. Skipping sequence search.")
|
| 227 |
sequence = None
|
| 228 |
else:
|
| 229 |
+
# Use the previously extracted similarity value
|
| 230 |
+
if similarity is None:
|
| 231 |
+
similarity = 100 # default value
|
| 232 |
+
print("No similarity specified, using default 100%")
|
| 233 |
+
|
| 234 |
+
identity_cutoff = similarity / 100.0 # Convert percentage to decimal
|
| 235 |
+
print(f"Adding sequence search with identity {similarity}% (cutoff: {identity_cutoff}) for sequence: {sequence}")
|
| 236 |
sequence_query = SequenceQuery(
|
| 237 |
sequence,
|
| 238 |
+
identity_cutoff=identity_cutoff,
|
| 239 |
evalue_cutoff=1,
|
| 240 |
sequence_type="protein"
|
| 241 |
)
|
| 242 |
queries.append(sequence_query)
|
| 243 |
+
print(f"Created sequence query with parameters: {sequence_query.params}")
|
| 244 |
# If no sequence, proceed with text search
|
| 245 |
else:
|
| 246 |
# Clean the original query and add text search
|
|
|
|
| 309 |
|
| 310 |
# Combine queries with AND operator
|
| 311 |
if queries:
|
| 312 |
+
final_query = queries[0]
|
| 313 |
for q in queries[1:]:
|
| 314 |
final_query = final_query & q
|
| 315 |
|
|
|
|
| 346 |
continue
|
| 347 |
|
| 348 |
structure_data = response.json()
|
|
|
|
| 349 |
# 결과 구성
|
| 350 |
result = {
|
| 351 |
'PDB ID': pdb_id,
|
| 352 |
+
'Title': structure_data.get('struct', {}).get('title', 'N/A'),
|
| 353 |
+
'# of total residues': structure_data.get('refine_hist', [{}])[0].get('pdbx_number_residues_total', 'N/A'),
|
| 354 |
+
'# of atoms of protein': structure_data.get('refine_hist', [{}])[0].get('pdbx_number_atoms_protein', 'N/A'),
|
| 355 |
'Resolution': f"{structure_data.get('rcsb_entry_info', {}).get('resolution_combined', [0.0])[0]:.2f}Å",
|
| 356 |
'Method': structure_data.get('exptl', [{}])[0].get('method', 'Unknown'),
|
|
|
|
| 357 |
'Release Date': structure_data.get('rcsb_accession_info', {}).get('initial_release_date', 'N/A')
|
| 358 |
+
|
| 359 |
+
|
| 360 |
}
|
| 361 |
|
| 362 |
results.append(result)
|
|
|
|
| 392 |
pdir=self.pdb_dir,
|
| 393 |
file_format="pdb"
|
| 394 |
)
|
| 395 |
+
|
| 396 |
if not pdb_path or not os.path.exists(pdb_path):
|
| 397 |
print(f"Failed to download PDB file for {pdb_id}")
|
| 398 |
+
return []
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 399 |
|
| 400 |
# Parse structure
|
| 401 |
parser = PDB.PDBParser(QUIET=True)
|
|
|
|
| 452 |
def process_query(self, query):
|
| 453 |
"""Process query and return results"""
|
| 454 |
try:
|
| 455 |
+
# Get search parameters from LLM
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 456 |
formatted_prompt = self.prompt_template.format(query=query)
|
| 457 |
response = self.pipe(formatted_prompt)[0]['generated_text']
|
| 458 |
print("Generated parameters:", response)
|
|
|
|
| 471 |
is_sequence_query = any(keyword in query.lower() for keyword in sequence_keywords)
|
| 472 |
|
| 473 |
if is_sequence_query and pdb_id:
|
| 474 |
+
# Get sequences for the PDB ID
|
| 475 |
sequences = self.get_sequences_by_pdb_id(pdb_id)
|
| 476 |
return {
|
| 477 |
"type": "sequence",
|
|
|
|
| 488 |
print(f"Error processing query: {str(e)}")
|
| 489 |
return {"type": "structure", "results": []}
|
| 490 |
|
| 491 |
+
def pdbsummary(name):
|
| 492 |
+
|
| 493 |
+
search_engine = ProteinSearchEngine()
|
| 494 |
+
|
| 495 |
+
query = ProteinQuery(
|
| 496 |
+
name,
|
| 497 |
+
max_resolution= 5.0
|
| 498 |
+
)
|
| 499 |
+
|
| 500 |
+
results = search_engine.search(query)
|
| 501 |
+
|
| 502 |
+
answer = ""
|
| 503 |
+
for i, structure in enumerate(results, 1):
|
| 504 |
+
answer += f"\n{i}. PDB ID : {structure.pdb_id}\n"
|
| 505 |
+
answer += f"\nResolution : {structure.resolution:.2f} A \n"
|
| 506 |
+
answer += f"Method : {structure.method}\n Title : {structure.title}\n"
|
| 507 |
+
answer += f"Release Date : {structure.release_date}\n Sequence length: {len(structure.sequence)} aa\n"
|
| 508 |
+
answer += f" Sequence:\n {structure.sequence}\n"
|
| 509 |
+
|
| 510 |
+
return answer
|
| 511 |
|
| 512 |
def render_html(pdb_id):
|
| 513 |
if pdb_id is None:
|
|
|
|
| 551 |
if df.empty:
|
| 552 |
return go.Figure()
|
| 553 |
|
| 554 |
+
# Reorder columns - Add '# of atoms of protein' to the column order
|
| 555 |
+
column_order = ['PDB ID', 'Resolution', 'Title','# of total residues', '# of atoms of protein', 'Method','Release Date']
|
| 556 |
df = df[column_order]
|
| 557 |
|
| 558 |
# Release Date 형식 변경 (YYYY-MM-DD)
|
|
|
|
| 563 |
header=dict(
|
| 564 |
values=list(df.columns),
|
| 565 |
fill_color='paleturquoise',
|
| 566 |
+
align='center',
|
| 567 |
+
font=dict(size=16),
|
| 568 |
),
|
| 569 |
cells=dict(
|
| 570 |
values=[
|
|
|
|
| 573 |
for cell in df[col]]
|
| 574 |
for i, col in enumerate(df.columns)
|
| 575 |
],
|
| 576 |
+
align='center',
|
| 577 |
+
font=dict(size=15),
|
| 578 |
+
height=35
|
| 579 |
),
|
| 580 |
+
columnwidth=[80, 80, 400, 100, 100, 100, 100], # Updated columnwidth to include new column
|
| 581 |
customdata=[['html'] * len(df) if i == 0 else [''] * len(df)
|
| 582 |
for i in range(len(df.columns))],
|
| 583 |
hoverlabel=dict(bgcolor='white')
|
|
|
|
| 586 |
# Update table layout
|
| 587 |
table.update_layout(
|
| 588 |
margin=dict(l=20, r=20, t=20, b=20),
|
| 589 |
+
height=450,
|
| 590 |
autosize=True
|
| 591 |
)
|
| 592 |
|
|
|
|
| 933 |
|
| 934 |
.pdb-selector {
|
| 935 |
display: flex;
|
| 936 |
+
align-items: ;
|
| 937 |
justify-content: flex-start;
|
| 938 |
gap: 5px;
|
| 939 |
margin-bottom: 20px;
|
|
|
|
| 950 |
|
| 951 |
.pdb-selector select {
|
| 952 |
margin-left: 0;
|
| 953 |
+
vertical-align: left;
|
| 954 |
display: inline-block;
|
| 955 |
}
|
| 956 |
|
|
|
|
| 1011 |
ui.p("Example queries:"),
|
| 1012 |
ui.tags.ul(
|
| 1013 |
ui.tags.li("Human hemoglobin C resolution better than 2.5Å"),
|
| 1014 |
+
ui.tags.li("Find structures containing sequence with similarity 90% MNIFEMLRIDEGLRLKIYKDTEGYYTIGIGHLLTKSPSLNAAKSELDKAIGRNTNGVITKDEAEKLFNQDVDAAVRGILRNAKLKPVYDSLDAVRRAALINMVFQMGETGVAGFTNSLRMLQQKRWDEAAVNLAKSRWYNQTPNRAKRVITTFRTGTWDAYKNL"),
|
| 1015 |
ui.tags.li("Sequence of PDB ID 8ET6")
|
| 1016 |
)
|
| 1017 |
)
|
|
|
|
| 1098 |
for line in input.query().split():
|
| 1099 |
if re.match(r'^[0-9A-Za-z]{4}$', line):
|
| 1100 |
pdb_ids.append(line.upper())
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1101 |
else:
|
| 1102 |
df = pd.DataFrame(query_results["results"])
|
| 1103 |
if df.empty:
|
|
|
|
| 1112 |
|
| 1113 |
if pdb_ids:
|
| 1114 |
pdb_ids_store.set(pdb_ids)
|
| 1115 |
+
# Update only one dropdown
|
| 1116 |
ui.update_select(
|
| 1117 |
"selected_pdb",
|
| 1118 |
choices=pdb_ids,
|