update
Browse files
app.py
CHANGED
@@ -1,5 +1,5 @@
|
|
1 |
from transformers import pipeline
|
2 |
-
from rcsbsearchapi import TextQuery, AttributeQuery
|
3 |
from rcsbsearchapi.search import Sort, SequenceQuery
|
4 |
import os
|
5 |
from dotenv import load_dotenv
|
@@ -31,7 +31,7 @@ class PDBSearchAssistant:
|
|
31 |
"text2text-generation",
|
32 |
model=model_name,
|
33 |
max_new_tokens=1024,
|
34 |
-
temperature=0.
|
35 |
torch_dtype="auto",
|
36 |
device="cpu"
|
37 |
)
|
@@ -44,6 +44,7 @@ class PDBSearchAssistant:
|
|
44 |
4. Specific PDB ID
|
45 |
5. Experimental method (X-RAY, EM, NMR)
|
46 |
6. Organism/Species information
|
|
|
47 |
|
48 |
Format:
|
49 |
Protein: [protein name or type]
|
@@ -62,13 +63,14 @@ class PDBSearchAssistant:
|
|
62 |
PDB_ID: none
|
63 |
Method: X-RAY
|
64 |
|
65 |
-
Query: "Find structures containing sequence MNIFEMLRIDEGLRLKIYKDTEGYYTIGIGHLLTKSPSLNAAKSELDKAIGRNTNGVITKDEAEKLFNQDVDAAVRGILRNAKLKPVYDSLDAVRRAALINMVFQMGETGVAGFTNSLRMLQQKRWDEAAVNLAKSRWYNQTPNRAKRVITTFRTGTWDAYKNL"
|
66 |
Protein: none
|
67 |
Organism: none
|
68 |
Resolution: none
|
69 |
Sequence: MNIFEMLRIDEGLRLKIYKDTEGYYTIGIGHLLTKSPSLNAAKSELDKAIGRNTNGVITKDEAEKLFNQDVDAAVRGILRNAKLKPVYDSLDAVRRAALINMVFQMGETGVAGFTNSLRMLQQKRWDEAAVNLAKSRWYNQTPNRAKRVITTFRTGTWDAYKNL
|
70 |
PDB_ID: none
|
71 |
Method: none
|
|
|
72 |
|
73 |
Query: "Get sequence of PDB ID 8ET6"
|
74 |
Protein: none
|
@@ -86,32 +88,6 @@ class PDBSearchAssistant:
|
|
86 |
PDB_ID: none
|
87 |
Method: none
|
88 |
|
89 |
-
Query: "Show me E. coli protein structures solved by Cryo-EM"
|
90 |
-
Protein: none
|
91 |
-
Organism: Escherichia coli
|
92 |
-
Resolution: none
|
93 |
-
Sequence: none
|
94 |
-
PDB_ID: none
|
95 |
-
Method: EM
|
96 |
-
|
97 |
-
Query: "Find S. cerevisiae structures with resolution better than 1.8Å"
|
98 |
-
Protein: none
|
99 |
-
Organism: Saccharomyces cerevisiae
|
100 |
-
Resolution: 1.8
|
101 |
-
Sequence: none
|
102 |
-
PDB_ID: none
|
103 |
-
Method: none
|
104 |
-
|
105 |
-
Query: "Sequence of 7BZ5"
|
106 |
-
Protein: none
|
107 |
-
Organism: none
|
108 |
-
Resolution: none
|
109 |
-
Sequence: none
|
110 |
-
PDB_ID: 7BZ5
|
111 |
-
Method: none
|
112 |
-
|
113 |
-
|
114 |
-
|
115 |
Now analyze:
|
116 |
Query: {query}
|
117 |
"""
|
@@ -135,6 +111,32 @@ class PDBSearchAssistant:
|
|
135 |
organism = None
|
136 |
has_resolution_query = False
|
137 |
resolution_direction = "less"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
138 |
|
139 |
# Check if query contains resolution-related terms
|
140 |
resolution_terms = {
|
@@ -166,45 +168,47 @@ class PDBSearchAssistant:
|
|
166 |
has_resolution_query = True
|
167 |
if direction: # if not None
|
168 |
resolution_direction = direction
|
169 |
-
|
170 |
# Also check for numerical values with Å
|
171 |
-
|
|
|
172 |
has_resolution_query = True
|
|
|
|
|
|
|
|
|
173 |
|
174 |
# Clean and parse LLM response
|
175 |
for line in response.split('\n'):
|
176 |
if 'Resolution:' in line:
|
177 |
-
value = line.split('Resolution:')[1].strip()
|
178 |
if value.lower() not in ['none', 'n/a'] and has_resolution_query:
|
179 |
-
|
180 |
-
|
181 |
-
|
182 |
-
|
183 |
-
|
184 |
-
|
185 |
-
|
186 |
-
# pass
|
187 |
-
|
188 |
-
if 'Method:' in line:
|
189 |
value = line.split('Method:')[1].strip()
|
190 |
if value.lower() not in ['none', 'n/a']:
|
191 |
method = value.upper()
|
192 |
-
|
193 |
value = line.split('Sequence:')[1].strip()
|
194 |
if value.lower() not in ['none', 'n/a']:
|
195 |
sequence = value
|
196 |
-
|
197 |
-
value = line.split('PDB_ID:')[1].strip()
|
198 |
if value.lower() not in ['none', 'n/a']:
|
199 |
pdb_id = value
|
200 |
-
|
201 |
-
value = line.split('
|
202 |
if value.lower() not in ['none', 'n/a']:
|
203 |
organism = value
|
204 |
|
205 |
# Build search query
|
206 |
queries = []
|
207 |
-
|
208 |
# Check if the query contains a protein sequence pattern
|
209 |
# Check for amino acid sequence (minimum 25 residues)
|
210 |
query_words = query.split()
|
@@ -222,14 +226,21 @@ class PDBSearchAssistant:
|
|
222 |
print("Warning: Sequence must be at least 25 residues long. Skipping sequence search.")
|
223 |
sequence = None
|
224 |
else:
|
225 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
226 |
sequence_query = SequenceQuery(
|
227 |
sequence,
|
228 |
-
identity_cutoff=
|
229 |
evalue_cutoff=1,
|
230 |
sequence_type="protein"
|
231 |
)
|
232 |
queries.append(sequence_query)
|
|
|
233 |
# If no sequence, proceed with text search
|
234 |
else:
|
235 |
# Clean the original query and add text search
|
@@ -298,7 +309,7 @@ class PDBSearchAssistant:
|
|
298 |
|
299 |
# Combine queries with AND operator
|
300 |
if queries:
|
301 |
-
final_query = queries[
|
302 |
for q in queries[1:]:
|
303 |
final_query = final_query & q
|
304 |
|
@@ -335,14 +346,17 @@ class PDBSearchAssistant:
|
|
335 |
continue
|
336 |
|
337 |
structure_data = response.json()
|
338 |
-
|
339 |
# 결과 구성
|
340 |
result = {
|
341 |
'PDB ID': pdb_id,
|
|
|
|
|
|
|
342 |
'Resolution': f"{structure_data.get('rcsb_entry_info', {}).get('resolution_combined', [0.0])[0]:.2f}Å",
|
343 |
'Method': structure_data.get('exptl', [{}])[0].get('method', 'Unknown'),
|
344 |
-
'Title': structure_data.get('struct', {}).get('title', 'N/A'),
|
345 |
'Release Date': structure_data.get('rcsb_accession_info', {}).get('initial_release_date', 'N/A')
|
|
|
|
|
346 |
}
|
347 |
|
348 |
results.append(result)
|
@@ -378,34 +392,10 @@ class PDBSearchAssistant:
|
|
378 |
pdir=self.pdb_dir,
|
379 |
file_format="pdb"
|
380 |
)
|
381 |
-
|
382 |
if not pdb_path or not os.path.exists(pdb_path):
|
383 |
print(f"Failed to download PDB file for {pdb_id}")
|
384 |
-
|
385 |
-
structure_url = f"https://data.rcsb.org/rest/v1/core/entry/{pdb_id}"
|
386 |
-
response = requests.get(structure_url)
|
387 |
-
structure_data = response.json() if response.status_code == 200 else {}
|
388 |
-
|
389 |
-
|
390 |
-
sequence_url = f"https://data.rcsb.org/rest/v1/core/polymer_entity/{pdb_id}/1"
|
391 |
-
seq_response = requests.get(sequence_url)
|
392 |
-
seq_data = seq_response.json() if response.status_code == 200 else {}
|
393 |
-
sequence = seq_data.get('entity_poly', {}).get('pdbx_seq_one_letter_code', 'N/A')
|
394 |
-
|
395 |
-
sequences = []
|
396 |
-
|
397 |
-
chain_info = {
|
398 |
-
'chain_id': "A", # chain.id, 임의 설정 api 3개써서 가져오기는 가능
|
399 |
-
'entity_id': '1', # Default entity ID
|
400 |
-
'description': structure_data.get('struct', {}).get('title', 'N/A'),
|
401 |
-
'sequence': sequence,
|
402 |
-
'length': len(sequence),
|
403 |
-
'resolution': structure_data.get('rcsb_entry_info', {}).get('resolution_combined', [0.0])[0],
|
404 |
-
'method': structure_data.get('exptl', [{}])[0].get('method', 'Unknown'),
|
405 |
-
'release_date': structure_data.get('rcsb_accession_info', {}).get('initial_release_date', 'N/A')
|
406 |
-
}
|
407 |
-
sequences.append(chain_info)
|
408 |
-
return sequences
|
409 |
|
410 |
# Parse structure
|
411 |
parser = PDB.PDBParser(QUIET=True)
|
@@ -462,17 +452,7 @@ class PDBSearchAssistant:
|
|
462 |
def process_query(self, query):
|
463 |
"""Process query and return results"""
|
464 |
try:
|
465 |
-
#
|
466 |
-
query_cleaned = query.strip().upper()
|
467 |
-
if re.match(r'^[0-9A-Za-z]{4}$', query_cleaned):
|
468 |
-
# Direct PDB ID query
|
469 |
-
sequences = self.get_sequences_by_pdb_id(query_cleaned)
|
470 |
-
return {
|
471 |
-
"type": "sequence",
|
472 |
-
"results": sequences
|
473 |
-
}
|
474 |
-
|
475 |
-
# If not a direct PDB ID, proceed with LLM processing
|
476 |
formatted_prompt = self.prompt_template.format(query=query)
|
477 |
response = self.pipe(formatted_prompt)[0]['generated_text']
|
478 |
print("Generated parameters:", response)
|
@@ -491,6 +471,7 @@ class PDBSearchAssistant:
|
|
491 |
is_sequence_query = any(keyword in query.lower() for keyword in sequence_keywords)
|
492 |
|
493 |
if is_sequence_query and pdb_id:
|
|
|
494 |
sequences = self.get_sequences_by_pdb_id(pdb_id)
|
495 |
return {
|
496 |
"type": "sequence",
|
@@ -507,6 +488,26 @@ class PDBSearchAssistant:
|
|
507 |
print(f"Error processing query: {str(e)}")
|
508 |
return {"type": "structure", "results": []}
|
509 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
510 |
|
511 |
def render_html(pdb_id):
|
512 |
if pdb_id is None:
|
@@ -550,8 +551,8 @@ def create_interactive_table(df):
|
|
550 |
if df.empty:
|
551 |
return go.Figure()
|
552 |
|
553 |
-
# Reorder columns
|
554 |
-
column_order = ['PDB ID', 'Resolution', '
|
555 |
df = df[column_order]
|
556 |
|
557 |
# Release Date 형식 변경 (YYYY-MM-DD)
|
@@ -562,8 +563,8 @@ def create_interactive_table(df):
|
|
562 |
header=dict(
|
563 |
values=list(df.columns),
|
564 |
fill_color='paleturquoise',
|
565 |
-
align='center',
|
566 |
-
font=dict(size=16),
|
567 |
),
|
568 |
cells=dict(
|
569 |
values=[
|
@@ -572,11 +573,11 @@ def create_interactive_table(df):
|
|
572 |
for cell in df[col]]
|
573 |
for i, col in enumerate(df.columns)
|
574 |
],
|
575 |
-
align='center',
|
576 |
-
font=dict(size=15),
|
577 |
-
height=35
|
578 |
),
|
579 |
-
columnwidth=[80, 80, 100,
|
580 |
customdata=[['html'] * len(df) if i == 0 else [''] * len(df)
|
581 |
for i in range(len(df.columns))],
|
582 |
hoverlabel=dict(bgcolor='white')
|
@@ -585,7 +586,7 @@ def create_interactive_table(df):
|
|
585 |
# Update table layout
|
586 |
table.update_layout(
|
587 |
margin=dict(l=20, r=20, t=20, b=20),
|
588 |
-
height=450,
|
589 |
autosize=True
|
590 |
)
|
591 |
|
@@ -932,7 +933,7 @@ app_ui = ui.page_fluid(
|
|
932 |
|
933 |
.pdb-selector {
|
934 |
display: flex;
|
935 |
-
align-items:
|
936 |
justify-content: flex-start;
|
937 |
gap: 5px;
|
938 |
margin-bottom: 20px;
|
@@ -949,7 +950,7 @@ app_ui = ui.page_fluid(
|
|
949 |
|
950 |
.pdb-selector select {
|
951 |
margin-left: 0;
|
952 |
-
vertical-align:
|
953 |
display: inline-block;
|
954 |
}
|
955 |
|
@@ -1010,7 +1011,7 @@ app_ui = ui.page_fluid(
|
|
1010 |
ui.p("Example queries:"),
|
1011 |
ui.tags.ul(
|
1012 |
ui.tags.li("Human hemoglobin C resolution better than 2.5Å"),
|
1013 |
-
ui.tags.li("Find structures containing sequence MNIFEMLRIDEGLRLKIYKDTEGYYTIGIGHLLTKSPSLNAAKSELDKAIGRNTNGVITKDEAEKLFNQDVDAAVRGILRNAKLKPVYDSLDAVRRAALINMVFQMGETGVAGFTNSLRMLQQKRWDEAAVNLAKSRWYNQTPNRAKRVITTFRTGTWDAYKNL"),
|
1014 |
ui.tags.li("Sequence of PDB ID 8ET6")
|
1015 |
)
|
1016 |
)
|
@@ -1097,11 +1098,6 @@ def server(input, output, session):
|
|
1097 |
for line in input.query().split():
|
1098 |
if re.match(r'^[0-9A-Za-z]{4}$', line):
|
1099 |
pdb_ids.append(line.upper())
|
1100 |
-
# Clear the results table for sequence queries
|
1101 |
-
@output
|
1102 |
-
@render_widget
|
1103 |
-
def results_table():
|
1104 |
-
return create_interactive_table(pd.DataFrame())
|
1105 |
else:
|
1106 |
df = pd.DataFrame(query_results["results"])
|
1107 |
if df.empty:
|
@@ -1116,6 +1112,7 @@ def server(input, output, session):
|
|
1116 |
|
1117 |
if pdb_ids:
|
1118 |
pdb_ids_store.set(pdb_ids)
|
|
|
1119 |
ui.update_select(
|
1120 |
"selected_pdb",
|
1121 |
choices=pdb_ids,
|
|
|
1 |
from transformers import pipeline
|
2 |
+
from rcsbsearchapi import TextQuery, AttributeQuery, Query
|
3 |
from rcsbsearchapi.search import Sort, SequenceQuery
|
4 |
import os
|
5 |
from dotenv import load_dotenv
|
|
|
31 |
"text2text-generation",
|
32 |
model=model_name,
|
33 |
max_new_tokens=1024,
|
34 |
+
temperature=0.1,
|
35 |
torch_dtype="auto",
|
36 |
device="cpu"
|
37 |
)
|
|
|
44 |
4. Specific PDB ID
|
45 |
5. Experimental method (X-RAY, EM, NMR)
|
46 |
6. Organism/Species information
|
47 |
+
7. Sequence similarity (in %)
|
48 |
|
49 |
Format:
|
50 |
Protein: [protein name or type]
|
|
|
63 |
PDB_ID: none
|
64 |
Method: X-RAY
|
65 |
|
66 |
+
Query: "Find structures containing sequence with similarity 90% MNIFEMLRIDEGLRLKIYKDTEGYYTIGIGHLLTKSPSLNAAKSELDKAIGRNTNGVITKDEAEKLFNQDVDAAVRGILRNAKLKPVYDSLDAVRRAALINMVFQMGETGVAGFTNSLRMLQQKRWDEAAVNLAKSRWYNQTPNRAKRVITTFRTGTWDAYKNL"
|
67 |
Protein: none
|
68 |
Organism: none
|
69 |
Resolution: none
|
70 |
Sequence: MNIFEMLRIDEGLRLKIYKDTEGYYTIGIGHLLTKSPSLNAAKSELDKAIGRNTNGVITKDEAEKLFNQDVDAAVRGILRNAKLKPVYDSLDAVRRAALINMVFQMGETGVAGFTNSLRMLQQKRWDEAAVNLAKSRWYNQTPNRAKRVITTFRTGTWDAYKNL
|
71 |
PDB_ID: none
|
72 |
Method: none
|
73 |
+
Similarity: 90
|
74 |
|
75 |
Query: "Get sequence of PDB ID 8ET6"
|
76 |
Protein: none
|
|
|
88 |
PDB_ID: none
|
89 |
Method: none
|
90 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
91 |
Now analyze:
|
92 |
Query: {query}
|
93 |
"""
|
|
|
111 |
organism = None
|
112 |
has_resolution_query = False
|
113 |
resolution_direction = "less"
|
114 |
+
similarity = None # Initialize similarity
|
115 |
+
print("Raw LLM response:", response) # Debug print
|
116 |
+
|
117 |
+
# Parse LLM response first to get similarity value
|
118 |
+
for line in response.split('\n'):
|
119 |
+
line = line.strip().lower() # Convert to lowercase
|
120 |
+
if 'similarity:' in line:
|
121 |
+
try:
|
122 |
+
similarity_str = line.split('similarity:')[1].strip()
|
123 |
+
if similarity_str.lower() not in ['none', 'n/a']:
|
124 |
+
similarity = float(similarity_str)
|
125 |
+
print(f"Successfully extracted similarity: {similarity}%")
|
126 |
+
except (ValueError, IndexError) as e:
|
127 |
+
print(f"Error parsing similarity: {e}")
|
128 |
+
continue
|
129 |
+
|
130 |
+
# If similarity is still None, try to extract from original query
|
131 |
+
if similarity is None:
|
132 |
+
# Case insensitive search for similarity pattern
|
133 |
+
similarity_match = re.search(r'similarity\s+(\d+(?:\.\d+)?)\s*%', query.lower())
|
134 |
+
if similarity_match:
|
135 |
+
try:
|
136 |
+
similarity = float(similarity_match.group(1))
|
137 |
+
print(f"Extracted similarity from query: {similarity}%")
|
138 |
+
except ValueError as e:
|
139 |
+
print(f"Error parsing similarity from query: {e}")
|
140 |
|
141 |
# Check if query contains resolution-related terms
|
142 |
resolution_terms = {
|
|
|
168 |
has_resolution_query = True
|
169 |
if direction: # if not None
|
170 |
resolution_direction = direction
|
171 |
+
|
172 |
# Also check for numerical values with Å
|
173 |
+
resolution_match = re.search(r'(\d+\.?\d*)\s*å?.*resolution', query_lower)
|
174 |
+
if resolution_match:
|
175 |
has_resolution_query = True
|
176 |
+
try:
|
177 |
+
resolution_limit = float(resolution_match.group(1))
|
178 |
+
except ValueError:
|
179 |
+
pass
|
180 |
|
181 |
# Clean and parse LLM response
|
182 |
for line in response.split('\n'):
|
183 |
if 'Resolution:' in line:
|
184 |
+
value = line.split('Resolution:')[1].strip()
|
185 |
if value.lower() not in ['none', 'n/a'] and has_resolution_query:
|
186 |
+
try:
|
187 |
+
# Extract just the number
|
188 |
+
res_value = ''.join(c for c in value if c.isdigit() or c == '.')
|
189 |
+
resolution_limit = float(res_value)
|
190 |
+
except ValueError:
|
191 |
+
pass
|
192 |
+
elif 'Method:' in line:
|
|
|
|
|
|
|
193 |
value = line.split('Method:')[1].strip()
|
194 |
if value.lower() not in ['none', 'n/a']:
|
195 |
method = value.upper()
|
196 |
+
elif 'Sequence:' in line:
|
197 |
value = line.split('Sequence:')[1].strip()
|
198 |
if value.lower() not in ['none', 'n/a']:
|
199 |
sequence = value
|
200 |
+
elif 'PDB_ID:' in line:
|
201 |
+
value = line.split('PDB_ID:')[1].strip()
|
202 |
if value.lower() not in ['none', 'n/a']:
|
203 |
pdb_id = value
|
204 |
+
elif 'Organism:' in line:
|
205 |
+
value = line.split('Organism:')[1].strip()
|
206 |
if value.lower() not in ['none', 'n/a']:
|
207 |
organism = value
|
208 |
|
209 |
# Build search query
|
210 |
queries = []
|
211 |
+
|
212 |
# Check if the query contains a protein sequence pattern
|
213 |
# Check for amino acid sequence (minimum 25 residues)
|
214 |
query_words = query.split()
|
|
|
226 |
print("Warning: Sequence must be at least 25 residues long. Skipping sequence search.")
|
227 |
sequence = None
|
228 |
else:
|
229 |
+
# Use the previously extracted similarity value
|
230 |
+
if similarity is None:
|
231 |
+
similarity = 100 # default value
|
232 |
+
print("No similarity specified, using default 100%")
|
233 |
+
|
234 |
+
identity_cutoff = similarity / 100.0 # Convert percentage to decimal
|
235 |
+
print(f"Adding sequence search with identity {similarity}% (cutoff: {identity_cutoff}) for sequence: {sequence}")
|
236 |
sequence_query = SequenceQuery(
|
237 |
sequence,
|
238 |
+
identity_cutoff=identity_cutoff,
|
239 |
evalue_cutoff=1,
|
240 |
sequence_type="protein"
|
241 |
)
|
242 |
queries.append(sequence_query)
|
243 |
+
print(f"Created sequence query with parameters: {sequence_query.params}")
|
244 |
# If no sequence, proceed with text search
|
245 |
else:
|
246 |
# Clean the original query and add text search
|
|
|
309 |
|
310 |
# Combine queries with AND operator
|
311 |
if queries:
|
312 |
+
final_query = queries[0]
|
313 |
for q in queries[1:]:
|
314 |
final_query = final_query & q
|
315 |
|
|
|
346 |
continue
|
347 |
|
348 |
structure_data = response.json()
|
|
|
349 |
# 결과 구성
|
350 |
result = {
|
351 |
'PDB ID': pdb_id,
|
352 |
+
'Title': structure_data.get('struct', {}).get('title', 'N/A'),
|
353 |
+
'# of total residues': structure_data.get('refine_hist', [{}])[0].get('pdbx_number_residues_total', 'N/A'),
|
354 |
+
'# of atoms of protein': structure_data.get('refine_hist', [{}])[0].get('pdbx_number_atoms_protein', 'N/A'),
|
355 |
'Resolution': f"{structure_data.get('rcsb_entry_info', {}).get('resolution_combined', [0.0])[0]:.2f}Å",
|
356 |
'Method': structure_data.get('exptl', [{}])[0].get('method', 'Unknown'),
|
|
|
357 |
'Release Date': structure_data.get('rcsb_accession_info', {}).get('initial_release_date', 'N/A')
|
358 |
+
|
359 |
+
|
360 |
}
|
361 |
|
362 |
results.append(result)
|
|
|
392 |
pdir=self.pdb_dir,
|
393 |
file_format="pdb"
|
394 |
)
|
395 |
+
|
396 |
if not pdb_path or not os.path.exists(pdb_path):
|
397 |
print(f"Failed to download PDB file for {pdb_id}")
|
398 |
+
return []
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
399 |
|
400 |
# Parse structure
|
401 |
parser = PDB.PDBParser(QUIET=True)
|
|
|
452 |
def process_query(self, query):
|
453 |
"""Process query and return results"""
|
454 |
try:
|
455 |
+
# Get search parameters from LLM
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
456 |
formatted_prompt = self.prompt_template.format(query=query)
|
457 |
response = self.pipe(formatted_prompt)[0]['generated_text']
|
458 |
print("Generated parameters:", response)
|
|
|
471 |
is_sequence_query = any(keyword in query.lower() for keyword in sequence_keywords)
|
472 |
|
473 |
if is_sequence_query and pdb_id:
|
474 |
+
# Get sequences for the PDB ID
|
475 |
sequences = self.get_sequences_by_pdb_id(pdb_id)
|
476 |
return {
|
477 |
"type": "sequence",
|
|
|
488 |
print(f"Error processing query: {str(e)}")
|
489 |
return {"type": "structure", "results": []}
|
490 |
|
491 |
+
def pdbsummary(name):
|
492 |
+
|
493 |
+
search_engine = ProteinSearchEngine()
|
494 |
+
|
495 |
+
query = ProteinQuery(
|
496 |
+
name,
|
497 |
+
max_resolution= 5.0
|
498 |
+
)
|
499 |
+
|
500 |
+
results = search_engine.search(query)
|
501 |
+
|
502 |
+
answer = ""
|
503 |
+
for i, structure in enumerate(results, 1):
|
504 |
+
answer += f"\n{i}. PDB ID : {structure.pdb_id}\n"
|
505 |
+
answer += f"\nResolution : {structure.resolution:.2f} A \n"
|
506 |
+
answer += f"Method : {structure.method}\n Title : {structure.title}\n"
|
507 |
+
answer += f"Release Date : {structure.release_date}\n Sequence length: {len(structure.sequence)} aa\n"
|
508 |
+
answer += f" Sequence:\n {structure.sequence}\n"
|
509 |
+
|
510 |
+
return answer
|
511 |
|
512 |
def render_html(pdb_id):
|
513 |
if pdb_id is None:
|
|
|
551 |
if df.empty:
|
552 |
return go.Figure()
|
553 |
|
554 |
+
# Reorder columns - Add '# of atoms of protein' to the column order
|
555 |
+
column_order = ['PDB ID', 'Resolution', 'Title','# of total residues', '# of atoms of protein', 'Method','Release Date']
|
556 |
df = df[column_order]
|
557 |
|
558 |
# Release Date 형식 변경 (YYYY-MM-DD)
|
|
|
563 |
header=dict(
|
564 |
values=list(df.columns),
|
565 |
fill_color='paleturquoise',
|
566 |
+
align='center',
|
567 |
+
font=dict(size=16),
|
568 |
),
|
569 |
cells=dict(
|
570 |
values=[
|
|
|
573 |
for cell in df[col]]
|
574 |
for i, col in enumerate(df.columns)
|
575 |
],
|
576 |
+
align='center',
|
577 |
+
font=dict(size=15),
|
578 |
+
height=35
|
579 |
),
|
580 |
+
columnwidth=[80, 80, 400, 100, 100, 100, 100], # Updated columnwidth to include new column
|
581 |
customdata=[['html'] * len(df) if i == 0 else [''] * len(df)
|
582 |
for i in range(len(df.columns))],
|
583 |
hoverlabel=dict(bgcolor='white')
|
|
|
586 |
# Update table layout
|
587 |
table.update_layout(
|
588 |
margin=dict(l=20, r=20, t=20, b=20),
|
589 |
+
height=450,
|
590 |
autosize=True
|
591 |
)
|
592 |
|
|
|
933 |
|
934 |
.pdb-selector {
|
935 |
display: flex;
|
936 |
+
align-items: ;
|
937 |
justify-content: flex-start;
|
938 |
gap: 5px;
|
939 |
margin-bottom: 20px;
|
|
|
950 |
|
951 |
.pdb-selector select {
|
952 |
margin-left: 0;
|
953 |
+
vertical-align: left;
|
954 |
display: inline-block;
|
955 |
}
|
956 |
|
|
|
1011 |
ui.p("Example queries:"),
|
1012 |
ui.tags.ul(
|
1013 |
ui.tags.li("Human hemoglobin C resolution better than 2.5Å"),
|
1014 |
+
ui.tags.li("Find structures containing sequence with similarity 90% MNIFEMLRIDEGLRLKIYKDTEGYYTIGIGHLLTKSPSLNAAKSELDKAIGRNTNGVITKDEAEKLFNQDVDAAVRGILRNAKLKPVYDSLDAVRRAALINMVFQMGETGVAGFTNSLRMLQQKRWDEAAVNLAKSRWYNQTPNRAKRVITTFRTGTWDAYKNL"),
|
1015 |
ui.tags.li("Sequence of PDB ID 8ET6")
|
1016 |
)
|
1017 |
)
|
|
|
1098 |
for line in input.query().split():
|
1099 |
if re.match(r'^[0-9A-Za-z]{4}$', line):
|
1100 |
pdb_ids.append(line.upper())
|
|
|
|
|
|
|
|
|
|
|
1101 |
else:
|
1102 |
df = pd.DataFrame(query_results["results"])
|
1103 |
if df.empty:
|
|
|
1112 |
|
1113 |
if pdb_ids:
|
1114 |
pdb_ids_store.set(pdb_ids)
|
1115 |
+
# Update only one dropdown
|
1116 |
ui.update_select(
|
1117 |
"selected_pdb",
|
1118 |
choices=pdb_ids,
|