lkjjj26 commited on
Commit
a783038
·
verified ·
1 Parent(s): 52245fc

Upload app.py

Browse files
Files changed (1) hide show
  1. app.py +95 -135
app.py CHANGED
@@ -45,28 +45,39 @@ class PDBSearchAssistant:
45
 
46
  Format:
47
  Protein: [protein name or type]
48
- Organism: [organism/species if mentioned]
49
  Resolution: [maximum resolution in Å, if mentioned]
50
  Sequence: [any sequence mentioned]
51
  PDB_ID: [specific PDB ID if mentioned]
52
  Method: [experimental method if mentioned]
 
 
53
 
54
  Examples:
 
 
 
 
 
 
 
 
 
55
  Query: "Find human insulin structures with X-ray better than 2.5Å resolution"
56
  Protein: insulin
57
- Organism: Homo sapiens
58
  Resolution: 2.5
59
  Sequence: none
60
  PDB_ID: none
61
  Method: X-RAY
 
 
62
 
63
- Query: "Find structures containing sequence with similarity 90% MNIFEMLRIDEGLRLKIYKDTEGYYTIGIGHLLTKSPSLNAAKSELDKAIGRNTNGVITKDEAEKLFNQDVDAAVRGILRNAKLKPVYDSLDAVRRAALINMVFQMGETGVAGFTNSLRMLQQKRWDEAAVNLAKSRWYNQTPNRAKRVITTFRTGTWDAYKNL"
64
  Protein: none
65
- Organism: none
66
  Resolution: none
67
- Sequence: MNIFEMLRIDEGLRLKIYKDTEGYYTIGIGHLLTKSPSLNAAKSELDKAIGRNTNGVITKDEAEKLFNQDVDAAVRGILRNAKLKPVYDSLDAVRRAALINMVFQMGETGVAGFTNSLRMLQQKRWDEAAVNLAKSRWYNQTPNRAKRVITTFRTGTWDAYKNL
68
  PDB_ID: none
69
  Method: none
 
70
  Similarity: 90
71
 
72
  Query: "Get sequence of PDB ID 8ET6"
@@ -76,15 +87,7 @@ class PDBSearchAssistant:
76
  Sequence: none
77
  PDB_ID: 8ET6
78
  Method: none
79
-
80
- Query: "Find mouse lysozyme structures"
81
- Protein: lysozyme
82
- Organism: Mus musculus
83
- Resolution: none
84
- Sequence: none
85
- PDB_ID: none
86
- Method: none
87
-
88
  Now analyze:
89
  Query: {query}
90
  """
@@ -108,128 +111,108 @@ class PDBSearchAssistant:
108
  organism = None
109
  has_resolution_query = False
110
  resolution_direction = "less"
111
- similarity = None # Initialize similarity
112
- print("Raw LLM response:", response) # Debug print
113
 
114
- # Parse LLM response first to get similarity value
115
- for line in response.split('\n'):
116
- line = line.strip().lower() # Convert to lowercase
117
- if 'similarity:' in line:
118
- try:
119
- similarity_str = line.split('similarity:')[1].strip()
120
- if similarity_str.lower() not in ['none', 'n/a']:
121
- similarity = float(similarity_str)
122
- print(f"Successfully extracted similarity: {similarity}%")
123
- except (ValueError, IndexError) as e:
124
- print(f"Error parsing similarity: {e}")
125
- continue
126
-
127
- # If similarity is still None, try to extract from original query
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
128
  if similarity is None:
129
- # Case insensitive search for similarity pattern
130
  similarity_match = re.search(r'similarity\s+(\d+(?:\.\d+)?)\s*%', query.lower())
131
  if similarity_match:
132
  try:
133
  similarity = float(similarity_match.group(1))
134
  print(f"Extracted similarity from query: {similarity}%")
135
- except ValueError as e:
136
- print(f"Error parsing similarity from query: {e}")
137
-
138
- # Check if query contains resolution-related terms
139
- resolution_terms = {
140
- 'better': 'less',
141
- 'best': 'less',
142
- 'highest': 'less',
143
- 'good': 'less',
144
- 'fine': 'less',
145
- 'worse': 'greater',
146
- 'worst': 'greater',
147
- 'lowest': 'greater',
148
- 'poor': 'greater',
149
- 'resolution': None,
150
- 'å': None,
151
- 'angstrom': None,
152
- 'than': None,
153
- 'under': 'less',
154
- 'below': 'less',
155
- 'above': 'greater',
156
- 'over': 'greater'
157
- }
158
 
159
- # Check if the original query mentions resolution
160
- query_lower = query.lower()
161
-
162
- # Determine resolution direction from query
163
- for term, direction in resolution_terms.items():
164
- if term in query_lower:
165
- has_resolution_query = True
166
- if direction: # if not None
167
- resolution_direction = direction
168
 
169
- # Also check for numerical values with Å
170
- resolution_match = re.search(r'(\d+\.?\d*)\s*å?.*resolution', query_lower)
171
- if resolution_match:
172
- has_resolution_query = True
173
- try:
174
  resolution_limit = float(resolution_match.group(1))
175
- except ValueError:
176
- pass
177
-
178
- # Clean and parse LLM response
179
- for line in response.split('\n'):
180
- if 'Resolution:' in line:
181
- value = line.split('Resolution:')[1].strip()
182
- if value.lower() not in ['none', 'n/a'] and has_resolution_query:
183
- try:
184
- # Extract just the number
185
- res_value = ''.join(c for c in value if c.isdigit() or c == '.')
186
- resolution_limit = float(res_value)
187
- except ValueError:
188
- pass
189
- elif 'Method:' in line:
190
- value = line.split('Method:')[1].strip()
191
- if value.lower() not in ['none', 'n/a']:
192
- method = value.upper()
193
- elif 'Sequence:' in line:
194
- value = line.split('Sequence:')[1].strip()
195
- if value.lower() not in ['none', 'n/a']:
196
- sequence = value
197
- elif 'PDB_ID:' in line:
198
- value = line.split('PDB_ID:')[1].strip()
199
- if value.lower() not in ['none', 'n/a']:
200
- pdb_id = value
201
- elif 'Organism:' in line:
202
- value = line.split('Organism:')[1].strip()
203
- if value.lower() not in ['none', 'n/a']:
204
- organism = value
205
-
206
- # Build search query
207
  queries = []
208
 
209
- # Check if the query contains a protein sequence pattern
210
- # Check for amino acid sequence (minimum 25 residues)
211
  query_words = query.split()
212
  for word in query_words:
213
- # Check if the word consists of valid amino acid letters
214
- if (len(word) >= 25 and # minimum 25 residues requirement
215
  all(c in 'ACDEFGHIKLMNPQRSTVWY' for c in word.upper()) and
216
  sum(c.isupper() for c in word) / len(word) > 0.8):
217
  sequence = word
218
  break
219
 
220
- # If sequence is found, use SequenceQuery
221
  if sequence:
222
  if len(sequence) < 25:
223
  print("Warning: Sequence must be at least 25 residues long. Skipping sequence search.")
224
- sequence = None
225
  else:
226
- # Use the previously extracted similarity value
227
  if similarity is None:
228
- similarity = 100 # default value
229
  print("No similarity specified, using default 100%")
230
 
231
- identity_cutoff = similarity / 100.0 # Convert percentage to decimal
232
- print(f"Adding sequence search with identity {similarity}% (cutoff: {identity_cutoff}) for sequence: {sequence}")
233
  sequence_query = SequenceQuery(
234
  sequence,
235
  identity_cutoff=identity_cutoff,
@@ -238,32 +221,8 @@ class PDBSearchAssistant:
238
  )
239
  queries.append(sequence_query)
240
  print(f"Created sequence query with parameters: {sequence_query.params}")
241
- # If no sequence, proceed with text search
242
- else:
243
- # Clean the original query and add text search
244
- clean_query = query.lower()
245
-
246
- # Remove resolution numbers and terms if they exist
247
- if has_resolution_query:
248
- clean_query = re.sub(r'\d+\.?\d*\s*å?', '', clean_query)
249
- for term in resolution_terms:
250
- clean_query = clean_query.replace(term, '')
251
-
252
- # Clean up extra spaces and trim
253
- clean_query = ' '.join(clean_query.split())
254
-
255
- print("Cleaned query:", clean_query)
256
-
257
- # Add text search if query is not empty
258
- if clean_query.strip():
259
- text_query = AttributeQuery(
260
- attribute="struct.title",
261
- operator="contains_phrase",
262
- value=clean_query
263
- )
264
- queries.append(text_query)
265
 
266
- # Add resolution filter if specified
267
  if resolution_limit and has_resolution_query:
268
  operator = "less_or_equal" if resolution_direction == "less" else "greater_or_equal"
269
  print(f"Adding resolution filter: {operator} {resolution_limit}Å")
@@ -273,6 +232,7 @@ class PDBSearchAssistant:
273
  value=resolution_limit
274
  )
275
  queries.append(resolution_query)
 
276
 
277
  # Add PDB ID search if specified
278
  if pdb_id:
 
45
 
46
  Format:
47
  Protein: [protein name or type]
 
48
  Resolution: [maximum resolution in Å, if mentioned]
49
  Sequence: [any sequence mentioned]
50
  PDB_ID: [specific PDB ID if mentioned]
51
  Method: [experimental method if mentioned]
52
+ Organism: [organism/species if mentioned]
53
+ Similarity: [similarity percentage if mentioned]
54
 
55
  Examples:
56
+ Query: "Find structures with sequence MNIFEMLRIDEGLRLKIYKDTEGYYTIGIGHLLTKSPSLNAAKSELDKAIGRN and resolution better than 2.5Å"
57
+ Protein: none
58
+ Resolution: 2.5
59
+ Sequence: MNIFEMLRIDEGLRLKIYKDTEGYYTIGIGHLLTKSPSLNAAKSELDKAIGRN
60
+ PDB_ID: none
61
+ Method: none
62
+ Organism: none
63
+ Similarity: 100
64
+
65
  Query: "Find human insulin structures with X-ray better than 2.5Å resolution"
66
  Protein: insulin
 
67
  Resolution: 2.5
68
  Sequence: none
69
  PDB_ID: none
70
  Method: X-RAY
71
+ Organism: Homo sapiens
72
+ Similarity: none
73
 
74
+ Query: "Find structures containing sequence with similarity 90% MNIFEMLRIDEGLRLKIYKDTEGYYTIGIGHLLTKSPSLNAAKSELDKAIGRN"
75
  Protein: none
 
76
  Resolution: none
77
+ Sequence: MNIFEMLRIDEGLRLKIYKDTEGYYTIGIGHLLTKSPSLNAAKSELDKAIGRN
78
  PDB_ID: none
79
  Method: none
80
+ Organism: none
81
  Similarity: 90
82
 
83
  Query: "Get sequence of PDB ID 8ET6"
 
87
  Sequence: none
88
  PDB_ID: 8ET6
89
  Method: none
90
+
 
 
 
 
 
 
 
 
91
  Now analyze:
92
  Query: {query}
93
  """
 
111
  organism = None
112
  has_resolution_query = False
113
  resolution_direction = "less"
114
+ similarity = None
 
115
 
116
+ print("Raw LLM response:", response)
117
+
118
+ # Extract resolution with improved pattern matching
119
+ # Look for the first valid resolution value (non-zero)
120
+ resolution_matches = re.finditer(r'[Rr]esolution:\s*(\d+(?:\.\d+)?)', response)
121
+ for match in resolution_matches:
122
+ try:
123
+ value = float(match.group(1))
124
+ if value > 0: # Only accept positive resolution values
125
+ resolution_limit = value
126
+ has_resolution_query = True
127
+ print(f"Extracted resolution: {resolution_limit}Å")
128
+ break # Stop after finding the first valid resolution
129
+ except ValueError:
130
+ continue
131
+
132
+ # Clean and normalize remaining response
133
+ # Remove all resolution entries to avoid confusion
134
+ cleaned_response = re.sub(r'[Rr]esolution:\s*\d+(?:\.\d+)?(?:\s*Å?)?\s*', '', response)
135
+
136
+ # Split remaining response into clean key-value pairs
137
+ response_pairs = {}
138
+ for pair in re.finditer(r'(\w+):\s*([^:]+?)(?=\s+\w+:|$)', cleaned_response):
139
+ key, value = pair.groups()
140
+ key = key.lower()
141
+ value = value.strip()
142
+ if value.lower() not in ['none', 'n/a']:
143
+ response_pairs[key] = value
144
+
145
+ print("Parsed response pairs:", response_pairs) # Debug print
146
+
147
+ # Extract sequence and similarity from cleaned pairs
148
+ if 'sequence' in response_pairs:
149
+ sequence = response_pairs['sequence']
150
+ if len(sequence) >= 25:
151
+ print(f"Extracted sequence: {sequence}")
152
+
153
+ if 'similarity' in response_pairs:
154
+ try:
155
+ similarity_str = response_pairs['similarity'].replace('%', '')
156
+ similarity = float(similarity_str)
157
+ print(f"Extracted similarity: {similarity}%")
158
+ except ValueError:
159
+ pass
160
+
161
+ if 'pdb_id' in response_pairs:
162
+ pdb_id = response_pairs['pdb_id'].upper()
163
+
164
+ if 'method' in response_pairs:
165
+ method = response_pairs['method'].upper()
166
+
167
+ if 'organism' in response_pairs:
168
+ organism = response_pairs['organism']
169
+
170
+ # If similarity not found in LLM response, try query
171
  if similarity is None:
 
172
  similarity_match = re.search(r'similarity\s+(\d+(?:\.\d+)?)\s*%', query.lower())
173
  if similarity_match:
174
  try:
175
  similarity = float(similarity_match.group(1))
176
  print(f"Extracted similarity from query: {similarity}%")
177
+ except ValueError:
178
+ pass
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
179
 
180
+ # If still no similarity specified and sequence exists, use default
181
+ if similarity is None and sequence:
182
+ similarity = 100
183
+ print("No similarity specified, using default 100%")
 
 
 
 
 
184
 
185
+ # Parse resolution from query if not found in LLM response
186
+ if not has_resolution_query:
187
+ resolution_pattern = r'resolution (?:better|worse|less|greater) than (\d+\.?\d*)(?:\s*Å|A)?'
188
+ resolution_match = re.search(resolution_pattern, query.lower())
189
+ if resolution_match:
190
  resolution_limit = float(resolution_match.group(1))
191
+ has_resolution_query = True
192
+ print(f"Extracted resolution from query: {resolution_limit}Å")
193
+
194
+ # Build queries list
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
195
  queries = []
196
 
197
+ # Add sequence query if present
 
198
  query_words = query.split()
199
  for word in query_words:
200
+ if (len(word) >= 25 and
 
201
  all(c in 'ACDEFGHIKLMNPQRSTVWY' for c in word.upper()) and
202
  sum(c.isupper() for c in word) / len(word) > 0.8):
203
  sequence = word
204
  break
205
 
 
206
  if sequence:
207
  if len(sequence) < 25:
208
  print("Warning: Sequence must be at least 25 residues long. Skipping sequence search.")
 
209
  else:
 
210
  if similarity is None:
211
+ similarity = 100
212
  print("No similarity specified, using default 100%")
213
 
214
+ identity_cutoff = similarity / 100.0
215
+ print(f"Adding sequence search with identity {similarity}% (cutoff: {identity_cutoff})")
216
  sequence_query = SequenceQuery(
217
  sequence,
218
  identity_cutoff=identity_cutoff,
 
221
  )
222
  queries.append(sequence_query)
223
  print(f"Created sequence query with parameters: {sequence_query.params}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
224
 
225
+ # Add resolution query if present
226
  if resolution_limit and has_resolution_query:
227
  operator = "less_or_equal" if resolution_direction == "less" else "greater_or_equal"
228
  print(f"Adding resolution filter: {operator} {resolution_limit}Å")
 
232
  value=resolution_limit
233
  )
234
  queries.append(resolution_query)
235
+ print(f"Created resolution query with cutoff: {resolution_limit}Å")
236
 
237
  # Add PDB ID search if specified
238
  if pdb_id: