Spaces:

lkjjj26
/

query

Sleeping

App Files Files Community

lkjjj26 commited on Jan 10

Commit

a783038

verified ·

1 Parent(s): 52245fc

Upload app.py

Browse files

Files changed (1) hide show

app.py +95 -135

app.py CHANGED Viewed

@@ -45,28 +45,39 @@ class PDBSearchAssistant:
             Format:
             Protein: [protein name or type]
-            Organism: [organism/species if mentioned]
             Resolution: [maximum resolution in Å, if mentioned]
             Sequence: [any sequence mentioned]
             PDB_ID: [specific PDB ID if mentioned]
             Method: [experimental method if mentioned]
             Examples:
             Query: "Find human insulin structures with X-ray better than 2.5Å resolution"
             Protein: insulin
-            Organism: Homo sapiens
             Resolution: 2.5
             Sequence: none
             PDB_ID: none
             Method: X-RAY
-            Query: "Find structures containing sequence with similarity 90% MNIFEMLRIDEGLRLKIYKDTEGYYTIGIGHLLTKSPSLNAAKSELDKAIGRNTNGVITKDEAEKLFNQDVDAAVRGILRNAKLKPVYDSLDAVRRAALINMVFQMGETGVAGFTNSLRMLQQKRWDEAAVNLAKSRWYNQTPNRAKRVITTFRTGTWDAYKNL"
             Protein: none
-            Organism: none
             Resolution: none
-            Sequence: MNIFEMLRIDEGLRLKIYKDTEGYYTIGIGHLLTKSPSLNAAKSELDKAIGRNTNGVITKDEAEKLFNQDVDAAVRGILRNAKLKPVYDSLDAVRRAALINMVFQMGETGVAGFTNSLRMLQQKRWDEAAVNLAKSRWYNQTPNRAKRVITTFRTGTWDAYKNL
             PDB_ID: none
             Method: none
             Similarity: 90
             Query: "Get sequence of PDB ID 8ET6"
@@ -76,15 +87,7 @@ class PDBSearchAssistant:
             Sequence: none
             PDB_ID: 8ET6
             Method: none
-            Query: "Find mouse lysozyme structures"
-            Protein: lysozyme
-            Organism: Mus musculus
-            Resolution: none
-            Sequence: none
-            PDB_ID: none
-            Method: none
             Now analyze:
             Query: {query}
             """
@@ -108,128 +111,108 @@ class PDBSearchAssistant:
             organism = None
             has_resolution_query = False
             resolution_direction = "less"
-            similarity = None  # Initialize similarity
-            print("Raw LLM response:", response)  # Debug print
-            # Parse LLM response first to get similarity value
-            for line in response.split('\n'):
-                line = line.strip().lower()  # Convert to lowercase
-                if 'similarity:' in line:
-                    try:
-                        similarity_str = line.split('similarity:')[1].strip()
-                        if similarity_str.lower() not in ['none', 'n/a']:
-                            similarity = float(similarity_str)
-                            print(f"Successfully extracted similarity: {similarity}%")
-                    except (ValueError, IndexError) as e:
-                        print(f"Error parsing similarity: {e}")
-                        continue
-            # If similarity is still None, try to extract from original query
             if similarity is None:
-                # Case insensitive search for similarity pattern
                 similarity_match = re.search(r'similarity\s+(\d+(?:\.\d+)?)\s*%', query.lower())
                 if similarity_match:
                     try:
                         similarity = float(similarity_match.group(1))
                         print(f"Extracted similarity from query: {similarity}%")
-                    except ValueError as e:
-                        print(f"Error parsing similarity from query: {e}")
-            # Check if query contains resolution-related terms
-            resolution_terms = {
-                'better': 'less',
-                'best': 'less',
-                'highest': 'less',
-                'good': 'less',
-                'fine': 'less',
-                'worse': 'greater',
-                'worst': 'greater',
-                'lowest': 'greater',
-                'poor': 'greater',
-                'resolution': None,
-                'å': None,
-                'angstrom': None,
-                'than': None,
-                'under': 'less',
-                'below': 'less',
-                'above': 'greater',
-                'over': 'greater'
-            }
-            # Check if the original query mentions resolution
-            query_lower = query.lower()
-            # Determine resolution direction from query
-            for term, direction in resolution_terms.items():
-                if term in query_lower:
-                    has_resolution_query = True
-                    if direction:  # if not None
-                        resolution_direction = direction
-            # Also check for numerical values with Å
-            resolution_match = re.search(r'(\d+\.?\d*)\s*å?.*resolution', query_lower)
-            if resolution_match:
-                has_resolution_query = True
-                try:
                     resolution_limit = float(resolution_match.group(1))
-                except ValueError:
-                    pass
-            # Clean and parse LLM response
-            for line in response.split('\n'):
-                if 'Resolution:' in line:
-                    value = line.split('Resolution:')[1].strip()
-                    if value.lower() not in ['none', 'n/a'] and has_resolution_query:
-                        try:
-                            # Extract just the number
-                            res_value = ''.join(c for c in value if c.isdigit() or c == '.')
-                            resolution_limit = float(res_value)
-                        except ValueError:
-                            pass
-                elif 'Method:' in line:
-                    value = line.split('Method:')[1].strip()
-                    if value.lower() not in ['none', 'n/a']:
-                        method = value.upper()
-                elif 'Sequence:' in line:
-                    value = line.split('Sequence:')[1].strip()
-                    if value.lower() not in ['none', 'n/a']:
-                        sequence = value
-                elif 'PDB_ID:' in line:
-                    value = line.split('PDB_ID:')[1].strip()
-                    if value.lower() not in ['none', 'n/a']:
-                        pdb_id = value
-                elif 'Organism:' in line:
-                    value = line.split('Organism:')[1].strip()
-                    if value.lower() not in ['none', 'n/a']:
-                        organism = value
-            # Build search query
             queries = []
-            # Check if the query contains a protein sequence pattern
-            # Check for amino acid sequence (minimum 25 residues)
             query_words = query.split()
             for word in query_words:
-                # Check if the word consists of valid amino acid letters
-                if (len(word) >= 25 and  # minimum 25 residues requirement
                     all(c in 'ACDEFGHIKLMNPQRSTVWY' for c in word.upper()) and
                     sum(c.isupper() for c in word) / len(word) > 0.8):
                     sequence = word
                     break
-            # If sequence is found, use SequenceQuery
             if sequence:
                 if len(sequence) < 25:
                     print("Warning: Sequence must be at least 25 residues long. Skipping sequence search.")
-                    sequence = None
                 else:
-                    # Use the previously extracted similarity value
                     if similarity is None:
-                        similarity = 100  # default value
                         print("No similarity specified, using default 100%")
-                    identity_cutoff = similarity / 100.0  # Convert percentage to decimal
-                    print(f"Adding sequence search with identity {similarity}% (cutoff: {identity_cutoff}) for sequence: {sequence}")
                     sequence_query = SequenceQuery(
                         sequence,
                         identity_cutoff=identity_cutoff,
@@ -238,32 +221,8 @@ class PDBSearchAssistant:
                     )
                     queries.append(sequence_query)
                     print(f"Created sequence query with parameters: {sequence_query.params}")
-            # If no sequence, proceed with text search
-            else:
-                # Clean the original query and add text search
-                clean_query = query.lower()
-                # Remove resolution numbers and terms if they exist
-                if has_resolution_query:
-                    clean_query = re.sub(r'\d+\.?\d*\s*å?', '', clean_query)
-                    for term in resolution_terms:
-                        clean_query = clean_query.replace(term, '')
-                # Clean up extra spaces and trim
-                clean_query = ' '.join(clean_query.split())
-                print("Cleaned query:", clean_query)
-                # Add text search if query is not empty
-                if clean_query.strip():
-                    text_query = AttributeQuery(
-                        attribute="struct.title",
-                        operator="contains_phrase",
-                        value=clean_query
-                    )
-                    queries.append(text_query)
-            # Add resolution filter if specified
             if resolution_limit and has_resolution_query:
                 operator = "less_or_equal" if resolution_direction == "less" else "greater_or_equal"
                 print(f"Adding resolution filter: {operator} {resolution_limit}Å")
@@ -273,6 +232,7 @@ class PDBSearchAssistant:
                     value=resolution_limit
                 )
                 queries.append(resolution_query)
             # Add PDB ID search if specified
             if pdb_id:

             Format:
             Protein: [protein name or type]
             Resolution: [maximum resolution in Å, if mentioned]
             Sequence: [any sequence mentioned]
             PDB_ID: [specific PDB ID if mentioned]
             Method: [experimental method if mentioned]
+            Organism: [organism/species if mentioned]
+            Similarity: [similarity percentage if mentioned]
             Examples:
+            Query: "Find structures with sequence MNIFEMLRIDEGLRLKIYKDTEGYYTIGIGHLLTKSPSLNAAKSELDKAIGRN and resolution better than 2.5Å"
+            Protein: none
+            Resolution: 2.5
+            Sequence: MNIFEMLRIDEGLRLKIYKDTEGYYTIGIGHLLTKSPSLNAAKSELDKAIGRN
+            PDB_ID: none
+            Method: none
+            Organism: none
+            Similarity: 100
             Query: "Find human insulin structures with X-ray better than 2.5Å resolution"
             Protein: insulin
             Resolution: 2.5
             Sequence: none
             PDB_ID: none
             Method: X-RAY
+            Organism: Homo sapiens
+            Similarity: none
+            Query: "Find structures containing sequence with similarity 90% MNIFEMLRIDEGLRLKIYKDTEGYYTIGIGHLLTKSPSLNAAKSELDKAIGRN"
             Protein: none
             Resolution: none
+            Sequence: MNIFEMLRIDEGLRLKIYKDTEGYYTIGIGHLLTKSPSLNAAKSELDKAIGRN
             PDB_ID: none
             Method: none
+            Organism: none
             Similarity: 90
             Query: "Get sequence of PDB ID 8ET6"
             Sequence: none
             PDB_ID: 8ET6
             Method: none
             Now analyze:
             Query: {query}
             """
             organism = None
             has_resolution_query = False
             resolution_direction = "less"
+            similarity = None
+            print("Raw LLM response:", response)
+            # Extract resolution with improved pattern matching
+            # Look for the first valid resolution value (non-zero)
+            resolution_matches = re.finditer(r'[Rr]esolution:\s*(\d+(?:\.\d+)?)', response)
+            for match in resolution_matches:
+                try:
+                    value = float(match.group(1))
+                    if value > 0:  # Only accept positive resolution values
+                        resolution_limit = value
+                        has_resolution_query = True
+                        print(f"Extracted resolution: {resolution_limit}Å")
+                        break  # Stop after finding the first valid resolution
+                except ValueError:
+                    continue
+            # Clean and normalize remaining response
+            # Remove all resolution entries to avoid confusion
+            cleaned_response = re.sub(r'[Rr]esolution:\s*\d+(?:\.\d+)?(?:\s*Å?)?\s*', '', response)
+            # Split remaining response into clean key-value pairs
+            response_pairs = {}
+            for pair in re.finditer(r'(\w+):\s*([^:]+?)(?=\s+\w+:|$)', cleaned_response):
+                key, value = pair.groups()
+                key = key.lower()
+                value = value.strip()
+                if value.lower() not in ['none', 'n/a']:
+                    response_pairs[key] = value
+            print("Parsed response pairs:", response_pairs)  # Debug print
+            # Extract sequence and similarity from cleaned pairs
+            if 'sequence' in response_pairs:
+                sequence = response_pairs['sequence']
+                if len(sequence) >= 25:
+                    print(f"Extracted sequence: {sequence}")
+            if 'similarity' in response_pairs:
+                try:
+                    similarity_str = response_pairs['similarity'].replace('%', '')
+                    similarity = float(similarity_str)
+                    print(f"Extracted similarity: {similarity}%")
+                except ValueError:
+                    pass
+            if 'pdb_id' in response_pairs:
+                pdb_id = response_pairs['pdb_id'].upper()
+            if 'method' in response_pairs:
+                method = response_pairs['method'].upper()
+            if 'organism' in response_pairs:
+                organism = response_pairs['organism']
+            # If similarity not found in LLM response, try query
             if similarity is None:
                 similarity_match = re.search(r'similarity\s+(\d+(?:\.\d+)?)\s*%', query.lower())
                 if similarity_match:
                     try:
                         similarity = float(similarity_match.group(1))
                         print(f"Extracted similarity from query: {similarity}%")
+                    except ValueError:
+                        pass
+            # If still no similarity specified and sequence exists, use default
+            if similarity is None and sequence:
+                similarity = 100
+                print("No similarity specified, using default 100%")
+            # Parse resolution from query if not found in LLM response
+            if not has_resolution_query:
+                resolution_pattern = r'resolution (?:better|worse|less|greater) than (\d+\.?\d*)(?:\s*Å|A)?'
+                resolution_match = re.search(resolution_pattern, query.lower())
+                if resolution_match:
                     resolution_limit = float(resolution_match.group(1))
+                    has_resolution_query = True
+                    print(f"Extracted resolution from query: {resolution_limit}Å")
+            # Build queries list
             queries = []
+            # Add sequence query if present
             query_words = query.split()
             for word in query_words:
+                if (len(word) >= 25 and
                     all(c in 'ACDEFGHIKLMNPQRSTVWY' for c in word.upper()) and
                     sum(c.isupper() for c in word) / len(word) > 0.8):
                     sequence = word
                     break
             if sequence:
                 if len(sequence) < 25:
                     print("Warning: Sequence must be at least 25 residues long. Skipping sequence search.")
                 else:
                     if similarity is None:
+                        similarity = 100
                         print("No similarity specified, using default 100%")
+                    identity_cutoff = similarity / 100.0
+                    print(f"Adding sequence search with identity {similarity}% (cutoff: {identity_cutoff})")
                     sequence_query = SequenceQuery(
                         sequence,
                         identity_cutoff=identity_cutoff,
                     )
                     queries.append(sequence_query)
                     print(f"Created sequence query with parameters: {sequence_query.params}")
+            # Add resolution query if present
             if resolution_limit and has_resolution_query:
                 operator = "less_or_equal" if resolution_direction == "less" else "greater_or_equal"
                 print(f"Adding resolution filter: {operator} {resolution_limit}Å")
                     value=resolution_limit
                 )
                 queries.append(resolution_query)
+                print(f"Created resolution query with cutoff: {resolution_limit}Å")
             # Add PDB ID search if specified
             if pdb_id: