Spaces:

lkjjj26
/

query

Sleeping

App Files Files Community

lkjjj26 commited on Jan 10

Commit

727ecf1

1 Parent(s): a51bfce

update

Browse files

Files changed (1) hide show

app.py +104 -107

app.py CHANGED Viewed

@@ -1,5 +1,5 @@
 from transformers import pipeline
-from rcsbsearchapi import TextQuery, AttributeQuery
 from rcsbsearchapi.search import Sort, SequenceQuery
 import os
 from dotenv import load_dotenv
@@ -31,7 +31,7 @@ class PDBSearchAssistant:
             "text2text-generation",
             model=model_name,
             max_new_tokens=1024,
-            temperature=0.3,
             torch_dtype="auto",
             device="cpu"
         )
@@ -44,6 +44,7 @@ class PDBSearchAssistant:
             4. Specific PDB ID
             5. Experimental method (X-RAY, EM, NMR)
             6. Organism/Species information
             Format:
             Protein: [protein name or type]
@@ -62,13 +63,14 @@ class PDBSearchAssistant:
             PDB_ID: none
             Method: X-RAY
-            Query: "Find structures containing sequence MNIFEMLRIDEGLRLKIYKDTEGYYTIGIGHLLTKSPSLNAAKSELDKAIGRNTNGVITKDEAEKLFNQDVDAAVRGILRNAKLKPVYDSLDAVRRAALINMVFQMGETGVAGFTNSLRMLQQKRWDEAAVNLAKSRWYNQTPNRAKRVITTFRTGTWDAYKNL"
             Protein: none
             Organism: none
             Resolution: none
             Sequence: MNIFEMLRIDEGLRLKIYKDTEGYYTIGIGHLLTKSPSLNAAKSELDKAIGRNTNGVITKDEAEKLFNQDVDAAVRGILRNAKLKPVYDSLDAVRRAALINMVFQMGETGVAGFTNSLRMLQQKRWDEAAVNLAKSRWYNQTPNRAKRVITTFRTGTWDAYKNL
             PDB_ID: none
             Method: none
             Query: "Get sequence of PDB ID 8ET6"
             Protein: none
@@ -86,32 +88,6 @@ class PDBSearchAssistant:
             PDB_ID: none
             Method: none
-            Query: "Show me E. coli protein structures solved by Cryo-EM"
-            Protein: none
-            Organism: Escherichia coli
-            Resolution: none
-            Sequence: none
-            PDB_ID: none
-            Method: EM
-            Query: "Find S. cerevisiae structures with resolution better than 1.8Å"
-            Protein: none
-            Organism: Saccharomyces cerevisiae
-            Resolution: 1.8
-            Sequence: none
-            PDB_ID: none
-            Method: none
-            Query: "Sequence of 7BZ5"
-            Protein: none
-            Organism: none
-            Resolution: none
-            Sequence: none
-            PDB_ID: 7BZ5
-            Method: none
             Now analyze:
             Query: {query}
             """
@@ -135,6 +111,32 @@ class PDBSearchAssistant:
             organism = None
             has_resolution_query = False
             resolution_direction = "less"
             # Check if query contains resolution-related terms
             resolution_terms = {
@@ -166,45 +168,47 @@ class PDBSearchAssistant:
                     has_resolution_query = True
                     if direction:  # if not None
                         resolution_direction = direction
             # Also check for numerical values with Å
-            if re.search(r'\d+\.?\d*\s*å?', query_lower):
                 has_resolution_query = True
             # Clean and parse LLM response
             for line in response.split('\n'):
                 if 'Resolution:' in line:
-                    value = line.split('Resolution:')[1].strip().split(" ")[0].strip()
                     if value.lower() not in ['none', 'n/a'] and has_resolution_query:
-                        resolution_limit = float(value)
-                        # try:
-                        #     # Extract just the number
-                        #     res_value = ''.join(c for c in value if c.isdigit() or c == '.')
-                        #     resolution_limit = float(res_value)
-                        # except ValueError:
-                        #     pass
-                if 'Method:' in line:
                     value = line.split('Method:')[1].strip()
                     if value.lower() not in ['none', 'n/a']:
                         method = value.upper()
-                if 'Sequence:' in line:
                     value = line.split('Sequence:')[1].strip()
                     if value.lower() not in ['none', 'n/a']:
                         sequence = value
-                if 'PDB_ID:' in line:
-                    value = line.split('PDB_ID:')[1].strip().split(" ")[0].strip()
                     if value.lower() not in ['none', 'n/a']:
                         pdb_id = value
-                if 'Protein:' in line:
-                    value = line.split('Protein:')[1].split('Resolution:')[0].strip()
                     if value.lower() not in ['none', 'n/a']:
                         organism = value
             # Build search query
             queries = []
-            print(organism)
             # Check if the query contains a protein sequence pattern
             # Check for amino acid sequence (minimum 25 residues)
             query_words = query.split()
@@ -222,14 +226,21 @@ class PDBSearchAssistant:
                     print("Warning: Sequence must be at least 25 residues long. Skipping sequence search.")
                     sequence = None
                 else:
-                    print(f"Adding sequence search with identity 100% for sequence: {sequence}")
                     sequence_query = SequenceQuery(
                         sequence,
-                        identity_cutoff=1.0,  # 100% identity
                         evalue_cutoff=1,
                         sequence_type="protein"
                     )
                     queries.append(sequence_query)
             # If no sequence, proceed with text search
             else:
                 # Clean the original query and add text search
@@ -298,7 +309,7 @@ class PDBSearchAssistant:
             # Combine queries with AND operator
             if queries:
-                final_query = queries[1]
                 for q in queries[1:]:
                     final_query = final_query & q
@@ -335,14 +346,17 @@ class PDBSearchAssistant:
                                 continue
                             structure_data = response.json()
                             # 결과 구성
                             result = {
                                 'PDB ID': pdb_id,
                                 'Resolution': f"{structure_data.get('rcsb_entry_info', {}).get('resolution_combined', [0.0])[0]:.2f}Å",
                                 'Method': structure_data.get('exptl', [{}])[0].get('method', 'Unknown'),
-                                'Title': structure_data.get('struct', {}).get('title', 'N/A'),
                                 'Release Date': structure_data.get('rcsb_accession_info', {}).get('initial_release_date', 'N/A')
                             }
                             results.append(result)
@@ -378,34 +392,10 @@ class PDBSearchAssistant:
                 pdir=self.pdb_dir,
                 file_format="pdb"
             )
-            print(pdb_path)
             if not pdb_path or not os.path.exists(pdb_path):
                 print(f"Failed to download PDB file for {pdb_id}")
-                structure_url = f"https://data.rcsb.org/rest/v1/core/entry/{pdb_id}"
-                response = requests.get(structure_url)
-                structure_data = response.json() if response.status_code == 200 else {}
-                sequence_url = f"https://data.rcsb.org/rest/v1/core/polymer_entity/{pdb_id}/1"
-                seq_response = requests.get(sequence_url)
-                seq_data = seq_response.json() if response.status_code == 200 else {}
-                sequence = seq_data.get('entity_poly', {}).get('pdbx_seq_one_letter_code', 'N/A')
-                sequences = []
-                chain_info = {
-                            'chain_id': "A", # chain.id, 임의 설정 api 3개써서 가져오기는 가능
-                            'entity_id': '1',  # Default entity ID
-                            'description': structure_data.get('struct', {}).get('title', 'N/A'),
-                            'sequence': sequence,
-                            'length': len(sequence),
-                            'resolution': structure_data.get('rcsb_entry_info', {}).get('resolution_combined', [0.0])[0],
-                            'method': structure_data.get('exptl', [{}])[0].get('method', 'Unknown'),
-                            'release_date': structure_data.get('rcsb_accession_info', {}).get('initial_release_date', 'N/A')
-                        }
-                sequences.append(chain_info)
-                return sequences
             # Parse structure
             parser = PDB.PDBParser(QUIET=True)
@@ -462,17 +452,7 @@ class PDBSearchAssistant:
     def process_query(self, query):
         """Process query and return results"""
         try:
-            # First check if the query is just a PDB ID
-            query_cleaned = query.strip().upper()
-            if re.match(r'^[0-9A-Za-z]{4}$', query_cleaned):
-                # Direct PDB ID query
-                sequences = self.get_sequences_by_pdb_id(query_cleaned)
-                return {
-                    "type": "sequence",
-                    "results": sequences
-                }
-            # If not a direct PDB ID, proceed with LLM processing
             formatted_prompt = self.prompt_template.format(query=query)
             response = self.pipe(formatted_prompt)[0]['generated_text']
             print("Generated parameters:", response)
@@ -491,6 +471,7 @@ class PDBSearchAssistant:
             is_sequence_query = any(keyword in query.lower() for keyword in sequence_keywords)
             if is_sequence_query and pdb_id:
                 sequences = self.get_sequences_by_pdb_id(pdb_id)
                 return {
                     "type": "sequence",
@@ -507,6 +488,26 @@ class PDBSearchAssistant:
             print(f"Error processing query: {str(e)}")
             return {"type": "structure", "results": []}
 def render_html(pdb_id):
     if pdb_id is None:
@@ -550,8 +551,8 @@ def create_interactive_table(df):
     if df.empty:
         return go.Figure()
-    # Reorder columns
-    column_order = ['PDB ID', 'Resolution', 'Method', 'Title', 'Release Date']
     df = df[column_order]
     # Release Date 형식 변경 (YYYY-MM-DD)
@@ -562,8 +563,8 @@ def create_interactive_table(df):
         header=dict(
             values=list(df.columns),
             fill_color='paleturquoise',
-            align='center',  # 헤더 중앙 정렬
-            font=dict(size=16),  # 헤더 글자 크기 증가s
         ),
         cells=dict(
             values=[
@@ -572,11 +573,11 @@ def create_interactive_table(df):
                  for cell in df[col]]
                 for i, col in enumerate(df.columns)
             ],
-            align='center',  # 셀 내용 중앙 정렬
-            font=dict(size=15),  # 셀 글자 크기 증가
-            height=35  # 셀 높이 증가
         ),
-        columnwidth=[80, 80, 100, 400, 100],
         customdata=[['html'] * len(df) if i == 0 else [''] * len(df)
                    for i in range(len(df.columns))],
         hoverlabel=dict(bgcolor='white')
@@ -585,7 +586,7 @@ def create_interactive_table(df):
     # Update table layout
     table.update_layout(
         margin=dict(l=20, r=20, t=20, b=20),
-        height=450,  # 테이블 전체 높이 증가
         autosize=True
     )
@@ -932,7 +933,7 @@ app_ui = ui.page_fluid(
             .pdb-selector {
                 display: flex;
-                align-items: center;
                 justify-content: flex-start;
                 gap: 5px;
                 margin-bottom: 20px;
@@ -949,7 +950,7 @@ app_ui = ui.page_fluid(
             .pdb-selector select {
                 margin-left: 0;
-                vertical-align: middle;
                 display: inline-block;
             }
@@ -1010,7 +1011,7 @@ app_ui = ui.page_fluid(
                             ui.p("Example queries:"),
                             ui.tags.ul(
                                 ui.tags.li("Human hemoglobin C resolution better than 2.5Å"),
-                                ui.tags.li("Find structures containing sequence MNIFEMLRIDEGLRLKIYKDTEGYYTIGIGHLLTKSPSLNAAKSELDKAIGRNTNGVITKDEAEKLFNQDVDAAVRGILRNAKLKPVYDSLDAVRRAALINMVFQMGETGVAGFTNSLRMLQQKRWDEAAVNLAKSRWYNQTPNRAKRVITTFRTGTWDAYKNL"),
                                 ui.tags.li("Sequence of PDB ID 8ET6")
                             )
                         )
@@ -1097,11 +1098,6 @@ def server(input, output, session):
                 for line in input.query().split():
                     if re.match(r'^[0-9A-Za-z]{4}$', line):
                         pdb_ids.append(line.upper())
-            # Clear the results table for sequence queries
-            @output
-            @render_widget
-            def results_table():
-                return create_interactive_table(pd.DataFrame())
         else:
             df = pd.DataFrame(query_results["results"])
             if df.empty:
@@ -1116,6 +1112,7 @@ def server(input, output, session):
         if pdb_ids:
             pdb_ids_store.set(pdb_ids)
             ui.update_select(
                 "selected_pdb",
                 choices=pdb_ids,

 from transformers import pipeline
+from rcsbsearchapi import TextQuery, AttributeQuery, Query
 from rcsbsearchapi.search import Sort, SequenceQuery
 import os
 from dotenv import load_dotenv
             "text2text-generation",
             model=model_name,
             max_new_tokens=1024,
+            temperature=0.1,
             torch_dtype="auto",
             device="cpu"
         )
             4. Specific PDB ID
             5. Experimental method (X-RAY, EM, NMR)
             6. Organism/Species information
+            7. Sequence similarity (in %)
             Format:
             Protein: [protein name or type]
             PDB_ID: none
             Method: X-RAY
+            Query: "Find structures containing sequence with similarity 90% MNIFEMLRIDEGLRLKIYKDTEGYYTIGIGHLLTKSPSLNAAKSELDKAIGRNTNGVITKDEAEKLFNQDVDAAVRGILRNAKLKPVYDSLDAVRRAALINMVFQMGETGVAGFTNSLRMLQQKRWDEAAVNLAKSRWYNQTPNRAKRVITTFRTGTWDAYKNL"
             Protein: none
             Organism: none
             Resolution: none
             Sequence: MNIFEMLRIDEGLRLKIYKDTEGYYTIGIGHLLTKSPSLNAAKSELDKAIGRNTNGVITKDEAEKLFNQDVDAAVRGILRNAKLKPVYDSLDAVRRAALINMVFQMGETGVAGFTNSLRMLQQKRWDEAAVNLAKSRWYNQTPNRAKRVITTFRTGTWDAYKNL
             PDB_ID: none
             Method: none
+            Similarity: 90
             Query: "Get sequence of PDB ID 8ET6"
             Protein: none
             PDB_ID: none
             Method: none
             Now analyze:
             Query: {query}
             """
             organism = None
             has_resolution_query = False
             resolution_direction = "less"
+            similarity = None  # Initialize similarity
+            print("Raw LLM response:", response)  # Debug print
+            # Parse LLM response first to get similarity value
+            for line in response.split('\n'):
+                line = line.strip().lower()  # Convert to lowercase
+                if 'similarity:' in line:
+                    try:
+                        similarity_str = line.split('similarity:')[1].strip()
+                        if similarity_str.lower() not in ['none', 'n/a']:
+                            similarity = float(similarity_str)
+                            print(f"Successfully extracted similarity: {similarity}%")
+                    except (ValueError, IndexError) as e:
+                        print(f"Error parsing similarity: {e}")
+                        continue
+            # If similarity is still None, try to extract from original query
+            if similarity is None:
+                # Case insensitive search for similarity pattern
+                similarity_match = re.search(r'similarity\s+(\d+(?:\.\d+)?)\s*%', query.lower())
+                if similarity_match:
+                    try:
+                        similarity = float(similarity_match.group(1))
+                        print(f"Extracted similarity from query: {similarity}%")
+                    except ValueError as e:
+                        print(f"Error parsing similarity from query: {e}")
             # Check if query contains resolution-related terms
             resolution_terms = {
                     has_resolution_query = True
                     if direction:  # if not None
                         resolution_direction = direction
             # Also check for numerical values with Å
+            resolution_match = re.search(r'(\d+\.?\d*)\s*å?.*resolution', query_lower)
+            if resolution_match:
                 has_resolution_query = True
+                try:
+                    resolution_limit = float(resolution_match.group(1))
+                except ValueError:
+                    pass
             # Clean and parse LLM response
             for line in response.split('\n'):
                 if 'Resolution:' in line:
+                    value = line.split('Resolution:')[1].strip()
                     if value.lower() not in ['none', 'n/a'] and has_resolution_query:
+                        try:
+                            # Extract just the number
+                            res_value = ''.join(c for c in value if c.isdigit() or c == '.')
+                            resolution_limit = float(res_value)
+                        except ValueError:
+                            pass
+                elif 'Method:' in line:
                     value = line.split('Method:')[1].strip()
                     if value.lower() not in ['none', 'n/a']:
                         method = value.upper()
+                elif 'Sequence:' in line:
                     value = line.split('Sequence:')[1].strip()
                     if value.lower() not in ['none', 'n/a']:
                         sequence = value
+                elif 'PDB_ID:' in line:
+                    value = line.split('PDB_ID:')[1].strip()
                     if value.lower() not in ['none', 'n/a']:
                         pdb_id = value
+                elif 'Organism:' in line:
+                    value = line.split('Organism:')[1].strip()
                     if value.lower() not in ['none', 'n/a']:
                         organism = value
             # Build search query
             queries = []
             # Check if the query contains a protein sequence pattern
             # Check for amino acid sequence (minimum 25 residues)
             query_words = query.split()
                     print("Warning: Sequence must be at least 25 residues long. Skipping sequence search.")
                     sequence = None
                 else:
+                    # Use the previously extracted similarity value
+                    if similarity is None:
+                        similarity = 100  # default value
+                        print("No similarity specified, using default 100%")
+                    identity_cutoff = similarity / 100.0  # Convert percentage to decimal
+                    print(f"Adding sequence search with identity {similarity}% (cutoff: {identity_cutoff}) for sequence: {sequence}")
                     sequence_query = SequenceQuery(
                         sequence,
+                        identity_cutoff=identity_cutoff,
                         evalue_cutoff=1,
                         sequence_type="protein"
                     )
                     queries.append(sequence_query)
+                    print(f"Created sequence query with parameters: {sequence_query.params}")
             # If no sequence, proceed with text search
             else:
                 # Clean the original query and add text search
             # Combine queries with AND operator
             if queries:
+                final_query = queries[0]
                 for q in queries[1:]:
                     final_query = final_query & q
                                 continue
                             structure_data = response.json()
                             # 결과 구성
                             result = {
                                 'PDB ID': pdb_id,
+                                'Title': structure_data.get('struct', {}).get('title', 'N/A'),
+                                '# of total residues': structure_data.get('refine_hist', [{}])[0].get('pdbx_number_residues_total', 'N/A'),
+                                '# of atoms of protein': structure_data.get('refine_hist', [{}])[0].get('pdbx_number_atoms_protein', 'N/A'),
                                 'Resolution': f"{structure_data.get('rcsb_entry_info', {}).get('resolution_combined', [0.0])[0]:.2f}Å",
                                 'Method': structure_data.get('exptl', [{}])[0].get('method', 'Unknown'),
                                 'Release Date': structure_data.get('rcsb_accession_info', {}).get('initial_release_date', 'N/A')
                             }
                             results.append(result)
                 pdir=self.pdb_dir,
                 file_format="pdb"
             )
             if not pdb_path or not os.path.exists(pdb_path):
                 print(f"Failed to download PDB file for {pdb_id}")
+                return []
             # Parse structure
             parser = PDB.PDBParser(QUIET=True)
     def process_query(self, query):
         """Process query and return results"""
         try:
+            # Get search parameters from LLM
             formatted_prompt = self.prompt_template.format(query=query)
             response = self.pipe(formatted_prompt)[0]['generated_text']
             print("Generated parameters:", response)
             is_sequence_query = any(keyword in query.lower() for keyword in sequence_keywords)
             if is_sequence_query and pdb_id:
+                # Get sequences for the PDB ID
                 sequences = self.get_sequences_by_pdb_id(pdb_id)
                 return {
                     "type": "sequence",
             print(f"Error processing query: {str(e)}")
             return {"type": "structure", "results": []}
+def pdbsummary(name):
+    search_engine = ProteinSearchEngine()
+    query = ProteinQuery(
+        name,
+        max_resolution= 5.0
+    )
+    results = search_engine.search(query)
+    answer = ""
+    for i, structure in enumerate(results, 1):
+        answer += f"\n{i}. PDB ID : {structure.pdb_id}\n"
+        answer += f"\nResolution : {structure.resolution:.2f} A \n"
+        answer += f"Method : {structure.method}\n Title : {structure.title}\n"
+        answer += f"Release Date : {structure.release_date}\n Sequence length: {len(structure.sequence)} aa\n"
+        answer += f"    Sequence:\n {structure.sequence}\n"
+    return answer
 def render_html(pdb_id):
     if pdb_id is None:
     if df.empty:
         return go.Figure()
+    # Reorder columns - Add '# of atoms of protein' to the column order
+    column_order = ['PDB ID', 'Resolution', 'Title','# of total residues', '# of atoms of protein', 'Method','Release Date']
     df = df[column_order]
     # Release Date 형식 변경 (YYYY-MM-DD)
         header=dict(
             values=list(df.columns),
             fill_color='paleturquoise',
+            align='center',
+            font=dict(size=16),
         ),
         cells=dict(
             values=[
                  for cell in df[col]]
                 for i, col in enumerate(df.columns)
             ],
+            align='center',
+            font=dict(size=15),
+            height=35
         ),
+        columnwidth=[80, 80, 400, 100, 100, 100, 100],  # Updated columnwidth to include new column
         customdata=[['html'] * len(df) if i == 0 else [''] * len(df)
                    for i in range(len(df.columns))],
         hoverlabel=dict(bgcolor='white')
     # Update table layout
     table.update_layout(
         margin=dict(l=20, r=20, t=20, b=20),
+        height=450,
         autosize=True
     )
             .pdb-selector {
                 display: flex;
+                align-items: ;
                 justify-content: flex-start;
                 gap: 5px;
                 margin-bottom: 20px;
             .pdb-selector select {
                 margin-left: 0;
+                vertical-align: left;
                 display: inline-block;
             }
                             ui.p("Example queries:"),
                             ui.tags.ul(
                                 ui.tags.li("Human hemoglobin C resolution better than 2.5Å"),
+                                ui.tags.li("Find structures containing sequence with similarity 90% MNIFEMLRIDEGLRLKIYKDTEGYYTIGIGHLLTKSPSLNAAKSELDKAIGRNTNGVITKDEAEKLFNQDVDAAVRGILRNAKLKPVYDSLDAVRRAALINMVFQMGETGVAGFTNSLRMLQQKRWDEAAVNLAKSRWYNQTPNRAKRVITTFRTGTWDAYKNL"),
                                 ui.tags.li("Sequence of PDB ID 8ET6")
                             )
                         )
                 for line in input.query().split():
                     if re.match(r'^[0-9A-Za-z]{4}$', line):
                         pdb_ids.append(line.upper())
         else:
             df = pd.DataFrame(query_results["results"])
             if df.empty:
         if pdb_ids:
             pdb_ids_store.set(pdb_ids)
+            # Update only one dropdown
             ui.update_select(
                 "selected_pdb",
                 choices=pdb_ids,