Spaces:

lkjjj26
/

query

Sleeping

File size: 7,556 Bytes

f60f277

import requests
from typing import List, Dict, Optional
from dataclasses import dataclass

@dataclass
class ProteinQuery:
    name: str
    organism: Optional[str] = None
    mutations: Optional[List[str]] = None
    min_resolution: Optional[float] = None
    max_resolution: Optional[float] = None

@dataclass
class ProteinStructure:
    pdb_id: str
    resolution: float
    sequence: str
    title: str
    method: str
    release_date: str

class ProteinSearchEngine:
    def __init__(self, debug=False):
        self.uniprot_api = "https://rest.uniprot.org/uniprotkb"
        self.pdb_api = "https://data.rcsb.org/graphql"
    
    def _get_uniprot_data(self, query: ProteinQuery) -> Dict:
        """UniProt API를 통해 기본 단백질 정보 검색"""
        
        search_query = f'"{query.name}"'
        if query.organism:
            search_query += f' AND organism:"{query.organism}"'
            
        params = {
            "query": search_query,
            "format": "json"
        }
        
        # self._debug_print(f"UniProt search query: {search_query}")
        response = requests.get(f"{self.uniprot_api}/search", params=params)
        data = response.json()
        # self._debug_print(f"UniProt results count: {len(data.get('results', []))}")
        return data
    
    def _get_pdb_structures(self, uniprot_id: str, uniprot_sequence: str = None) -> List[ProteinStructure]:
        """REST API를 사용하여 PDB에서 구조 정보 검색"""
        url = "https://search.rcsb.org/rcsbsearch/v2/query"
        
        query = {
            "query": {
                "type": "group",
                "logical_operator": "and",
                "nodes": [
                    {
                        "type": "terminal",
                        "service": "text",
                        "parameters": {
                            "attribute": "rcsb_polymer_entity_container_identifiers.reference_sequence_identifiers.database_accession",
                            "operator": "exact_match",
                            "value": uniprot_id
                        }
                    },
                    {
                        "type": "terminal",
                        "service": "text",
                        "parameters": {
                            "attribute": "rcsb_polymer_entity_container_identifiers.reference_sequence_identifiers.database_name",
                            "operator": "exact_match",
                            "value": "UniProt"
                        }
                    }
                ]
            },
            "return_type": "entry"
        }

        response = requests.post(url, json=query)
        
        if response.status_code != 200:
            # self._debug_print(f"Error querying PDB: {response.text}")
            return []
        
        data = response.json()
        structures = []
        
        for hit in data.get("result_set", []):
            pdb_id = hit["identifier"]
            # PDB API를 통해 구조 세부 정보 가져오기
            structure_url = f"https://data.rcsb.org/rest/v1/core/entry/{pdb_id}"
            structure_response = requests.get(structure_url)
            
            if structure_response.status_code == 200:
                structure_data = structure_response.json()
                
                # 시퀀스 정보 가져오기
                entity_url = f"https://data.rcsb.org/rest/v1/core/polymer_entity/{pdb_id}/1"  # 첫 번째 엔티티 가져오기
                entity_response = requests.get(entity_url)
                sequence = ""
                
                if entity_response.status_code == 200:
                    entity_data = entity_response.json()
                    sequence = entity_data.get("entity_poly", {}).get("pdbx_seq_one_letter_code", "")
                
                structure = ProteinStructure(
                    pdb_id=pdb_id,
                    resolution=float(structure_data.get("rcsb_entry_info", {}).get("resolution_combined", [0.0])[0]),
                    sequence=sequence,
                    method=structure_data.get("exptl", [{}])[0].get("method", ""),
                    title=structure_data.get("struct", {}).get("title", ""),
                    release_date=structure_data.get("rcsb_accession_info", {}).get("initial_release_date", "")
                )
                structures.append(structure)
                
        return structures
    
    def search(self, query: ProteinQuery) -> List[ProteinStructure]:
        """주어진 쿼리로 단백질 구조 검색"""
        # 1. UniProt에서 기본 정보 검색
        uniprot_data = self._get_uniprot_data(query)
        
        if not uniprot_data.get('results'):
            # self._debug_print("No UniProt results found")
            return []
        
        all_structures = []
        # 여러 UniProt 엔트리 검색
        for entry in uniprot_data['results'][:5]:  # 상위 5개만 검색
            uniprot_id = entry['primaryAccession']
            sequence = entry.get('sequence', {}).get('value', '')
            # self._debug_print(f"Processing UniProt ID: {uniprot_id}")
            # self._debug_print(f"UniProt Sequence ({len(sequence)} aa):\n{sequence}")
            
            structures = self._get_pdb_structures(uniprot_id, sequence)
            all_structures.extend(structures)
        
        # self._debug_print(f"Total structures found: {len(all_structures)}")
        
        # 3. Resolution 기준으로 필터링
        filtered_structures = []
        for structure in all_structures:
            # Resolution 체크
            if query.min_resolution and structure.resolution < query.min_resolution:
                continue
            if query.max_resolution and structure.resolution > query.max_resolution:
                continue
            
            filtered_structures.append(structure)
        
        # self._debug_print(f"Structures after resolution filter: {len(filtered_structures)}")
        
        # 4. Resolution 기준으로 정렬
        filtered_structures.sort(key=lambda x: x.resolution)
        
        return filtered_structures

def main():
    # 검색 엔진 초기화
    search_engine = ProteinSearchEngine(debug=True)
    
    # 전체 검색 (resolution 5 이하)
    query = ProteinQuery(
        name="human hemoglobin A",
        max_resolution=5.0  # resolution 제한 완화
    )
    
    # 검색 실행
    results = search_engine.search(query)
    
    # 결과를 파일로 출력
    with open('protein_search_results.txt', 'w') as f:
        f.write(f"Search Query: {query.name}\n")
        if query.organism:
            f.write(f"Organism: {query.organism}\n")
        f.write(f"Resolution Filter: <= {query.max_resolution} Å\n\n")
        
        f.write(f"Found {len(results)} structures matching the criteria:\n")
        for i, structure in enumerate(results, 1):
            f.write(f"\n{i}. PDB ID: {structure.pdb_id}\n")
            f.write(f"   Resolution: {structure.resolution:.2f} Å\n")
            f.write(f"   Method: {structure.method}\n")
            f.write(f"   Title: {structure.title}\n")
            f.write(f"   Release Date: {structure.release_date}\n")
            f.write(f"   Sequence Length: {len(structure.sequence)} aa\n")
            f.write(f"   Sequence:\n{structure.sequence}\n")
            f.write("-" * 80 + "\n")
    
    print(f"Results have been saved to 'protein_search_results.txt'")

if __name__ == "__main__":
    main()