File size: 7,556 Bytes
f60f277
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
import requests
from typing import List, Dict, Optional
from dataclasses import dataclass

@dataclass
class ProteinQuery:
    name: str
    organism: Optional[str] = None
    mutations: Optional[List[str]] = None
    min_resolution: Optional[float] = None
    max_resolution: Optional[float] = None

@dataclass
class ProteinStructure:
    pdb_id: str
    resolution: float
    sequence: str
    title: str
    method: str
    release_date: str

class ProteinSearchEngine:
    def __init__(self, debug=False):
        self.uniprot_api = "https://rest.uniprot.org/uniprotkb"
        self.pdb_api = "https://data.rcsb.org/graphql"
    
    def _get_uniprot_data(self, query: ProteinQuery) -> Dict:
        """UniProt API를 통해 기본 단백질 정보 검색"""
        
        search_query = f'"{query.name}"'
        if query.organism:
            search_query += f' AND organism:"{query.organism}"'
            
        params = {
            "query": search_query,
            "format": "json"
        }
        
        # self._debug_print(f"UniProt search query: {search_query}")
        response = requests.get(f"{self.uniprot_api}/search", params=params)
        data = response.json()
        # self._debug_print(f"UniProt results count: {len(data.get('results', []))}")
        return data
    
    def _get_pdb_structures(self, uniprot_id: str, uniprot_sequence: str = None) -> List[ProteinStructure]:
        """REST API를 사용하여 PDB에서 구조 정보 검색"""
        url = "https://search.rcsb.org/rcsbsearch/v2/query"
        
        query = {
            "query": {
                "type": "group",
                "logical_operator": "and",
                "nodes": [
                    {
                        "type": "terminal",
                        "service": "text",
                        "parameters": {
                            "attribute": "rcsb_polymer_entity_container_identifiers.reference_sequence_identifiers.database_accession",
                            "operator": "exact_match",
                            "value": uniprot_id
                        }
                    },
                    {
                        "type": "terminal",
                        "service": "text",
                        "parameters": {
                            "attribute": "rcsb_polymer_entity_container_identifiers.reference_sequence_identifiers.database_name",
                            "operator": "exact_match",
                            "value": "UniProt"
                        }
                    }
                ]
            },
            "return_type": "entry"
        }

        response = requests.post(url, json=query)
        
        if response.status_code != 200:
            # self._debug_print(f"Error querying PDB: {response.text}")
            return []
        
        data = response.json()
        structures = []
        
        for hit in data.get("result_set", []):
            pdb_id = hit["identifier"]
            # PDB API를 통해 구조 세부 정보 가져오기
            structure_url = f"https://data.rcsb.org/rest/v1/core/entry/{pdb_id}"
            structure_response = requests.get(structure_url)
            
            if structure_response.status_code == 200:
                structure_data = structure_response.json()
                
                # 시퀀스 정보 가져오기
                entity_url = f"https://data.rcsb.org/rest/v1/core/polymer_entity/{pdb_id}/1"  # 첫 번째 엔티티 가져오기
                entity_response = requests.get(entity_url)
                sequence = ""
                
                if entity_response.status_code == 200:
                    entity_data = entity_response.json()
                    sequence = entity_data.get("entity_poly", {}).get("pdbx_seq_one_letter_code", "")
                
                structure = ProteinStructure(
                    pdb_id=pdb_id,
                    resolution=float(structure_data.get("rcsb_entry_info", {}).get("resolution_combined", [0.0])[0]),
                    sequence=sequence,
                    method=structure_data.get("exptl", [{}])[0].get("method", ""),
                    title=structure_data.get("struct", {}).get("title", ""),
                    release_date=structure_data.get("rcsb_accession_info", {}).get("initial_release_date", "")
                )
                structures.append(structure)
                
        return structures
    
    def search(self, query: ProteinQuery) -> List[ProteinStructure]:
        """주어진 쿼리로 단백질 구조 검색"""
        # 1. UniProt에서 기본 정보 검색
        uniprot_data = self._get_uniprot_data(query)
        
        if not uniprot_data.get('results'):
            # self._debug_print("No UniProt results found")
            return []
        
        all_structures = []
        # 여러 UniProt 엔트리 검색
        for entry in uniprot_data['results'][:5]:  # 상위 5개만 검색
            uniprot_id = entry['primaryAccession']
            sequence = entry.get('sequence', {}).get('value', '')
            # self._debug_print(f"Processing UniProt ID: {uniprot_id}")
            # self._debug_print(f"UniProt Sequence ({len(sequence)} aa):\n{sequence}")
            
            structures = self._get_pdb_structures(uniprot_id, sequence)
            all_structures.extend(structures)
        
        # self._debug_print(f"Total structures found: {len(all_structures)}")
        
        # 3. Resolution 기준으로 필터링
        filtered_structures = []
        for structure in all_structures:
            # Resolution 체크
            if query.min_resolution and structure.resolution < query.min_resolution:
                continue
            if query.max_resolution and structure.resolution > query.max_resolution:
                continue
            
            filtered_structures.append(structure)
        
        # self._debug_print(f"Structures after resolution filter: {len(filtered_structures)}")
        
        # 4. Resolution 기준으로 정렬
        filtered_structures.sort(key=lambda x: x.resolution)
        
        return filtered_structures

def main():
    # 검색 엔진 초기화
    search_engine = ProteinSearchEngine(debug=True)
    
    # 전체 검색 (resolution 5 이하)
    query = ProteinQuery(
        name="human hemoglobin A",
        max_resolution=5.0  # resolution 제한 완화
    )
    
    # 검색 실행
    results = search_engine.search(query)
    
    # 결과를 파일로 출력
    with open('protein_search_results.txt', 'w') as f:
        f.write(f"Search Query: {query.name}\n")
        if query.organism:
            f.write(f"Organism: {query.organism}\n")
        f.write(f"Resolution Filter: <= {query.max_resolution} Å\n\n")
        
        f.write(f"Found {len(results)} structures matching the criteria:\n")
        for i, structure in enumerate(results, 1):
            f.write(f"\n{i}. PDB ID: {structure.pdb_id}\n")
            f.write(f"   Resolution: {structure.resolution:.2f} Å\n")
            f.write(f"   Method: {structure.method}\n")
            f.write(f"   Title: {structure.title}\n")
            f.write(f"   Release Date: {structure.release_date}\n")
            f.write(f"   Sequence Length: {len(structure.sequence)} aa\n")
            f.write(f"   Sequence:\n{structure.sequence}\n")
            f.write("-" * 80 + "\n")
    
    print(f"Results have been saved to 'protein_search_results.txt'")

if __name__ == "__main__":
    main()