lkjjj26 commited on
Commit
f60f277
·
1 Parent(s): d8b5f68

add UnitprotApi.py

Browse files
Files changed (2) hide show
  1. Dockerfile +2 -3
  2. UniprotKB_P_Sequence_RCSB_API_test.py +194 -0
Dockerfile CHANGED
@@ -1,6 +1,6 @@
1
  FROM python:3.12
2
 
3
- WORKDIR /code
4
 
5
  COPY ./requirements.txt /code/requirements.txt
6
 
@@ -10,5 +10,4 @@ COPY . .
10
 
11
  EXPOSE 8000
12
 
13
- CMD ["python", "app.py"]
14
-
 
1
  FROM python:3.12
2
 
3
+ WORKDIR /
4
 
5
  COPY ./requirements.txt /code/requirements.txt
6
 
 
10
 
11
  EXPOSE 8000
12
 
13
+ CMD ["python", "app.py"]
 
UniprotKB_P_Sequence_RCSB_API_test.py ADDED
@@ -0,0 +1,194 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import requests
2
+ from typing import List, Dict, Optional
3
+ from dataclasses import dataclass
4
+ import re
5
+ from Bio import pairwise2
6
+ from Bio.Seq import Seq
7
+ import json
8
+
9
+ @dataclass
10
+ class ProteinQuery:
11
+ name: str
12
+ organism: Optional[str] = None
13
+ mutations: Optional[List[str]] = None
14
+ min_resolution: Optional[float] = None
15
+ max_resolution: Optional[float] = None
16
+
17
+ @dataclass
18
+ class ProteinStructure:
19
+ pdb_id: str
20
+ resolution: float
21
+ sequence: str
22
+ title: str
23
+ method: str
24
+ release_date: str
25
+
26
+ class ProteinSearchEngine:
27
+ def __init__(self, debug=False):
28
+ self.uniprot_api = "https://rest.uniprot.org/uniprotkb"
29
+ self.pdb_api = "https://data.rcsb.org/graphql"
30
+
31
+ def _get_uniprot_data(self, query: ProteinQuery) -> Dict:
32
+ """UniProt API를 통해 기본 단백질 정보 검색"""
33
+
34
+ search_query = f'"{query.name}"'
35
+ if query.organism:
36
+ search_query += f' AND organism:"{query.organism}"'
37
+
38
+ params = {
39
+ "query": search_query,
40
+ "format": "json"
41
+ }
42
+
43
+ # self._debug_print(f"UniProt search query: {search_query}")
44
+ response = requests.get(f"{self.uniprot_api}/search", params=params)
45
+ data = response.json()
46
+ # self._debug_print(f"UniProt results count: {len(data.get('results', []))}")
47
+ return data
48
+
49
+ def _get_pdb_structures(self, uniprot_id: str, uniprot_sequence: str = None) -> List[ProteinStructure]:
50
+ """REST API를 사용하여 PDB에서 구조 정보 검색"""
51
+ url = "https://search.rcsb.org/rcsbsearch/v2/query"
52
+
53
+ query = {
54
+ "query": {
55
+ "type": "group",
56
+ "logical_operator": "and",
57
+ "nodes": [
58
+ {
59
+ "type": "terminal",
60
+ "service": "text",
61
+ "parameters": {
62
+ "attribute": "rcsb_polymer_entity_container_identifiers.reference_sequence_identifiers.database_accession",
63
+ "operator": "exact_match",
64
+ "value": uniprot_id
65
+ }
66
+ },
67
+ {
68
+ "type": "terminal",
69
+ "service": "text",
70
+ "parameters": {
71
+ "attribute": "rcsb_polymer_entity_container_identifiers.reference_sequence_identifiers.database_name",
72
+ "operator": "exact_match",
73
+ "value": "UniProt"
74
+ }
75
+ }
76
+ ]
77
+ },
78
+ "return_type": "entry"
79
+ }
80
+
81
+ response = requests.post(url, json=query)
82
+
83
+ if response.status_code != 200:
84
+ # self._debug_print(f"Error querying PDB: {response.text}")
85
+ return []
86
+
87
+ data = response.json()
88
+ structures = []
89
+
90
+ for hit in data.get("result_set", []):
91
+ pdb_id = hit["identifier"]
92
+ # PDB API를 통해 구조 세부 정보 가져오기
93
+ structure_url = f"https://data.rcsb.org/rest/v1/core/entry/{pdb_id}"
94
+ structure_response = requests.get(structure_url)
95
+
96
+ if structure_response.status_code == 200:
97
+ structure_data = structure_response.json()
98
+
99
+ # 시퀀스 정보 가져오기
100
+ entity_url = f"https://data.rcsb.org/rest/v1/core/polymer_entity/{pdb_id}/1" # 첫 번째 엔티티 가져오기
101
+ entity_response = requests.get(entity_url)
102
+ sequence = ""
103
+
104
+ if entity_response.status_code == 200:
105
+ entity_data = entity_response.json()
106
+ sequence = entity_data.get("entity_poly", {}).get("pdbx_seq_one_letter_code", "")
107
+
108
+ structure = ProteinStructure(
109
+ pdb_id=pdb_id,
110
+ resolution=float(structure_data.get("rcsb_entry_info", {}).get("resolution_combined", [0.0])[0]),
111
+ sequence=sequence,
112
+ method=structure_data.get("exptl", [{}])[0].get("method", ""),
113
+ title=structure_data.get("struct", {}).get("title", ""),
114
+ release_date=structure_data.get("rcsb_accession_info", {}).get("initial_release_date", "")
115
+ )
116
+ structures.append(structure)
117
+
118
+ return structures
119
+
120
+ def search(self, query: ProteinQuery) -> List[ProteinStructure]:
121
+ """주어진 쿼리로 단백질 구조 검색"""
122
+ # 1. UniProt에서 기본 정보 검색
123
+ uniprot_data = self._get_uniprot_data(query)
124
+
125
+ if not uniprot_data.get('results'):
126
+ # self._debug_print("No UniProt results found")
127
+ return []
128
+
129
+ all_structures = []
130
+ # 여러 UniProt 엔트리 검색
131
+ for entry in uniprot_data['results'][:5]: # 상위 5개만 검색
132
+ uniprot_id = entry['primaryAccession']
133
+ sequence = entry.get('sequence', {}).get('value', '')
134
+ # self._debug_print(f"Processing UniProt ID: {uniprot_id}")
135
+ # self._debug_print(f"UniProt Sequence ({len(sequence)} aa):\n{sequence}")
136
+
137
+ structures = self._get_pdb_structures(uniprot_id, sequence)
138
+ all_structures.extend(structures)
139
+
140
+ # self._debug_print(f"Total structures found: {len(all_structures)}")
141
+
142
+ # 3. Resolution 기준으로 필터링
143
+ filtered_structures = []
144
+ for structure in all_structures:
145
+ # Resolution 체크
146
+ if query.min_resolution and structure.resolution < query.min_resolution:
147
+ continue
148
+ if query.max_resolution and structure.resolution > query.max_resolution:
149
+ continue
150
+
151
+ filtered_structures.append(structure)
152
+
153
+ # self._debug_print(f"Structures after resolution filter: {len(filtered_structures)}")
154
+
155
+ # 4. Resolution 기준으로 정렬
156
+ filtered_structures.sort(key=lambda x: x.resolution)
157
+
158
+ return filtered_structures
159
+
160
+ def main():
161
+ # 검색 엔진 초기화
162
+ search_engine = ProteinSearchEngine(debug=True)
163
+
164
+ # 전체 검색 (resolution 5 이하)
165
+ query = ProteinQuery(
166
+ name="human hemoglobin A",
167
+ max_resolution=5.0 # resolution 제한 완화
168
+ )
169
+
170
+ # 검색 실행
171
+ results = search_engine.search(query)
172
+
173
+ # 결과를 파일로 출력
174
+ with open('protein_search_results.txt', 'w') as f:
175
+ f.write(f"Search Query: {query.name}\n")
176
+ if query.organism:
177
+ f.write(f"Organism: {query.organism}\n")
178
+ f.write(f"Resolution Filter: <= {query.max_resolution} Å\n\n")
179
+
180
+ f.write(f"Found {len(results)} structures matching the criteria:\n")
181
+ for i, structure in enumerate(results, 1):
182
+ f.write(f"\n{i}. PDB ID: {structure.pdb_id}\n")
183
+ f.write(f" Resolution: {structure.resolution:.2f} Å\n")
184
+ f.write(f" Method: {structure.method}\n")
185
+ f.write(f" Title: {structure.title}\n")
186
+ f.write(f" Release Date: {structure.release_date}\n")
187
+ f.write(f" Sequence Length: {len(structure.sequence)} aa\n")
188
+ f.write(f" Sequence:\n{structure.sequence}\n")
189
+ f.write("-" * 80 + "\n")
190
+
191
+ print(f"Results have been saved to 'protein_search_results.txt'")
192
+
193
+ if __name__ == "__main__":
194
+ main()