add UnitprotApi.py
Browse files- Dockerfile +2 -3
- UniprotKB_P_Sequence_RCSB_API_test.py +194 -0
Dockerfile
CHANGED
@@ -1,6 +1,6 @@
|
|
1 |
FROM python:3.12
|
2 |
|
3 |
-
WORKDIR /
|
4 |
|
5 |
COPY ./requirements.txt /code/requirements.txt
|
6 |
|
@@ -10,5 +10,4 @@ COPY . .
|
|
10 |
|
11 |
EXPOSE 8000
|
12 |
|
13 |
-
CMD ["python", "app.py"]
|
14 |
-
|
|
|
1 |
FROM python:3.12
|
2 |
|
3 |
+
WORKDIR /
|
4 |
|
5 |
COPY ./requirements.txt /code/requirements.txt
|
6 |
|
|
|
10 |
|
11 |
EXPOSE 8000
|
12 |
|
13 |
+
CMD ["python", "app.py"]
|
|
UniprotKB_P_Sequence_RCSB_API_test.py
ADDED
@@ -0,0 +1,194 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import requests
|
2 |
+
from typing import List, Dict, Optional
|
3 |
+
from dataclasses import dataclass
|
4 |
+
import re
|
5 |
+
from Bio import pairwise2
|
6 |
+
from Bio.Seq import Seq
|
7 |
+
import json
|
8 |
+
|
9 |
+
@dataclass
|
10 |
+
class ProteinQuery:
|
11 |
+
name: str
|
12 |
+
organism: Optional[str] = None
|
13 |
+
mutations: Optional[List[str]] = None
|
14 |
+
min_resolution: Optional[float] = None
|
15 |
+
max_resolution: Optional[float] = None
|
16 |
+
|
17 |
+
@dataclass
|
18 |
+
class ProteinStructure:
|
19 |
+
pdb_id: str
|
20 |
+
resolution: float
|
21 |
+
sequence: str
|
22 |
+
title: str
|
23 |
+
method: str
|
24 |
+
release_date: str
|
25 |
+
|
26 |
+
class ProteinSearchEngine:
|
27 |
+
def __init__(self, debug=False):
|
28 |
+
self.uniprot_api = "https://rest.uniprot.org/uniprotkb"
|
29 |
+
self.pdb_api = "https://data.rcsb.org/graphql"
|
30 |
+
|
31 |
+
def _get_uniprot_data(self, query: ProteinQuery) -> Dict:
|
32 |
+
"""UniProt API를 통해 기본 단백질 정보 검색"""
|
33 |
+
|
34 |
+
search_query = f'"{query.name}"'
|
35 |
+
if query.organism:
|
36 |
+
search_query += f' AND organism:"{query.organism}"'
|
37 |
+
|
38 |
+
params = {
|
39 |
+
"query": search_query,
|
40 |
+
"format": "json"
|
41 |
+
}
|
42 |
+
|
43 |
+
# self._debug_print(f"UniProt search query: {search_query}")
|
44 |
+
response = requests.get(f"{self.uniprot_api}/search", params=params)
|
45 |
+
data = response.json()
|
46 |
+
# self._debug_print(f"UniProt results count: {len(data.get('results', []))}")
|
47 |
+
return data
|
48 |
+
|
49 |
+
def _get_pdb_structures(self, uniprot_id: str, uniprot_sequence: str = None) -> List[ProteinStructure]:
|
50 |
+
"""REST API를 사용하여 PDB에서 구조 정보 검색"""
|
51 |
+
url = "https://search.rcsb.org/rcsbsearch/v2/query"
|
52 |
+
|
53 |
+
query = {
|
54 |
+
"query": {
|
55 |
+
"type": "group",
|
56 |
+
"logical_operator": "and",
|
57 |
+
"nodes": [
|
58 |
+
{
|
59 |
+
"type": "terminal",
|
60 |
+
"service": "text",
|
61 |
+
"parameters": {
|
62 |
+
"attribute": "rcsb_polymer_entity_container_identifiers.reference_sequence_identifiers.database_accession",
|
63 |
+
"operator": "exact_match",
|
64 |
+
"value": uniprot_id
|
65 |
+
}
|
66 |
+
},
|
67 |
+
{
|
68 |
+
"type": "terminal",
|
69 |
+
"service": "text",
|
70 |
+
"parameters": {
|
71 |
+
"attribute": "rcsb_polymer_entity_container_identifiers.reference_sequence_identifiers.database_name",
|
72 |
+
"operator": "exact_match",
|
73 |
+
"value": "UniProt"
|
74 |
+
}
|
75 |
+
}
|
76 |
+
]
|
77 |
+
},
|
78 |
+
"return_type": "entry"
|
79 |
+
}
|
80 |
+
|
81 |
+
response = requests.post(url, json=query)
|
82 |
+
|
83 |
+
if response.status_code != 200:
|
84 |
+
# self._debug_print(f"Error querying PDB: {response.text}")
|
85 |
+
return []
|
86 |
+
|
87 |
+
data = response.json()
|
88 |
+
structures = []
|
89 |
+
|
90 |
+
for hit in data.get("result_set", []):
|
91 |
+
pdb_id = hit["identifier"]
|
92 |
+
# PDB API를 통해 구조 세부 정보 가져오기
|
93 |
+
structure_url = f"https://data.rcsb.org/rest/v1/core/entry/{pdb_id}"
|
94 |
+
structure_response = requests.get(structure_url)
|
95 |
+
|
96 |
+
if structure_response.status_code == 200:
|
97 |
+
structure_data = structure_response.json()
|
98 |
+
|
99 |
+
# 시퀀스 정보 가져오기
|
100 |
+
entity_url = f"https://data.rcsb.org/rest/v1/core/polymer_entity/{pdb_id}/1" # 첫 번째 엔티티 가져오기
|
101 |
+
entity_response = requests.get(entity_url)
|
102 |
+
sequence = ""
|
103 |
+
|
104 |
+
if entity_response.status_code == 200:
|
105 |
+
entity_data = entity_response.json()
|
106 |
+
sequence = entity_data.get("entity_poly", {}).get("pdbx_seq_one_letter_code", "")
|
107 |
+
|
108 |
+
structure = ProteinStructure(
|
109 |
+
pdb_id=pdb_id,
|
110 |
+
resolution=float(structure_data.get("rcsb_entry_info", {}).get("resolution_combined", [0.0])[0]),
|
111 |
+
sequence=sequence,
|
112 |
+
method=structure_data.get("exptl", [{}])[0].get("method", ""),
|
113 |
+
title=structure_data.get("struct", {}).get("title", ""),
|
114 |
+
release_date=structure_data.get("rcsb_accession_info", {}).get("initial_release_date", "")
|
115 |
+
)
|
116 |
+
structures.append(structure)
|
117 |
+
|
118 |
+
return structures
|
119 |
+
|
120 |
+
def search(self, query: ProteinQuery) -> List[ProteinStructure]:
|
121 |
+
"""주어진 쿼리로 단백질 구조 검색"""
|
122 |
+
# 1. UniProt에서 기본 정보 검색
|
123 |
+
uniprot_data = self._get_uniprot_data(query)
|
124 |
+
|
125 |
+
if not uniprot_data.get('results'):
|
126 |
+
# self._debug_print("No UniProt results found")
|
127 |
+
return []
|
128 |
+
|
129 |
+
all_structures = []
|
130 |
+
# 여러 UniProt 엔트리 검색
|
131 |
+
for entry in uniprot_data['results'][:5]: # 상위 5개만 검색
|
132 |
+
uniprot_id = entry['primaryAccession']
|
133 |
+
sequence = entry.get('sequence', {}).get('value', '')
|
134 |
+
# self._debug_print(f"Processing UniProt ID: {uniprot_id}")
|
135 |
+
# self._debug_print(f"UniProt Sequence ({len(sequence)} aa):\n{sequence}")
|
136 |
+
|
137 |
+
structures = self._get_pdb_structures(uniprot_id, sequence)
|
138 |
+
all_structures.extend(structures)
|
139 |
+
|
140 |
+
# self._debug_print(f"Total structures found: {len(all_structures)}")
|
141 |
+
|
142 |
+
# 3. Resolution 기준으로 필터링
|
143 |
+
filtered_structures = []
|
144 |
+
for structure in all_structures:
|
145 |
+
# Resolution 체크
|
146 |
+
if query.min_resolution and structure.resolution < query.min_resolution:
|
147 |
+
continue
|
148 |
+
if query.max_resolution and structure.resolution > query.max_resolution:
|
149 |
+
continue
|
150 |
+
|
151 |
+
filtered_structures.append(structure)
|
152 |
+
|
153 |
+
# self._debug_print(f"Structures after resolution filter: {len(filtered_structures)}")
|
154 |
+
|
155 |
+
# 4. Resolution 기준으로 정렬
|
156 |
+
filtered_structures.sort(key=lambda x: x.resolution)
|
157 |
+
|
158 |
+
return filtered_structures
|
159 |
+
|
160 |
+
def main():
|
161 |
+
# 검색 엔진 초기화
|
162 |
+
search_engine = ProteinSearchEngine(debug=True)
|
163 |
+
|
164 |
+
# 전체 검색 (resolution 5 이하)
|
165 |
+
query = ProteinQuery(
|
166 |
+
name="human hemoglobin A",
|
167 |
+
max_resolution=5.0 # resolution 제한 완화
|
168 |
+
)
|
169 |
+
|
170 |
+
# 검색 실행
|
171 |
+
results = search_engine.search(query)
|
172 |
+
|
173 |
+
# 결과를 파일로 출력
|
174 |
+
with open('protein_search_results.txt', 'w') as f:
|
175 |
+
f.write(f"Search Query: {query.name}\n")
|
176 |
+
if query.organism:
|
177 |
+
f.write(f"Organism: {query.organism}\n")
|
178 |
+
f.write(f"Resolution Filter: <= {query.max_resolution} Å\n\n")
|
179 |
+
|
180 |
+
f.write(f"Found {len(results)} structures matching the criteria:\n")
|
181 |
+
for i, structure in enumerate(results, 1):
|
182 |
+
f.write(f"\n{i}. PDB ID: {structure.pdb_id}\n")
|
183 |
+
f.write(f" Resolution: {structure.resolution:.2f} Å\n")
|
184 |
+
f.write(f" Method: {structure.method}\n")
|
185 |
+
f.write(f" Title: {structure.title}\n")
|
186 |
+
f.write(f" Release Date: {structure.release_date}\n")
|
187 |
+
f.write(f" Sequence Length: {len(structure.sequence)} aa\n")
|
188 |
+
f.write(f" Sequence:\n{structure.sequence}\n")
|
189 |
+
f.write("-" * 80 + "\n")
|
190 |
+
|
191 |
+
print(f"Results have been saved to 'protein_search_results.txt'")
|
192 |
+
|
193 |
+
if __name__ == "__main__":
|
194 |
+
main()
|