query / UniprotKB_P_Sequence_RCSB_API_test.py
lkjjj26's picture
updata uniprot.py
ae69c23
raw
history blame
7.56 kB
import requests
from typing import List, Dict, Optional
from dataclasses import dataclass
@dataclass
class ProteinQuery:
name: str
organism: Optional[str] = None
mutations: Optional[List[str]] = None
min_resolution: Optional[float] = None
max_resolution: Optional[float] = None
@dataclass
class ProteinStructure:
pdb_id: str
resolution: float
sequence: str
title: str
method: str
release_date: str
class ProteinSearchEngine:
def __init__(self, debug=False):
self.uniprot_api = "https://rest.uniprot.org/uniprotkb"
self.pdb_api = "https://data.rcsb.org/graphql"
def _get_uniprot_data(self, query: ProteinQuery) -> Dict:
"""UniProt API를 통해 기본 단백질 정보 검색"""
search_query = f'"{query.name}"'
if query.organism:
search_query += f' AND organism:"{query.organism}"'
params = {
"query": search_query,
"format": "json"
}
# self._debug_print(f"UniProt search query: {search_query}")
response = requests.get(f"{self.uniprot_api}/search", params=params)
data = response.json()
# self._debug_print(f"UniProt results count: {len(data.get('results', []))}")
return data
def _get_pdb_structures(self, uniprot_id: str, uniprot_sequence: str = None) -> List[ProteinStructure]:
"""REST API를 사용하여 PDB에서 구조 정보 검색"""
url = "https://search.rcsb.org/rcsbsearch/v2/query"
query = {
"query": {
"type": "group",
"logical_operator": "and",
"nodes": [
{
"type": "terminal",
"service": "text",
"parameters": {
"attribute": "rcsb_polymer_entity_container_identifiers.reference_sequence_identifiers.database_accession",
"operator": "exact_match",
"value": uniprot_id
}
},
{
"type": "terminal",
"service": "text",
"parameters": {
"attribute": "rcsb_polymer_entity_container_identifiers.reference_sequence_identifiers.database_name",
"operator": "exact_match",
"value": "UniProt"
}
}
]
},
"return_type": "entry"
}
response = requests.post(url, json=query)
if response.status_code != 200:
# self._debug_print(f"Error querying PDB: {response.text}")
return []
data = response.json()
structures = []
for hit in data.get("result_set", []):
pdb_id = hit["identifier"]
# PDB API를 통해 구조 세부 정보 가져오기
structure_url = f"https://data.rcsb.org/rest/v1/core/entry/{pdb_id}"
structure_response = requests.get(structure_url)
if structure_response.status_code == 200:
structure_data = structure_response.json()
# 시퀀스 정보 가져오기
entity_url = f"https://data.rcsb.org/rest/v1/core/polymer_entity/{pdb_id}/1" # 첫 번째 엔티티 가져오기
entity_response = requests.get(entity_url)
sequence = ""
if entity_response.status_code == 200:
entity_data = entity_response.json()
sequence = entity_data.get("entity_poly", {}).get("pdbx_seq_one_letter_code", "")
structure = ProteinStructure(
pdb_id=pdb_id,
resolution=float(structure_data.get("rcsb_entry_info", {}).get("resolution_combined", [0.0])[0]),
sequence=sequence,
method=structure_data.get("exptl", [{}])[0].get("method", ""),
title=structure_data.get("struct", {}).get("title", ""),
release_date=structure_data.get("rcsb_accession_info", {}).get("initial_release_date", "")
)
structures.append(structure)
return structures
def search(self, query: ProteinQuery) -> List[ProteinStructure]:
"""주어진 쿼리로 단백질 구조 검색"""
# 1. UniProt에서 기본 정보 검색
uniprot_data = self._get_uniprot_data(query)
if not uniprot_data.get('results'):
# self._debug_print("No UniProt results found")
return []
all_structures = []
# 여러 UniProt 엔트리 검색
for entry in uniprot_data['results'][:5]: # 상위 5개만 검색
uniprot_id = entry['primaryAccession']
sequence = entry.get('sequence', {}).get('value', '')
# self._debug_print(f"Processing UniProt ID: {uniprot_id}")
# self._debug_print(f"UniProt Sequence ({len(sequence)} aa):\n{sequence}")
structures = self._get_pdb_structures(uniprot_id, sequence)
all_structures.extend(structures)
# self._debug_print(f"Total structures found: {len(all_structures)}")
# 3. Resolution 기준으로 필터링
filtered_structures = []
for structure in all_structures:
# Resolution 체크
if query.min_resolution and structure.resolution < query.min_resolution:
continue
if query.max_resolution and structure.resolution > query.max_resolution:
continue
filtered_structures.append(structure)
# self._debug_print(f"Structures after resolution filter: {len(filtered_structures)}")
# 4. Resolution 기준으로 정렬
filtered_structures.sort(key=lambda x: x.resolution)
return filtered_structures
def main():
# 검색 엔진 초기화
search_engine = ProteinSearchEngine(debug=True)
# 전체 검색 (resolution 5 이하)
query = ProteinQuery(
name="human hemoglobin A",
max_resolution=5.0 # resolution 제한 완화
)
# 검색 실행
results = search_engine.search(query)
# 결과를 파일로 출력
with open('protein_search_results.txt', 'w') as f:
f.write(f"Search Query: {query.name}\n")
if query.organism:
f.write(f"Organism: {query.organism}\n")
f.write(f"Resolution Filter: <= {query.max_resolution} Å\n\n")
f.write(f"Found {len(results)} structures matching the criteria:\n")
for i, structure in enumerate(results, 1):
f.write(f"\n{i}. PDB ID: {structure.pdb_id}\n")
f.write(f" Resolution: {structure.resolution:.2f} Å\n")
f.write(f" Method: {structure.method}\n")
f.write(f" Title: {structure.title}\n")
f.write(f" Release Date: {structure.release_date}\n")
f.write(f" Sequence Length: {len(structure.sequence)} aa\n")
f.write(f" Sequence:\n{structure.sequence}\n")
f.write("-" * 80 + "\n")
print(f"Results have been saved to 'protein_search_results.txt'")
if __name__ == "__main__":
main()