|
import requests |
|
from typing import List, Dict, Optional |
|
from dataclasses import dataclass |
|
|
|
@dataclass |
|
class ProteinQuery: |
|
name: str |
|
organism: Optional[str] = None |
|
mutations: Optional[List[str]] = None |
|
min_resolution: Optional[float] = None |
|
max_resolution: Optional[float] = None |
|
|
|
@dataclass |
|
class ProteinStructure: |
|
pdb_id: str |
|
resolution: float |
|
sequence: str |
|
title: str |
|
method: str |
|
release_date: str |
|
|
|
class ProteinSearchEngine: |
|
def __init__(self, debug=False): |
|
self.uniprot_api = "https://rest.uniprot.org/uniprotkb" |
|
self.pdb_api = "https://data.rcsb.org/graphql" |
|
|
|
def _get_uniprot_data(self, query: ProteinQuery) -> Dict: |
|
"""UniProt API를 통해 기본 단백질 정보 검색""" |
|
|
|
search_query = f'"{query.name}"' |
|
if query.organism: |
|
search_query += f' AND organism:"{query.organism}"' |
|
|
|
params = { |
|
"query": search_query, |
|
"format": "json" |
|
} |
|
|
|
|
|
response = requests.get(f"{self.uniprot_api}/search", params=params) |
|
data = response.json() |
|
|
|
return data |
|
|
|
def _get_pdb_structures(self, uniprot_id: str, uniprot_sequence: str = None) -> List[ProteinStructure]: |
|
"""REST API를 사용하여 PDB에서 구조 정보 검색""" |
|
url = "https://search.rcsb.org/rcsbsearch/v2/query" |
|
|
|
query = { |
|
"query": { |
|
"type": "group", |
|
"logical_operator": "and", |
|
"nodes": [ |
|
{ |
|
"type": "terminal", |
|
"service": "text", |
|
"parameters": { |
|
"attribute": "rcsb_polymer_entity_container_identifiers.reference_sequence_identifiers.database_accession", |
|
"operator": "exact_match", |
|
"value": uniprot_id |
|
} |
|
}, |
|
{ |
|
"type": "terminal", |
|
"service": "text", |
|
"parameters": { |
|
"attribute": "rcsb_polymer_entity_container_identifiers.reference_sequence_identifiers.database_name", |
|
"operator": "exact_match", |
|
"value": "UniProt" |
|
} |
|
} |
|
] |
|
}, |
|
"return_type": "entry" |
|
} |
|
|
|
response = requests.post(url, json=query) |
|
|
|
if response.status_code != 200: |
|
|
|
return [] |
|
|
|
data = response.json() |
|
structures = [] |
|
|
|
for hit in data.get("result_set", []): |
|
pdb_id = hit["identifier"] |
|
|
|
structure_url = f"https://data.rcsb.org/rest/v1/core/entry/{pdb_id}" |
|
structure_response = requests.get(structure_url) |
|
|
|
if structure_response.status_code == 200: |
|
structure_data = structure_response.json() |
|
|
|
|
|
entity_url = f"https://data.rcsb.org/rest/v1/core/polymer_entity/{pdb_id}/1" |
|
entity_response = requests.get(entity_url) |
|
sequence = "" |
|
|
|
if entity_response.status_code == 200: |
|
entity_data = entity_response.json() |
|
sequence = entity_data.get("entity_poly", {}).get("pdbx_seq_one_letter_code", "") |
|
|
|
structure = ProteinStructure( |
|
pdb_id=pdb_id, |
|
resolution=float(structure_data.get("rcsb_entry_info", {}).get("resolution_combined", [0.0])[0]), |
|
sequence=sequence, |
|
method=structure_data.get("exptl", [{}])[0].get("method", ""), |
|
title=structure_data.get("struct", {}).get("title", ""), |
|
release_date=structure_data.get("rcsb_accession_info", {}).get("initial_release_date", "") |
|
) |
|
structures.append(structure) |
|
|
|
return structures |
|
|
|
def search(self, query: ProteinQuery) -> List[ProteinStructure]: |
|
"""주어진 쿼리로 단백질 구조 검색""" |
|
|
|
uniprot_data = self._get_uniprot_data(query) |
|
|
|
if not uniprot_data.get('results'): |
|
|
|
return [] |
|
|
|
all_structures = [] |
|
|
|
for entry in uniprot_data['results'][:5]: |
|
uniprot_id = entry['primaryAccession'] |
|
sequence = entry.get('sequence', {}).get('value', '') |
|
|
|
|
|
|
|
structures = self._get_pdb_structures(uniprot_id, sequence) |
|
all_structures.extend(structures) |
|
|
|
|
|
|
|
|
|
filtered_structures = [] |
|
for structure in all_structures: |
|
|
|
if query.min_resolution and structure.resolution < query.min_resolution: |
|
continue |
|
if query.max_resolution and structure.resolution > query.max_resolution: |
|
continue |
|
|
|
filtered_structures.append(structure) |
|
|
|
|
|
|
|
|
|
filtered_structures.sort(key=lambda x: x.resolution) |
|
|
|
return filtered_structures |
|
|
|
def main(): |
|
|
|
search_engine = ProteinSearchEngine(debug=True) |
|
|
|
|
|
query = ProteinQuery( |
|
name="human hemoglobin A", |
|
max_resolution=5.0 |
|
) |
|
|
|
|
|
results = search_engine.search(query) |
|
|
|
|
|
with open('protein_search_results.txt', 'w') as f: |
|
f.write(f"Search Query: {query.name}\n") |
|
if query.organism: |
|
f.write(f"Organism: {query.organism}\n") |
|
f.write(f"Resolution Filter: <= {query.max_resolution} Å\n\n") |
|
|
|
f.write(f"Found {len(results)} structures matching the criteria:\n") |
|
for i, structure in enumerate(results, 1): |
|
f.write(f"\n{i}. PDB ID: {structure.pdb_id}\n") |
|
f.write(f" Resolution: {structure.resolution:.2f} Å\n") |
|
f.write(f" Method: {structure.method}\n") |
|
f.write(f" Title: {structure.title}\n") |
|
f.write(f" Release Date: {structure.release_date}\n") |
|
f.write(f" Sequence Length: {len(structure.sequence)} aa\n") |
|
f.write(f" Sequence:\n{structure.sequence}\n") |
|
f.write("-" * 80 + "\n") |
|
|
|
print(f"Results have been saved to 'protein_search_results.txt'") |
|
|
|
if __name__ == "__main__": |
|
main() |