Spaces:

lkjjj26
/

query

Sleeping

App Files Files Community

query / UniprotKB_P_Sequence_RCSB_API_test.py

lkjjj26

updata uniprot.py

ae69c23 3 months ago

raw

history blame

7.56 kB

	import requests
	from typing import List, Dict, Optional
	from dataclasses import dataclass

	@dataclass
	class ProteinQuery:
	name: str
	organism: Optional[str] = None
	mutations: Optional[List[str]] = None
	min_resolution: Optional[float] = None
	max_resolution: Optional[float] = None

	@dataclass
	class ProteinStructure:
	pdb_id: str
	resolution: float
	sequence: str
	title: str
	method: str
	release_date: str

	class ProteinSearchEngine:
	def __init__(self, debug=False):
	self.uniprot_api = "https://rest.uniprot.org/uniprotkb"
	self.pdb_api = "https://data.rcsb.org/graphql"

	def _get_uniprot_data(self, query: ProteinQuery) -> Dict:
	"""UniProt API를 통해 기본 단백질 정보 검색"""

	search_query = f'"{query.name}"'
	if query.organism:
	search_query += f' AND organism:"{query.organism}"'

	params = {
	"query": search_query,
	"format": "json"
	}

	# self._debug_print(f"UniProt search query: {search_query}")
	response = requests.get(f"{self.uniprot_api}/search", params=params)
	data = response.json()
	# self._debug_print(f"UniProt results count: {len(data.get('results', []))}")
	return data

	def _get_pdb_structures(self, uniprot_id: str, uniprot_sequence: str = None) -> List[ProteinStructure]:
	"""REST API를 사용하여 PDB에서 구조 정보 검색"""
	url = "https://search.rcsb.org/rcsbsearch/v2/query"

	query = {
	"query": {
	"type": "group",
	"logical_operator": "and",
	"nodes": [
	{
	"type": "terminal",
	"service": "text",
	"parameters": {
	"attribute": "rcsb_polymer_entity_container_identifiers.reference_sequence_identifiers.database_accession",
	"operator": "exact_match",
	"value": uniprot_id
	}
	},
	{
	"type": "terminal",
	"service": "text",
	"parameters": {
	"attribute": "rcsb_polymer_entity_container_identifiers.reference_sequence_identifiers.database_name",
	"operator": "exact_match",
	"value": "UniProt"
	}
	}
	]
	},
	"return_type": "entry"
	}

	response = requests.post(url, json=query)

	if response.status_code != 200:
	# self._debug_print(f"Error querying PDB: {response.text}")
	return []

	data = response.json()
	structures = []

	for hit in data.get("result_set", []):
	pdb_id = hit["identifier"]
	# PDB API를 통해 구조 세부 정보 가져오기
	structure_url = f"https://data.rcsb.org/rest/v1/core/entry/{pdb_id}"
	structure_response = requests.get(structure_url)

	if structure_response.status_code == 200:
	structure_data = structure_response.json()

	# 시퀀스 정보 가져오기
	entity_url = f"https://data.rcsb.org/rest/v1/core/polymer_entity/{pdb_id}/1" # 첫 번째 엔티티 가져오기
	entity_response = requests.get(entity_url)
	sequence = ""

	if entity_response.status_code == 200:
	entity_data = entity_response.json()
	sequence = entity_data.get("entity_poly", {}).get("pdbx_seq_one_letter_code", "")

	structure = ProteinStructure(
	pdb_id=pdb_id,
	resolution=float(structure_data.get("rcsb_entry_info", {}).get("resolution_combined", [0.0])[0]),
	sequence=sequence,
	method=structure_data.get("exptl", [{}])[0].get("method", ""),
	title=structure_data.get("struct", {}).get("title", ""),
	release_date=structure_data.get("rcsb_accession_info", {}).get("initial_release_date", "")
	)
	structures.append(structure)

	return structures

	def search(self, query: ProteinQuery) -> List[ProteinStructure]:
	"""주어진 쿼리로 단백질 구조 검색"""
	# 1. UniProt에서 기본 정보 검색
	uniprot_data = self._get_uniprot_data(query)

	if not uniprot_data.get('results'):
	# self._debug_print("No UniProt results found")
	return []

	all_structures = []
	# 여러 UniProt 엔트리 검색
	for entry in uniprot_data['results'][:5]: # 상위 5개만 검색
	uniprot_id = entry['primaryAccession']
	sequence = entry.get('sequence', {}).get('value', '')
	# self._debug_print(f"Processing UniProt ID: {uniprot_id}")
	# self._debug_print(f"UniProt Sequence ({len(sequence)} aa):\n{sequence}")

	structures = self._get_pdb_structures(uniprot_id, sequence)
	all_structures.extend(structures)

	# self._debug_print(f"Total structures found: {len(all_structures)}")

	# 3. Resolution 기준으로 필터링
	filtered_structures = []
	for structure in all_structures:
	# Resolution 체크
	if query.min_resolution and structure.resolution < query.min_resolution:
	continue
	if query.max_resolution and structure.resolution > query.max_resolution:
	continue

	filtered_structures.append(structure)

	# self._debug_print(f"Structures after resolution filter: {len(filtered_structures)}")

	# 4. Resolution 기준으로 정렬
	filtered_structures.sort(key=lambda x: x.resolution)

	return filtered_structures

	def main():
	# 검색 엔진 초기화
	search_engine = ProteinSearchEngine(debug=True)

	# 전체 검색 (resolution 5 이하)
	query = ProteinQuery(
	name="human hemoglobin A",
	max_resolution=5.0 # resolution 제한 완화
	)

	# 검색 실행
	results = search_engine.search(query)

	# 결과를 파일로 출력
	with open('protein_search_results.txt', 'w') as f:
	f.write(f"Search Query: {query.name}\n")
	if query.organism:
	f.write(f"Organism: {query.organism}\n")
	f.write(f"Resolution Filter: <= {query.max_resolution} Å\n\n")

	f.write(f"Found {len(results)} structures matching the criteria:\n")
	for i, structure in enumerate(results, 1):
	f.write(f"\n{i}. PDB ID: {structure.pdb_id}\n")
	f.write(f" Resolution: {structure.resolution:.2f} Å\n")
	f.write(f" Method: {structure.method}\n")
	f.write(f" Title: {structure.title}\n")
	f.write(f" Release Date: {structure.release_date}\n")
	f.write(f" Sequence Length: {len(structure.sequence)} aa\n")
	f.write(f" Sequence:\n{structure.sequence}\n")
	f.write("-" * 80 + "\n")

	print(f"Results have been saved to 'protein_search_results.txt'")

	if __name__ == "__main__":
	main()