Spaces:
Running
Running
Upload 10 files
Browse files- Dockerfile +9 -0
- README.md +4 -6
- alphagenome_mcp.py +107 -0
- requirements.txt +10 -0
- tools/batch_variant_scoring.py +232 -0
- tools/essential_commands.py +590 -0
- tools/quick_start.py +664 -0
- tools/tissue_ontology_mapping.py +148 -0
- tools/variant_scoring_ui.py +389 -0
- tools/visualization_modality_tour.py +1053 -0
Dockerfile
ADDED
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
FROM python:3.12
|
2 |
+
WORKDIR /app
|
3 |
+
COPY requirements.txt .
|
4 |
+
RUN pip install --no-cache-dir -r requirements.txt
|
5 |
+
COPY alphagenome_mcp.py .
|
6 |
+
COPY tools/ tools/
|
7 |
+
RUN mkdir -p /app/data/upload /data/tmp_inputs /data/tmp_outputs && chmod -R 777 /app/data/upload /data
|
8 |
+
EXPOSE 7860
|
9 |
+
CMD ["uvicorn", "alphagenome_mcp:app", "--host", "0.0.0.0", "--port", "7860"]
|
README.md
CHANGED
@@ -1,12 +1,10 @@
|
|
1 |
---
|
2 |
-
title:
|
3 |
-
emoji:
|
4 |
-
colorFrom:
|
5 |
-
colorTo:
|
6 |
sdk: docker
|
7 |
pinned: false
|
8 |
-
license: apache-2.0
|
9 |
-
short_description: Paper2Agent-generated AlphaGenome MCP server
|
10 |
---
|
11 |
|
12 |
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
|
|
1 |
---
|
2 |
+
title: AlphaGenome Mcp
|
3 |
+
emoji: 🔥
|
4 |
+
colorFrom: red
|
5 |
+
colorTo: green
|
6 |
sdk: docker
|
7 |
pinned: false
|
|
|
|
|
8 |
---
|
9 |
|
10 |
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
alphagenome_mcp.py
ADDED
@@ -0,0 +1,107 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Model Context Protocol (MCP) for AlphaGenome
|
3 |
+
|
4 |
+
AlphaGenome provides state-of-the-art AI-driven tools for genomic sequence analysis, variant effect prediction, and functional genomics visualization. The platform enables researchers to predict regulatory elements, assess variant impacts, and explore molecular mechanisms across diverse cell types and tissues.
|
5 |
+
|
6 |
+
This MCP Server contains the tools extracted from the following tutorials with their tools:
|
7 |
+
1. batch_variant_scoring
|
8 |
+
- score_variants_batch: Score multiple genetic variants in batch using configurable variant scorers
|
9 |
+
- filter_variant_scores: Filter variant scores by ontology criteria (e.g., cell types)
|
10 |
+
2. essential_commands
|
11 |
+
- create_genomic_interval: Create genomic intervals for DNA regions
|
12 |
+
- create_genomic_variant: Create genomic variants for genetic changes
|
13 |
+
- create_track_data: Create TrackData objects from arrays and metadata
|
14 |
+
- create_variant_scores: Create AnnData objects for variant scoring results
|
15 |
+
- genomic_interval_operations: Perform operations on genomic intervals
|
16 |
+
- variant_interval_operations: Check variant overlaps with intervals
|
17 |
+
- track_data_operations: Filter, resize, slice and transform TrackData
|
18 |
+
- track_data_resolution_conversion: Convert between track data resolutions
|
19 |
+
3. quick_start
|
20 |
+
- predict_dna_sequence: Predict genomic tracks from DNA sequence
|
21 |
+
- predict_genome_interval: Predict genomic tracks for reference genome intervals
|
22 |
+
- ism_analysis: Perform in silico mutagenesis analysis with sequence logos
|
23 |
+
4. tissue_ontology_mapping
|
24 |
+
- explore_output_metadata: Explore and filter output metadata for specific organisms and search terms
|
25 |
+
- count_tracks_by_output_type: Count tracks by output type for human and mouse organisms
|
26 |
+
5. variant_scoring_ui
|
27 |
+
- visualize_variant_effects: Generate comprehensive variant effect visualization across multiple modalities
|
28 |
+
6. visualization_modality_tour
|
29 |
+
- visualize_gene_expression: Visualize RNA_SEQ and CAGE gene expression predictions
|
30 |
+
- visualize_chromatin_accessibility: Visualize DNASE and ATAC chromatin accessibility
|
31 |
+
- visualize_splicing_effects: Visualize splicing predictions with SPLICE_SITES, SPLICE_SITE_USAGE, SPLICE_JUNCTIONS
|
32 |
+
- visualize_histone_modifications: Visualize CHIP_HISTONE predictions with custom colors
|
33 |
+
- visualize_tf_binding: Visualize CHIP_TF transcription factor binding predictions
|
34 |
+
- visualize_contact_maps: Visualize CONTACT_MAPS DNA-DNA contact predictions
|
35 |
+
"""
|
36 |
+
from fastmcp import FastMCP
|
37 |
+
|
38 |
+
# Import the MCP tools from the tools folder
|
39 |
+
from tools.batch_variant_scoring import batch_variant_scoring_mcp
|
40 |
+
from tools.essential_commands import essential_commands_mcp
|
41 |
+
from tools.quick_start import quick_start_mcp
|
42 |
+
from tools.tissue_ontology_mapping import tissue_ontology_mapping_mcp
|
43 |
+
from tools.variant_scoring_ui import variant_scoring_ui_mcp
|
44 |
+
from tools.visualization_modality_tour import visualization_modality_tour_mcp
|
45 |
+
|
46 |
+
from starlette.requests import Request
|
47 |
+
from starlette.responses import PlainTextResponse, JSONResponse
|
48 |
+
import os
|
49 |
+
from fastapi.staticfiles import StaticFiles
|
50 |
+
import uuid
|
51 |
+
|
52 |
+
# Define the MCP server
|
53 |
+
mcp = FastMCP(name = "AlphaGenome")
|
54 |
+
|
55 |
+
# Mount the tools
|
56 |
+
mcp.mount(batch_variant_scoring_mcp)
|
57 |
+
mcp.mount(essential_commands_mcp)
|
58 |
+
mcp.mount(quick_start_mcp)
|
59 |
+
mcp.mount(tissue_ontology_mapping_mcp)
|
60 |
+
mcp.mount(variant_scoring_ui_mcp)
|
61 |
+
mcp.mount(visualization_modality_tour_mcp)
|
62 |
+
|
63 |
+
# Use absolute directory for uploads
|
64 |
+
BASE_DIR = os.path.dirname(os.path.abspath(__file__))
|
65 |
+
UPLOAD_DIR = os.path.join(BASE_DIR, "/data/upload")
|
66 |
+
os.makedirs(UPLOAD_DIR, exist_ok=True)
|
67 |
+
|
68 |
+
@mcp.custom_route("/health", methods=["GET"])
|
69 |
+
async def health_check(request: Request) -> PlainTextResponse:
|
70 |
+
return PlainTextResponse("OK")
|
71 |
+
|
72 |
+
|
73 |
+
@mcp.custom_route("/", methods=["GET"])
|
74 |
+
async def index(request: Request) -> PlainTextResponse:
|
75 |
+
return PlainTextResponse("MCP is on https://Paper2Agent-alphagenome-mcp.hf.space/mcp")
|
76 |
+
|
77 |
+
# Upload route
|
78 |
+
@mcp.custom_route("/upload", methods=["POST"])
|
79 |
+
async def upload(request: Request):
|
80 |
+
form = await request.form()
|
81 |
+
up = form.get("file")
|
82 |
+
if up is None:
|
83 |
+
return JSONResponse({"error": "missing form field 'file'"}, status_code=400)
|
84 |
+
|
85 |
+
# Generate a safe filename
|
86 |
+
orig = getattr(up, "filename", "") or ""
|
87 |
+
ext = os.path.splitext(orig)[1]
|
88 |
+
name = f"{uuid.uuid4().hex}{ext}"
|
89 |
+
dst = os.path.join(UPLOAD_DIR, name)
|
90 |
+
|
91 |
+
# up is a Starlette UploadFile-like object
|
92 |
+
with open(dst, "wb") as out:
|
93 |
+
out.write(await up.read())
|
94 |
+
|
95 |
+
# Return only the absolute local path
|
96 |
+
abs_path = os.path.abspath(dst)
|
97 |
+
return JSONResponse({"path": abs_path})
|
98 |
+
|
99 |
+
app = mcp.http_app(path="/mcp")
|
100 |
+
# Saved uploaded input files
|
101 |
+
app.mount("/files", StaticFiles(directory=UPLOAD_DIR), name="files")
|
102 |
+
# Saved output files
|
103 |
+
app.mount("/outputs", StaticFiles(directory="/data/tmp_outputs"), name="outputs")
|
104 |
+
|
105 |
+
|
106 |
+
if __name__ == "__main__":
|
107 |
+
mcp.run(transport="http", host="0.0.0.0", port=int(os.getenv("PORT", 7860)), path="/mcp")
|
requirements.txt
ADDED
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
alphagenome==0.2.0
|
2 |
+
anndata==0.12.2
|
3 |
+
fastmcp==2.12.2
|
4 |
+
matplotlib==3.10.6
|
5 |
+
numpy==2.3.2
|
6 |
+
pandas==2.3.2
|
7 |
+
plotnine==0.15.0
|
8 |
+
tqdm==4.67.1
|
9 |
+
fastapi
|
10 |
+
starlette==0.47.3
|
tools/batch_variant_scoring.py
ADDED
@@ -0,0 +1,232 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Batch variant scoring tutorial demonstrating how to score multiple genetic variants using AlphaGenome.
|
3 |
+
|
4 |
+
This MCP Server provides 2 tools:
|
5 |
+
1. score_variants_batch: Score multiple genetic variants in batch using configurable variant scorers
|
6 |
+
2. filter_variant_scores: Filter variant scores by ontology criteria (e.g., cell types)
|
7 |
+
|
8 |
+
All tools extracted from `/batch_variant_scoring.ipynb`.
|
9 |
+
"""
|
10 |
+
|
11 |
+
# Standard imports
|
12 |
+
from typing import Annotated, Literal, Any
|
13 |
+
import pandas as pd
|
14 |
+
import numpy as np
|
15 |
+
from pathlib import Path
|
16 |
+
import os
|
17 |
+
from fastmcp import FastMCP
|
18 |
+
from datetime import datetime
|
19 |
+
from io import StringIO
|
20 |
+
|
21 |
+
# AlphaGenome imports
|
22 |
+
from alphagenome import colab_utils
|
23 |
+
from alphagenome.data import genome
|
24 |
+
from alphagenome.models import dna_client, variant_scorers
|
25 |
+
from tqdm import tqdm
|
26 |
+
|
27 |
+
# Base persistent directory (HF Spaces guarantees /data is writable & persistent)
|
28 |
+
BASE_DIR = Path("/data")
|
29 |
+
|
30 |
+
DEFAULT_INPUT_DIR = BASE_DIR / "tmp_inputs"
|
31 |
+
DEFAULT_OUTPUT_DIR = BASE_DIR / "tmp_outputs"
|
32 |
+
|
33 |
+
INPUT_DIR = Path(os.environ.get("BATCH_VARIANT_SCORING_INPUT_DIR", DEFAULT_INPUT_DIR))
|
34 |
+
OUTPUT_DIR = Path(os.environ.get("BATCH_VARIANT_SCORING_OUTPUT_DIR", DEFAULT_OUTPUT_DIR))
|
35 |
+
|
36 |
+
# Ensure directories exist
|
37 |
+
INPUT_DIR.mkdir(parents=True, exist_ok=True)
|
38 |
+
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
|
39 |
+
|
40 |
+
# Timestamp for unique outputs
|
41 |
+
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
42 |
+
|
43 |
+
# Fetch your secret
|
44 |
+
ALPHAGENOME_API_KEY = os.environ["ALPHAGENOME_API_KEY"]
|
45 |
+
|
46 |
+
# MCP server instance
|
47 |
+
batch_variant_scoring_mcp = FastMCP(name="batch_variant_scoring")
|
48 |
+
|
49 |
+
@batch_variant_scoring_mcp.tool
|
50 |
+
def score_variants_batch(
|
51 |
+
# Primary data inputs
|
52 |
+
vcf_path: Annotated[str | None, "Path to VCF file with extension .vcf or .tsv. The header of the file should include the following columns: variant_id, CHROM, POS, REF, ALT"] = None,
|
53 |
+
# Analysis parameters with tutorial defaults
|
54 |
+
api_key: Annotated[str, "AlphaGenome API key for authentication"] = ALPHAGENOME_API_KEY,
|
55 |
+
organism: Annotated[Literal["human", "mouse"], "Organism for variant scoring"] = "human",
|
56 |
+
sequence_length: Annotated[Literal["2KB", "16KB", "100KB", "500KB", "1MB"], "Length of sequence around variants to predict"] = "1MB",
|
57 |
+
score_rna_seq: Annotated[bool, "Score RNA-seq effects"] = True,
|
58 |
+
score_cage: Annotated[bool, "Score CAGE effects"] = True,
|
59 |
+
score_procap: Annotated[bool, "Score ProCAP effects"] = True,
|
60 |
+
score_atac: Annotated[bool, "Score ATAC-seq effects"] = True,
|
61 |
+
score_dnase: Annotated[bool, "Score DNase effects"] = True,
|
62 |
+
score_chip_histone: Annotated[bool, "Score ChIP histone effects"] = True,
|
63 |
+
score_chip_tf: Annotated[bool, "Score ChIP transcription factor effects"] = True,
|
64 |
+
score_polyadenylation: Annotated[bool, "Score polyadenylation effects"] = True,
|
65 |
+
score_splice_sites: Annotated[bool, "Score splice sites effects"] = True,
|
66 |
+
score_splice_site_usage: Annotated[bool, "Score splice site usage effects"] = True,
|
67 |
+
score_splice_junctions: Annotated[bool, "Score splice junctions effects"] = True,
|
68 |
+
out_prefix: Annotated[str | None, "Output file prefix"] = None,
|
69 |
+
) -> dict:
|
70 |
+
"""
|
71 |
+
Score multiple genetic variants in batch using AlphaGenome with configurable variant scorers.
|
72 |
+
Input is VCF file with variant information and output is comprehensive variant scores table.
|
73 |
+
"""
|
74 |
+
# Validate exactly one input
|
75 |
+
if vcf_path is None:
|
76 |
+
raise ValueError("Path to VCF file must be provided")
|
77 |
+
|
78 |
+
# Set output prefix
|
79 |
+
if out_prefix is None:
|
80 |
+
out_prefix = f"batch_variant_scores_{timestamp}"
|
81 |
+
|
82 |
+
# Load VCF file containing variants
|
83 |
+
vcf = pd.read_csv(vcf_path, sep='\t')
|
84 |
+
|
85 |
+
# Validate required columns
|
86 |
+
required_columns = ['variant_id', 'CHROM', 'POS', 'REF', 'ALT']
|
87 |
+
for column in required_columns:
|
88 |
+
if column not in vcf.columns:
|
89 |
+
raise ValueError(f'VCF file is missing required column: {column}.')
|
90 |
+
|
91 |
+
# Load the model
|
92 |
+
dna_model = dna_client.create(api_key)
|
93 |
+
|
94 |
+
# Parse organism specification
|
95 |
+
organism_map = {
|
96 |
+
'human': dna_client.Organism.HOMO_SAPIENS,
|
97 |
+
'mouse': dna_client.Organism.MUS_MUSCULUS,
|
98 |
+
}
|
99 |
+
organism_enum = organism_map[organism]
|
100 |
+
|
101 |
+
# Parse sequence length
|
102 |
+
sequence_length_value = dna_client.SUPPORTED_SEQUENCE_LENGTHS[
|
103 |
+
f'SEQUENCE_LENGTH_{sequence_length}'
|
104 |
+
]
|
105 |
+
|
106 |
+
# Parse scorer specification
|
107 |
+
scorer_selections = {
|
108 |
+
'rna_seq': score_rna_seq,
|
109 |
+
'cage': score_cage,
|
110 |
+
'procap': score_procap,
|
111 |
+
'atac': score_atac,
|
112 |
+
'dnase': score_dnase,
|
113 |
+
'chip_histone': score_chip_histone,
|
114 |
+
'chip_tf': score_chip_tf,
|
115 |
+
'polyadenylation': score_polyadenylation,
|
116 |
+
'splice_sites': score_splice_sites,
|
117 |
+
'splice_site_usage': score_splice_site_usage,
|
118 |
+
'splice_junctions': score_splice_junctions,
|
119 |
+
}
|
120 |
+
|
121 |
+
all_scorers = variant_scorers.RECOMMENDED_VARIANT_SCORERS
|
122 |
+
selected_scorers = [
|
123 |
+
all_scorers[key]
|
124 |
+
for key in all_scorers
|
125 |
+
if scorer_selections.get(key.lower(), False)
|
126 |
+
]
|
127 |
+
|
128 |
+
# Remove any scorers or output types that are not supported for the chosen organism
|
129 |
+
unsupported_scorers = [
|
130 |
+
scorer
|
131 |
+
for scorer in selected_scorers
|
132 |
+
if (
|
133 |
+
organism_enum.value
|
134 |
+
not in variant_scorers.SUPPORTED_ORGANISMS[scorer.base_variant_scorer]
|
135 |
+
)
|
136 |
+
| (
|
137 |
+
(scorer.requested_output == dna_client.OutputType.PROCAP)
|
138 |
+
& (organism_enum == dna_client.Organism.MUS_MUSCULUS)
|
139 |
+
)
|
140 |
+
]
|
141 |
+
if len(unsupported_scorers) > 0:
|
142 |
+
print(
|
143 |
+
f'Excluding {unsupported_scorers} scorers as they are not supported for'
|
144 |
+
f' {organism_enum}.'
|
145 |
+
)
|
146 |
+
for unsupported_scorer in unsupported_scorers:
|
147 |
+
selected_scorers.remove(unsupported_scorer)
|
148 |
+
|
149 |
+
# Score variants in the VCF file
|
150 |
+
results = []
|
151 |
+
|
152 |
+
for i, vcf_row in tqdm(vcf.iterrows(), total=len(vcf)):
|
153 |
+
variant = genome.Variant(
|
154 |
+
chromosome=str(vcf_row.CHROM),
|
155 |
+
position=int(vcf_row.POS),
|
156 |
+
reference_bases=vcf_row.REF,
|
157 |
+
alternate_bases=vcf_row.ALT,
|
158 |
+
name=vcf_row.variant_id,
|
159 |
+
)
|
160 |
+
interval = variant.reference_interval.resize(sequence_length_value)
|
161 |
+
|
162 |
+
variant_scores = dna_model.score_variant(
|
163 |
+
interval=interval,
|
164 |
+
variant=variant,
|
165 |
+
variant_scorers=selected_scorers,
|
166 |
+
organism=organism_enum,
|
167 |
+
)
|
168 |
+
results.append(variant_scores)
|
169 |
+
|
170 |
+
df_scores = variant_scorers.tidy_scores(results)
|
171 |
+
|
172 |
+
# Save results
|
173 |
+
output_file = OUTPUT_DIR / f"{out_prefix}.csv"
|
174 |
+
df_scores.to_csv(output_file, index=False)
|
175 |
+
|
176 |
+
return {
|
177 |
+
"message": f"Batch variant scoring completed for {len(vcf)} variants with {len(selected_scorers)} scorers",
|
178 |
+
"reference": "https://github.com/google-deepmind/alphagenome/tree/main/colabs/batch_variant_scoring.ipynb",
|
179 |
+
"artifacts": [
|
180 |
+
{
|
181 |
+
"description": "Batch variant scores results",
|
182 |
+
"path": str(output_file.resolve())
|
183 |
+
}
|
184 |
+
]
|
185 |
+
}
|
186 |
+
|
187 |
+
@batch_variant_scoring_mcp.tool
|
188 |
+
def filter_variant_scores(
|
189 |
+
# Primary data inputs
|
190 |
+
scores_path: Annotated[str | None, "Path to variant scores CSV file from batch scoring"] = None,
|
191 |
+
# Analysis parameters with tutorial defaults
|
192 |
+
ontology_curie: Annotated[str, "Ontology CURIE for filtering (e.g., 'CL:0000084' for T-cells)"] = "CL:0000084",
|
193 |
+
exclude_ontology_column: Annotated[bool, "Whether to exclude ontology_curie column from output"] = True,
|
194 |
+
out_prefix: Annotated[str | None, "Output file prefix"] = None,
|
195 |
+
) -> dict:
|
196 |
+
"""
|
197 |
+
Filter variant scores by ontology criteria to examine effects on specific cell types or tissues.
|
198 |
+
Input is variant scores CSV file and output is filtered scores table.
|
199 |
+
"""
|
200 |
+
# Validate exactly one input
|
201 |
+
if scores_path is None:
|
202 |
+
raise ValueError("Path to variant scores CSV file must be provided")
|
203 |
+
|
204 |
+
# Set output prefix
|
205 |
+
if out_prefix is None:
|
206 |
+
out_prefix = f"filtered_variant_scores_{timestamp}"
|
207 |
+
|
208 |
+
# Load variant scores
|
209 |
+
df_scores = pd.read_csv(scores_path)
|
210 |
+
|
211 |
+
# Filter by ontology criteria
|
212 |
+
filtered_df = df_scores[df_scores['ontology_curie'] == ontology_curie]
|
213 |
+
|
214 |
+
# Optionally exclude ontology column
|
215 |
+
if exclude_ontology_column:
|
216 |
+
columns = [c for c in filtered_df.columns if c != 'ontology_curie']
|
217 |
+
filtered_df = filtered_df[columns]
|
218 |
+
|
219 |
+
# Save filtered results
|
220 |
+
output_file = OUTPUT_DIR / f"{out_prefix}.csv"
|
221 |
+
filtered_df.to_csv(output_file, index=False)
|
222 |
+
|
223 |
+
return {
|
224 |
+
"message": f"Filtered {len(filtered_df)} variant scores for ontology {ontology_curie}",
|
225 |
+
"reference": "https://github.com/google-deepmind/alphagenome/tree/main/colabs/batch_variant_scoring.ipynb",
|
226 |
+
"artifacts": [
|
227 |
+
{
|
228 |
+
"description": "Filtered variant scores",
|
229 |
+
"path": str(output_file.resolve())
|
230 |
+
}
|
231 |
+
]
|
232 |
+
}
|
tools/essential_commands.py
ADDED
@@ -0,0 +1,590 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Essential commands for AlphaGenome API interaction covering data structures and operations.
|
3 |
+
|
4 |
+
This MCP Server provides 8 tools:
|
5 |
+
1. create_genomic_interval: Create genomic intervals for DNA regions
|
6 |
+
2. create_genomic_variant: Create genomic variants for genetic changes
|
7 |
+
3. create_track_data: Create TrackData objects from arrays and metadata
|
8 |
+
4. create_variant_scores: Create AnnData objects for variant scoring results
|
9 |
+
5. genomic_interval_operations: Perform operations on genomic intervals
|
10 |
+
6. variant_interval_operations: Check variant overlaps with intervals
|
11 |
+
7. track_data_operations: Filter, resize, slice and transform TrackData
|
12 |
+
8. track_data_resolution_conversion: Convert between track data resolutions
|
13 |
+
|
14 |
+
All tools extracted from `https://github.com/google-deepmind/alphagenome/tree/main/colabs/essential_commands.ipynb`.
|
15 |
+
"""
|
16 |
+
|
17 |
+
# Standard imports
|
18 |
+
from typing import Annotated, Literal, Any
|
19 |
+
import pandas as pd
|
20 |
+
import numpy as np
|
21 |
+
from pathlib import Path
|
22 |
+
import os
|
23 |
+
from fastmcp import FastMCP
|
24 |
+
from datetime import datetime
|
25 |
+
|
26 |
+
# AlphaGenome imports
|
27 |
+
from alphagenome.data import genome, track_data
|
28 |
+
from alphagenome.models import dna_client
|
29 |
+
import anndata
|
30 |
+
|
31 |
+
# Base persistent directory (HF Spaces guarantees /data is writable & persistent)
|
32 |
+
BASE_DIR = Path("/data")
|
33 |
+
|
34 |
+
DEFAULT_INPUT_DIR = BASE_DIR / "tmp_inputs"
|
35 |
+
DEFAULT_OUTPUT_DIR = BASE_DIR / "tmp_outputs"
|
36 |
+
|
37 |
+
INPUT_DIR = Path(os.environ.get("ESSENTIAL_COMMANDS_INPUT_DIR", DEFAULT_INPUT_DIR))
|
38 |
+
OUTPUT_DIR = Path(os.environ.get("ESSENTIAL_COMMANDS_OUTPUT_DIR", DEFAULT_OUTPUT_DIR))
|
39 |
+
|
40 |
+
# Ensure directories exist
|
41 |
+
INPUT_DIR.mkdir(parents=True, exist_ok=True)
|
42 |
+
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
|
43 |
+
|
44 |
+
# Timestamp for unique outputs
|
45 |
+
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
46 |
+
|
47 |
+
# MCP server instance
|
48 |
+
essential_commands_mcp = FastMCP(name="essential_commands")
|
49 |
+
|
50 |
+
@essential_commands_mcp.tool
|
51 |
+
def create_genomic_interval(
|
52 |
+
chromosome: Annotated[str, "Chromosome name (e.g., 'chr1', 'chr2')"] = "chr1",
|
53 |
+
start: Annotated[int, "Start position (0-based)"] = 1000,
|
54 |
+
end: Annotated[int, "End position (0-based, exclusive)"] = 1010,
|
55 |
+
strand: Annotated[Literal["+", "-", "."], "DNA strand"] = ".",
|
56 |
+
name: Annotated[str, "Interval name"] = "",
|
57 |
+
out_prefix: Annotated[str | None, "Output file prefix"] = None,
|
58 |
+
) -> dict:
|
59 |
+
"""
|
60 |
+
Create genomic intervals for DNA regions with basic properties and operations.
|
61 |
+
Input is genomic coordinates and output is interval object with properties and operations results.
|
62 |
+
"""
|
63 |
+
|
64 |
+
# Create genomic interval
|
65 |
+
interval = genome.Interval(chromosome=chromosome, start=start, end=end, strand=strand, name=name)
|
66 |
+
|
67 |
+
# Calculate basic properties
|
68 |
+
results = {
|
69 |
+
"interval": {
|
70 |
+
"chromosome": interval.chromosome,
|
71 |
+
"start": interval.start,
|
72 |
+
"end": interval.end,
|
73 |
+
"strand": interval.strand,
|
74 |
+
"name": interval.name,
|
75 |
+
"center": interval.center(),
|
76 |
+
"width": interval.width,
|
77 |
+
},
|
78 |
+
"operations": {
|
79 |
+
"resize_100bp": str(interval.resize(100)),
|
80 |
+
"resize_1MB": str(interval.resize(dna_client.SEQUENCE_LENGTH_1MB)),
|
81 |
+
}
|
82 |
+
}
|
83 |
+
|
84 |
+
# Save results
|
85 |
+
prefix = out_prefix or f"genomic_interval_{timestamp}"
|
86 |
+
output_file = OUTPUT_DIR / f"{prefix}.csv"
|
87 |
+
|
88 |
+
# Flatten results for CSV
|
89 |
+
rows = []
|
90 |
+
for category, data in results.items():
|
91 |
+
for key, value in data.items():
|
92 |
+
rows.append({"category": category, "property": key, "value": str(value)})
|
93 |
+
|
94 |
+
df = pd.DataFrame(rows)
|
95 |
+
df.to_csv(output_file, index=False)
|
96 |
+
|
97 |
+
return {
|
98 |
+
"message": f"Genomic interval created: {interval.chromosome}:{interval.start}-{interval.end}",
|
99 |
+
"reference": "https://github.com/google-deepmind/alphagenome/tree/main/colabs/essential_commands.ipynb",
|
100 |
+
"artifacts": [
|
101 |
+
{
|
102 |
+
"description": "Genomic interval properties",
|
103 |
+
"path": str(output_file.resolve())
|
104 |
+
}
|
105 |
+
]
|
106 |
+
}
|
107 |
+
|
108 |
+
@essential_commands_mcp.tool
|
109 |
+
def create_genomic_variant(
|
110 |
+
chromosome: Annotated[str, "Chromosome name (e.g., 'chr3')"] = "chr3",
|
111 |
+
position: Annotated[int, "1-based position"] = 10000,
|
112 |
+
reference_bases: Annotated[str, "Reference sequence (e.g., 'A')"] = "A",
|
113 |
+
alternate_bases: Annotated[str, "Alternate sequence (e.g., 'C')"] = "C",
|
114 |
+
out_prefix: Annotated[str | None, "Output file prefix"] = None,
|
115 |
+
) -> dict:
|
116 |
+
"""
|
117 |
+
Create genomic variants for genetic changes with reference interval operations.
|
118 |
+
Input is variant coordinates and sequences and output is variant properties and intervals.
|
119 |
+
"""
|
120 |
+
|
121 |
+
# Create genomic variant
|
122 |
+
variant = genome.Variant(
|
123 |
+
chromosome=chromosome,
|
124 |
+
position=position,
|
125 |
+
reference_bases=reference_bases,
|
126 |
+
alternate_bases=alternate_bases
|
127 |
+
)
|
128 |
+
|
129 |
+
# Get reference interval and properties
|
130 |
+
ref_interval = variant.reference_interval
|
131 |
+
input_interval = ref_interval.resize(dna_client.SEQUENCE_LENGTH_1MB)
|
132 |
+
|
133 |
+
# Create test interval for overlap testing
|
134 |
+
test_interval = genome.Interval(chromosome=chromosome, start=position+5, end=position+10)
|
135 |
+
|
136 |
+
results = {
|
137 |
+
"variant": {
|
138 |
+
"chromosome": variant.chromosome,
|
139 |
+
"position": variant.position,
|
140 |
+
"reference_bases": variant.reference_bases,
|
141 |
+
"alternate_bases": variant.alternate_bases,
|
142 |
+
"reference_interval": str(ref_interval),
|
143 |
+
"input_interval_width": input_interval.width,
|
144 |
+
},
|
145 |
+
"overlap_tests": {
|
146 |
+
"reference_overlaps_test": variant.reference_overlaps(test_interval),
|
147 |
+
"alternate_overlaps_test": variant.alternate_overlaps(test_interval),
|
148 |
+
}
|
149 |
+
}
|
150 |
+
|
151 |
+
# Save results
|
152 |
+
prefix = out_prefix or f"genomic_variant_{timestamp}"
|
153 |
+
output_file = OUTPUT_DIR / f"{prefix}.csv"
|
154 |
+
|
155 |
+
# Flatten results for CSV
|
156 |
+
rows = []
|
157 |
+
for category, data in results.items():
|
158 |
+
for key, value in data.items():
|
159 |
+
rows.append({"category": category, "property": key, "value": str(value)})
|
160 |
+
|
161 |
+
df = pd.DataFrame(rows)
|
162 |
+
df.to_csv(output_file, index=False)
|
163 |
+
|
164 |
+
return {
|
165 |
+
"message": f"Genomic variant created: {variant.chromosome}:{variant.position} {variant.reference_bases}>{variant.alternate_bases}",
|
166 |
+
"reference": "https://github.com/google-deepmind/alphagenome/tree/main/colabs/essential_commands.ipynb",
|
167 |
+
"artifacts": [
|
168 |
+
{
|
169 |
+
"description": "Genomic variant properties",
|
170 |
+
"path": str(output_file.resolve())
|
171 |
+
}
|
172 |
+
]
|
173 |
+
}
|
174 |
+
|
175 |
+
@essential_commands_mcp.tool
|
176 |
+
def create_track_data(
|
177 |
+
values_array: Annotated[str | None, "Path to CSV file with values array (shape: sequence_length x num_tracks)"] = None,
|
178 |
+
track_names: Annotated[list[str], "List of track names"] = ["track1", "track1", "track2"],
|
179 |
+
track_strands: Annotated[list[str], "List of track strands (+, -, .)"] = ["+", "-", "."],
|
180 |
+
chromosome: Annotated[str, "Chromosome for interval"] = "chr1",
|
181 |
+
start: Annotated[int, "Start position"] = 1000,
|
182 |
+
end: Annotated[int, "End position"] = 1004,
|
183 |
+
resolution: Annotated[int, "Base pair resolution"] = 1,
|
184 |
+
out_prefix: Annotated[str | None, "Output file prefix"] = None,
|
185 |
+
) -> dict:
|
186 |
+
"""
|
187 |
+
Create TrackData objects from user arrays and metadata with validation.
|
188 |
+
Input is values array and track metadata and output is TrackData object properties.
|
189 |
+
"""
|
190 |
+
|
191 |
+
if values_array is None:
|
192 |
+
# Use tutorial example data if no input provided
|
193 |
+
values = np.array([[0, 1, 2], [3, 4, 5], [6, 7, 8], [9, 10, 11]]).astype(np.float32)
|
194 |
+
else:
|
195 |
+
# Load user data
|
196 |
+
values = np.loadtxt(values_array, delimiter=',').astype(np.float32)
|
197 |
+
|
198 |
+
# Create metadata
|
199 |
+
metadata = pd.DataFrame({
|
200 |
+
'name': track_names,
|
201 |
+
'strand': track_strands,
|
202 |
+
})
|
203 |
+
|
204 |
+
# Create genomic interval
|
205 |
+
interval = genome.Interval(chromosome=chromosome, start=start, end=end)
|
206 |
+
|
207 |
+
# Create TrackData object
|
208 |
+
tdata = track_data.TrackData(
|
209 |
+
values=values,
|
210 |
+
metadata=metadata,
|
211 |
+
resolution=resolution,
|
212 |
+
interval=interval
|
213 |
+
)
|
214 |
+
|
215 |
+
# Save results
|
216 |
+
prefix = out_prefix or f"track_data_{timestamp}"
|
217 |
+
values_file = OUTPUT_DIR / f"{prefix}_values.csv"
|
218 |
+
metadata_file = OUTPUT_DIR / f"{prefix}_metadata.csv"
|
219 |
+
|
220 |
+
# Save values and metadata
|
221 |
+
np.savetxt(values_file, tdata.values, delimiter=',')
|
222 |
+
tdata.metadata.to_csv(metadata_file, index=False)
|
223 |
+
|
224 |
+
results = {
|
225 |
+
"shape": tdata.values.shape,
|
226 |
+
"resolution": tdata.resolution,
|
227 |
+
"interval": str(tdata.interval) if tdata.interval else "None",
|
228 |
+
"num_tracks": len(tdata.metadata),
|
229 |
+
"track_names": tdata.metadata.name.tolist(),
|
230 |
+
"track_strands": tdata.metadata.strand.tolist(),
|
231 |
+
}
|
232 |
+
|
233 |
+
return {
|
234 |
+
"message": f"TrackData created with shape {tdata.values.shape} and {len(tdata.metadata)} tracks",
|
235 |
+
"reference": "https://github.com/google-deepmind/alphagenome/tree/main/colabs/essential_commands.ipynb",
|
236 |
+
"artifacts": [
|
237 |
+
{
|
238 |
+
"description": "Track data values",
|
239 |
+
"path": str(values_file.resolve())
|
240 |
+
},
|
241 |
+
{
|
242 |
+
"description": "Track metadata",
|
243 |
+
"path": str(metadata_file.resolve())
|
244 |
+
}
|
245 |
+
]
|
246 |
+
}
|
247 |
+
|
248 |
+
@essential_commands_mcp.tool
|
249 |
+
def create_variant_scores(
|
250 |
+
scores_array: Annotated[str | None, "Path to CSV file with scores array (shape: num_genes x num_tracks)"] = None,
|
251 |
+
gene_ids: Annotated[list[str], "List of gene IDs"] = ["ENSG0001", "ENSG0002", "ENSG0003"],
|
252 |
+
track_names: Annotated[list[str], "List of track names"] = ["track1", "track2"],
|
253 |
+
track_strands: Annotated[list[str], "List of track strands"] = ["+", "-"],
|
254 |
+
out_prefix: Annotated[str | None, "Output file prefix"] = None,
|
255 |
+
) -> dict:
|
256 |
+
"""
|
257 |
+
Create AnnData objects for variant scoring results with gene and track metadata.
|
258 |
+
Input is scores array and metadata and output is AnnData object structure.
|
259 |
+
"""
|
260 |
+
|
261 |
+
if scores_array is None:
|
262 |
+
# Use tutorial example data if no input provided
|
263 |
+
scores = np.array([[0.1, 0.2], [0.3, 0.4], [0.5, 0.6]])
|
264 |
+
else:
|
265 |
+
# Load user data
|
266 |
+
scores = np.loadtxt(scores_array, delimiter=',')
|
267 |
+
|
268 |
+
# Create metadata
|
269 |
+
gene_metadata = pd.DataFrame({'gene_id': gene_ids})
|
270 |
+
track_metadata = pd.DataFrame({
|
271 |
+
'name': track_names,
|
272 |
+
'strand': track_strands
|
273 |
+
})
|
274 |
+
|
275 |
+
# Create AnnData object
|
276 |
+
variant_scores = anndata.AnnData(
|
277 |
+
X=scores,
|
278 |
+
obs=gene_metadata,
|
279 |
+
var=track_metadata
|
280 |
+
)
|
281 |
+
|
282 |
+
# Save results
|
283 |
+
prefix = out_prefix or f"variant_scores_{timestamp}"
|
284 |
+
scores_file = OUTPUT_DIR / f"{prefix}_scores.csv"
|
285 |
+
gene_metadata_file = OUTPUT_DIR / f"{prefix}_gene_metadata.csv"
|
286 |
+
track_metadata_file = OUTPUT_DIR / f"{prefix}_track_metadata.csv"
|
287 |
+
|
288 |
+
# Save components
|
289 |
+
np.savetxt(scores_file, variant_scores.X, delimiter=',')
|
290 |
+
variant_scores.obs.to_csv(gene_metadata_file, index=False)
|
291 |
+
variant_scores.var.to_csv(track_metadata_file, index=False)
|
292 |
+
|
293 |
+
results = {
|
294 |
+
"scores_shape": variant_scores.X.shape,
|
295 |
+
"num_genes": variant_scores.n_obs,
|
296 |
+
"num_tracks": variant_scores.n_vars,
|
297 |
+
"gene_ids": variant_scores.obs.gene_id.tolist(),
|
298 |
+
"track_names": variant_scores.var.name.tolist(),
|
299 |
+
}
|
300 |
+
|
301 |
+
return {
|
302 |
+
"message": f"Variant scores AnnData created with {variant_scores.n_obs} genes and {variant_scores.n_vars} tracks",
|
303 |
+
"reference": "https://github.com/google-deepmind/alphagenome/tree/main/colabs/essential_commands.ipynb",
|
304 |
+
"artifacts": [
|
305 |
+
{
|
306 |
+
"description": "Variant scores matrix",
|
307 |
+
"path": str(scores_file.resolve())
|
308 |
+
},
|
309 |
+
{
|
310 |
+
"description": "Gene metadata",
|
311 |
+
"path": str(gene_metadata_file.resolve())
|
312 |
+
},
|
313 |
+
{
|
314 |
+
"description": "Track metadata",
|
315 |
+
"path": str(track_metadata_file.resolve())
|
316 |
+
}
|
317 |
+
]
|
318 |
+
}
|
319 |
+
|
320 |
+
@essential_commands_mcp.tool
|
321 |
+
def genomic_interval_operations(
|
322 |
+
interval1_chromosome: Annotated[str, "First interval chromosome"] = "chr1",
|
323 |
+
interval1_start: Annotated[int, "First interval start"] = 1000,
|
324 |
+
interval1_end: Annotated[int, "First interval end"] = 1010,
|
325 |
+
interval2_chromosome: Annotated[str, "Second interval chromosome"] = "chr1",
|
326 |
+
interval2_start: Annotated[int, "Second interval start"] = 1005,
|
327 |
+
interval2_end: Annotated[int, "Second interval end"] = 1015,
|
328 |
+
out_prefix: Annotated[str | None, "Output file prefix"] = None,
|
329 |
+
) -> dict:
|
330 |
+
"""
|
331 |
+
Perform operations on genomic intervals including overlaps, intersections and comparisons.
|
332 |
+
Input is two genomic intervals and output is comparison operations results.
|
333 |
+
"""
|
334 |
+
|
335 |
+
# Create intervals
|
336 |
+
interval1 = genome.Interval(chromosome=interval1_chromosome, start=interval1_start, end=interval1_end)
|
337 |
+
interval2 = genome.Interval(chromosome=interval2_chromosome, start=interval2_start, end=interval2_end)
|
338 |
+
|
339 |
+
# Perform operations
|
340 |
+
operations = {
|
341 |
+
"interval1": str(interval1),
|
342 |
+
"interval2": str(interval2),
|
343 |
+
"interval1_center": interval1.center(),
|
344 |
+
"interval1_width": interval1.width,
|
345 |
+
"interval2_center": interval2.center(),
|
346 |
+
"interval2_width": interval2.width,
|
347 |
+
"overlaps": interval1.overlaps(interval2),
|
348 |
+
"contains": interval1.contains(interval2),
|
349 |
+
"intersect": str(interval1.intersect(interval2)) if interval1.overlaps(interval2) else "No overlap",
|
350 |
+
"interval1_resized_100": str(interval1.resize(100)),
|
351 |
+
"interval2_resized_100": str(interval2.resize(100)),
|
352 |
+
}
|
353 |
+
|
354 |
+
# Save results
|
355 |
+
prefix = out_prefix or f"genomic_interval_operations_{timestamp}"
|
356 |
+
output_file = OUTPUT_DIR / f"{prefix}.csv"
|
357 |
+
|
358 |
+
# Convert to DataFrame
|
359 |
+
df = pd.DataFrame([
|
360 |
+
{"operation": key, "result": str(value)}
|
361 |
+
for key, value in operations.items()
|
362 |
+
])
|
363 |
+
df.to_csv(output_file, index=False)
|
364 |
+
|
365 |
+
return {
|
366 |
+
"message": f"Genomic interval operations completed for {interval1} and {interval2}",
|
367 |
+
"reference": "https://github.com/google-deepmind/alphagenome/tree/main/colabs/essential_commands.ipynb",
|
368 |
+
"artifacts": [
|
369 |
+
{
|
370 |
+
"description": "Interval operations results",
|
371 |
+
"path": str(output_file.resolve())
|
372 |
+
}
|
373 |
+
]
|
374 |
+
}
|
375 |
+
|
376 |
+
@essential_commands_mcp.tool
|
377 |
+
def variant_interval_operations(
|
378 |
+
variant_chromosome: Annotated[str, "Variant chromosome"] = "chr3",
|
379 |
+
variant_position: Annotated[int, "Variant position (1-based)"] = 10000,
|
380 |
+
variant_ref: Annotated[str, "Reference bases"] = "T",
|
381 |
+
variant_alt: Annotated[str, "Alternate bases"] = "CGTCAAT",
|
382 |
+
interval_chromosome: Annotated[str, "Test interval chromosome"] = "chr3",
|
383 |
+
interval_start: Annotated[int, "Test interval start"] = 10005,
|
384 |
+
interval_end: Annotated[int, "Test interval end"] = 10010,
|
385 |
+
out_prefix: Annotated[str | None, "Output file prefix"] = None,
|
386 |
+
) -> dict:
|
387 |
+
"""
|
388 |
+
Check variant overlaps with genomic intervals for reference and alternate alleles.
|
389 |
+
Input is variant and interval coordinates and output is overlap test results.
|
390 |
+
"""
|
391 |
+
|
392 |
+
# Create variant and interval
|
393 |
+
variant = genome.Variant(
|
394 |
+
chromosome=variant_chromosome,
|
395 |
+
position=variant_position,
|
396 |
+
reference_bases=variant_ref,
|
397 |
+
alternate_bases=variant_alt,
|
398 |
+
)
|
399 |
+
|
400 |
+
interval = genome.Interval(
|
401 |
+
chromosome=interval_chromosome,
|
402 |
+
start=interval_start,
|
403 |
+
end=interval_end
|
404 |
+
)
|
405 |
+
|
406 |
+
# Perform overlap tests
|
407 |
+
results = {
|
408 |
+
"variant": f"{variant.chromosome}:{variant.position} {variant.reference_bases}>{variant.alternate_bases}",
|
409 |
+
"interval": str(interval),
|
410 |
+
"reference_interval": str(variant.reference_interval),
|
411 |
+
"reference_overlaps": variant.reference_overlaps(interval),
|
412 |
+
"alternate_overlaps": variant.alternate_overlaps(interval),
|
413 |
+
"variant_ref_length": len(variant.reference_bases),
|
414 |
+
"variant_alt_length": len(variant.alternate_bases),
|
415 |
+
}
|
416 |
+
|
417 |
+
# Save results
|
418 |
+
prefix = out_prefix or f"variant_interval_operations_{timestamp}"
|
419 |
+
output_file = OUTPUT_DIR / f"{prefix}.csv"
|
420 |
+
|
421 |
+
# Convert to DataFrame
|
422 |
+
df = pd.DataFrame([
|
423 |
+
{"property": key, "value": str(value)}
|
424 |
+
for key, value in results.items()
|
425 |
+
])
|
426 |
+
df.to_csv(output_file, index=False)
|
427 |
+
|
428 |
+
return {
|
429 |
+
"message": f"Variant interval operations completed: ref_overlaps={results['reference_overlaps']}, alt_overlaps={results['alternate_overlaps']}",
|
430 |
+
"reference": "https://github.com/google-deepmind/alphagenome/tree/main/colabs/essential_commands.ipynb",
|
431 |
+
"artifacts": [
|
432 |
+
{
|
433 |
+
"description": "Variant interval operations results",
|
434 |
+
"path": str(output_file.resolve())
|
435 |
+
}
|
436 |
+
]
|
437 |
+
}
|
438 |
+
|
439 |
+
@essential_commands_mcp.tool
|
440 |
+
def track_data_operations(
|
441 |
+
values_array: Annotated[str | None, "Path to CSV file with values array"] = None,
|
442 |
+
track_names: Annotated[list[str], "List of track names"] = ["track1", "track1", "track2"],
|
443 |
+
track_strands: Annotated[list[str], "List of track strands"] = ["+", "-", "."],
|
444 |
+
operation: Annotated[Literal["filter", "resize", "slice", "subset", "reverse_complement"], "Operation to perform"] = "filter",
|
445 |
+
filter_strand: Annotated[Literal["+", "-", "."], "Strand to filter to"] = "+",
|
446 |
+
resize_width: Annotated[int, "Width to resize to"] = 8,
|
447 |
+
slice_start: Annotated[int, "Slice start position"] = 2,
|
448 |
+
slice_end: Annotated[int, "Slice end position"] = 4,
|
449 |
+
subset_names: Annotated[list[str], "Track names to subset to"] = ["track1"],
|
450 |
+
chromosome: Annotated[str, "Chromosome"] = "chr1",
|
451 |
+
start: Annotated[int, "Start position"] = 1000,
|
452 |
+
end: Annotated[int, "End position"] = 1004,
|
453 |
+
strand: Annotated[Literal["+", "-", "."], "Interval strand"] = "+",
|
454 |
+
out_prefix: Annotated[str | None, "Output file prefix"] = None,
|
455 |
+
) -> dict:
|
456 |
+
"""
|
457 |
+
Filter, resize, slice and transform TrackData objects with strand-aware operations.
|
458 |
+
Input is TrackData and operation parameters and output is transformed TrackData.
|
459 |
+
"""
|
460 |
+
|
461 |
+
if values_array is None:
|
462 |
+
# Use tutorial example data
|
463 |
+
values = np.array([[0, 1, 2], [3, 4, 5], [6, 7, 8], [9, 10, 11]]).astype(np.float32)
|
464 |
+
else:
|
465 |
+
values = np.loadtxt(values_array, delimiter=',').astype(np.float32)
|
466 |
+
|
467 |
+
# Create metadata and interval
|
468 |
+
metadata = pd.DataFrame({
|
469 |
+
'name': track_names,
|
470 |
+
'strand': track_strands,
|
471 |
+
})
|
472 |
+
|
473 |
+
interval = genome.Interval(chromosome=chromosome, start=start, end=end, strand=strand)
|
474 |
+
tdata = track_data.TrackData(values=values, metadata=metadata, resolution=1, interval=interval)
|
475 |
+
|
476 |
+
# Perform operation
|
477 |
+
if operation == "filter":
|
478 |
+
if filter_strand == "+":
|
479 |
+
result_tdata = tdata.filter_to_positive_strand()
|
480 |
+
elif filter_strand == "-":
|
481 |
+
result_tdata = tdata.filter_to_negative_strand()
|
482 |
+
else:
|
483 |
+
result_tdata = tdata.filter_to_unstranded()
|
484 |
+
operation_info = f"filtered to {filter_strand} strand"
|
485 |
+
elif operation == "resize":
|
486 |
+
result_tdata = tdata.resize(width=resize_width)
|
487 |
+
operation_info = f"resized to width {resize_width}"
|
488 |
+
elif operation == "slice":
|
489 |
+
result_tdata = tdata.slice_by_positions(start=slice_start, end=slice_end)
|
490 |
+
operation_info = f"sliced from {slice_start} to {slice_end}"
|
491 |
+
elif operation == "subset":
|
492 |
+
result_tdata = tdata.select_tracks_by_name(names=subset_names)
|
493 |
+
operation_info = f"subset to tracks {subset_names}"
|
494 |
+
elif operation == "reverse_complement":
|
495 |
+
result_tdata = tdata.reverse_complement()
|
496 |
+
operation_info = "reverse complemented"
|
497 |
+
|
498 |
+
# Save results
|
499 |
+
prefix = out_prefix or f"track_data_{operation}_{timestamp}"
|
500 |
+
values_file = OUTPUT_DIR / f"{prefix}_values.csv"
|
501 |
+
metadata_file = OUTPUT_DIR / f"{prefix}_metadata.csv"
|
502 |
+
|
503 |
+
# Save values and metadata
|
504 |
+
np.savetxt(values_file, result_tdata.values, delimiter=',')
|
505 |
+
result_tdata.metadata.to_csv(metadata_file, index=False)
|
506 |
+
|
507 |
+
return {
|
508 |
+
"message": f"TrackData {operation_info}: shape {result_tdata.values.shape}",
|
509 |
+
"reference": "https://github.com/google-deepmind/alphagenome/tree/main/colabs/essential_commands.ipynb",
|
510 |
+
"artifacts": [
|
511 |
+
{
|
512 |
+
"description": f"Track data values after {operation}",
|
513 |
+
"path": str(values_file.resolve())
|
514 |
+
},
|
515 |
+
{
|
516 |
+
"description": f"Track metadata after {operation}",
|
517 |
+
"path": str(metadata_file.resolve())
|
518 |
+
}
|
519 |
+
]
|
520 |
+
}
|
521 |
+
|
522 |
+
@essential_commands_mcp.tool
|
523 |
+
def track_data_resolution_conversion(
|
524 |
+
values_array: Annotated[str | None, "Path to CSV file with values array"] = None,
|
525 |
+
track_names: Annotated[list[str], "List of track names"] = ["track1", "track1", "track2"],
|
526 |
+
track_strands: Annotated[list[str], "List of track strands"] = ["+", "-", "."],
|
527 |
+
original_resolution: Annotated[int, "Original resolution in bp"] = 1,
|
528 |
+
target_resolution: Annotated[int, "Target resolution in bp"] = 2,
|
529 |
+
chromosome: Annotated[str, "Chromosome"] = "chr1",
|
530 |
+
start: Annotated[int, "Start position"] = 1000,
|
531 |
+
end: Annotated[int, "End position"] = 1004,
|
532 |
+
out_prefix: Annotated[str | None, "Output file prefix"] = None,
|
533 |
+
) -> dict:
|
534 |
+
"""
|
535 |
+
Convert between different resolutions by upsampling or downsampling TrackData.
|
536 |
+
Input is TrackData and resolution parameters and output is resolution-converted data.
|
537 |
+
"""
|
538 |
+
|
539 |
+
if values_array is None:
|
540 |
+
# Use tutorial example data
|
541 |
+
values = np.array([[0, 1, 2], [3, 4, 5], [6, 7, 8], [9, 10, 11]]).astype(np.float32)
|
542 |
+
else:
|
543 |
+
values = np.loadtxt(values_array, delimiter=',').astype(np.float32)
|
544 |
+
|
545 |
+
# Create metadata and interval
|
546 |
+
metadata = pd.DataFrame({
|
547 |
+
'name': track_names,
|
548 |
+
'strand': track_strands,
|
549 |
+
})
|
550 |
+
|
551 |
+
interval = genome.Interval(chromosome=chromosome, start=start, end=end)
|
552 |
+
tdata = track_data.TrackData(
|
553 |
+
values=values,
|
554 |
+
metadata=metadata,
|
555 |
+
resolution=original_resolution,
|
556 |
+
interval=interval
|
557 |
+
)
|
558 |
+
|
559 |
+
# Change resolution
|
560 |
+
converted_tdata = tdata.change_resolution(resolution=target_resolution)
|
561 |
+
|
562 |
+
# Save results
|
563 |
+
prefix = out_prefix or f"track_data_resolution_{original_resolution}_to_{target_resolution}_{timestamp}"
|
564 |
+
original_file = OUTPUT_DIR / f"{prefix}_original.csv"
|
565 |
+
converted_file = OUTPUT_DIR / f"{prefix}_converted.csv"
|
566 |
+
metadata_file = OUTPUT_DIR / f"{prefix}_metadata.csv"
|
567 |
+
|
568 |
+
# Save original, converted values, and metadata
|
569 |
+
np.savetxt(original_file, tdata.values, delimiter=',')
|
570 |
+
np.savetxt(converted_file, converted_tdata.values, delimiter=',')
|
571 |
+
converted_tdata.metadata.to_csv(metadata_file, index=False)
|
572 |
+
|
573 |
+
return {
|
574 |
+
"message": f"Resolution converted from {original_resolution}bp to {target_resolution}bp: {tdata.values.shape} -> {converted_tdata.values.shape}",
|
575 |
+
"reference": "https://github.com/google-deepmind/alphagenome/tree/main/colabs/essential_commands.ipynb",
|
576 |
+
"artifacts": [
|
577 |
+
{
|
578 |
+
"description": f"Original values at {original_resolution}bp resolution",
|
579 |
+
"path": str(original_file.resolve())
|
580 |
+
},
|
581 |
+
{
|
582 |
+
"description": f"Converted values at {target_resolution}bp resolution",
|
583 |
+
"path": str(converted_file.resolve())
|
584 |
+
},
|
585 |
+
{
|
586 |
+
"description": "Track metadata",
|
587 |
+
"path": str(metadata_file.resolve())
|
588 |
+
}
|
589 |
+
]
|
590 |
+
}
|
tools/quick_start.py
ADDED
@@ -0,0 +1,664 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
AlphaGenome Quick Start tutorial tools for DNA sequence analysis and prediction.
|
3 |
+
|
4 |
+
This MCP Server provides 6 tools:
|
5 |
+
1. predict_dna_sequence: Predict genomic tracks from DNA sequence
|
6 |
+
2. predict_genome_interval: Predict genomic tracks for reference genome intervals
|
7 |
+
3. predict_variant_effects: Predict and visualize genetic variant effects
|
8 |
+
4. score_variant_effect: Score genetic variant effects using variant scorers
|
9 |
+
5. ism_analysis: Perform in silico mutagenesis analysis with sequence logos
|
10 |
+
6. mouse_predictions: Make predictions for mouse sequences and intervals
|
11 |
+
|
12 |
+
All tools extracted from `https://github.com/google-deepmind/alphagenome/tree/main/colabs/quick_start.ipynb`.
|
13 |
+
"""
|
14 |
+
|
15 |
+
# Standard imports
|
16 |
+
from typing import Annotated, Literal, Any
|
17 |
+
import pandas as pd
|
18 |
+
import numpy as np
|
19 |
+
from pathlib import Path
|
20 |
+
import os
|
21 |
+
from fastmcp import FastMCP
|
22 |
+
from datetime import datetime
|
23 |
+
|
24 |
+
# AlphaGenome imports
|
25 |
+
from alphagenome import colab_utils
|
26 |
+
from alphagenome.data import gene_annotation
|
27 |
+
from alphagenome.data import genome
|
28 |
+
from alphagenome.data import transcript as transcript_utils
|
29 |
+
from alphagenome.interpretation import ism
|
30 |
+
from alphagenome.models import dna_client
|
31 |
+
from alphagenome.models import variant_scorers
|
32 |
+
from alphagenome.visualization import plot_components
|
33 |
+
import matplotlib.pyplot as plt
|
34 |
+
|
35 |
+
# Base persistent directory (HF Spaces guarantees /data is writable & persistent)
|
36 |
+
BASE_DIR = Path("/data")
|
37 |
+
|
38 |
+
DEFAULT_INPUT_DIR = BASE_DIR / "tmp_inputs"
|
39 |
+
DEFAULT_OUTPUT_DIR = BASE_DIR / "tmp_outputs"
|
40 |
+
|
41 |
+
INPUT_DIR = Path(os.environ.get("QUICK_START_INPUT_DIR", DEFAULT_INPUT_DIR))
|
42 |
+
OUTPUT_DIR = Path(os.environ.get("QUICK_START_OUTPUT_DIR", DEFAULT_OUTPUT_DIR))
|
43 |
+
|
44 |
+
# Ensure directories exist
|
45 |
+
INPUT_DIR.mkdir(parents=True, exist_ok=True)
|
46 |
+
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
|
47 |
+
|
48 |
+
# Timestamp for unique outputs
|
49 |
+
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
50 |
+
|
51 |
+
# MCP server instance
|
52 |
+
quick_start_mcp = FastMCP(name="quick_start")
|
53 |
+
|
54 |
+
@quick_start_mcp.tool
|
55 |
+
def predict_dna_sequence(
|
56 |
+
api_key: Annotated[str, "AlphaGenome API key for authentication"],
|
57 |
+
sequence: Annotated[str, "DNA sequence to analyze (will be center-padded to valid length)"] = 'GATTACA',
|
58 |
+
sequence_length: Annotated[Literal["2KB", "16KB", "100KB", "500KB", "1MB"], "Model input sequence length"] = "2KB",
|
59 |
+
output_types: Annotated[list[str], "List of output types to predict (e.g. ['DNASE', 'CAGE', 'RNA_SEQ'])"] = ["DNASE"],
|
60 |
+
ontology_terms: Annotated[list[str], "List of ontology terms for tissues/cell types (e.g. ['UBERON:0002048'])"] = ["UBERON:0002048"],
|
61 |
+
out_prefix: Annotated[str | None, "Output file prefix"] = None,
|
62 |
+
) -> dict:
|
63 |
+
"""
|
64 |
+
Predict genomic tracks from a DNA sequence using AlphaGenome model.
|
65 |
+
Input is DNA sequence string and output is track predictions with metadata tables.
|
66 |
+
"""
|
67 |
+
# Set up output prefix
|
68 |
+
if out_prefix is None:
|
69 |
+
out_prefix = f"predict_dna_sequence_{timestamp}"
|
70 |
+
|
71 |
+
# Create DNA model
|
72 |
+
dna_model = dna_client.create(api_key)
|
73 |
+
|
74 |
+
# Convert sequence length to client constant
|
75 |
+
length_map = {
|
76 |
+
"2KB": dna_client.SEQUENCE_LENGTH_2KB,
|
77 |
+
"16KB": dna_client.SEQUENCE_LENGTH_16KB,
|
78 |
+
"100KB": dna_client.SEQUENCE_LENGTH_100KB,
|
79 |
+
"500KB": dna_client.SEQUENCE_LENGTH_500KB,
|
80 |
+
"1MB": dna_client.SEQUENCE_LENGTH_1MB
|
81 |
+
}
|
82 |
+
target_length = length_map[sequence_length]
|
83 |
+
|
84 |
+
# Pad sequence to valid length
|
85 |
+
padded_sequence = sequence.center(target_length, 'N')
|
86 |
+
|
87 |
+
# Convert output types to client enums
|
88 |
+
output_enums = []
|
89 |
+
for output_type in output_types:
|
90 |
+
output_enums.append(getattr(dna_client.OutputType, output_type))
|
91 |
+
|
92 |
+
# Make prediction
|
93 |
+
output = dna_model.predict_sequence(
|
94 |
+
sequence=padded_sequence,
|
95 |
+
requested_outputs=output_enums,
|
96 |
+
ontology_terms=ontology_terms,
|
97 |
+
)
|
98 |
+
|
99 |
+
artifacts = []
|
100 |
+
|
101 |
+
# Save track data and metadata for each output type
|
102 |
+
for output_type in output_types:
|
103 |
+
track_data = getattr(output, output_type.lower())
|
104 |
+
|
105 |
+
# Save values
|
106 |
+
values_file = OUTPUT_DIR / f"{out_prefix}_{output_type.lower()}_values.csv"
|
107 |
+
pd.DataFrame(track_data.values).to_csv(values_file, index=False)
|
108 |
+
artifacts.append({
|
109 |
+
"description": f"{output_type} prediction values",
|
110 |
+
"path": str(values_file.resolve())
|
111 |
+
})
|
112 |
+
|
113 |
+
# Save metadata
|
114 |
+
metadata_file = OUTPUT_DIR / f"{out_prefix}_{output_type.lower()}_metadata.csv"
|
115 |
+
track_data.metadata.to_csv(metadata_file, index=False)
|
116 |
+
artifacts.append({
|
117 |
+
"description": f"{output_type} track metadata",
|
118 |
+
"path": str(metadata_file.resolve())
|
119 |
+
})
|
120 |
+
|
121 |
+
return {
|
122 |
+
"message": f"DNA sequence predictions completed for {len(output_types)} output types",
|
123 |
+
"reference": "https://github.com/google-deepmind/alphagenome/tree/main/colabs/quick_start.ipynb",
|
124 |
+
"artifacts": artifacts
|
125 |
+
}
|
126 |
+
|
127 |
+
@quick_start_mcp.tool
|
128 |
+
def predict_genome_interval(
|
129 |
+
api_key: Annotated[str, "AlphaGenome API key for authentication"],
|
130 |
+
chromosome: Annotated[str, "Chromosome name (e.g. 'chr19')"] = "chr19",
|
131 |
+
start_position: Annotated[int, "Start position on chromosome"] = 40991281,
|
132 |
+
end_position: Annotated[int, "End position on chromosome"] = 41018398,
|
133 |
+
strand: Annotated[Literal["+", "-", "."], "Strand orientation"] = "+",
|
134 |
+
sequence_length: Annotated[Literal["2KB", "16KB", "100KB", "500KB", "1MB"], "Model input sequence length"] = "1MB",
|
135 |
+
output_types: Annotated[list[str], "List of output types to predict (e.g. ['RNA_SEQ'])"] = ["RNA_SEQ"],
|
136 |
+
ontology_terms: Annotated[list[str], "List of ontology terms for tissues/cell types"] = ["UBERON:0001114"],
|
137 |
+
gene_symbol: Annotated[str | None, "Gene symbol to center interval on (overrides coordinates)"] = "CYP2B6",
|
138 |
+
out_prefix: Annotated[str | None, "Output file prefix"] = None,
|
139 |
+
) -> dict:
|
140 |
+
"""
|
141 |
+
Predict genomic tracks for a reference genome interval with transcript visualization.
|
142 |
+
Input is genomic coordinates or gene symbol and output is prediction plot and metadata.
|
143 |
+
"""
|
144 |
+
# Set up output prefix
|
145 |
+
if out_prefix is None:
|
146 |
+
out_prefix = f"predict_genome_interval_{timestamp}"
|
147 |
+
|
148 |
+
# Create DNA model
|
149 |
+
dna_model = dna_client.create(api_key)
|
150 |
+
|
151 |
+
# Load GTF file for gene annotation
|
152 |
+
gtf = pd.read_feather(
|
153 |
+
'https://storage.googleapis.com/alphagenome/reference/gencode/'
|
154 |
+
'hg38/gencode.v46.annotation.gtf.gz.feather'
|
155 |
+
)
|
156 |
+
|
157 |
+
# Set up transcript extractors
|
158 |
+
gtf_transcripts = gene_annotation.filter_protein_coding(gtf)
|
159 |
+
gtf_transcripts = gene_annotation.filter_to_longest_transcript(gtf_transcripts)
|
160 |
+
transcript_extractor = transcript_utils.TranscriptExtractor(gtf_transcripts)
|
161 |
+
|
162 |
+
# Create interval - use gene symbol if provided, otherwise coordinates
|
163 |
+
if gene_symbol:
|
164 |
+
interval = gene_annotation.get_gene_interval(gtf, gene_symbol=gene_symbol)
|
165 |
+
else:
|
166 |
+
interval = genome.Interval(chromosome, start_position, end_position, strand)
|
167 |
+
|
168 |
+
# Resize to model-compatible length
|
169 |
+
length_map = {
|
170 |
+
"2KB": dna_client.SEQUENCE_LENGTH_2KB,
|
171 |
+
"16KB": dna_client.SEQUENCE_LENGTH_16KB,
|
172 |
+
"100KB": dna_client.SEQUENCE_LENGTH_100KB,
|
173 |
+
"500KB": dna_client.SEQUENCE_LENGTH_500KB,
|
174 |
+
"1MB": dna_client.SEQUENCE_LENGTH_1MB
|
175 |
+
}
|
176 |
+
interval = interval.resize(length_map[sequence_length])
|
177 |
+
|
178 |
+
# Convert output types to client enums
|
179 |
+
output_enums = []
|
180 |
+
for output_type in output_types:
|
181 |
+
output_enums.append(getattr(dna_client.OutputType, output_type))
|
182 |
+
|
183 |
+
# Make prediction
|
184 |
+
output = dna_model.predict_interval(
|
185 |
+
interval=interval,
|
186 |
+
requested_outputs=output_enums,
|
187 |
+
ontology_terms=ontology_terms,
|
188 |
+
)
|
189 |
+
|
190 |
+
# Extract transcripts for visualization
|
191 |
+
longest_transcripts = transcript_extractor.extract(interval)
|
192 |
+
|
193 |
+
artifacts = []
|
194 |
+
|
195 |
+
# Save metadata for each output type
|
196 |
+
for output_type in output_types:
|
197 |
+
track_data = getattr(output, output_type.lower())
|
198 |
+
|
199 |
+
# Save metadata
|
200 |
+
metadata_file = OUTPUT_DIR / f"{out_prefix}_{output_type.lower()}_metadata.csv"
|
201 |
+
track_data.metadata.to_csv(metadata_file, index=False)
|
202 |
+
artifacts.append({
|
203 |
+
"description": f"{output_type} track metadata",
|
204 |
+
"path": str(metadata_file.resolve())
|
205 |
+
})
|
206 |
+
|
207 |
+
# Create visualization - full interval
|
208 |
+
plt.figure(figsize=(15, 8))
|
209 |
+
track_data = getattr(output, output_types[0].lower())
|
210 |
+
plot_components.plot(
|
211 |
+
components=[
|
212 |
+
plot_components.TranscriptAnnotation(longest_transcripts),
|
213 |
+
plot_components.Tracks(track_data),
|
214 |
+
],
|
215 |
+
interval=track_data.interval,
|
216 |
+
)
|
217 |
+
|
218 |
+
plot_file = OUTPUT_DIR / f"{out_prefix}_{output_types[0].lower()}_plot.png"
|
219 |
+
plt.savefig(plot_file, dpi=300, bbox_inches='tight')
|
220 |
+
plt.close()
|
221 |
+
artifacts.append({
|
222 |
+
"description": f"{output_types[0]} prediction plot",
|
223 |
+
"path": str(plot_file.resolve())
|
224 |
+
})
|
225 |
+
|
226 |
+
# Create zoomed visualization
|
227 |
+
plt.figure(figsize=(15, 8))
|
228 |
+
plot_components.plot(
|
229 |
+
components=[
|
230 |
+
plot_components.TranscriptAnnotation(longest_transcripts, fig_height=0.1),
|
231 |
+
plot_components.Tracks(track_data),
|
232 |
+
],
|
233 |
+
interval=track_data.interval.resize(2**15),
|
234 |
+
)
|
235 |
+
|
236 |
+
plot_zoomed_file = OUTPUT_DIR / f"{out_prefix}_{output_types[0].lower()}_plot_zoomed.png"
|
237 |
+
plt.savefig(plot_zoomed_file, dpi=300, bbox_inches='tight')
|
238 |
+
plt.close()
|
239 |
+
artifacts.append({
|
240 |
+
"description": f"{output_types[0]} prediction plot (zoomed)",
|
241 |
+
"path": str(plot_zoomed_file.resolve())
|
242 |
+
})
|
243 |
+
|
244 |
+
return {
|
245 |
+
"message": f"Genome interval predictions completed with {len(longest_transcripts)} transcripts",
|
246 |
+
"reference": "https://github.com/google-deepmind/alphagenome/tree/main/colabs/quick_start.ipynb",
|
247 |
+
"artifacts": artifacts
|
248 |
+
}
|
249 |
+
|
250 |
+
|
251 |
+
def predict_variant_effects(
|
252 |
+
api_key: Annotated[str, "AlphaGenome API key for authentication"],
|
253 |
+
chromosome: Annotated[str, "Chromosome name (e.g. 'chr22')"] = "chr22",
|
254 |
+
position: Annotated[int, "Variant position"] = 36201698,
|
255 |
+
reference_bases: Annotated[str, "Reference allele"] = "A",
|
256 |
+
alternate_bases: Annotated[str, "Alternative allele"] = "C",
|
257 |
+
sequence_length: Annotated[Literal["2KB", "16KB", "100KB", "500KB", "1MB"], "Model input sequence length"] = "1MB",
|
258 |
+
output_types: Annotated[list[str], "List of output types to predict"] = ["RNA_SEQ"],
|
259 |
+
ontology_terms: Annotated[list[str], "List of ontology terms for tissues/cell types"] = ["UBERON:0001157"],
|
260 |
+
out_prefix: Annotated[str | None, "Output file prefix"] = None,
|
261 |
+
) -> dict:
|
262 |
+
"""
|
263 |
+
Predict and visualize genetic variant effects comparing REF vs ALT predictions.
|
264 |
+
Input is variant coordinates and output is overlaid REF/ALT visualization plot.
|
265 |
+
"""
|
266 |
+
# Set up output prefix
|
267 |
+
if out_prefix is None:
|
268 |
+
out_prefix = f"predict_variant_effects_{timestamp}"
|
269 |
+
|
270 |
+
# Create DNA model
|
271 |
+
dna_model = dna_client.create(api_key)
|
272 |
+
|
273 |
+
# Create variant object
|
274 |
+
variant = genome.Variant(
|
275 |
+
chromosome=chromosome,
|
276 |
+
position=position,
|
277 |
+
reference_bases=reference_bases,
|
278 |
+
alternate_bases=alternate_bases,
|
279 |
+
)
|
280 |
+
|
281 |
+
# Create interval from variant
|
282 |
+
length_map = {
|
283 |
+
"2KB": dna_client.SEQUENCE_LENGTH_2KB,
|
284 |
+
"16KB": dna_client.SEQUENCE_LENGTH_16KB,
|
285 |
+
"100KB": dna_client.SEQUENCE_LENGTH_100KB,
|
286 |
+
"500KB": dna_client.SEQUENCE_LENGTH_500KB,
|
287 |
+
"1MB": dna_client.SEQUENCE_LENGTH_1MB
|
288 |
+
}
|
289 |
+
interval = variant.reference_interval.resize(length_map[sequence_length])
|
290 |
+
|
291 |
+
# Convert output types to client enums
|
292 |
+
output_enums = []
|
293 |
+
for output_type in output_types:
|
294 |
+
output_enums.append(getattr(dna_client.OutputType, output_type))
|
295 |
+
|
296 |
+
# Make variant prediction
|
297 |
+
variant_output = dna_model.predict_variant(
|
298 |
+
interval=interval,
|
299 |
+
variant=variant,
|
300 |
+
requested_outputs=output_enums,
|
301 |
+
ontology_terms=ontology_terms,
|
302 |
+
)
|
303 |
+
|
304 |
+
# Load GTF for transcript annotation
|
305 |
+
gtf = pd.read_feather(
|
306 |
+
'https://storage.googleapis.com/alphagenome/reference/gencode/'
|
307 |
+
'hg38/gencode.v46.annotation.gtf.gz.feather'
|
308 |
+
)
|
309 |
+
gtf_transcripts = gene_annotation.filter_protein_coding(gtf)
|
310 |
+
gtf_transcripts = gene_annotation.filter_to_longest_transcript(gtf_transcripts)
|
311 |
+
transcript_extractor = transcript_utils.TranscriptExtractor(gtf_transcripts)
|
312 |
+
longest_transcripts = transcript_extractor.extract(interval)
|
313 |
+
|
314 |
+
artifacts = []
|
315 |
+
|
316 |
+
# Create overlaid REF vs ALT visualization
|
317 |
+
plt.figure(figsize=(15, 8))
|
318 |
+
ref_track_data = getattr(variant_output.reference, output_types[0].lower())
|
319 |
+
alt_track_data = getattr(variant_output.alternate, output_types[0].lower())
|
320 |
+
|
321 |
+
plot_components.plot(
|
322 |
+
[
|
323 |
+
plot_components.TranscriptAnnotation(longest_transcripts),
|
324 |
+
plot_components.OverlaidTracks(
|
325 |
+
tdata={
|
326 |
+
'REF': ref_track_data,
|
327 |
+
'ALT': alt_track_data,
|
328 |
+
},
|
329 |
+
colors={'REF': 'dimgrey', 'ALT': 'red'},
|
330 |
+
),
|
331 |
+
],
|
332 |
+
interval=ref_track_data.interval.resize(2**15),
|
333 |
+
# Annotate the location of the variant as a vertical line
|
334 |
+
annotations=[plot_components.VariantAnnotation([variant], alpha=0.8)],
|
335 |
+
)
|
336 |
+
|
337 |
+
plot_file = OUTPUT_DIR / f"{out_prefix}_{output_types[0].lower()}_variant_plot.png"
|
338 |
+
plt.savefig(plot_file, dpi=300, bbox_inches='tight')
|
339 |
+
plt.close()
|
340 |
+
artifacts.append({
|
341 |
+
"description": f"{output_types[0]} REF vs ALT comparison plot",
|
342 |
+
"path": str(plot_file.resolve())
|
343 |
+
})
|
344 |
+
|
345 |
+
return {
|
346 |
+
"message": f"Variant effect predictions completed for {chromosome}:{position}:{reference_bases}>{alternate_bases}",
|
347 |
+
"reference": "https://github.com/google-deepmind/alphagenome/tree/main/colabs/quick_start.ipynb",
|
348 |
+
"artifacts": artifacts
|
349 |
+
}
|
350 |
+
|
351 |
+
|
352 |
+
def score_variant_effect(
|
353 |
+
api_key: Annotated[str, "AlphaGenome API key for authentication"],
|
354 |
+
chromosome: Annotated[str, "Chromosome name (e.g. 'chr22')"] = "chr22",
|
355 |
+
position: Annotated[int, "Variant position"] = 36201698,
|
356 |
+
reference_bases: Annotated[str, "Reference allele"] = "A",
|
357 |
+
alternate_bases: Annotated[str, "Alternative allele"] = "C",
|
358 |
+
sequence_length: Annotated[Literal["2KB", "16KB", "100KB", "500KB", "1MB"], "Model input sequence length"] = "1MB",
|
359 |
+
scorer_type: Annotated[Literal["RNA_SEQ", "DNASE", "CAGE", "ATAC"], "Variant scorer type to use"] = "RNA_SEQ",
|
360 |
+
out_prefix: Annotated[str | None, "Output file prefix"] = None,
|
361 |
+
) -> dict:
|
362 |
+
"""
|
363 |
+
Score genetic variant effects using recommended variant scorers and produce tidy scores.
|
364 |
+
Input is variant coordinates and output is variant scores table with genes and tracks.
|
365 |
+
"""
|
366 |
+
# Set up output prefix
|
367 |
+
if out_prefix is None:
|
368 |
+
out_prefix = f"score_variant_effect_{timestamp}"
|
369 |
+
|
370 |
+
# Create DNA model
|
371 |
+
dna_model = dna_client.create(api_key)
|
372 |
+
|
373 |
+
# Create variant object
|
374 |
+
variant = genome.Variant(
|
375 |
+
chromosome=chromosome,
|
376 |
+
position=position,
|
377 |
+
reference_bases=reference_bases,
|
378 |
+
alternate_bases=alternate_bases,
|
379 |
+
)
|
380 |
+
|
381 |
+
# Create interval from variant
|
382 |
+
length_map = {
|
383 |
+
"2KB": dna_client.SEQUENCE_LENGTH_2KB,
|
384 |
+
"16KB": dna_client.SEQUENCE_LENGTH_16KB,
|
385 |
+
"100KB": dna_client.SEQUENCE_LENGTH_100KB,
|
386 |
+
"500KB": dna_client.SEQUENCE_LENGTH_500KB,
|
387 |
+
"1MB": dna_client.SEQUENCE_LENGTH_1MB
|
388 |
+
}
|
389 |
+
interval = variant.reference_interval.resize(length_map[sequence_length])
|
390 |
+
|
391 |
+
# Get recommended variant scorer
|
392 |
+
variant_scorer = variant_scorers.RECOMMENDED_VARIANT_SCORERS[scorer_type]
|
393 |
+
|
394 |
+
# Score variant
|
395 |
+
variant_scores = dna_model.score_variant(
|
396 |
+
interval=interval,
|
397 |
+
variant=variant,
|
398 |
+
variant_scorers=[variant_scorer]
|
399 |
+
)
|
400 |
+
|
401 |
+
artifacts = []
|
402 |
+
|
403 |
+
# Extract first scorer results
|
404 |
+
scores_adata = variant_scores[0]
|
405 |
+
|
406 |
+
# Save gene metadata (obs)
|
407 |
+
genes_file = OUTPUT_DIR / f"{out_prefix}_{scorer_type}_genes.csv"
|
408 |
+
scores_adata.obs.to_csv(genes_file, index=True)
|
409 |
+
artifacts.append({
|
410 |
+
"description": f"{scorer_type} gene metadata",
|
411 |
+
"path": str(genes_file.resolve())
|
412 |
+
})
|
413 |
+
|
414 |
+
# Save track metadata (var)
|
415 |
+
tracks_file = OUTPUT_DIR / f"{out_prefix}_{scorer_type}_tracks.csv"
|
416 |
+
scores_adata.var.to_csv(tracks_file, index=True)
|
417 |
+
artifacts.append({
|
418 |
+
"description": f"{scorer_type} track metadata",
|
419 |
+
"path": str(tracks_file.resolve())
|
420 |
+
})
|
421 |
+
|
422 |
+
# Save raw scores matrix
|
423 |
+
raw_scores_file = OUTPUT_DIR / f"{out_prefix}_{scorer_type}_raw_scores.csv"
|
424 |
+
pd.DataFrame(scores_adata.X,
|
425 |
+
index=scores_adata.obs.index,
|
426 |
+
columns=scores_adata.var.index).to_csv(raw_scores_file, index=True)
|
427 |
+
artifacts.append({
|
428 |
+
"description": f"{scorer_type} raw scores matrix",
|
429 |
+
"path": str(raw_scores_file.resolve())
|
430 |
+
})
|
431 |
+
|
432 |
+
# Create tidy scores dataframe
|
433 |
+
tidy_scores_df = variant_scorers.tidy_scores([scores_adata], match_gene_strand=True)
|
434 |
+
|
435 |
+
# Save tidy scores
|
436 |
+
tidy_file = OUTPUT_DIR / f"{out_prefix}_{scorer_type}_tidy_scores.csv"
|
437 |
+
tidy_scores_df.to_csv(tidy_file, index=False)
|
438 |
+
artifacts.append({
|
439 |
+
"description": f"{scorer_type} tidy scores table",
|
440 |
+
"path": str(tidy_file.resolve())
|
441 |
+
})
|
442 |
+
|
443 |
+
return {
|
444 |
+
"message": f"Variant scoring completed: {scores_adata.X.shape[0]} genes × {scores_adata.X.shape[1]} tracks",
|
445 |
+
"reference": "https://github.com/google-deepmind/alphagenome/tree/main/colabs/quick_start.ipynb",
|
446 |
+
"artifacts": artifacts
|
447 |
+
}
|
448 |
+
|
449 |
+
@quick_start_mcp.tool
|
450 |
+
def ism_analysis(
|
451 |
+
api_key: Annotated[str, "AlphaGenome API key for authentication"],
|
452 |
+
chromosome: Annotated[str, "Chromosome for ISM analysis"] = "chr20",
|
453 |
+
start_position: Annotated[int, "Start position for sequence context"] = 3753000,
|
454 |
+
end_position: Annotated[int, "End position for sequence context"] = 3753400,
|
455 |
+
sequence_length: Annotated[Literal["2KB", "16KB", "100KB", "500KB", "1MB"], "Model input sequence length"] = "2KB",
|
456 |
+
ism_width: Annotated[int, "Width of region to mutate systematically"] = 256,
|
457 |
+
output_type: Annotated[Literal["DNASE", "RNA_SEQ", "CAGE", "ATAC"], "Output type for scoring variants"] = "DNASE",
|
458 |
+
mask_width: Annotated[int, "Width of center mask for scoring"] = 501,
|
459 |
+
target_cell_line: Annotated[str, "Ontology term for specific cell line/tissue"] = "EFO:0002067",
|
460 |
+
out_prefix: Annotated[str | None, "Output file prefix"] = None,
|
461 |
+
) -> dict:
|
462 |
+
"""
|
463 |
+
Perform in silico mutagenesis analysis with sequence logo visualization of important regions.
|
464 |
+
Input is genomic coordinates and output is ISM matrix and sequence logo plot.
|
465 |
+
"""
|
466 |
+
# Set up output prefix
|
467 |
+
if out_prefix is None:
|
468 |
+
out_prefix = f"ism_analysis_{timestamp}"
|
469 |
+
|
470 |
+
# Create DNA model
|
471 |
+
dna_model = dna_client.create(api_key)
|
472 |
+
|
473 |
+
# Create sequence interval
|
474 |
+
sequence_interval = genome.Interval(chromosome, start_position, end_position)
|
475 |
+
length_map = {
|
476 |
+
"2KB": dna_client.SEQUENCE_LENGTH_2KB,
|
477 |
+
"16KB": dna_client.SEQUENCE_LENGTH_16KB,
|
478 |
+
"100KB": dna_client.SEQUENCE_LENGTH_100KB,
|
479 |
+
"500KB": dna_client.SEQUENCE_LENGTH_500KB,
|
480 |
+
"1MB": dna_client.SEQUENCE_LENGTH_1MB
|
481 |
+
}
|
482 |
+
sequence_interval = sequence_interval.resize(length_map[sequence_length])
|
483 |
+
|
484 |
+
# Create ISM interval (region to mutate)
|
485 |
+
ism_interval = sequence_interval.resize(ism_width)
|
486 |
+
|
487 |
+
# Create variant scorer
|
488 |
+
output_enum = getattr(dna_client.OutputType, output_type)
|
489 |
+
variant_scorer = variant_scorers.CenterMaskScorer(
|
490 |
+
requested_output=output_enum,
|
491 |
+
width=mask_width,
|
492 |
+
aggregation_type=variant_scorers.AggregationType.DIFF_MEAN,
|
493 |
+
)
|
494 |
+
|
495 |
+
# Score all ISM variants
|
496 |
+
variant_scores = dna_model.score_ism_variants(
|
497 |
+
interval=sequence_interval,
|
498 |
+
ism_interval=ism_interval,
|
499 |
+
variant_scorers=[variant_scorer],
|
500 |
+
)
|
501 |
+
|
502 |
+
# Extract scores for target cell line/tissue
|
503 |
+
def extract_target_scores(adata):
|
504 |
+
values = adata.X[:, adata.var['ontology_curie'] == target_cell_line]
|
505 |
+
if values.size == 0:
|
506 |
+
# If target not found, use first available track
|
507 |
+
values = adata.X[:, 0:1]
|
508 |
+
assert values.size >= 1
|
509 |
+
return values.flatten()[0]
|
510 |
+
|
511 |
+
# Create ISM matrix
|
512 |
+
ism_result = ism.ism_matrix(
|
513 |
+
[extract_target_scores(x[0]) for x in variant_scores],
|
514 |
+
variants=[v[0].uns['variant'] for v in variant_scores],
|
515 |
+
)
|
516 |
+
|
517 |
+
artifacts = []
|
518 |
+
|
519 |
+
# Save ISM matrix
|
520 |
+
ism_matrix_file = OUTPUT_DIR / f"{out_prefix}_ism_matrix.csv"
|
521 |
+
pd.DataFrame(ism_result).to_csv(ism_matrix_file, index=False)
|
522 |
+
artifacts.append({
|
523 |
+
"description": "ISM contribution matrix",
|
524 |
+
"path": str(ism_matrix_file.resolve())
|
525 |
+
})
|
526 |
+
|
527 |
+
# Create sequence logo plot
|
528 |
+
plt.figure(figsize=(35, 6))
|
529 |
+
plot_components.plot(
|
530 |
+
[
|
531 |
+
plot_components.SeqLogo(
|
532 |
+
scores=ism_result,
|
533 |
+
scores_interval=ism_interval,
|
534 |
+
ylabel=f'ISM {target_cell_line} {output_type}',
|
535 |
+
)
|
536 |
+
],
|
537 |
+
interval=ism_interval,
|
538 |
+
fig_width=35,
|
539 |
+
)
|
540 |
+
|
541 |
+
logo_file = OUTPUT_DIR / f"{out_prefix}_sequence_logo.png"
|
542 |
+
plt.savefig(logo_file, dpi=300, bbox_inches='tight')
|
543 |
+
plt.close()
|
544 |
+
artifacts.append({
|
545 |
+
"description": "ISM sequence logo plot",
|
546 |
+
"path": str(logo_file.resolve())
|
547 |
+
})
|
548 |
+
|
549 |
+
return {
|
550 |
+
"message": f"ISM analysis completed: {len(variant_scores)} variants scored ({ism_width} positions)",
|
551 |
+
"reference": "https://github.com/google-deepmind/alphagenome/tree/main/colabs/quick_start.ipynb",
|
552 |
+
"artifacts": artifacts
|
553 |
+
}
|
554 |
+
|
555 |
+
|
556 |
+
def mouse_predictions(
|
557 |
+
api_key: Annotated[str, "AlphaGenome API key for authentication"],
|
558 |
+
sequence: Annotated[str | None, "DNA sequence for sequence prediction"] = 'GATTACA',
|
559 |
+
chromosome: Annotated[str | None, "Mouse chromosome for interval prediction"] = "chr1",
|
560 |
+
start_position: Annotated[int | None, "Start position for interval prediction"] = 3000000,
|
561 |
+
end_position: Annotated[int | None, "End position for interval prediction"] = 3000001,
|
562 |
+
prediction_type: Annotated[Literal["sequence", "interval"], "Type of prediction to perform"] = "sequence",
|
563 |
+
sequence_length: Annotated[Literal["2KB", "16KB", "100KB", "500KB", "1MB"], "Model input sequence length"] = "2KB",
|
564 |
+
output_types: Annotated[list[str], "List of output types to predict"] = ["DNASE"],
|
565 |
+
ontology_terms: Annotated[list[str], "List of ontology terms for mouse tissues"] = ["UBERON:0002048"],
|
566 |
+
out_prefix: Annotated[str | None, "Output file prefix"] = None,
|
567 |
+
) -> dict:
|
568 |
+
"""
|
569 |
+
Make predictions for mouse sequences and genomic intervals using MUS_MUSCULUS organism.
|
570 |
+
Input is mouse DNA sequence or coordinates and output is prediction metadata tables.
|
571 |
+
"""
|
572 |
+
# Set up output prefix
|
573 |
+
if out_prefix is None:
|
574 |
+
out_prefix = f"mouse_predictions_{timestamp}"
|
575 |
+
|
576 |
+
# Create DNA model
|
577 |
+
dna_model = dna_client.create(api_key)
|
578 |
+
|
579 |
+
# Convert sequence length to client constant
|
580 |
+
length_map = {
|
581 |
+
"2KB": dna_client.SEQUENCE_LENGTH_2KB,
|
582 |
+
"16KB": dna_client.SEQUENCE_LENGTH_16KB,
|
583 |
+
"100KB": dna_client.SEQUENCE_LENGTH_100KB,
|
584 |
+
"500KB": dna_client.SEQUENCE_LENGTH_500KB,
|
585 |
+
"1MB": dna_client.SEQUENCE_LENGTH_1MB
|
586 |
+
}
|
587 |
+
target_length = length_map[sequence_length]
|
588 |
+
|
589 |
+
# Convert output types to client enums
|
590 |
+
output_enums = []
|
591 |
+
for output_type in output_types:
|
592 |
+
output_enums.append(getattr(dna_client.OutputType, output_type))
|
593 |
+
|
594 |
+
artifacts = []
|
595 |
+
|
596 |
+
if prediction_type == "sequence":
|
597 |
+
# Sequence prediction for mouse
|
598 |
+
if sequence is None:
|
599 |
+
raise ValueError("sequence must be provided for sequence prediction")
|
600 |
+
|
601 |
+
# Pad sequence to valid length
|
602 |
+
padded_sequence = sequence.center(target_length, 'N')
|
603 |
+
|
604 |
+
# Make mouse sequence prediction
|
605 |
+
output = dna_model.predict_sequence(
|
606 |
+
sequence=padded_sequence,
|
607 |
+
organism=dna_client.Organism.MUS_MUSCULUS,
|
608 |
+
requested_outputs=output_enums,
|
609 |
+
ontology_terms=ontology_terms,
|
610 |
+
)
|
611 |
+
|
612 |
+
# Save results for each output type
|
613 |
+
for output_type in output_types:
|
614 |
+
track_data = getattr(output, output_type.lower())
|
615 |
+
|
616 |
+
# Save values
|
617 |
+
values_file = OUTPUT_DIR / f"{out_prefix}_{output_type.lower()}_values.csv"
|
618 |
+
pd.DataFrame(track_data.values).to_csv(values_file, index=False)
|
619 |
+
artifacts.append({
|
620 |
+
"description": f"Mouse {output_type} prediction values",
|
621 |
+
"path": str(values_file.resolve())
|
622 |
+
})
|
623 |
+
|
624 |
+
# Save metadata
|
625 |
+
metadata_file = OUTPUT_DIR / f"{out_prefix}_{output_type.lower()}_metadata.csv"
|
626 |
+
track_data.metadata.to_csv(metadata_file, index=False)
|
627 |
+
artifacts.append({
|
628 |
+
"description": f"Mouse {output_type} track metadata",
|
629 |
+
"path": str(metadata_file.resolve())
|
630 |
+
})
|
631 |
+
|
632 |
+
elif prediction_type == "interval":
|
633 |
+
# Interval prediction for mouse
|
634 |
+
if chromosome is None or start_position is None or end_position is None:
|
635 |
+
raise ValueError("chromosome, start_position, and end_position must be provided for interval prediction")
|
636 |
+
|
637 |
+
# Create mouse interval
|
638 |
+
interval = genome.Interval(chromosome, start_position, end_position).resize(target_length)
|
639 |
+
|
640 |
+
# Make mouse interval prediction
|
641 |
+
output = dna_model.predict_interval(
|
642 |
+
interval=interval,
|
643 |
+
organism=dna_client.Organism.MUS_MUSCULUS,
|
644 |
+
requested_outputs=output_enums,
|
645 |
+
ontology_terms=ontology_terms,
|
646 |
+
)
|
647 |
+
|
648 |
+
# Save results for each output type
|
649 |
+
for output_type in output_types:
|
650 |
+
track_data = getattr(output, output_type.lower())
|
651 |
+
|
652 |
+
# Save metadata
|
653 |
+
metadata_file = OUTPUT_DIR / f"{out_prefix}_{output_type.lower()}_metadata.csv"
|
654 |
+
track_data.metadata.to_csv(metadata_file, index=False)
|
655 |
+
artifacts.append({
|
656 |
+
"description": f"Mouse {output_type} interval metadata",
|
657 |
+
"path": str(metadata_file.resolve())
|
658 |
+
})
|
659 |
+
|
660 |
+
return {
|
661 |
+
"message": f"Mouse {prediction_type} predictions completed for {len(output_types)} output types",
|
662 |
+
"reference": "https://github.com/google-deepmind/alphagenome/tree/main/colabs/quick_start.ipynb",
|
663 |
+
"artifacts": artifacts
|
664 |
+
}
|
tools/tissue_ontology_mapping.py
ADDED
@@ -0,0 +1,148 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Tissue ontology mapping tutorial for navigating biological data ontologies in AlphaGenome.
|
3 |
+
|
4 |
+
This MCP Server provides 2 tools:
|
5 |
+
1. explore_output_metadata: Explore and filter output metadata for specific organisms and search terms
|
6 |
+
2. count_tracks_by_output_type: Count tracks by output type for human and mouse organisms
|
7 |
+
|
8 |
+
All tools extracted from `https://github.com/google-deepmind/alphagenome/tree/main/colabs/tissue_ontology_mapping.ipynb`.
|
9 |
+
"""
|
10 |
+
|
11 |
+
# Standard imports
|
12 |
+
from typing import Annotated, Literal, Any
|
13 |
+
import pandas as pd
|
14 |
+
import numpy as np
|
15 |
+
from pathlib import Path
|
16 |
+
import os
|
17 |
+
from fastmcp import FastMCP
|
18 |
+
from datetime import datetime
|
19 |
+
|
20 |
+
# Base persistent directory (HF Spaces guarantees /data is writable & persistent)
|
21 |
+
BASE_DIR = Path("/data")
|
22 |
+
|
23 |
+
DEFAULT_INPUT_DIR = BASE_DIR / "tmp_inputs"
|
24 |
+
DEFAULT_OUTPUT_DIR = BASE_DIR / "tmp_outputs"
|
25 |
+
|
26 |
+
|
27 |
+
INPUT_DIR = Path(os.environ.get("TISSUE_ONTOLOGY_MAPPING_INPUT_DIR", DEFAULT_INPUT_DIR))
|
28 |
+
OUTPUT_DIR = Path(os.environ.get("TISSUE_ONTOLOGY_MAPPING_OUTPUT_DIR", DEFAULT_OUTPUT_DIR))
|
29 |
+
|
30 |
+
# Ensure directories exist
|
31 |
+
INPUT_DIR.mkdir(parents=True, exist_ok=True)
|
32 |
+
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
|
33 |
+
|
34 |
+
# Timestamp for unique outputs
|
35 |
+
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
36 |
+
|
37 |
+
ALPHAGENOME_API_KEY = os.environ["ALPHAGENOME_API_KEY"]
|
38 |
+
|
39 |
+
# MCP server instance
|
40 |
+
tissue_ontology_mapping_mcp = FastMCP(name="tissue_ontology_mapping")
|
41 |
+
|
42 |
+
@tissue_ontology_mapping_mcp.tool
|
43 |
+
def explore_output_metadata(
|
44 |
+
# Analysis parameters with tutorial defaults
|
45 |
+
organism: Annotated[Literal["HOMO_SAPIENS", "MUS_MUSCULUS"], "Target organism for metadata exploration"] = "HOMO_SAPIENS",
|
46 |
+
api_key: Annotated[str, "AlphaGenome API key for accessing the DNA model"] = ALPHAGENOME_API_KEY,
|
47 |
+
out_prefix: Annotated[str | None, "Output file prefix"] = None,
|
48 |
+
) -> dict:
|
49 |
+
"""
|
50 |
+
Explore output metadata for specific organisms to find ontology terms and tissue types.
|
51 |
+
Input is organism selection and API key and output is metadata table for interactive exploration.
|
52 |
+
"""
|
53 |
+
# Import required modules
|
54 |
+
from alphagenome.models import dna_client
|
55 |
+
|
56 |
+
if not api_key:
|
57 |
+
raise ValueError("API key must be provided")
|
58 |
+
|
59 |
+
# Create DNA model client
|
60 |
+
dna_model = dna_client.create(api_key)
|
61 |
+
|
62 |
+
# Get organism enum
|
63 |
+
org_enum = getattr(dna_client.Organism, organism)
|
64 |
+
|
65 |
+
# Get output metadata
|
66 |
+
output_metadata = dna_model.output_metadata(org_enum).concatenate()
|
67 |
+
|
68 |
+
# Set output filename
|
69 |
+
if out_prefix is None:
|
70 |
+
out_prefix = f"output_metadata_{organism.lower()}"
|
71 |
+
|
72 |
+
output_file = OUTPUT_DIR / f"{out_prefix}_{timestamp}.csv"
|
73 |
+
|
74 |
+
# Save metadata as CSV
|
75 |
+
output_metadata.to_csv(output_file, index=False)
|
76 |
+
|
77 |
+
# Return standardized format
|
78 |
+
return {
|
79 |
+
"message": f"Output metadata exploration completed for {organism}",
|
80 |
+
"reference": "https://github.com/google-deepmind/alphagenome/tree/main/colabs/tissue_ontology_mapping.ipynb",
|
81 |
+
"artifacts": [
|
82 |
+
{
|
83 |
+
"description": f"Output metadata for {organism}",
|
84 |
+
"path": str(output_file.resolve())
|
85 |
+
}
|
86 |
+
]
|
87 |
+
}
|
88 |
+
|
89 |
+
@tissue_ontology_mapping_mcp.tool
|
90 |
+
def count_tracks_by_output_type(
|
91 |
+
# Analysis parameters with tutorial defaults
|
92 |
+
api_key: Annotated[str, "AlphaGenome API key for accessing the DNA model"] = ALPHAGENOME_API_KEY,
|
93 |
+
out_prefix: Annotated[str | None, "Output file prefix"] = None,
|
94 |
+
) -> dict:
|
95 |
+
"""
|
96 |
+
Count tracks by output type for both human and mouse organisms to understand data availability.
|
97 |
+
Input is API key and output is track counts table comparing human vs mouse availability.
|
98 |
+
"""
|
99 |
+
# Import required modules
|
100 |
+
from alphagenome.models import dna_client
|
101 |
+
|
102 |
+
if not api_key:
|
103 |
+
raise ValueError("API key must be provided")
|
104 |
+
|
105 |
+
# Create DNA model client
|
106 |
+
dna_model = dna_client.create(api_key)
|
107 |
+
|
108 |
+
# Count human tracks
|
109 |
+
human_tracks = (
|
110 |
+
dna_model.output_metadata(dna_client.Organism.HOMO_SAPIENS)
|
111 |
+
.concatenate()
|
112 |
+
.groupby('output_type')
|
113 |
+
.size()
|
114 |
+
.rename('# Human tracks')
|
115 |
+
)
|
116 |
+
|
117 |
+
# Count mouse tracks
|
118 |
+
mouse_tracks = (
|
119 |
+
dna_model.output_metadata(dna_client.Organism.MUS_MUSCULUS)
|
120 |
+
.concatenate()
|
121 |
+
.groupby('output_type')
|
122 |
+
.size()
|
123 |
+
.rename('# Mouse tracks')
|
124 |
+
)
|
125 |
+
|
126 |
+
# Combine the results
|
127 |
+
track_counts = pd.concat([human_tracks, mouse_tracks], axis=1).astype(pd.Int64Dtype())
|
128 |
+
|
129 |
+
# Set output filename
|
130 |
+
if out_prefix is None:
|
131 |
+
out_prefix = "track_counts"
|
132 |
+
|
133 |
+
output_file = OUTPUT_DIR / f"{out_prefix}_{timestamp}.csv"
|
134 |
+
|
135 |
+
# Save track counts as CSV
|
136 |
+
track_counts.to_csv(output_file)
|
137 |
+
|
138 |
+
# Return standardized format
|
139 |
+
return {
|
140 |
+
"message": "Track counting by output type completed successfully",
|
141 |
+
"reference": "https://github.com/google-deepmind/alphagenome/tree/main/colabs/tissue_ontology_mapping.ipynb",
|
142 |
+
"artifacts": [
|
143 |
+
{
|
144 |
+
"description": "Track counts by output type",
|
145 |
+
"path": str(output_file.resolve())
|
146 |
+
}
|
147 |
+
]
|
148 |
+
}
|
tools/variant_scoring_ui.py
ADDED
@@ -0,0 +1,389 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Interactive tutorial for scoring and visualizing single variants with comprehensive modality options.
|
3 |
+
|
4 |
+
This MCP Server provides 2 tools:
|
5 |
+
1. score_variant: Score a single variant with multiple variant scorers and save results
|
6 |
+
2. visualize_variant_effects: Generate comprehensive variant effect visualization across multiple modalities
|
7 |
+
|
8 |
+
All tools extracted from `https://github.com/google-deepmind/alphagenome/tree/main/colabs/variant_scoring_ui.ipynb`.
|
9 |
+
"""
|
10 |
+
|
11 |
+
# Standard imports
|
12 |
+
from typing import Annotated, Literal, Any
|
13 |
+
import pandas as pd
|
14 |
+
import numpy as np
|
15 |
+
from pathlib import Path
|
16 |
+
import os
|
17 |
+
from fastmcp import FastMCP
|
18 |
+
from datetime import datetime
|
19 |
+
|
20 |
+
# Base persistent directory (HF Spaces guarantees /data is writable & persistent)
|
21 |
+
BASE_DIR = Path("/data")
|
22 |
+
|
23 |
+
DEFAULT_INPUT_DIR = BASE_DIR / "tmp_inputs"
|
24 |
+
DEFAULT_OUTPUT_DIR = BASE_DIR / "tmp_outputs"
|
25 |
+
|
26 |
+
INPUT_DIR = Path(os.environ.get("BATCH_VARIANT_SCORING_INPUT_DIR", DEFAULT_INPUT_DIR))
|
27 |
+
OUTPUT_DIR = Path(os.environ.get("BATCH_VARIANT_SCORING_OUTPUT_DIR", DEFAULT_OUTPUT_DIR))
|
28 |
+
|
29 |
+
# Ensure directories exist
|
30 |
+
INPUT_DIR.mkdir(parents=True, exist_ok=True)
|
31 |
+
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
|
32 |
+
|
33 |
+
# Timestamp for unique outputs
|
34 |
+
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
35 |
+
|
36 |
+
# Fetch your secret
|
37 |
+
ALPHAGENOME_API_KEY = os.environ["ALPHAGENOME_API_KEY"]
|
38 |
+
|
39 |
+
# MCP server instance
|
40 |
+
variant_scoring_ui_mcp = FastMCP(name="variant_scoring_ui")
|
41 |
+
|
42 |
+
|
43 |
+
def score_variant(
|
44 |
+
# Primary data inputs - variant specification
|
45 |
+
variant_chromosome: Annotated[str, "Chromosome name (e.g., 'chr22')"] = "chr22",
|
46 |
+
variant_position: Annotated[int, "Genomic position"] = 36201698,
|
47 |
+
variant_reference_bases: Annotated[str, "Reference allele"] = "A",
|
48 |
+
variant_alternate_bases: Annotated[str, "Alternate allele"] = "C",
|
49 |
+
# Analysis parameters with tutorial defaults
|
50 |
+
organism: Annotated[Literal["human", "mouse"], "Organism to analyze"] = "human",
|
51 |
+
sequence_length: Annotated[Literal["2KB", "16KB", "100KB", "500KB", "1MB"], "Length of sequence around variant to predict"] = "1MB",
|
52 |
+
api_key: Annotated[str | None, "API key for AlphaGenome model"] = ALPHAGENOME_API_KEY,
|
53 |
+
out_prefix: Annotated[str | None, "Output file prefix"] = None,
|
54 |
+
) -> dict:
|
55 |
+
"""
|
56 |
+
Score a single variant using multiple variant scorers with comprehensive analysis.
|
57 |
+
Input is variant coordinates and parameters, output is variant scores table and downloadable CSV file.
|
58 |
+
"""
|
59 |
+
from alphagenome import colab_utils
|
60 |
+
from alphagenome.data import genome
|
61 |
+
from alphagenome.models import dna_client, variant_scorers
|
62 |
+
|
63 |
+
# Use provided API key or get from environment/colab
|
64 |
+
if api_key:
|
65 |
+
dna_model = dna_client.create(api_key)
|
66 |
+
else:
|
67 |
+
dna_model = dna_client.create(colab_utils.get_api_key())
|
68 |
+
|
69 |
+
# Map organism string to enum
|
70 |
+
organism_map = {
|
71 |
+
'human': dna_client.Organism.HOMO_SAPIENS,
|
72 |
+
'mouse': dna_client.Organism.MUS_MUSCULUS,
|
73 |
+
}
|
74 |
+
organism_enum = organism_map[organism]
|
75 |
+
|
76 |
+
# Create variant object
|
77 |
+
variant = genome.Variant(
|
78 |
+
chromosome=variant_chromosome,
|
79 |
+
position=variant_position,
|
80 |
+
reference_bases=variant_reference_bases,
|
81 |
+
alternate_bases=variant_alternate_bases,
|
82 |
+
)
|
83 |
+
|
84 |
+
# Get sequence length
|
85 |
+
sequence_length_value = dna_client.SUPPORTED_SEQUENCE_LENGTHS[
|
86 |
+
f'SEQUENCE_LENGTH_{sequence_length}'
|
87 |
+
]
|
88 |
+
|
89 |
+
# The input interval is derived from the variant (centered on it)
|
90 |
+
interval = variant.reference_interval.resize(sequence_length_value)
|
91 |
+
|
92 |
+
# Score variant
|
93 |
+
variant_scores = dna_model.score_variant(
|
94 |
+
interval=interval,
|
95 |
+
variant=variant,
|
96 |
+
variant_scorers=list(variant_scorers.RECOMMENDED_VARIANT_SCORERS.values()),
|
97 |
+
)
|
98 |
+
|
99 |
+
# Convert to tidy format
|
100 |
+
df_scores = variant_scorers.tidy_scores(variant_scores)
|
101 |
+
|
102 |
+
# Save results
|
103 |
+
if out_prefix is None:
|
104 |
+
out_prefix = f"variant_{variant_chromosome}_{variant_position}_{variant_reference_bases}_{variant_alternate_bases}"
|
105 |
+
|
106 |
+
output_file = OUTPUT_DIR / f"{out_prefix}_scores_{timestamp}.csv"
|
107 |
+
|
108 |
+
# Filter columns for display (remove internal columns)
|
109 |
+
columns = [
|
110 |
+
c for c in df_scores.columns if c not in ['variant_id', 'scored_interval']
|
111 |
+
]
|
112 |
+
df_display = df_scores[columns]
|
113 |
+
|
114 |
+
# Save full results
|
115 |
+
df_scores.to_csv(output_file, index=False)
|
116 |
+
|
117 |
+
return {
|
118 |
+
"message": f"Variant scoring completed with {len(df_scores)} scores across modalities",
|
119 |
+
"reference": "https://github.com/google-deepmind/alphagenome/tree/main/colabs/variant_scoring_ui.ipynb",
|
120 |
+
"artifacts": [
|
121 |
+
{
|
122 |
+
"description": "Variant scores CSV",
|
123 |
+
"path": str(output_file.resolve())
|
124 |
+
}
|
125 |
+
]
|
126 |
+
}
|
127 |
+
|
128 |
+
@variant_scoring_ui_mcp.tool
|
129 |
+
def visualize_variant_effects(
|
130 |
+
# Primary data inputs - variant specification
|
131 |
+
variant_chromosome: Annotated[str, "Chromosome name (e.g., 'chr22')"] = "chr22",
|
132 |
+
variant_position: Annotated[int, "Genomic position"] = 36201698,
|
133 |
+
variant_reference_bases: Annotated[str, "Reference allele"] = "A",
|
134 |
+
variant_alternate_bases: Annotated[str, "Alternate allele"] = "C",
|
135 |
+
# Analysis parameters with tutorial defaults
|
136 |
+
organism: Annotated[Literal["human", "mouse"], "Organism to analyze"] = "human",
|
137 |
+
sequence_length: Annotated[Literal["2KB", "16KB", "100KB", "500KB", "1MB"], "Length of sequence around variant to predict"] = "1MB",
|
138 |
+
ontology_terms: Annotated[list[str], "List of cell and tissue ontology terms"] = None,
|
139 |
+
# Gene annotation options
|
140 |
+
plot_gene_annotation: Annotated[bool, "Include gene annotation in plot"] = True,
|
141 |
+
plot_longest_transcript_only: Annotated[bool, "Show only longest transcript per gene"] = True,
|
142 |
+
# Output types to plot
|
143 |
+
plot_rna_seq: Annotated[bool, "Plot RNA-seq tracks"] = True,
|
144 |
+
plot_cage: Annotated[bool, "Plot CAGE tracks"] = True,
|
145 |
+
plot_atac: Annotated[bool, "Plot ATAC-seq tracks"] = False,
|
146 |
+
plot_dnase: Annotated[bool, "Plot DNase tracks"] = False,
|
147 |
+
plot_chip_histone: Annotated[bool, "Plot ChIP-seq histone tracks"] = False,
|
148 |
+
plot_chip_tf: Annotated[bool, "Plot ChIP-seq transcription factor tracks"] = False,
|
149 |
+
plot_splice_sites: Annotated[bool, "Plot splice sites"] = True,
|
150 |
+
plot_splice_site_usage: Annotated[bool, "Plot splice site usage"] = False,
|
151 |
+
plot_contact_maps: Annotated[bool, "Plot contact maps"] = False,
|
152 |
+
plot_splice_junctions: Annotated[bool, "Plot splice junctions"] = False,
|
153 |
+
# DNA strand filtering
|
154 |
+
filter_to_positive_strand: Annotated[bool, "Filter tracks to positive strand only"] = False,
|
155 |
+
filter_to_negative_strand: Annotated[bool, "Filter tracks to negative strand only"] = False,
|
156 |
+
# Visualization options
|
157 |
+
ref_color: Annotated[str, "Color for reference allele"] = "dimgrey",
|
158 |
+
alt_color: Annotated[str, "Color for alternate allele"] = "red",
|
159 |
+
plot_interval_width: Annotated[int, "Width of plot interval in base pairs"] = 43008,
|
160 |
+
plot_interval_shift: Annotated[int, "Shift of plot interval from variant center"] = 0,
|
161 |
+
api_key: Annotated[str | None, "API key for AlphaGenome model"] = ALPHAGENOME_API_KEY,
|
162 |
+
out_prefix: Annotated[str | None, "Output file prefix"] = None,
|
163 |
+
) -> dict:
|
164 |
+
"""
|
165 |
+
Generate comprehensive variant effect visualization across multiple genomic modalities.
|
166 |
+
Input is variant coordinates and visualization parameters, output is variant effect plots showing REF vs ALT predictions.
|
167 |
+
"""
|
168 |
+
from alphagenome import colab_utils
|
169 |
+
from alphagenome.data import gene_annotation, genome, transcript
|
170 |
+
from alphagenome.models import dna_client
|
171 |
+
from alphagenome.visualization import plot_components
|
172 |
+
import matplotlib.pyplot as plt
|
173 |
+
|
174 |
+
# Validate strand filtering parameters
|
175 |
+
if filter_to_positive_strand and filter_to_negative_strand:
|
176 |
+
raise ValueError(
|
177 |
+
'Cannot specify both filter_to_positive_strand and '
|
178 |
+
'filter_to_negative_strand.'
|
179 |
+
)
|
180 |
+
|
181 |
+
# Use provided API key or get from environment/colab
|
182 |
+
if api_key:
|
183 |
+
dna_model = dna_client.create(api_key)
|
184 |
+
else:
|
185 |
+
dna_model = dna_client.create(colab_utils.get_api_key())
|
186 |
+
|
187 |
+
# Default ontology terms from tutorial
|
188 |
+
if ontology_terms is None:
|
189 |
+
ontology_terms = ['EFO:0001187', 'EFO:0002067', 'EFO:0002784']
|
190 |
+
|
191 |
+
# Map organism string to enum
|
192 |
+
organism_map = {
|
193 |
+
'human': dna_client.Organism.HOMO_SAPIENS,
|
194 |
+
'mouse': dna_client.Organism.MUS_MUSCULUS,
|
195 |
+
}
|
196 |
+
organism_enum = organism_map[organism]
|
197 |
+
|
198 |
+
# Reference paths for gene annotation
|
199 |
+
HG38_GTF_FEATHER = (
|
200 |
+
'https://storage.googleapis.com/alphagenome/reference/gencode/'
|
201 |
+
'hg38/gencode.v46.annotation.gtf.gz.feather'
|
202 |
+
)
|
203 |
+
MM10_GTF_FEATHER = (
|
204 |
+
'https://storage.googleapis.com/alphagenome/reference/gencode/'
|
205 |
+
'mm10/gencode.vM23.annotation.gtf.gz.feather'
|
206 |
+
)
|
207 |
+
|
208 |
+
# Create variant object
|
209 |
+
variant = genome.Variant(
|
210 |
+
chromosome=variant_chromosome,
|
211 |
+
position=variant_position,
|
212 |
+
reference_bases=variant_reference_bases,
|
213 |
+
alternate_bases=variant_alternate_bases,
|
214 |
+
)
|
215 |
+
|
216 |
+
# Get sequence length
|
217 |
+
sequence_length_value = dna_client.SUPPORTED_SEQUENCE_LENGTHS[
|
218 |
+
f'SEQUENCE_LENGTH_{sequence_length}'
|
219 |
+
]
|
220 |
+
|
221 |
+
# The input interval is derived from the variant (centered on it)
|
222 |
+
interval = variant.reference_interval.resize(sequence_length_value)
|
223 |
+
|
224 |
+
# Load gene annotation
|
225 |
+
match organism_enum:
|
226 |
+
case dna_client.Organism.HOMO_SAPIENS:
|
227 |
+
gtf_path = HG38_GTF_FEATHER
|
228 |
+
case dna_client.Organism.MUS_MUSCULUS:
|
229 |
+
gtf_path = MM10_GTF_FEATHER
|
230 |
+
case _:
|
231 |
+
raise ValueError(f'Unsupported organism: {organism_enum}')
|
232 |
+
|
233 |
+
import pandas as pd
|
234 |
+
gtf = pd.read_feather(gtf_path)
|
235 |
+
|
236 |
+
# Filter to protein-coding genes and highly supported transcripts
|
237 |
+
gtf_transcript = gene_annotation.filter_transcript_support_level(
|
238 |
+
gene_annotation.filter_protein_coding(gtf), ['1']
|
239 |
+
)
|
240 |
+
|
241 |
+
# Extractor for identifying transcripts in a region
|
242 |
+
transcript_extractor = transcript.TranscriptExtractor(gtf_transcript)
|
243 |
+
|
244 |
+
# Also define an extractor that fetches only the longest transcript per gene
|
245 |
+
gtf_longest_transcript = gene_annotation.filter_to_longest_transcript(
|
246 |
+
gtf_transcript
|
247 |
+
)
|
248 |
+
longest_transcript_extractor = transcript.TranscriptExtractor(
|
249 |
+
gtf_longest_transcript
|
250 |
+
)
|
251 |
+
|
252 |
+
# Predict variant effects
|
253 |
+
output = dna_model.predict_variant(
|
254 |
+
interval=interval,
|
255 |
+
variant=variant,
|
256 |
+
organism=organism_enum,
|
257 |
+
requested_outputs=[*dna_client.OutputType],
|
258 |
+
ontology_terms=ontology_terms,
|
259 |
+
)
|
260 |
+
|
261 |
+
# Filter to DNA strand if requested
|
262 |
+
ref, alt = output.reference, output.alternate
|
263 |
+
|
264 |
+
if filter_to_positive_strand:
|
265 |
+
ref = ref.filter_to_strand(strand='+')
|
266 |
+
alt = alt.filter_to_strand(strand='+')
|
267 |
+
elif filter_to_negative_strand:
|
268 |
+
ref = ref.filter_to_strand(strand='-')
|
269 |
+
alt = alt.filter_to_strand(strand='-')
|
270 |
+
|
271 |
+
# Build plot components
|
272 |
+
components = []
|
273 |
+
ref_alt_colors = {'REF': ref_color, 'ALT': alt_color}
|
274 |
+
|
275 |
+
# Gene and transcript annotation
|
276 |
+
if plot_gene_annotation:
|
277 |
+
if plot_longest_transcript_only:
|
278 |
+
transcripts = longest_transcript_extractor.extract(interval)
|
279 |
+
else:
|
280 |
+
transcripts = transcript_extractor.extract(interval)
|
281 |
+
components.append(plot_components.TranscriptAnnotation(transcripts))
|
282 |
+
|
283 |
+
# Individual output type plots
|
284 |
+
plot_map = {
|
285 |
+
'plot_atac': (ref.atac, alt.atac, 'ATAC'),
|
286 |
+
'plot_cage': (ref.cage, alt.cage, 'CAGE'),
|
287 |
+
'plot_chip_histone': (ref.chip_histone, alt.chip_histone, 'CHIP_HISTONE'),
|
288 |
+
'plot_chip_tf': (ref.chip_tf, alt.chip_tf, 'CHIP_TF'),
|
289 |
+
'plot_contact_maps': (ref.contact_maps, alt.contact_maps, 'CONTACT_MAPS'),
|
290 |
+
'plot_dnase': (ref.dnase, alt.dnase, 'DNASE'),
|
291 |
+
'plot_rna_seq': (ref.rna_seq, alt.rna_seq, 'RNA_SEQ'),
|
292 |
+
'plot_splice_junctions': (
|
293 |
+
ref.splice_junctions,
|
294 |
+
alt.splice_junctions,
|
295 |
+
'SPLICE_JUNCTIONS',
|
296 |
+
),
|
297 |
+
'plot_splice_sites': (ref.splice_sites, alt.splice_sites, 'SPLICE_SITES'),
|
298 |
+
'plot_splice_site_usage': (
|
299 |
+
ref.splice_site_usage,
|
300 |
+
alt.splice_site_usage,
|
301 |
+
'SPLICE_SITE_USAGE',
|
302 |
+
),
|
303 |
+
}
|
304 |
+
|
305 |
+
for key, (ref_data, alt_data, output_type) in plot_map.items():
|
306 |
+
if eval(key) and ref_data is not None and ref_data.values.shape[-1] == 0:
|
307 |
+
print(
|
308 |
+
f'Requested plot for output {output_type} but no tracks exist in'
|
309 |
+
' output. This is likely because this output does not exist for your'
|
310 |
+
' ontologies or requested DNA strand.'
|
311 |
+
)
|
312 |
+
if eval(key) and ref_data and alt_data:
|
313 |
+
match output_type:
|
314 |
+
case 'CHIP_HISTONE':
|
315 |
+
ylabel_template = (
|
316 |
+
f'{output_type}: {{biosample_name}} ({{strand}})\n{{histone_mark}}'
|
317 |
+
)
|
318 |
+
case 'CHIP_TF':
|
319 |
+
ylabel_template = (
|
320 |
+
f'{output_type}: {{biosample_name}}'
|
321 |
+
' ({strand})\n{transcription_factor}'
|
322 |
+
)
|
323 |
+
case 'CONTACT_MAPS':
|
324 |
+
ylabel_template = f'{output_type}: {{biosample_name}} ({{strand}})'
|
325 |
+
case 'SPLICE_SITES':
|
326 |
+
ylabel_template = f'{output_type}: {{name}} ({{strand}})'
|
327 |
+
case _:
|
328 |
+
ylabel_template = (
|
329 |
+
f'{output_type}: {{biosample_name}} ({{strand}})\n{{name}}'
|
330 |
+
)
|
331 |
+
|
332 |
+
if output_type == 'CONTACT_MAPS':
|
333 |
+
component = plot_components.ContactMapsDiff(
|
334 |
+
tdata=alt_data - ref_data,
|
335 |
+
ylabel_template=ylabel_template,
|
336 |
+
)
|
337 |
+
components.append(component)
|
338 |
+
elif output_type == 'SPLICE_JUNCTIONS':
|
339 |
+
ref_plot = plot_components.Sashimi(
|
340 |
+
ref_data,
|
341 |
+
ylabel_template='REF: ' + ylabel_template,
|
342 |
+
)
|
343 |
+
alt_plot = plot_components.Sashimi(
|
344 |
+
alt_data,
|
345 |
+
ylabel_template='ALT: ' + ylabel_template,
|
346 |
+
)
|
347 |
+
components.extend([ref_plot, alt_plot])
|
348 |
+
else:
|
349 |
+
component = plot_components.OverlaidTracks(
|
350 |
+
tdata={'REF': ref_data, 'ALT': alt_data},
|
351 |
+
colors=ref_alt_colors,
|
352 |
+
ylabel_template=ylabel_template,
|
353 |
+
)
|
354 |
+
components.append(component)
|
355 |
+
|
356 |
+
# Validate plot interval width
|
357 |
+
if plot_interval_width > interval.width:
|
358 |
+
raise ValueError(
|
359 |
+
f'plot_interval_width ({plot_interval_width}) must be less than '
|
360 |
+
f'interval.width ({interval.width}).'
|
361 |
+
)
|
362 |
+
|
363 |
+
# Generate plot
|
364 |
+
plot = plot_components.plot(
|
365 |
+
components=components,
|
366 |
+
interval=interval.shift(plot_interval_shift).resize(plot_interval_width),
|
367 |
+
annotations=[
|
368 |
+
plot_components.VariantAnnotation([variant]),
|
369 |
+
],
|
370 |
+
)
|
371 |
+
|
372 |
+
# Save plot
|
373 |
+
if out_prefix is None:
|
374 |
+
out_prefix = f"variant_{variant_chromosome}_{variant_position}_{variant_reference_bases}_{variant_alternate_bases}"
|
375 |
+
|
376 |
+
output_file = OUTPUT_DIR / f"{out_prefix}_effects_{timestamp}.png"
|
377 |
+
plt.savefig(output_file, dpi=300, bbox_inches='tight')
|
378 |
+
plt.close()
|
379 |
+
|
380 |
+
return {
|
381 |
+
"message": f"Variant visualization completed with {len(components)} plot components",
|
382 |
+
"reference": "https://github.com/google-deepmind/alphagenome/tree/main/colabs/variant_scoring_ui.ipynb",
|
383 |
+
"artifacts": [
|
384 |
+
{
|
385 |
+
"description": "Variant effects plot",
|
386 |
+
"path": str(output_file.resolve())
|
387 |
+
}
|
388 |
+
]
|
389 |
+
}
|
tools/visualization_modality_tour.py
ADDED
@@ -0,0 +1,1053 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
AlphaGenome visualization modality tour for different genomic data types.
|
3 |
+
|
4 |
+
This MCP Server provides 9 tools:
|
5 |
+
1. visualize_gene_expression: Visualize RNA_SEQ and CAGE gene expression predictions
|
6 |
+
2. visualize_variant_expression_effects: Show REF vs ALT variant effects on gene expression
|
7 |
+
3. visualize_custom_annotations: Plot custom annotations like polyadenylation sites
|
8 |
+
4. visualize_chromatin_accessibility: Visualize DNASE and ATAC chromatin accessibility
|
9 |
+
5. visualize_splicing_effects: Visualize splicing predictions with SPLICE_SITES, SPLICE_SITE_USAGE, SPLICE_JUNCTIONS
|
10 |
+
6. visualize_variant_splicing_effects: Show REF vs ALT variant effects on splicing with sashimi plots
|
11 |
+
7. visualize_histone_modifications: Visualize CHIP_HISTONE predictions with custom colors
|
12 |
+
8. visualize_tf_binding: Visualize CHIP_TF transcription factor binding predictions
|
13 |
+
9. visualize_contact_maps: Visualize CONTACT_MAPS DNA-DNA contact predictions
|
14 |
+
|
15 |
+
All tools extracted from `https://github.com/google-deepmind/alphagenome/tree/main/colabs/visualization_modality_tour.ipynb`.
|
16 |
+
"""
|
17 |
+
|
18 |
+
# Standard imports
|
19 |
+
from typing import Annotated, Literal, Any
|
20 |
+
import pandas as pd
|
21 |
+
import numpy as np
|
22 |
+
from pathlib import Path
|
23 |
+
import os
|
24 |
+
from fastmcp import FastMCP
|
25 |
+
from datetime import datetime
|
26 |
+
import matplotlib.pyplot as plt
|
27 |
+
|
28 |
+
# AlphaGenome imports
|
29 |
+
from alphagenome import colab_utils
|
30 |
+
from alphagenome.data import gene_annotation, genome, track_data, transcript
|
31 |
+
from alphagenome.models import dna_client
|
32 |
+
from alphagenome.visualization import plot_components
|
33 |
+
|
34 |
+
# Base persistent directory (HF Spaces guarantees /data is writable & persistent)
|
35 |
+
BASE_DIR = Path("/data")
|
36 |
+
|
37 |
+
DEFAULT_INPUT_DIR = BASE_DIR / "tmp_inputs"
|
38 |
+
DEFAULT_OUTPUT_DIR = BASE_DIR / "tmp_outputs"
|
39 |
+
|
40 |
+
INPUT_DIR = Path(os.environ.get("BATCH_VARIANT_SCORING_INPUT_DIR", DEFAULT_INPUT_DIR))
|
41 |
+
OUTPUT_DIR = Path(os.environ.get("BATCH_VARIANT_SCORING_OUTPUT_DIR", DEFAULT_OUTPUT_DIR))
|
42 |
+
|
43 |
+
# Ensure directories exist
|
44 |
+
INPUT_DIR.mkdir(parents=True, exist_ok=True)
|
45 |
+
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
|
46 |
+
|
47 |
+
# Fetch your secret
|
48 |
+
ALPHAGENOME_API_KEY = os.environ["ALPHAGENOME_API_KEY"]
|
49 |
+
|
50 |
+
# Timestamp for unique outputs
|
51 |
+
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
52 |
+
|
53 |
+
# MCP server instance
|
54 |
+
visualization_modality_tour_mcp = FastMCP(name="visualization_modality_tour")
|
55 |
+
|
56 |
+
@visualization_modality_tour_mcp.tool
|
57 |
+
def visualize_gene_expression(
|
58 |
+
chromosome: Annotated[str, "Chromosome name (e.g., 'chr22')"],
|
59 |
+
start_position: Annotated[int, "Start genomic position"],
|
60 |
+
end_position: Annotated[int, "End genomic position"],
|
61 |
+
ontology_terms: Annotated[list, "List of ontology terms for tissues/cell types"] = ['UBERON:0001159', 'UBERON:0001155'],
|
62 |
+
api_key: Annotated[str | None, "AlphaGenome API key"] = ALPHAGENOME_API_KEY,
|
63 |
+
out_prefix: Annotated[str | None, "Output file prefix"] = None,
|
64 |
+
) -> dict:
|
65 |
+
"""
|
66 |
+
Visualize RNA_SEQ and CAGE gene expression predictions for a genomic interval.
|
67 |
+
Input is genomic coordinates and ontology terms and output is gene expression visualization plot.
|
68 |
+
"""
|
69 |
+
if api_key is None:
|
70 |
+
raise ValueError("API key must be provided")
|
71 |
+
|
72 |
+
# Create interval and resize to supported length
|
73 |
+
interval = genome.Interval(chromosome, start_position, end_position).resize(
|
74 |
+
dna_client.SEQUENCE_LENGTH_1MB
|
75 |
+
)
|
76 |
+
|
77 |
+
# Create model client
|
78 |
+
dna_model = dna_client.create(api_key)
|
79 |
+
|
80 |
+
# Load gene annotations
|
81 |
+
gtf = pd.read_feather(
|
82 |
+
'https://storage.googleapis.com/alphagenome/reference/gencode/'
|
83 |
+
'hg38/gencode.v46.annotation.gtf.gz.feather'
|
84 |
+
)
|
85 |
+
gtf_transcript = gene_annotation.filter_transcript_support_level(
|
86 |
+
gene_annotation.filter_protein_coding(gtf), ['1']
|
87 |
+
)
|
88 |
+
longest_transcript_extractor = transcript.TranscriptExtractor(
|
89 |
+
gene_annotation.filter_to_longest_transcript(gtf_transcript)
|
90 |
+
)
|
91 |
+
|
92 |
+
# Make predictions
|
93 |
+
output = dna_model.predict_interval(
|
94 |
+
interval=interval,
|
95 |
+
requested_outputs={
|
96 |
+
dna_client.OutputType.RNA_SEQ,
|
97 |
+
dna_client.OutputType.CAGE,
|
98 |
+
},
|
99 |
+
ontology_terms=ontology_terms,
|
100 |
+
)
|
101 |
+
|
102 |
+
# Extract transcripts
|
103 |
+
longest_transcripts = longest_transcript_extractor.extract(interval)
|
104 |
+
|
105 |
+
# Build plot
|
106 |
+
plot = plot_components.plot(
|
107 |
+
[
|
108 |
+
plot_components.TranscriptAnnotation(longest_transcripts),
|
109 |
+
plot_components.Tracks(
|
110 |
+
tdata=output.rna_seq,
|
111 |
+
ylabel_template='RNA_SEQ: {biosample_name} ({strand})\n{name}',
|
112 |
+
),
|
113 |
+
plot_components.Tracks(
|
114 |
+
tdata=output.cage,
|
115 |
+
ylabel_template='CAGE: {biosample_name} ({strand})\n{name}',
|
116 |
+
),
|
117 |
+
],
|
118 |
+
interval=interval,
|
119 |
+
title='Predicted RNA Expression (RNA_SEQ, CAGE) for colon tissue',
|
120 |
+
)
|
121 |
+
|
122 |
+
# Save plot
|
123 |
+
if out_prefix is None:
|
124 |
+
out_prefix = f"gene_expression_{timestamp}"
|
125 |
+
output_file = OUTPUT_DIR / f"{out_prefix}.png"
|
126 |
+
plt.savefig(output_file, dpi=300, bbox_inches='tight')
|
127 |
+
plt.close()
|
128 |
+
|
129 |
+
return {
|
130 |
+
"message": "Gene expression visualization completed successfully",
|
131 |
+
"reference": "https://github.com/google-deepmind/alphagenome/tree/main/colabs/visualization_modality_tour.ipynb",
|
132 |
+
"artifacts": [
|
133 |
+
{
|
134 |
+
"description": "Gene expression visualization plot",
|
135 |
+
"path": str(output_file.resolve())
|
136 |
+
}
|
137 |
+
]
|
138 |
+
}
|
139 |
+
|
140 |
+
|
141 |
+
def visualize_variant_expression_effects(
|
142 |
+
chromosome: Annotated[str, "Chromosome name (e.g., 'chr22')"],
|
143 |
+
position: Annotated[int, "Variant genomic position"],
|
144 |
+
reference_bases: Annotated[str, "Reference allele sequence"],
|
145 |
+
alternate_bases: Annotated[str, "Alternate allele sequence"],
|
146 |
+
interval_start: Annotated[int, "Start position for prediction interval"],
|
147 |
+
interval_end: Annotated[int, "End position for prediction interval"],
|
148 |
+
gene_symbol: Annotated[str | None, "Gene symbol to zoom in on"] = None,
|
149 |
+
ontology_terms: Annotated[list, "List of ontology terms for tissues/cell types"] = ['UBERON:0001159', 'UBERON:0001155'],
|
150 |
+
api_key: Annotated[str | None, "AlphaGenome API key"] = ALPHAGENOME_API_KEY,
|
151 |
+
out_prefix: Annotated[str | None, "Output file prefix"] = None,
|
152 |
+
) -> dict:
|
153 |
+
"""
|
154 |
+
Visualize REF vs ALT variant effects on gene expression with overlaid tracks.
|
155 |
+
Input is variant coordinates and interval and output is variant effect visualization plot.
|
156 |
+
"""
|
157 |
+
if api_key is None:
|
158 |
+
raise ValueError("API key must be provided")
|
159 |
+
|
160 |
+
# Create variant and interval
|
161 |
+
variant = genome.Variant(chromosome, position, reference_bases, alternate_bases)
|
162 |
+
interval = genome.Interval(chromosome, interval_start, interval_end).resize(
|
163 |
+
dna_client.SEQUENCE_LENGTH_1MB
|
164 |
+
)
|
165 |
+
|
166 |
+
# Create model client
|
167 |
+
dna_model = dna_client.create(api_key)
|
168 |
+
|
169 |
+
# Load gene annotations
|
170 |
+
gtf = pd.read_feather(
|
171 |
+
'https://storage.googleapis.com/alphagenome/reference/gencode/'
|
172 |
+
'hg38/gencode.v46.annotation.gtf.gz.feather'
|
173 |
+
)
|
174 |
+
gtf_transcript = gene_annotation.filter_transcript_support_level(
|
175 |
+
gene_annotation.filter_protein_coding(gtf), ['1']
|
176 |
+
)
|
177 |
+
longest_transcript_extractor = transcript.TranscriptExtractor(
|
178 |
+
gene_annotation.filter_to_longest_transcript(gtf_transcript)
|
179 |
+
)
|
180 |
+
|
181 |
+
# Make variant predictions
|
182 |
+
output = dna_model.predict_variant(
|
183 |
+
interval=interval,
|
184 |
+
variant=variant,
|
185 |
+
requested_outputs={
|
186 |
+
dna_client.OutputType.RNA_SEQ,
|
187 |
+
dna_client.OutputType.CAGE,
|
188 |
+
},
|
189 |
+
ontology_terms=ontology_terms,
|
190 |
+
)
|
191 |
+
|
192 |
+
# Extract transcripts
|
193 |
+
longest_transcripts = longest_transcript_extractor.extract(interval)
|
194 |
+
|
195 |
+
# Determine plot interval
|
196 |
+
if gene_symbol is not None:
|
197 |
+
plot_interval = gene_annotation.get_gene_interval(gtf, gene_symbol=gene_symbol)
|
198 |
+
plot_interval.resize_inplace(plot_interval.width + 1000)
|
199 |
+
else:
|
200 |
+
plot_interval = interval
|
201 |
+
|
202 |
+
# Define colors for REF and ALT
|
203 |
+
ref_alt_colors = {'REF': 'dimgrey', 'ALT': 'red'}
|
204 |
+
|
205 |
+
# Build plot
|
206 |
+
plot = plot_components.plot(
|
207 |
+
[
|
208 |
+
plot_components.TranscriptAnnotation(longest_transcripts),
|
209 |
+
plot_components.OverlaidTracks(
|
210 |
+
tdata={
|
211 |
+
'REF': output.reference.rna_seq.filter_to_nonpositive_strand(),
|
212 |
+
'ALT': output.alternate.rna_seq.filter_to_nonpositive_strand(),
|
213 |
+
},
|
214 |
+
colors=ref_alt_colors,
|
215 |
+
ylabel_template='{biosample_name} ({strand})\n{name}',
|
216 |
+
),
|
217 |
+
plot_components.OverlaidTracks(
|
218 |
+
tdata={
|
219 |
+
'REF': output.reference.cage.filter_to_nonpositive_strand(),
|
220 |
+
'ALT': output.alternate.cage.filter_to_nonpositive_strand(),
|
221 |
+
},
|
222 |
+
colors=ref_alt_colors,
|
223 |
+
ylabel_template='{biosample_name} ({strand})\n{name}',
|
224 |
+
),
|
225 |
+
],
|
226 |
+
annotations=[plot_components.VariantAnnotation([variant])],
|
227 |
+
interval=plot_interval,
|
228 |
+
title='Effect of variant on predicted RNA Expression in colon tissue',
|
229 |
+
)
|
230 |
+
|
231 |
+
# Save plot
|
232 |
+
if out_prefix is None:
|
233 |
+
out_prefix = f"variant_expression_effects_{timestamp}"
|
234 |
+
output_file = OUTPUT_DIR / f"{out_prefix}.png"
|
235 |
+
plt.savefig(output_file, dpi=300, bbox_inches='tight')
|
236 |
+
plt.close()
|
237 |
+
|
238 |
+
return {
|
239 |
+
"message": "Variant expression effects visualization completed successfully",
|
240 |
+
"reference": "https://github.com/google-deepmind/alphagenome/tree/main/colabs/visualization_modality_tour.ipynb",
|
241 |
+
"artifacts": [
|
242 |
+
{
|
243 |
+
"description": "Variant expression effects plot",
|
244 |
+
"path": str(output_file.resolve())
|
245 |
+
}
|
246 |
+
]
|
247 |
+
}
|
248 |
+
|
249 |
+
|
250 |
+
def visualize_custom_annotations(
|
251 |
+
chromosome: Annotated[str, "Chromosome name (e.g., 'chr22')"],
|
252 |
+
start_position: Annotated[int, "Start genomic position"],
|
253 |
+
end_position: Annotated[int, "End genomic position"],
|
254 |
+
annotation_intervals: Annotated[list, "List of annotation intervals as [chr, start, end, strand, label] tuples"],
|
255 |
+
plot_start: Annotated[int, "Start position for plotting window"],
|
256 |
+
plot_end: Annotated[int, "End position for plotting window"],
|
257 |
+
ontology_terms: Annotated[list, "List of ontology terms for tissues/cell types"] = ['UBERON:0001159', 'UBERON:0002048'],
|
258 |
+
api_key: Annotated[str | None, "AlphaGenome API key"] = ALPHAGENOME_API_KEY,
|
259 |
+
out_prefix: Annotated[str | None, "Output file prefix"] = None,
|
260 |
+
) -> dict:
|
261 |
+
"""
|
262 |
+
Visualize RNA predictions with custom interval annotations like polyadenylation sites.
|
263 |
+
Input is genomic coordinates and custom annotations and output is annotated RNA visualization plot.
|
264 |
+
"""
|
265 |
+
if api_key is None:
|
266 |
+
raise ValueError("API key must be provided")
|
267 |
+
|
268 |
+
# Create interval
|
269 |
+
interval = genome.Interval(chromosome, start_position, end_position).resize(
|
270 |
+
dna_client.SEQUENCE_LENGTH_1MB
|
271 |
+
)
|
272 |
+
|
273 |
+
# Create model client
|
274 |
+
dna_model = dna_client.create(api_key)
|
275 |
+
|
276 |
+
# Load gene annotations
|
277 |
+
gtf = pd.read_feather(
|
278 |
+
'https://storage.googleapis.com/alphagenome/reference/gencode/'
|
279 |
+
'hg38/gencode.v46.annotation.gtf.gz.feather'
|
280 |
+
)
|
281 |
+
gtf_transcript = gene_annotation.filter_transcript_support_level(
|
282 |
+
gene_annotation.filter_protein_coding(gtf), ['1']
|
283 |
+
)
|
284 |
+
longest_transcript_extractor = transcript.TranscriptExtractor(
|
285 |
+
gene_annotation.filter_to_longest_transcript(gtf_transcript)
|
286 |
+
)
|
287 |
+
|
288 |
+
# Make predictions
|
289 |
+
output = dna_model.predict_interval(
|
290 |
+
interval=interval,
|
291 |
+
requested_outputs={
|
292 |
+
dna_client.OutputType.RNA_SEQ,
|
293 |
+
},
|
294 |
+
ontology_terms=ontology_terms,
|
295 |
+
)
|
296 |
+
|
297 |
+
# Extract transcripts
|
298 |
+
longest_transcripts = longest_transcript_extractor.extract(interval)
|
299 |
+
|
300 |
+
# Create custom annotation intervals
|
301 |
+
custom_intervals = []
|
302 |
+
labels = []
|
303 |
+
for ann in annotation_intervals:
|
304 |
+
if len(ann) >= 5:
|
305 |
+
chr_name, start, end, strand, label = ann[:5]
|
306 |
+
custom_intervals.append(genome.Interval(chr_name, start, end, strand))
|
307 |
+
labels.append(label)
|
308 |
+
|
309 |
+
# Define plotting interval
|
310 |
+
plot_interval = genome.Interval(chromosome, plot_start, plot_end, '-')
|
311 |
+
|
312 |
+
# Build plot
|
313 |
+
plot = plot_components.plot(
|
314 |
+
[
|
315 |
+
plot_components.TranscriptAnnotation(longest_transcripts),
|
316 |
+
plot_components.Tracks(
|
317 |
+
tdata=output.rna_seq.filter_to_negative_strand(),
|
318 |
+
ylabel_template='RNA_SEQ: {biosample_name} ({strand})\n{name}',
|
319 |
+
shared_y_scale=True,
|
320 |
+
)
|
321 |
+
],
|
322 |
+
annotations=[
|
323 |
+
plot_components.IntervalAnnotation(
|
324 |
+
custom_intervals,
|
325 |
+
alpha=1,
|
326 |
+
labels=labels,
|
327 |
+
label_angle=90
|
328 |
+
)
|
329 |
+
],
|
330 |
+
interval=plot_interval,
|
331 |
+
title='Custom annotations with RNA expression',
|
332 |
+
)
|
333 |
+
|
334 |
+
# Save plot
|
335 |
+
if out_prefix is None:
|
336 |
+
out_prefix = f"custom_annotations_{timestamp}"
|
337 |
+
output_file = OUTPUT_DIR / f"{out_prefix}.png"
|
338 |
+
plt.savefig(output_file, dpi=300, bbox_inches='tight')
|
339 |
+
plt.close()
|
340 |
+
|
341 |
+
return {
|
342 |
+
"message": "Custom annotations visualization completed successfully",
|
343 |
+
"reference": "https://github.com/google-deepmind/alphagenome/tree/main/colabs/visualization_modality_tour.ipynb",
|
344 |
+
"artifacts": [
|
345 |
+
{
|
346 |
+
"description": "Custom annotations plot",
|
347 |
+
"path": str(output_file.resolve())
|
348 |
+
}
|
349 |
+
]
|
350 |
+
}
|
351 |
+
|
352 |
+
@visualization_modality_tour_mcp.tool
|
353 |
+
def visualize_chromatin_accessibility(
|
354 |
+
chromosome: Annotated[str, "Chromosome name (e.g., 'chr22')"],
|
355 |
+
start_position: Annotated[int, "Start genomic position"],
|
356 |
+
end_position: Annotated[int, "End genomic position"],
|
357 |
+
variant_position: Annotated[int | None, "Variant position to highlight"] = None,
|
358 |
+
variant_ref: Annotated[str | None, "Variant reference allele"] = None,
|
359 |
+
variant_alt: Annotated[str | None, "Variant alternate allele"] = None,
|
360 |
+
promoter_intervals: Annotated[list | None, "List of promoter intervals as [chr, start, end, name] tuples"] = None,
|
361 |
+
window_size: Annotated[int, "Size of plotting window around variant"] = 8000,
|
362 |
+
ontology_terms: Annotated[list, "List of ontology terms for tissues/cell types"] = ['UBERON:0000317', 'UBERON:0001155', 'UBERON:0001157', 'UBERON:0001159', 'UBERON:0004992', 'UBERON:0008971'],
|
363 |
+
api_key: Annotated[str | None, "AlphaGenome API key"] = ALPHAGENOME_API_KEY,
|
364 |
+
out_prefix: Annotated[str | None, "Output file prefix"] = None,
|
365 |
+
) -> dict:
|
366 |
+
"""
|
367 |
+
Visualize DNASE and ATAC chromatin accessibility predictions for intestinal tissues.
|
368 |
+
Input is genomic coordinates and optional variant information and output is chromatin accessibility plot.
|
369 |
+
"""
|
370 |
+
if api_key is None:
|
371 |
+
raise ValueError("API key must be provided")
|
372 |
+
|
373 |
+
# Create interval
|
374 |
+
interval = genome.Interval(chromosome, start_position, end_position).resize(
|
375 |
+
dna_client.SEQUENCE_LENGTH_1MB
|
376 |
+
)
|
377 |
+
|
378 |
+
# Create model client
|
379 |
+
dna_model = dna_client.create(api_key)
|
380 |
+
|
381 |
+
# Load gene annotations
|
382 |
+
gtf = pd.read_feather(
|
383 |
+
'https://storage.googleapis.com/alphagenome/reference/gencode/'
|
384 |
+
'hg38/gencode.v46.annotation.gtf.gz.feather'
|
385 |
+
)
|
386 |
+
gtf_transcript = gene_annotation.filter_transcript_support_level(
|
387 |
+
gene_annotation.filter_protein_coding(gtf), ['1']
|
388 |
+
)
|
389 |
+
longest_transcript_extractor = transcript.TranscriptExtractor(
|
390 |
+
gene_annotation.filter_to_longest_transcript(gtf_transcript)
|
391 |
+
)
|
392 |
+
|
393 |
+
# Make predictions
|
394 |
+
output = dna_model.predict_interval(
|
395 |
+
interval,
|
396 |
+
requested_outputs={
|
397 |
+
dna_client.OutputType.DNASE,
|
398 |
+
dna_client.OutputType.ATAC,
|
399 |
+
},
|
400 |
+
ontology_terms=ontology_terms,
|
401 |
+
)
|
402 |
+
|
403 |
+
# Extract transcripts
|
404 |
+
longest_transcripts = longest_transcript_extractor.extract(interval)
|
405 |
+
|
406 |
+
# Prepare annotations
|
407 |
+
annotations = []
|
408 |
+
|
409 |
+
# Add variant annotation if provided
|
410 |
+
if variant_position is not None and variant_ref is not None and variant_alt is not None:
|
411 |
+
variant = genome.Variant(chromosome, variant_position, variant_ref, variant_alt)
|
412 |
+
annotations.append(plot_components.VariantAnnotation([variant]))
|
413 |
+
plot_interval = variant.reference_interval.resize(window_size)
|
414 |
+
else:
|
415 |
+
plot_interval = interval
|
416 |
+
|
417 |
+
# Add promoter annotations if provided
|
418 |
+
if promoter_intervals is not None:
|
419 |
+
promoter_objs = []
|
420 |
+
for prom in promoter_intervals:
|
421 |
+
if len(prom) >= 4:
|
422 |
+
chr_name, start, end, name = prom[:4]
|
423 |
+
promoter_objs.append(genome.Interval(chr_name, start, end, name=name))
|
424 |
+
if promoter_objs:
|
425 |
+
annotations.append(plot_components.IntervalAnnotation(promoter_objs))
|
426 |
+
|
427 |
+
# Build plot
|
428 |
+
plot = plot_components.plot(
|
429 |
+
[
|
430 |
+
plot_components.TranscriptAnnotation(longest_transcripts),
|
431 |
+
plot_components.Tracks(
|
432 |
+
tdata=output.dnase,
|
433 |
+
ylabel_template='DNASE: {biosample_name} ({strand})\n{name}',
|
434 |
+
),
|
435 |
+
plot_components.Tracks(
|
436 |
+
tdata=output.atac,
|
437 |
+
ylabel_template='ATAC: {biosample_name} ({strand})\n{name}',
|
438 |
+
),
|
439 |
+
],
|
440 |
+
interval=plot_interval,
|
441 |
+
annotations=annotations,
|
442 |
+
title='Predicted chromatin accessibility (DNASE, ATAC) for colon tissue',
|
443 |
+
)
|
444 |
+
|
445 |
+
# Save plot
|
446 |
+
if out_prefix is None:
|
447 |
+
out_prefix = f"chromatin_accessibility_{timestamp}"
|
448 |
+
output_file = OUTPUT_DIR / f"{out_prefix}.png"
|
449 |
+
plt.savefig(output_file, dpi=300, bbox_inches='tight')
|
450 |
+
plt.close()
|
451 |
+
|
452 |
+
return {
|
453 |
+
"message": "Chromatin accessibility visualization completed successfully",
|
454 |
+
"reference": "https://github.com/google-deepmind/alphagenome/tree/main/colabs/visualization_modality_tour.ipynb",
|
455 |
+
"artifacts": [
|
456 |
+
{
|
457 |
+
"description": "Chromatin accessibility plot",
|
458 |
+
"path": str(output_file.resolve())
|
459 |
+
}
|
460 |
+
]
|
461 |
+
}
|
462 |
+
|
463 |
+
@visualization_modality_tour_mcp.tool
|
464 |
+
def visualize_splicing_effects(
|
465 |
+
chromosome: Annotated[str, "Chromosome name (e.g., 'chr22')"],
|
466 |
+
start_position: Annotated[int, "Start genomic position"],
|
467 |
+
end_position: Annotated[int, "End genomic position"],
|
468 |
+
gene_symbol: Annotated[str | None, "Gene symbol to focus on"] = None,
|
469 |
+
ontology_terms: Annotated[list, "List of ontology terms for tissues/cell types"] = ['UBERON:0001157', 'UBERON:0001159'],
|
470 |
+
api_key: Annotated[str | None, "AlphaGenome API key"] = ALPHAGENOME_API_KEY,
|
471 |
+
out_prefix: Annotated[str | None, "Output file prefix"] = None,
|
472 |
+
) -> dict:
|
473 |
+
"""
|
474 |
+
Visualize splicing predictions including SPLICE_SITES, SPLICE_SITE_USAGE, and SPLICE_JUNCTIONS.
|
475 |
+
Input is genomic coordinates and optional gene symbol and output is splicing effects plot.
|
476 |
+
"""
|
477 |
+
if api_key is None:
|
478 |
+
raise ValueError("API key must be provided")
|
479 |
+
|
480 |
+
# Create interval
|
481 |
+
interval = genome.Interval(chromosome, start_position, end_position).resize(
|
482 |
+
dna_client.SEQUENCE_LENGTH_1MB
|
483 |
+
)
|
484 |
+
|
485 |
+
# Create model client
|
486 |
+
dna_model = dna_client.create(api_key)
|
487 |
+
|
488 |
+
# Load gene annotations
|
489 |
+
gtf = pd.read_feather(
|
490 |
+
'https://storage.googleapis.com/alphagenome/reference/gencode/'
|
491 |
+
'hg38/gencode.v46.annotation.gtf.gz.feather'
|
492 |
+
)
|
493 |
+
gtf_transcript = gene_annotation.filter_transcript_support_level(
|
494 |
+
gene_annotation.filter_protein_coding(gtf), ['1']
|
495 |
+
)
|
496 |
+
longest_transcript_extractor = transcript.TranscriptExtractor(
|
497 |
+
gene_annotation.filter_to_longest_transcript(gtf_transcript)
|
498 |
+
)
|
499 |
+
|
500 |
+
# Make predictions
|
501 |
+
output = dna_model.predict_interval(
|
502 |
+
interval=interval,
|
503 |
+
requested_outputs={
|
504 |
+
dna_client.OutputType.RNA_SEQ,
|
505 |
+
dna_client.OutputType.SPLICE_SITES,
|
506 |
+
dna_client.OutputType.SPLICE_SITE_USAGE,
|
507 |
+
dna_client.OutputType.SPLICE_JUNCTIONS,
|
508 |
+
},
|
509 |
+
ontology_terms=ontology_terms,
|
510 |
+
)
|
511 |
+
|
512 |
+
# Extract transcripts
|
513 |
+
longest_transcripts = longest_transcript_extractor.extract(interval)
|
514 |
+
|
515 |
+
# Determine plot interval
|
516 |
+
if gene_symbol is not None:
|
517 |
+
plot_interval = gene_annotation.get_gene_interval(gtf, gene_symbol=gene_symbol)
|
518 |
+
plot_interval.resize_inplace(plot_interval.width + 1000)
|
519 |
+
else:
|
520 |
+
plot_interval = interval
|
521 |
+
|
522 |
+
# Build plot
|
523 |
+
plot = plot_components.plot(
|
524 |
+
[
|
525 |
+
plot_components.TranscriptAnnotation(longest_transcripts),
|
526 |
+
plot_components.Tracks(
|
527 |
+
tdata=output.splice_sites.filter_to_negative_strand(),
|
528 |
+
ylabel_template='SPLICE SITES: {name} ({strand})',
|
529 |
+
),
|
530 |
+
],
|
531 |
+
interval=plot_interval,
|
532 |
+
title='Predicted splicing effects for colon tissue',
|
533 |
+
)
|
534 |
+
|
535 |
+
# Save plot
|
536 |
+
if out_prefix is None:
|
537 |
+
out_prefix = f"splicing_effects_{timestamp}"
|
538 |
+
output_file = OUTPUT_DIR / f"{out_prefix}.png"
|
539 |
+
plt.savefig(output_file, dpi=300, bbox_inches='tight')
|
540 |
+
plt.close()
|
541 |
+
|
542 |
+
return {
|
543 |
+
"message": "Splicing effects visualization completed successfully",
|
544 |
+
"reference": "https://github.com/google-deepmind/alphagenome/tree/main/colabs/visualization_modality_tour.ipynb",
|
545 |
+
"artifacts": [
|
546 |
+
{
|
547 |
+
"description": "Splicing effects plot",
|
548 |
+
"path": str(output_file.resolve())
|
549 |
+
}
|
550 |
+
]
|
551 |
+
}
|
552 |
+
|
553 |
+
|
554 |
+
def visualize_variant_splicing_effects(
|
555 |
+
chromosome: Annotated[str, "Chromosome name (e.g., 'chr22')"],
|
556 |
+
variant_position: Annotated[int, "Variant genomic position"],
|
557 |
+
variant_ref: Annotated[str, "Variant reference allele"],
|
558 |
+
variant_alt: Annotated[str, "Variant alternate allele"],
|
559 |
+
interval_start: Annotated[int, "Start position for prediction interval"],
|
560 |
+
interval_end: Annotated[int, "End position for prediction interval"],
|
561 |
+
gene_symbol: Annotated[str | None, "Gene symbol to focus on"] = None,
|
562 |
+
tissue_filter: Annotated[str, "Specific tissue to filter for sashimi plots"] = "Colon_Transverse",
|
563 |
+
ontology_terms: Annotated[list, "List of ontology terms for tissues/cell types"] = ['UBERON:0001157', 'UBERON:0001159'],
|
564 |
+
api_key: Annotated[str | None, "AlphaGenome API key"] = ALPHAGENOME_API_KEY,
|
565 |
+
out_prefix: Annotated[str | None, "Output file prefix"] = None,
|
566 |
+
) -> dict:
|
567 |
+
"""
|
568 |
+
Visualize REF vs ALT variant effects on splicing with sashimi plots and overlaid tracks.
|
569 |
+
Input is variant coordinates and interval and output is variant splicing effects plot with sashimi arcs.
|
570 |
+
"""
|
571 |
+
if api_key is None:
|
572 |
+
raise ValueError("API key must be provided")
|
573 |
+
|
574 |
+
# Create variant and interval
|
575 |
+
variant = genome.Variant(chromosome, variant_position, variant_ref, variant_alt)
|
576 |
+
interval = genome.Interval(chromosome, interval_start, interval_end).resize(
|
577 |
+
dna_client.SEQUENCE_LENGTH_1MB
|
578 |
+
)
|
579 |
+
|
580 |
+
# Create model client
|
581 |
+
dna_model = dna_client.create(api_key)
|
582 |
+
|
583 |
+
# Load gene annotations
|
584 |
+
gtf = pd.read_feather(
|
585 |
+
'https://storage.googleapis.com/alphagenome/reference/gencode/'
|
586 |
+
'hg38/gencode.v46.annotation.gtf.gz.feather'
|
587 |
+
)
|
588 |
+
gtf_transcript = gene_annotation.filter_transcript_support_level(
|
589 |
+
gene_annotation.filter_protein_coding(gtf), ['1']
|
590 |
+
)
|
591 |
+
transcript_extractor = transcript.TranscriptExtractor(gtf_transcript)
|
592 |
+
|
593 |
+
# Make variant predictions
|
594 |
+
output = dna_model.predict_variant(
|
595 |
+
interval=interval,
|
596 |
+
variant=variant,
|
597 |
+
requested_outputs={
|
598 |
+
dna_client.OutputType.RNA_SEQ,
|
599 |
+
dna_client.OutputType.SPLICE_SITES,
|
600 |
+
dna_client.OutputType.SPLICE_SITE_USAGE,
|
601 |
+
dna_client.OutputType.SPLICE_JUNCTIONS,
|
602 |
+
},
|
603 |
+
ontology_terms=ontology_terms,
|
604 |
+
)
|
605 |
+
|
606 |
+
# Extract all transcripts
|
607 |
+
transcripts = transcript_extractor.extract(interval)
|
608 |
+
|
609 |
+
# Determine plot interval
|
610 |
+
if gene_symbol is not None:
|
611 |
+
plot_interval = gene_annotation.get_gene_interval(gtf, gene_symbol=gene_symbol)
|
612 |
+
plot_interval.resize_inplace(plot_interval.width + 1000)
|
613 |
+
else:
|
614 |
+
plot_interval = interval
|
615 |
+
|
616 |
+
ref_output = output.reference
|
617 |
+
alt_output = output.alternate
|
618 |
+
|
619 |
+
# Define colors for REF and ALT
|
620 |
+
ref_alt_colors = {'REF': 'dimgrey', 'ALT': 'red'}
|
621 |
+
|
622 |
+
# Build plot
|
623 |
+
plot = plot_components.plot(
|
624 |
+
[
|
625 |
+
plot_components.TranscriptAnnotation(transcripts),
|
626 |
+
plot_components.Sashimi(
|
627 |
+
ref_output.splice_junctions
|
628 |
+
.filter_to_strand('-')
|
629 |
+
.filter_by_tissue(tissue_filter),
|
630 |
+
ylabel_template='Reference {biosample_name} ({strand})\n{name}',
|
631 |
+
),
|
632 |
+
plot_components.Sashimi(
|
633 |
+
alt_output.splice_junctions
|
634 |
+
.filter_to_strand('-')
|
635 |
+
.filter_by_tissue(tissue_filter),
|
636 |
+
ylabel_template='Alternate {biosample_name} ({strand})\n{name}',
|
637 |
+
),
|
638 |
+
plot_components.OverlaidTracks(
|
639 |
+
tdata={
|
640 |
+
'REF': ref_output.rna_seq.filter_to_nonpositive_strand(),
|
641 |
+
'ALT': alt_output.rna_seq.filter_to_nonpositive_strand(),
|
642 |
+
},
|
643 |
+
colors=ref_alt_colors,
|
644 |
+
ylabel_template='RNA_SEQ: {biosample_name} ({strand})\n{name}',
|
645 |
+
),
|
646 |
+
plot_components.OverlaidTracks(
|
647 |
+
tdata={
|
648 |
+
'REF': ref_output.splice_sites.filter_to_nonpositive_strand(),
|
649 |
+
'ALT': alt_output.splice_sites.filter_to_nonpositive_strand(),
|
650 |
+
},
|
651 |
+
colors=ref_alt_colors,
|
652 |
+
ylabel_template='SPLICE SITES: {name} ({strand})',
|
653 |
+
),
|
654 |
+
plot_components.OverlaidTracks(
|
655 |
+
tdata={
|
656 |
+
'REF': (
|
657 |
+
ref_output.splice_site_usage.filter_to_nonpositive_strand()
|
658 |
+
),
|
659 |
+
'ALT': (
|
660 |
+
alt_output.splice_site_usage.filter_to_nonpositive_strand()
|
661 |
+
),
|
662 |
+
},
|
663 |
+
colors=ref_alt_colors,
|
664 |
+
ylabel_template=(
|
665 |
+
'SPLICE SITE USAGE: {biosample_name} ({strand})\n{name}'
|
666 |
+
),
|
667 |
+
),
|
668 |
+
],
|
669 |
+
interval=plot_interval,
|
670 |
+
annotations=[plot_components.VariantAnnotation([variant])],
|
671 |
+
title='Predicted REF vs. ALT effects of variant in colon tissue',
|
672 |
+
)
|
673 |
+
|
674 |
+
# Save plot
|
675 |
+
if out_prefix is None:
|
676 |
+
out_prefix = f"variant_splicing_effects_{timestamp}"
|
677 |
+
output_file = OUTPUT_DIR / f"{out_prefix}.png"
|
678 |
+
plt.savefig(output_file, dpi=300, bbox_inches='tight')
|
679 |
+
plt.close()
|
680 |
+
|
681 |
+
return {
|
682 |
+
"message": "Variant splicing effects visualization completed successfully",
|
683 |
+
"reference": "https://github.com/google-deepmind/alphagenome/tree/main/colabs/visualization_modality_tour.ipynb",
|
684 |
+
"artifacts": [
|
685 |
+
{
|
686 |
+
"description": "Variant splicing effects plot",
|
687 |
+
"path": str(output_file.resolve())
|
688 |
+
}
|
689 |
+
]
|
690 |
+
}
|
691 |
+
|
692 |
+
@visualization_modality_tour_mcp.tool
|
693 |
+
def visualize_histone_modifications(
|
694 |
+
chromosome: Annotated[str, "Chromosome name (e.g., 'chr22')"],
|
695 |
+
start_position: Annotated[int, "Start genomic position"],
|
696 |
+
end_position: Annotated[int, "End genomic position"],
|
697 |
+
include_tss_annotations: Annotated[bool, "Whether to include transcription start site annotations"] = True,
|
698 |
+
ontology_terms: Annotated[list, "List of ontology terms for tissues/cell types"] = ['UBERON:0000317', 'UBERON:0001155', 'UBERON:0001157', 'UBERON:0001159'],
|
699 |
+
api_key: Annotated[str | None, "AlphaGenome API key"] = ALPHAGENOME_API_KEY,
|
700 |
+
out_prefix: Annotated[str | None, "Output file prefix"] = None,
|
701 |
+
) -> dict:
|
702 |
+
"""
|
703 |
+
Visualize CHIP_HISTONE predictions with custom colors grouped by histone mark.
|
704 |
+
Input is genomic coordinates and output is histone modifications plot with colored tracks.
|
705 |
+
"""
|
706 |
+
if api_key is None:
|
707 |
+
raise ValueError("API key must be provided")
|
708 |
+
|
709 |
+
# Create interval
|
710 |
+
interval = genome.Interval(chromosome, start_position, end_position).resize(
|
711 |
+
dna_client.SEQUENCE_LENGTH_1MB
|
712 |
+
)
|
713 |
+
|
714 |
+
# Create model client
|
715 |
+
dna_model = dna_client.create(api_key)
|
716 |
+
|
717 |
+
# Load gene annotations
|
718 |
+
gtf = pd.read_feather(
|
719 |
+
'https://storage.googleapis.com/alphagenome/reference/gencode/'
|
720 |
+
'hg38/gencode.v46.annotation.gtf.gz.feather'
|
721 |
+
)
|
722 |
+
gtf_transcript = gene_annotation.filter_transcript_support_level(
|
723 |
+
gene_annotation.filter_protein_coding(gtf), ['1']
|
724 |
+
)
|
725 |
+
gtf_longest_transcript = gene_annotation.filter_to_longest_transcript(gtf_transcript)
|
726 |
+
longest_transcript_extractor = transcript.TranscriptExtractor(gtf_longest_transcript)
|
727 |
+
|
728 |
+
# Make predictions
|
729 |
+
output = dna_model.predict_interval(
|
730 |
+
interval=interval,
|
731 |
+
requested_outputs={dna_client.OutputType.CHIP_HISTONE},
|
732 |
+
ontology_terms=ontology_terms,
|
733 |
+
)
|
734 |
+
|
735 |
+
# Extract transcripts
|
736 |
+
longest_transcripts = longest_transcript_extractor.extract(interval)
|
737 |
+
|
738 |
+
# Reorder tracks by histone mark and apply colors
|
739 |
+
reordered_chip_histone = output.chip_histone.select_tracks_by_index(
|
740 |
+
output.chip_histone.metadata.sort_values('histone_mark').index
|
741 |
+
)
|
742 |
+
|
743 |
+
histone_to_color = {
|
744 |
+
'H3K27AC': '#e41a1c',
|
745 |
+
'H3K36ME3': '#ff7f00',
|
746 |
+
'H3K4ME1': '#377eb8',
|
747 |
+
'H3K4ME3': '#984ea3',
|
748 |
+
'H3K9AC': '#4daf4a',
|
749 |
+
'H3K27ME3': '#ffc0cb',
|
750 |
+
}
|
751 |
+
|
752 |
+
track_colors = (
|
753 |
+
reordered_chip_histone.metadata['histone_mark']
|
754 |
+
.map(lambda x: histone_to_color.get(x.upper(), '#000000'))
|
755 |
+
.values
|
756 |
+
)
|
757 |
+
|
758 |
+
# Prepare annotations
|
759 |
+
annotations = []
|
760 |
+
if include_tss_annotations:
|
761 |
+
# Extract TSS annotations
|
762 |
+
gtf_tss = gene_annotation.extract_tss(gtf_longest_transcript)
|
763 |
+
tss_as_intervals = [
|
764 |
+
genome.Interval(
|
765 |
+
chromosome=row.Chromosome,
|
766 |
+
start=row.Start,
|
767 |
+
end=row.End + 1000, # Add extra 1Kb so the TSSs are visible
|
768 |
+
name=row.gene_name,
|
769 |
+
)
|
770 |
+
for _, row in gtf_tss.iterrows()
|
771 |
+
]
|
772 |
+
annotations.append(
|
773 |
+
plot_components.IntervalAnnotation(
|
774 |
+
tss_as_intervals, alpha=0.5, colors='blue'
|
775 |
+
)
|
776 |
+
)
|
777 |
+
|
778 |
+
# Build plot
|
779 |
+
plot = plot_components.plot(
|
780 |
+
[
|
781 |
+
plot_components.TranscriptAnnotation(longest_transcripts),
|
782 |
+
plot_components.Tracks(
|
783 |
+
tdata=reordered_chip_histone,
|
784 |
+
ylabel_template=(
|
785 |
+
'CHIP HISTONE: {biosample_name} ({strand})\n{histone_mark}'
|
786 |
+
),
|
787 |
+
filled=True,
|
788 |
+
track_colors=track_colors,
|
789 |
+
),
|
790 |
+
],
|
791 |
+
interval=interval,
|
792 |
+
annotations=annotations,
|
793 |
+
despine_keep_bottom=True,
|
794 |
+
title='Predicted histone modification markers in colon tissue',
|
795 |
+
)
|
796 |
+
|
797 |
+
# Save plot
|
798 |
+
if out_prefix is None:
|
799 |
+
out_prefix = f"histone_modifications_{timestamp}"
|
800 |
+
output_file = OUTPUT_DIR / f"{out_prefix}.png"
|
801 |
+
plt.savefig(output_file, dpi=300, bbox_inches='tight')
|
802 |
+
plt.close()
|
803 |
+
|
804 |
+
return {
|
805 |
+
"message": "Histone modifications visualization completed successfully",
|
806 |
+
"reference": "https://github.com/google-deepmind/alphagenome/tree/main/colabs/visualization_modality_tour.ipynb",
|
807 |
+
"artifacts": [
|
808 |
+
{
|
809 |
+
"description": "Histone modifications plot",
|
810 |
+
"path": str(output_file.resolve())
|
811 |
+
}
|
812 |
+
]
|
813 |
+
}
|
814 |
+
|
815 |
+
@visualization_modality_tour_mcp.tool
|
816 |
+
def visualize_tf_binding(
|
817 |
+
chromosome: Annotated[str, "Chromosome name (e.g., 'chr22')"],
|
818 |
+
start_position: Annotated[int, "Start genomic position"],
|
819 |
+
end_position: Annotated[int, "End genomic position"],
|
820 |
+
transcription_factor: Annotated[str | None, "Specific transcription factor to visualize (e.g., 'CTCF')"] = None,
|
821 |
+
min_max_prediction: Annotated[float, "Minimum maximum prediction value to include tracks"] = 8000.0,
|
822 |
+
gene_symbol: Annotated[str | None, "Gene symbol to focus analysis on"] = None,
|
823 |
+
include_tss_annotations: Annotated[bool, "Whether to include transcription start site annotations"] = True,
|
824 |
+
ontology_terms: Annotated[list, "List of ontology terms for tissues/cell types"] = ['UBERON:0001159', 'UBERON:0001157', 'EFO:0002067', 'EFO:0001187'],
|
825 |
+
api_key: Annotated[str | None, "AlphaGenome API key"] = ALPHAGENOME_API_KEY,
|
826 |
+
out_prefix: Annotated[str | None, "Output file prefix"] = None,
|
827 |
+
) -> dict:
|
828 |
+
"""
|
829 |
+
Visualize CHIP_TF transcription factor binding predictions with filtering and averaging options.
|
830 |
+
Input is genomic coordinates and filtering parameters and output is TF binding visualization plot.
|
831 |
+
"""
|
832 |
+
if api_key is None:
|
833 |
+
raise ValueError("API key must be provided")
|
834 |
+
|
835 |
+
# Create interval
|
836 |
+
interval = genome.Interval(chromosome, start_position, end_position).resize(
|
837 |
+
dna_client.SEQUENCE_LENGTH_1MB
|
838 |
+
)
|
839 |
+
|
840 |
+
# Create model client
|
841 |
+
dna_model = dna_client.create(api_key)
|
842 |
+
|
843 |
+
# Load gene annotations
|
844 |
+
gtf = pd.read_feather(
|
845 |
+
'https://storage.googleapis.com/alphagenome/reference/gencode/'
|
846 |
+
'hg38/gencode.v46.annotation.gtf.gz.feather'
|
847 |
+
)
|
848 |
+
gtf_transcript = gene_annotation.filter_transcript_support_level(
|
849 |
+
gene_annotation.filter_protein_coding(gtf), ['1']
|
850 |
+
)
|
851 |
+
gtf_longest_transcript = gene_annotation.filter_to_longest_transcript(gtf_transcript)
|
852 |
+
longest_transcript_extractor = transcript.TranscriptExtractor(gtf_longest_transcript)
|
853 |
+
transcript_extractor = transcript.TranscriptExtractor(gtf_transcript)
|
854 |
+
|
855 |
+
# Make predictions
|
856 |
+
output = dna_model.predict_interval(
|
857 |
+
interval=interval,
|
858 |
+
requested_outputs={dna_client.OutputType.CHIP_TF},
|
859 |
+
ontology_terms=ontology_terms,
|
860 |
+
)
|
861 |
+
|
862 |
+
# Extract transcripts
|
863 |
+
longest_transcripts = longest_transcript_extractor.extract(interval)
|
864 |
+
all_transcripts = transcript_extractor.extract(interval)
|
865 |
+
|
866 |
+
# Determine plot interval and filtering logic
|
867 |
+
if gene_symbol is not None:
|
868 |
+
gene_interval = gene_annotation.get_gene_interval(gtf, gene_symbol=gene_symbol)
|
869 |
+
gene_interval.resize_inplace(gene_interval.width + 1000)
|
870 |
+
|
871 |
+
# Filter based on gene interval
|
872 |
+
max_predictions = output.chip_tf.slice_by_interval(
|
873 |
+
gene_interval, match_resolution=True
|
874 |
+
).values.max(axis=0)
|
875 |
+
|
876 |
+
# Get top 10 tracks for gene-specific analysis
|
877 |
+
output_filtered = output.chip_tf.filter_tracks(
|
878 |
+
(max_predictions >= np.sort(max_predictions)[-10])
|
879 |
+
)
|
880 |
+
plot_interval = gene_interval
|
881 |
+
transcripts_to_use = all_transcripts
|
882 |
+
else:
|
883 |
+
# Filter by minimum max prediction globally
|
884 |
+
output_filtered = output.chip_tf.filter_tracks(
|
885 |
+
output.chip_tf.values.max(axis=0) > min_max_prediction
|
886 |
+
)
|
887 |
+
plot_interval = interval
|
888 |
+
transcripts_to_use = longest_transcripts
|
889 |
+
|
890 |
+
# Handle specific transcription factor averaging
|
891 |
+
if transcription_factor is not None:
|
892 |
+
# Filter to specific TF and create mean track
|
893 |
+
tf_mask = output_filtered.metadata['transcription_factor'] == transcription_factor
|
894 |
+
if tf_mask.any():
|
895 |
+
mean_tf_values = output_filtered.values[:, tf_mask].mean(axis=1)
|
896 |
+
|
897 |
+
# Create new TrackData object
|
898 |
+
tdata_mean_tf = track_data.TrackData(
|
899 |
+
values=mean_tf_values[:, None],
|
900 |
+
metadata=pd.DataFrame({
|
901 |
+
'transcription_factor': [transcription_factor],
|
902 |
+
'name': ['mean'],
|
903 |
+
'strand': ['.']
|
904 |
+
}),
|
905 |
+
interval=output_filtered.interval,
|
906 |
+
resolution=output_filtered.resolution,
|
907 |
+
)
|
908 |
+
|
909 |
+
track_data_to_plot = tdata_mean_tf
|
910 |
+
ylabel_template = '{name} {transcription_factor}'
|
911 |
+
plot_title = f'Predicted {transcription_factor} binding (mean across cell types)'
|
912 |
+
else:
|
913 |
+
raise ValueError(f"No tracks found for transcription factor: {transcription_factor}")
|
914 |
+
else:
|
915 |
+
track_data_to_plot = output_filtered
|
916 |
+
ylabel_template = 'CHIP TF: {biosample_name} ({strand})\n{transcription_factor}'
|
917 |
+
plot_title = 'Predicted TF-binding in K562, HepG2, and sigmoid colon.'
|
918 |
+
|
919 |
+
# Prepare annotations
|
920 |
+
annotations = []
|
921 |
+
if include_tss_annotations:
|
922 |
+
# Extract TSS annotations
|
923 |
+
gtf_tss = gene_annotation.extract_tss(gtf_longest_transcript)
|
924 |
+
tss_as_intervals = [
|
925 |
+
genome.Interval(
|
926 |
+
chromosome=row.Chromosome,
|
927 |
+
start=row.Start,
|
928 |
+
end=row.End + 1000, # Add extra 1Kb so the TSSs are visible
|
929 |
+
name=row.gene_name,
|
930 |
+
)
|
931 |
+
for _, row in gtf_tss.iterrows()
|
932 |
+
]
|
933 |
+
annotations.append(
|
934 |
+
plot_components.IntervalAnnotation(
|
935 |
+
tss_as_intervals, alpha=0.3, colors='blue'
|
936 |
+
)
|
937 |
+
)
|
938 |
+
|
939 |
+
# Build plot
|
940 |
+
plot = plot_components.plot(
|
941 |
+
[
|
942 |
+
plot_components.TranscriptAnnotation(transcripts_to_use),
|
943 |
+
plot_components.Tracks(
|
944 |
+
tdata=track_data_to_plot,
|
945 |
+
ylabel_template=ylabel_template,
|
946 |
+
filled=True,
|
947 |
+
),
|
948 |
+
],
|
949 |
+
interval=plot_interval,
|
950 |
+
annotations=annotations,
|
951 |
+
despine_keep_bottom=True,
|
952 |
+
title=plot_title,
|
953 |
+
)
|
954 |
+
|
955 |
+
# Save plot
|
956 |
+
if out_prefix is None:
|
957 |
+
tf_suffix = f"_{transcription_factor}" if transcription_factor else ""
|
958 |
+
out_prefix = f"tf_binding{tf_suffix}_{timestamp}"
|
959 |
+
output_file = OUTPUT_DIR / f"{out_prefix}.png"
|
960 |
+
plt.savefig(output_file, dpi=300, bbox_inches='tight')
|
961 |
+
plt.close()
|
962 |
+
|
963 |
+
return {
|
964 |
+
"message": "TF binding visualization completed successfully",
|
965 |
+
"reference": "https://github.com/google-deepmind/alphagenome/tree/main/colabs/visualization_modality_tour.ipynb",
|
966 |
+
"artifacts": [
|
967 |
+
{
|
968 |
+
"description": "TF binding plot",
|
969 |
+
"path": str(output_file.resolve())
|
970 |
+
}
|
971 |
+
]
|
972 |
+
}
|
973 |
+
|
974 |
+
@visualization_modality_tour_mcp.tool
|
975 |
+
def visualize_contact_maps(
|
976 |
+
chromosome: Annotated[str, "Chromosome name (e.g., 'chr22')"],
|
977 |
+
start_position: Annotated[int, "Start genomic position"],
|
978 |
+
end_position: Annotated[int, "End genomic position"],
|
979 |
+
colormap: Annotated[str, "Matplotlib colormap for contact map"] = 'autumn_r',
|
980 |
+
vmax: Annotated[float, "Maximum value for colormap scaling"] = 1.0,
|
981 |
+
ontology_terms: Annotated[list, "List of ontology terms for cell lines"] = ['EFO:0002824'],
|
982 |
+
api_key: Annotated[str | None, "AlphaGenome API key"] = ALPHAGENOME_API_KEY,
|
983 |
+
out_prefix: Annotated[str | None, "Output file prefix"] = None,
|
984 |
+
) -> dict:
|
985 |
+
"""
|
986 |
+
Visualize CONTACT_MAPS DNA-DNA contact predictions showing topologically-associated domains.
|
987 |
+
Input is genomic coordinates and colormap parameters and output is contact maps visualization plot.
|
988 |
+
"""
|
989 |
+
if api_key is None:
|
990 |
+
raise ValueError("API key must be provided")
|
991 |
+
|
992 |
+
# Create interval
|
993 |
+
interval = genome.Interval(chromosome, start_position, end_position).resize(
|
994 |
+
dna_client.SEQUENCE_LENGTH_1MB
|
995 |
+
)
|
996 |
+
|
997 |
+
# Create model client
|
998 |
+
dna_model = dna_client.create(api_key)
|
999 |
+
|
1000 |
+
# Load gene annotations
|
1001 |
+
gtf = pd.read_feather(
|
1002 |
+
'https://storage.googleapis.com/alphagenome/reference/gencode/'
|
1003 |
+
'hg38/gencode.v46.annotation.gtf.gz.feather'
|
1004 |
+
)
|
1005 |
+
gtf_transcript = gene_annotation.filter_transcript_support_level(
|
1006 |
+
gene_annotation.filter_protein_coding(gtf), ['1']
|
1007 |
+
)
|
1008 |
+
longest_transcript_extractor = transcript.TranscriptExtractor(
|
1009 |
+
gene_annotation.filter_to_longest_transcript(gtf_transcript)
|
1010 |
+
)
|
1011 |
+
|
1012 |
+
# Make predictions
|
1013 |
+
output = dna_model.predict_interval(
|
1014 |
+
interval=interval,
|
1015 |
+
requested_outputs={dna_client.OutputType.CONTACT_MAPS},
|
1016 |
+
ontology_terms=ontology_terms,
|
1017 |
+
)
|
1018 |
+
|
1019 |
+
# Extract transcripts
|
1020 |
+
longest_transcripts = longest_transcript_extractor.extract(interval)
|
1021 |
+
|
1022 |
+
# Build plot
|
1023 |
+
plot = plot_components.plot(
|
1024 |
+
[
|
1025 |
+
plot_components.TranscriptAnnotation(longest_transcripts),
|
1026 |
+
plot_components.ContactMaps(
|
1027 |
+
tdata=output.contact_maps,
|
1028 |
+
ylabel_template='{biosample_name}\n{name}',
|
1029 |
+
cmap=colormap,
|
1030 |
+
vmax=vmax,
|
1031 |
+
),
|
1032 |
+
],
|
1033 |
+
interval=interval,
|
1034 |
+
title='Predicted contact maps',
|
1035 |
+
)
|
1036 |
+
|
1037 |
+
# Save plot
|
1038 |
+
if out_prefix is None:
|
1039 |
+
out_prefix = f"contact_maps_{timestamp}"
|
1040 |
+
output_file = OUTPUT_DIR / f"{out_prefix}.png"
|
1041 |
+
plt.savefig(output_file, dpi=300, bbox_inches='tight')
|
1042 |
+
plt.close()
|
1043 |
+
|
1044 |
+
return {
|
1045 |
+
"message": "Contact maps visualization completed successfully",
|
1046 |
+
"reference": "https://github.com/google-deepmind/alphagenome/tree/main/colabs/visualization_modality_tour.ipynb",
|
1047 |
+
"artifacts": [
|
1048 |
+
{
|
1049 |
+
"description": "Contact maps plot",
|
1050 |
+
"path": str(output_file.resolve())
|
1051 |
+
}
|
1052 |
+
]
|
1053 |
+
}
|