Paper2Agent commited on
Commit
0cdac39
·
verified ·
1 Parent(s): 8ac7142

Upload 10 files

Browse files
Dockerfile ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.12
2
+ WORKDIR /app
3
+ COPY requirements.txt .
4
+ RUN pip install --no-cache-dir -r requirements.txt
5
+ COPY alphagenome_mcp.py .
6
+ COPY tools/ tools/
7
+ RUN mkdir -p /app/data/upload /data/tmp_inputs /data/tmp_outputs && chmod -R 777 /app/data/upload /data
8
+ EXPOSE 7860
9
+ CMD ["uvicorn", "alphagenome_mcp:app", "--host", "0.0.0.0", "--port", "7860"]
README.md CHANGED
@@ -1,12 +1,10 @@
1
  ---
2
- title: Alphagenome Mcp
3
- emoji: 🐨
4
- colorFrom: purple
5
- colorTo: yellow
6
  sdk: docker
7
  pinned: false
8
- license: apache-2.0
9
- short_description: Paper2Agent-generated AlphaGenome MCP server
10
  ---
11
 
12
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
  ---
2
+ title: AlphaGenome Mcp
3
+ emoji: 🔥
4
+ colorFrom: red
5
+ colorTo: green
6
  sdk: docker
7
  pinned: false
 
 
8
  ---
9
 
10
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
alphagenome_mcp.py ADDED
@@ -0,0 +1,107 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Model Context Protocol (MCP) for AlphaGenome
3
+
4
+ AlphaGenome provides state-of-the-art AI-driven tools for genomic sequence analysis, variant effect prediction, and functional genomics visualization. The platform enables researchers to predict regulatory elements, assess variant impacts, and explore molecular mechanisms across diverse cell types and tissues.
5
+
6
+ This MCP Server contains the tools extracted from the following tutorials with their tools:
7
+ 1. batch_variant_scoring
8
+ - score_variants_batch: Score multiple genetic variants in batch using configurable variant scorers
9
+ - filter_variant_scores: Filter variant scores by ontology criteria (e.g., cell types)
10
+ 2. essential_commands
11
+ - create_genomic_interval: Create genomic intervals for DNA regions
12
+ - create_genomic_variant: Create genomic variants for genetic changes
13
+ - create_track_data: Create TrackData objects from arrays and metadata
14
+ - create_variant_scores: Create AnnData objects for variant scoring results
15
+ - genomic_interval_operations: Perform operations on genomic intervals
16
+ - variant_interval_operations: Check variant overlaps with intervals
17
+ - track_data_operations: Filter, resize, slice and transform TrackData
18
+ - track_data_resolution_conversion: Convert between track data resolutions
19
+ 3. quick_start
20
+ - predict_dna_sequence: Predict genomic tracks from DNA sequence
21
+ - predict_genome_interval: Predict genomic tracks for reference genome intervals
22
+ - ism_analysis: Perform in silico mutagenesis analysis with sequence logos
23
+ 4. tissue_ontology_mapping
24
+ - explore_output_metadata: Explore and filter output metadata for specific organisms and search terms
25
+ - count_tracks_by_output_type: Count tracks by output type for human and mouse organisms
26
+ 5. variant_scoring_ui
27
+ - visualize_variant_effects: Generate comprehensive variant effect visualization across multiple modalities
28
+ 6. visualization_modality_tour
29
+ - visualize_gene_expression: Visualize RNA_SEQ and CAGE gene expression predictions
30
+ - visualize_chromatin_accessibility: Visualize DNASE and ATAC chromatin accessibility
31
+ - visualize_splicing_effects: Visualize splicing predictions with SPLICE_SITES, SPLICE_SITE_USAGE, SPLICE_JUNCTIONS
32
+ - visualize_histone_modifications: Visualize CHIP_HISTONE predictions with custom colors
33
+ - visualize_tf_binding: Visualize CHIP_TF transcription factor binding predictions
34
+ - visualize_contact_maps: Visualize CONTACT_MAPS DNA-DNA contact predictions
35
+ """
36
+ from fastmcp import FastMCP
37
+
38
+ # Import the MCP tools from the tools folder
39
+ from tools.batch_variant_scoring import batch_variant_scoring_mcp
40
+ from tools.essential_commands import essential_commands_mcp
41
+ from tools.quick_start import quick_start_mcp
42
+ from tools.tissue_ontology_mapping import tissue_ontology_mapping_mcp
43
+ from tools.variant_scoring_ui import variant_scoring_ui_mcp
44
+ from tools.visualization_modality_tour import visualization_modality_tour_mcp
45
+
46
+ from starlette.requests import Request
47
+ from starlette.responses import PlainTextResponse, JSONResponse
48
+ import os
49
+ from fastapi.staticfiles import StaticFiles
50
+ import uuid
51
+
52
+ # Define the MCP server
53
+ mcp = FastMCP(name = "AlphaGenome")
54
+
55
+ # Mount the tools
56
+ mcp.mount(batch_variant_scoring_mcp)
57
+ mcp.mount(essential_commands_mcp)
58
+ mcp.mount(quick_start_mcp)
59
+ mcp.mount(tissue_ontology_mapping_mcp)
60
+ mcp.mount(variant_scoring_ui_mcp)
61
+ mcp.mount(visualization_modality_tour_mcp)
62
+
63
+ # Use absolute directory for uploads
64
+ BASE_DIR = os.path.dirname(os.path.abspath(__file__))
65
+ UPLOAD_DIR = os.path.join(BASE_DIR, "/data/upload")
66
+ os.makedirs(UPLOAD_DIR, exist_ok=True)
67
+
68
+ @mcp.custom_route("/health", methods=["GET"])
69
+ async def health_check(request: Request) -> PlainTextResponse:
70
+ return PlainTextResponse("OK")
71
+
72
+
73
+ @mcp.custom_route("/", methods=["GET"])
74
+ async def index(request: Request) -> PlainTextResponse:
75
+ return PlainTextResponse("MCP is on https://Paper2Agent-alphagenome-mcp.hf.space/mcp")
76
+
77
+ # Upload route
78
+ @mcp.custom_route("/upload", methods=["POST"])
79
+ async def upload(request: Request):
80
+ form = await request.form()
81
+ up = form.get("file")
82
+ if up is None:
83
+ return JSONResponse({"error": "missing form field 'file'"}, status_code=400)
84
+
85
+ # Generate a safe filename
86
+ orig = getattr(up, "filename", "") or ""
87
+ ext = os.path.splitext(orig)[1]
88
+ name = f"{uuid.uuid4().hex}{ext}"
89
+ dst = os.path.join(UPLOAD_DIR, name)
90
+
91
+ # up is a Starlette UploadFile-like object
92
+ with open(dst, "wb") as out:
93
+ out.write(await up.read())
94
+
95
+ # Return only the absolute local path
96
+ abs_path = os.path.abspath(dst)
97
+ return JSONResponse({"path": abs_path})
98
+
99
+ app = mcp.http_app(path="/mcp")
100
+ # Saved uploaded input files
101
+ app.mount("/files", StaticFiles(directory=UPLOAD_DIR), name="files")
102
+ # Saved output files
103
+ app.mount("/outputs", StaticFiles(directory="/data/tmp_outputs"), name="outputs")
104
+
105
+
106
+ if __name__ == "__main__":
107
+ mcp.run(transport="http", host="0.0.0.0", port=int(os.getenv("PORT", 7860)), path="/mcp")
requirements.txt ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ alphagenome==0.2.0
2
+ anndata==0.12.2
3
+ fastmcp==2.12.2
4
+ matplotlib==3.10.6
5
+ numpy==2.3.2
6
+ pandas==2.3.2
7
+ plotnine==0.15.0
8
+ tqdm==4.67.1
9
+ fastapi
10
+ starlette==0.47.3
tools/batch_variant_scoring.py ADDED
@@ -0,0 +1,232 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Batch variant scoring tutorial demonstrating how to score multiple genetic variants using AlphaGenome.
3
+
4
+ This MCP Server provides 2 tools:
5
+ 1. score_variants_batch: Score multiple genetic variants in batch using configurable variant scorers
6
+ 2. filter_variant_scores: Filter variant scores by ontology criteria (e.g., cell types)
7
+
8
+ All tools extracted from `/batch_variant_scoring.ipynb`.
9
+ """
10
+
11
+ # Standard imports
12
+ from typing import Annotated, Literal, Any
13
+ import pandas as pd
14
+ import numpy as np
15
+ from pathlib import Path
16
+ import os
17
+ from fastmcp import FastMCP
18
+ from datetime import datetime
19
+ from io import StringIO
20
+
21
+ # AlphaGenome imports
22
+ from alphagenome import colab_utils
23
+ from alphagenome.data import genome
24
+ from alphagenome.models import dna_client, variant_scorers
25
+ from tqdm import tqdm
26
+
27
+ # Base persistent directory (HF Spaces guarantees /data is writable & persistent)
28
+ BASE_DIR = Path("/data")
29
+
30
+ DEFAULT_INPUT_DIR = BASE_DIR / "tmp_inputs"
31
+ DEFAULT_OUTPUT_DIR = BASE_DIR / "tmp_outputs"
32
+
33
+ INPUT_DIR = Path(os.environ.get("BATCH_VARIANT_SCORING_INPUT_DIR", DEFAULT_INPUT_DIR))
34
+ OUTPUT_DIR = Path(os.environ.get("BATCH_VARIANT_SCORING_OUTPUT_DIR", DEFAULT_OUTPUT_DIR))
35
+
36
+ # Ensure directories exist
37
+ INPUT_DIR.mkdir(parents=True, exist_ok=True)
38
+ OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
39
+
40
+ # Timestamp for unique outputs
41
+ timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
42
+
43
+ # Fetch your secret
44
+ ALPHAGENOME_API_KEY = os.environ["ALPHAGENOME_API_KEY"]
45
+
46
+ # MCP server instance
47
+ batch_variant_scoring_mcp = FastMCP(name="batch_variant_scoring")
48
+
49
+ @batch_variant_scoring_mcp.tool
50
+ def score_variants_batch(
51
+ # Primary data inputs
52
+ vcf_path: Annotated[str | None, "Path to VCF file with extension .vcf or .tsv. The header of the file should include the following columns: variant_id, CHROM, POS, REF, ALT"] = None,
53
+ # Analysis parameters with tutorial defaults
54
+ api_key: Annotated[str, "AlphaGenome API key for authentication"] = ALPHAGENOME_API_KEY,
55
+ organism: Annotated[Literal["human", "mouse"], "Organism for variant scoring"] = "human",
56
+ sequence_length: Annotated[Literal["2KB", "16KB", "100KB", "500KB", "1MB"], "Length of sequence around variants to predict"] = "1MB",
57
+ score_rna_seq: Annotated[bool, "Score RNA-seq effects"] = True,
58
+ score_cage: Annotated[bool, "Score CAGE effects"] = True,
59
+ score_procap: Annotated[bool, "Score ProCAP effects"] = True,
60
+ score_atac: Annotated[bool, "Score ATAC-seq effects"] = True,
61
+ score_dnase: Annotated[bool, "Score DNase effects"] = True,
62
+ score_chip_histone: Annotated[bool, "Score ChIP histone effects"] = True,
63
+ score_chip_tf: Annotated[bool, "Score ChIP transcription factor effects"] = True,
64
+ score_polyadenylation: Annotated[bool, "Score polyadenylation effects"] = True,
65
+ score_splice_sites: Annotated[bool, "Score splice sites effects"] = True,
66
+ score_splice_site_usage: Annotated[bool, "Score splice site usage effects"] = True,
67
+ score_splice_junctions: Annotated[bool, "Score splice junctions effects"] = True,
68
+ out_prefix: Annotated[str | None, "Output file prefix"] = None,
69
+ ) -> dict:
70
+ """
71
+ Score multiple genetic variants in batch using AlphaGenome with configurable variant scorers.
72
+ Input is VCF file with variant information and output is comprehensive variant scores table.
73
+ """
74
+ # Validate exactly one input
75
+ if vcf_path is None:
76
+ raise ValueError("Path to VCF file must be provided")
77
+
78
+ # Set output prefix
79
+ if out_prefix is None:
80
+ out_prefix = f"batch_variant_scores_{timestamp}"
81
+
82
+ # Load VCF file containing variants
83
+ vcf = pd.read_csv(vcf_path, sep='\t')
84
+
85
+ # Validate required columns
86
+ required_columns = ['variant_id', 'CHROM', 'POS', 'REF', 'ALT']
87
+ for column in required_columns:
88
+ if column not in vcf.columns:
89
+ raise ValueError(f'VCF file is missing required column: {column}.')
90
+
91
+ # Load the model
92
+ dna_model = dna_client.create(api_key)
93
+
94
+ # Parse organism specification
95
+ organism_map = {
96
+ 'human': dna_client.Organism.HOMO_SAPIENS,
97
+ 'mouse': dna_client.Organism.MUS_MUSCULUS,
98
+ }
99
+ organism_enum = organism_map[organism]
100
+
101
+ # Parse sequence length
102
+ sequence_length_value = dna_client.SUPPORTED_SEQUENCE_LENGTHS[
103
+ f'SEQUENCE_LENGTH_{sequence_length}'
104
+ ]
105
+
106
+ # Parse scorer specification
107
+ scorer_selections = {
108
+ 'rna_seq': score_rna_seq,
109
+ 'cage': score_cage,
110
+ 'procap': score_procap,
111
+ 'atac': score_atac,
112
+ 'dnase': score_dnase,
113
+ 'chip_histone': score_chip_histone,
114
+ 'chip_tf': score_chip_tf,
115
+ 'polyadenylation': score_polyadenylation,
116
+ 'splice_sites': score_splice_sites,
117
+ 'splice_site_usage': score_splice_site_usage,
118
+ 'splice_junctions': score_splice_junctions,
119
+ }
120
+
121
+ all_scorers = variant_scorers.RECOMMENDED_VARIANT_SCORERS
122
+ selected_scorers = [
123
+ all_scorers[key]
124
+ for key in all_scorers
125
+ if scorer_selections.get(key.lower(), False)
126
+ ]
127
+
128
+ # Remove any scorers or output types that are not supported for the chosen organism
129
+ unsupported_scorers = [
130
+ scorer
131
+ for scorer in selected_scorers
132
+ if (
133
+ organism_enum.value
134
+ not in variant_scorers.SUPPORTED_ORGANISMS[scorer.base_variant_scorer]
135
+ )
136
+ | (
137
+ (scorer.requested_output == dna_client.OutputType.PROCAP)
138
+ & (organism_enum == dna_client.Organism.MUS_MUSCULUS)
139
+ )
140
+ ]
141
+ if len(unsupported_scorers) > 0:
142
+ print(
143
+ f'Excluding {unsupported_scorers} scorers as they are not supported for'
144
+ f' {organism_enum}.'
145
+ )
146
+ for unsupported_scorer in unsupported_scorers:
147
+ selected_scorers.remove(unsupported_scorer)
148
+
149
+ # Score variants in the VCF file
150
+ results = []
151
+
152
+ for i, vcf_row in tqdm(vcf.iterrows(), total=len(vcf)):
153
+ variant = genome.Variant(
154
+ chromosome=str(vcf_row.CHROM),
155
+ position=int(vcf_row.POS),
156
+ reference_bases=vcf_row.REF,
157
+ alternate_bases=vcf_row.ALT,
158
+ name=vcf_row.variant_id,
159
+ )
160
+ interval = variant.reference_interval.resize(sequence_length_value)
161
+
162
+ variant_scores = dna_model.score_variant(
163
+ interval=interval,
164
+ variant=variant,
165
+ variant_scorers=selected_scorers,
166
+ organism=organism_enum,
167
+ )
168
+ results.append(variant_scores)
169
+
170
+ df_scores = variant_scorers.tidy_scores(results)
171
+
172
+ # Save results
173
+ output_file = OUTPUT_DIR / f"{out_prefix}.csv"
174
+ df_scores.to_csv(output_file, index=False)
175
+
176
+ return {
177
+ "message": f"Batch variant scoring completed for {len(vcf)} variants with {len(selected_scorers)} scorers",
178
+ "reference": "https://github.com/google-deepmind/alphagenome/tree/main/colabs/batch_variant_scoring.ipynb",
179
+ "artifacts": [
180
+ {
181
+ "description": "Batch variant scores results",
182
+ "path": str(output_file.resolve())
183
+ }
184
+ ]
185
+ }
186
+
187
+ @batch_variant_scoring_mcp.tool
188
+ def filter_variant_scores(
189
+ # Primary data inputs
190
+ scores_path: Annotated[str | None, "Path to variant scores CSV file from batch scoring"] = None,
191
+ # Analysis parameters with tutorial defaults
192
+ ontology_curie: Annotated[str, "Ontology CURIE for filtering (e.g., 'CL:0000084' for T-cells)"] = "CL:0000084",
193
+ exclude_ontology_column: Annotated[bool, "Whether to exclude ontology_curie column from output"] = True,
194
+ out_prefix: Annotated[str | None, "Output file prefix"] = None,
195
+ ) -> dict:
196
+ """
197
+ Filter variant scores by ontology criteria to examine effects on specific cell types or tissues.
198
+ Input is variant scores CSV file and output is filtered scores table.
199
+ """
200
+ # Validate exactly one input
201
+ if scores_path is None:
202
+ raise ValueError("Path to variant scores CSV file must be provided")
203
+
204
+ # Set output prefix
205
+ if out_prefix is None:
206
+ out_prefix = f"filtered_variant_scores_{timestamp}"
207
+
208
+ # Load variant scores
209
+ df_scores = pd.read_csv(scores_path)
210
+
211
+ # Filter by ontology criteria
212
+ filtered_df = df_scores[df_scores['ontology_curie'] == ontology_curie]
213
+
214
+ # Optionally exclude ontology column
215
+ if exclude_ontology_column:
216
+ columns = [c for c in filtered_df.columns if c != 'ontology_curie']
217
+ filtered_df = filtered_df[columns]
218
+
219
+ # Save filtered results
220
+ output_file = OUTPUT_DIR / f"{out_prefix}.csv"
221
+ filtered_df.to_csv(output_file, index=False)
222
+
223
+ return {
224
+ "message": f"Filtered {len(filtered_df)} variant scores for ontology {ontology_curie}",
225
+ "reference": "https://github.com/google-deepmind/alphagenome/tree/main/colabs/batch_variant_scoring.ipynb",
226
+ "artifacts": [
227
+ {
228
+ "description": "Filtered variant scores",
229
+ "path": str(output_file.resolve())
230
+ }
231
+ ]
232
+ }
tools/essential_commands.py ADDED
@@ -0,0 +1,590 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Essential commands for AlphaGenome API interaction covering data structures and operations.
3
+
4
+ This MCP Server provides 8 tools:
5
+ 1. create_genomic_interval: Create genomic intervals for DNA regions
6
+ 2. create_genomic_variant: Create genomic variants for genetic changes
7
+ 3. create_track_data: Create TrackData objects from arrays and metadata
8
+ 4. create_variant_scores: Create AnnData objects for variant scoring results
9
+ 5. genomic_interval_operations: Perform operations on genomic intervals
10
+ 6. variant_interval_operations: Check variant overlaps with intervals
11
+ 7. track_data_operations: Filter, resize, slice and transform TrackData
12
+ 8. track_data_resolution_conversion: Convert between track data resolutions
13
+
14
+ All tools extracted from `https://github.com/google-deepmind/alphagenome/tree/main/colabs/essential_commands.ipynb`.
15
+ """
16
+
17
+ # Standard imports
18
+ from typing import Annotated, Literal, Any
19
+ import pandas as pd
20
+ import numpy as np
21
+ from pathlib import Path
22
+ import os
23
+ from fastmcp import FastMCP
24
+ from datetime import datetime
25
+
26
+ # AlphaGenome imports
27
+ from alphagenome.data import genome, track_data
28
+ from alphagenome.models import dna_client
29
+ import anndata
30
+
31
+ # Base persistent directory (HF Spaces guarantees /data is writable & persistent)
32
+ BASE_DIR = Path("/data")
33
+
34
+ DEFAULT_INPUT_DIR = BASE_DIR / "tmp_inputs"
35
+ DEFAULT_OUTPUT_DIR = BASE_DIR / "tmp_outputs"
36
+
37
+ INPUT_DIR = Path(os.environ.get("ESSENTIAL_COMMANDS_INPUT_DIR", DEFAULT_INPUT_DIR))
38
+ OUTPUT_DIR = Path(os.environ.get("ESSENTIAL_COMMANDS_OUTPUT_DIR", DEFAULT_OUTPUT_DIR))
39
+
40
+ # Ensure directories exist
41
+ INPUT_DIR.mkdir(parents=True, exist_ok=True)
42
+ OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
43
+
44
+ # Timestamp for unique outputs
45
+ timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
46
+
47
+ # MCP server instance
48
+ essential_commands_mcp = FastMCP(name="essential_commands")
49
+
50
+ @essential_commands_mcp.tool
51
+ def create_genomic_interval(
52
+ chromosome: Annotated[str, "Chromosome name (e.g., 'chr1', 'chr2')"] = "chr1",
53
+ start: Annotated[int, "Start position (0-based)"] = 1000,
54
+ end: Annotated[int, "End position (0-based, exclusive)"] = 1010,
55
+ strand: Annotated[Literal["+", "-", "."], "DNA strand"] = ".",
56
+ name: Annotated[str, "Interval name"] = "",
57
+ out_prefix: Annotated[str | None, "Output file prefix"] = None,
58
+ ) -> dict:
59
+ """
60
+ Create genomic intervals for DNA regions with basic properties and operations.
61
+ Input is genomic coordinates and output is interval object with properties and operations results.
62
+ """
63
+
64
+ # Create genomic interval
65
+ interval = genome.Interval(chromosome=chromosome, start=start, end=end, strand=strand, name=name)
66
+
67
+ # Calculate basic properties
68
+ results = {
69
+ "interval": {
70
+ "chromosome": interval.chromosome,
71
+ "start": interval.start,
72
+ "end": interval.end,
73
+ "strand": interval.strand,
74
+ "name": interval.name,
75
+ "center": interval.center(),
76
+ "width": interval.width,
77
+ },
78
+ "operations": {
79
+ "resize_100bp": str(interval.resize(100)),
80
+ "resize_1MB": str(interval.resize(dna_client.SEQUENCE_LENGTH_1MB)),
81
+ }
82
+ }
83
+
84
+ # Save results
85
+ prefix = out_prefix or f"genomic_interval_{timestamp}"
86
+ output_file = OUTPUT_DIR / f"{prefix}.csv"
87
+
88
+ # Flatten results for CSV
89
+ rows = []
90
+ for category, data in results.items():
91
+ for key, value in data.items():
92
+ rows.append({"category": category, "property": key, "value": str(value)})
93
+
94
+ df = pd.DataFrame(rows)
95
+ df.to_csv(output_file, index=False)
96
+
97
+ return {
98
+ "message": f"Genomic interval created: {interval.chromosome}:{interval.start}-{interval.end}",
99
+ "reference": "https://github.com/google-deepmind/alphagenome/tree/main/colabs/essential_commands.ipynb",
100
+ "artifacts": [
101
+ {
102
+ "description": "Genomic interval properties",
103
+ "path": str(output_file.resolve())
104
+ }
105
+ ]
106
+ }
107
+
108
+ @essential_commands_mcp.tool
109
+ def create_genomic_variant(
110
+ chromosome: Annotated[str, "Chromosome name (e.g., 'chr3')"] = "chr3",
111
+ position: Annotated[int, "1-based position"] = 10000,
112
+ reference_bases: Annotated[str, "Reference sequence (e.g., 'A')"] = "A",
113
+ alternate_bases: Annotated[str, "Alternate sequence (e.g., 'C')"] = "C",
114
+ out_prefix: Annotated[str | None, "Output file prefix"] = None,
115
+ ) -> dict:
116
+ """
117
+ Create genomic variants for genetic changes with reference interval operations.
118
+ Input is variant coordinates and sequences and output is variant properties and intervals.
119
+ """
120
+
121
+ # Create genomic variant
122
+ variant = genome.Variant(
123
+ chromosome=chromosome,
124
+ position=position,
125
+ reference_bases=reference_bases,
126
+ alternate_bases=alternate_bases
127
+ )
128
+
129
+ # Get reference interval and properties
130
+ ref_interval = variant.reference_interval
131
+ input_interval = ref_interval.resize(dna_client.SEQUENCE_LENGTH_1MB)
132
+
133
+ # Create test interval for overlap testing
134
+ test_interval = genome.Interval(chromosome=chromosome, start=position+5, end=position+10)
135
+
136
+ results = {
137
+ "variant": {
138
+ "chromosome": variant.chromosome,
139
+ "position": variant.position,
140
+ "reference_bases": variant.reference_bases,
141
+ "alternate_bases": variant.alternate_bases,
142
+ "reference_interval": str(ref_interval),
143
+ "input_interval_width": input_interval.width,
144
+ },
145
+ "overlap_tests": {
146
+ "reference_overlaps_test": variant.reference_overlaps(test_interval),
147
+ "alternate_overlaps_test": variant.alternate_overlaps(test_interval),
148
+ }
149
+ }
150
+
151
+ # Save results
152
+ prefix = out_prefix or f"genomic_variant_{timestamp}"
153
+ output_file = OUTPUT_DIR / f"{prefix}.csv"
154
+
155
+ # Flatten results for CSV
156
+ rows = []
157
+ for category, data in results.items():
158
+ for key, value in data.items():
159
+ rows.append({"category": category, "property": key, "value": str(value)})
160
+
161
+ df = pd.DataFrame(rows)
162
+ df.to_csv(output_file, index=False)
163
+
164
+ return {
165
+ "message": f"Genomic variant created: {variant.chromosome}:{variant.position} {variant.reference_bases}>{variant.alternate_bases}",
166
+ "reference": "https://github.com/google-deepmind/alphagenome/tree/main/colabs/essential_commands.ipynb",
167
+ "artifacts": [
168
+ {
169
+ "description": "Genomic variant properties",
170
+ "path": str(output_file.resolve())
171
+ }
172
+ ]
173
+ }
174
+
175
+ @essential_commands_mcp.tool
176
+ def create_track_data(
177
+ values_array: Annotated[str | None, "Path to CSV file with values array (shape: sequence_length x num_tracks)"] = None,
178
+ track_names: Annotated[list[str], "List of track names"] = ["track1", "track1", "track2"],
179
+ track_strands: Annotated[list[str], "List of track strands (+, -, .)"] = ["+", "-", "."],
180
+ chromosome: Annotated[str, "Chromosome for interval"] = "chr1",
181
+ start: Annotated[int, "Start position"] = 1000,
182
+ end: Annotated[int, "End position"] = 1004,
183
+ resolution: Annotated[int, "Base pair resolution"] = 1,
184
+ out_prefix: Annotated[str | None, "Output file prefix"] = None,
185
+ ) -> dict:
186
+ """
187
+ Create TrackData objects from user arrays and metadata with validation.
188
+ Input is values array and track metadata and output is TrackData object properties.
189
+ """
190
+
191
+ if values_array is None:
192
+ # Use tutorial example data if no input provided
193
+ values = np.array([[0, 1, 2], [3, 4, 5], [6, 7, 8], [9, 10, 11]]).astype(np.float32)
194
+ else:
195
+ # Load user data
196
+ values = np.loadtxt(values_array, delimiter=',').astype(np.float32)
197
+
198
+ # Create metadata
199
+ metadata = pd.DataFrame({
200
+ 'name': track_names,
201
+ 'strand': track_strands,
202
+ })
203
+
204
+ # Create genomic interval
205
+ interval = genome.Interval(chromosome=chromosome, start=start, end=end)
206
+
207
+ # Create TrackData object
208
+ tdata = track_data.TrackData(
209
+ values=values,
210
+ metadata=metadata,
211
+ resolution=resolution,
212
+ interval=interval
213
+ )
214
+
215
+ # Save results
216
+ prefix = out_prefix or f"track_data_{timestamp}"
217
+ values_file = OUTPUT_DIR / f"{prefix}_values.csv"
218
+ metadata_file = OUTPUT_DIR / f"{prefix}_metadata.csv"
219
+
220
+ # Save values and metadata
221
+ np.savetxt(values_file, tdata.values, delimiter=',')
222
+ tdata.metadata.to_csv(metadata_file, index=False)
223
+
224
+ results = {
225
+ "shape": tdata.values.shape,
226
+ "resolution": tdata.resolution,
227
+ "interval": str(tdata.interval) if tdata.interval else "None",
228
+ "num_tracks": len(tdata.metadata),
229
+ "track_names": tdata.metadata.name.tolist(),
230
+ "track_strands": tdata.metadata.strand.tolist(),
231
+ }
232
+
233
+ return {
234
+ "message": f"TrackData created with shape {tdata.values.shape} and {len(tdata.metadata)} tracks",
235
+ "reference": "https://github.com/google-deepmind/alphagenome/tree/main/colabs/essential_commands.ipynb",
236
+ "artifacts": [
237
+ {
238
+ "description": "Track data values",
239
+ "path": str(values_file.resolve())
240
+ },
241
+ {
242
+ "description": "Track metadata",
243
+ "path": str(metadata_file.resolve())
244
+ }
245
+ ]
246
+ }
247
+
248
+ @essential_commands_mcp.tool
249
+ def create_variant_scores(
250
+ scores_array: Annotated[str | None, "Path to CSV file with scores array (shape: num_genes x num_tracks)"] = None,
251
+ gene_ids: Annotated[list[str], "List of gene IDs"] = ["ENSG0001", "ENSG0002", "ENSG0003"],
252
+ track_names: Annotated[list[str], "List of track names"] = ["track1", "track2"],
253
+ track_strands: Annotated[list[str], "List of track strands"] = ["+", "-"],
254
+ out_prefix: Annotated[str | None, "Output file prefix"] = None,
255
+ ) -> dict:
256
+ """
257
+ Create AnnData objects for variant scoring results with gene and track metadata.
258
+ Input is scores array and metadata and output is AnnData object structure.
259
+ """
260
+
261
+ if scores_array is None:
262
+ # Use tutorial example data if no input provided
263
+ scores = np.array([[0.1, 0.2], [0.3, 0.4], [0.5, 0.6]])
264
+ else:
265
+ # Load user data
266
+ scores = np.loadtxt(scores_array, delimiter=',')
267
+
268
+ # Create metadata
269
+ gene_metadata = pd.DataFrame({'gene_id': gene_ids})
270
+ track_metadata = pd.DataFrame({
271
+ 'name': track_names,
272
+ 'strand': track_strands
273
+ })
274
+
275
+ # Create AnnData object
276
+ variant_scores = anndata.AnnData(
277
+ X=scores,
278
+ obs=gene_metadata,
279
+ var=track_metadata
280
+ )
281
+
282
+ # Save results
283
+ prefix = out_prefix or f"variant_scores_{timestamp}"
284
+ scores_file = OUTPUT_DIR / f"{prefix}_scores.csv"
285
+ gene_metadata_file = OUTPUT_DIR / f"{prefix}_gene_metadata.csv"
286
+ track_metadata_file = OUTPUT_DIR / f"{prefix}_track_metadata.csv"
287
+
288
+ # Save components
289
+ np.savetxt(scores_file, variant_scores.X, delimiter=',')
290
+ variant_scores.obs.to_csv(gene_metadata_file, index=False)
291
+ variant_scores.var.to_csv(track_metadata_file, index=False)
292
+
293
+ results = {
294
+ "scores_shape": variant_scores.X.shape,
295
+ "num_genes": variant_scores.n_obs,
296
+ "num_tracks": variant_scores.n_vars,
297
+ "gene_ids": variant_scores.obs.gene_id.tolist(),
298
+ "track_names": variant_scores.var.name.tolist(),
299
+ }
300
+
301
+ return {
302
+ "message": f"Variant scores AnnData created with {variant_scores.n_obs} genes and {variant_scores.n_vars} tracks",
303
+ "reference": "https://github.com/google-deepmind/alphagenome/tree/main/colabs/essential_commands.ipynb",
304
+ "artifacts": [
305
+ {
306
+ "description": "Variant scores matrix",
307
+ "path": str(scores_file.resolve())
308
+ },
309
+ {
310
+ "description": "Gene metadata",
311
+ "path": str(gene_metadata_file.resolve())
312
+ },
313
+ {
314
+ "description": "Track metadata",
315
+ "path": str(track_metadata_file.resolve())
316
+ }
317
+ ]
318
+ }
319
+
320
+ @essential_commands_mcp.tool
321
+ def genomic_interval_operations(
322
+ interval1_chromosome: Annotated[str, "First interval chromosome"] = "chr1",
323
+ interval1_start: Annotated[int, "First interval start"] = 1000,
324
+ interval1_end: Annotated[int, "First interval end"] = 1010,
325
+ interval2_chromosome: Annotated[str, "Second interval chromosome"] = "chr1",
326
+ interval2_start: Annotated[int, "Second interval start"] = 1005,
327
+ interval2_end: Annotated[int, "Second interval end"] = 1015,
328
+ out_prefix: Annotated[str | None, "Output file prefix"] = None,
329
+ ) -> dict:
330
+ """
331
+ Perform operations on genomic intervals including overlaps, intersections and comparisons.
332
+ Input is two genomic intervals and output is comparison operations results.
333
+ """
334
+
335
+ # Create intervals
336
+ interval1 = genome.Interval(chromosome=interval1_chromosome, start=interval1_start, end=interval1_end)
337
+ interval2 = genome.Interval(chromosome=interval2_chromosome, start=interval2_start, end=interval2_end)
338
+
339
+ # Perform operations
340
+ operations = {
341
+ "interval1": str(interval1),
342
+ "interval2": str(interval2),
343
+ "interval1_center": interval1.center(),
344
+ "interval1_width": interval1.width,
345
+ "interval2_center": interval2.center(),
346
+ "interval2_width": interval2.width,
347
+ "overlaps": interval1.overlaps(interval2),
348
+ "contains": interval1.contains(interval2),
349
+ "intersect": str(interval1.intersect(interval2)) if interval1.overlaps(interval2) else "No overlap",
350
+ "interval1_resized_100": str(interval1.resize(100)),
351
+ "interval2_resized_100": str(interval2.resize(100)),
352
+ }
353
+
354
+ # Save results
355
+ prefix = out_prefix or f"genomic_interval_operations_{timestamp}"
356
+ output_file = OUTPUT_DIR / f"{prefix}.csv"
357
+
358
+ # Convert to DataFrame
359
+ df = pd.DataFrame([
360
+ {"operation": key, "result": str(value)}
361
+ for key, value in operations.items()
362
+ ])
363
+ df.to_csv(output_file, index=False)
364
+
365
+ return {
366
+ "message": f"Genomic interval operations completed for {interval1} and {interval2}",
367
+ "reference": "https://github.com/google-deepmind/alphagenome/tree/main/colabs/essential_commands.ipynb",
368
+ "artifacts": [
369
+ {
370
+ "description": "Interval operations results",
371
+ "path": str(output_file.resolve())
372
+ }
373
+ ]
374
+ }
375
+
376
+ @essential_commands_mcp.tool
377
+ def variant_interval_operations(
378
+ variant_chromosome: Annotated[str, "Variant chromosome"] = "chr3",
379
+ variant_position: Annotated[int, "Variant position (1-based)"] = 10000,
380
+ variant_ref: Annotated[str, "Reference bases"] = "T",
381
+ variant_alt: Annotated[str, "Alternate bases"] = "CGTCAAT",
382
+ interval_chromosome: Annotated[str, "Test interval chromosome"] = "chr3",
383
+ interval_start: Annotated[int, "Test interval start"] = 10005,
384
+ interval_end: Annotated[int, "Test interval end"] = 10010,
385
+ out_prefix: Annotated[str | None, "Output file prefix"] = None,
386
+ ) -> dict:
387
+ """
388
+ Check variant overlaps with genomic intervals for reference and alternate alleles.
389
+ Input is variant and interval coordinates and output is overlap test results.
390
+ """
391
+
392
+ # Create variant and interval
393
+ variant = genome.Variant(
394
+ chromosome=variant_chromosome,
395
+ position=variant_position,
396
+ reference_bases=variant_ref,
397
+ alternate_bases=variant_alt,
398
+ )
399
+
400
+ interval = genome.Interval(
401
+ chromosome=interval_chromosome,
402
+ start=interval_start,
403
+ end=interval_end
404
+ )
405
+
406
+ # Perform overlap tests
407
+ results = {
408
+ "variant": f"{variant.chromosome}:{variant.position} {variant.reference_bases}>{variant.alternate_bases}",
409
+ "interval": str(interval),
410
+ "reference_interval": str(variant.reference_interval),
411
+ "reference_overlaps": variant.reference_overlaps(interval),
412
+ "alternate_overlaps": variant.alternate_overlaps(interval),
413
+ "variant_ref_length": len(variant.reference_bases),
414
+ "variant_alt_length": len(variant.alternate_bases),
415
+ }
416
+
417
+ # Save results
418
+ prefix = out_prefix or f"variant_interval_operations_{timestamp}"
419
+ output_file = OUTPUT_DIR / f"{prefix}.csv"
420
+
421
+ # Convert to DataFrame
422
+ df = pd.DataFrame([
423
+ {"property": key, "value": str(value)}
424
+ for key, value in results.items()
425
+ ])
426
+ df.to_csv(output_file, index=False)
427
+
428
+ return {
429
+ "message": f"Variant interval operations completed: ref_overlaps={results['reference_overlaps']}, alt_overlaps={results['alternate_overlaps']}",
430
+ "reference": "https://github.com/google-deepmind/alphagenome/tree/main/colabs/essential_commands.ipynb",
431
+ "artifacts": [
432
+ {
433
+ "description": "Variant interval operations results",
434
+ "path": str(output_file.resolve())
435
+ }
436
+ ]
437
+ }
438
+
439
+ @essential_commands_mcp.tool
440
+ def track_data_operations(
441
+ values_array: Annotated[str | None, "Path to CSV file with values array"] = None,
442
+ track_names: Annotated[list[str], "List of track names"] = ["track1", "track1", "track2"],
443
+ track_strands: Annotated[list[str], "List of track strands"] = ["+", "-", "."],
444
+ operation: Annotated[Literal["filter", "resize", "slice", "subset", "reverse_complement"], "Operation to perform"] = "filter",
445
+ filter_strand: Annotated[Literal["+", "-", "."], "Strand to filter to"] = "+",
446
+ resize_width: Annotated[int, "Width to resize to"] = 8,
447
+ slice_start: Annotated[int, "Slice start position"] = 2,
448
+ slice_end: Annotated[int, "Slice end position"] = 4,
449
+ subset_names: Annotated[list[str], "Track names to subset to"] = ["track1"],
450
+ chromosome: Annotated[str, "Chromosome"] = "chr1",
451
+ start: Annotated[int, "Start position"] = 1000,
452
+ end: Annotated[int, "End position"] = 1004,
453
+ strand: Annotated[Literal["+", "-", "."], "Interval strand"] = "+",
454
+ out_prefix: Annotated[str | None, "Output file prefix"] = None,
455
+ ) -> dict:
456
+ """
457
+ Filter, resize, slice and transform TrackData objects with strand-aware operations.
458
+ Input is TrackData and operation parameters and output is transformed TrackData.
459
+ """
460
+
461
+ if values_array is None:
462
+ # Use tutorial example data
463
+ values = np.array([[0, 1, 2], [3, 4, 5], [6, 7, 8], [9, 10, 11]]).astype(np.float32)
464
+ else:
465
+ values = np.loadtxt(values_array, delimiter=',').astype(np.float32)
466
+
467
+ # Create metadata and interval
468
+ metadata = pd.DataFrame({
469
+ 'name': track_names,
470
+ 'strand': track_strands,
471
+ })
472
+
473
+ interval = genome.Interval(chromosome=chromosome, start=start, end=end, strand=strand)
474
+ tdata = track_data.TrackData(values=values, metadata=metadata, resolution=1, interval=interval)
475
+
476
+ # Perform operation
477
+ if operation == "filter":
478
+ if filter_strand == "+":
479
+ result_tdata = tdata.filter_to_positive_strand()
480
+ elif filter_strand == "-":
481
+ result_tdata = tdata.filter_to_negative_strand()
482
+ else:
483
+ result_tdata = tdata.filter_to_unstranded()
484
+ operation_info = f"filtered to {filter_strand} strand"
485
+ elif operation == "resize":
486
+ result_tdata = tdata.resize(width=resize_width)
487
+ operation_info = f"resized to width {resize_width}"
488
+ elif operation == "slice":
489
+ result_tdata = tdata.slice_by_positions(start=slice_start, end=slice_end)
490
+ operation_info = f"sliced from {slice_start} to {slice_end}"
491
+ elif operation == "subset":
492
+ result_tdata = tdata.select_tracks_by_name(names=subset_names)
493
+ operation_info = f"subset to tracks {subset_names}"
494
+ elif operation == "reverse_complement":
495
+ result_tdata = tdata.reverse_complement()
496
+ operation_info = "reverse complemented"
497
+
498
+ # Save results
499
+ prefix = out_prefix or f"track_data_{operation}_{timestamp}"
500
+ values_file = OUTPUT_DIR / f"{prefix}_values.csv"
501
+ metadata_file = OUTPUT_DIR / f"{prefix}_metadata.csv"
502
+
503
+ # Save values and metadata
504
+ np.savetxt(values_file, result_tdata.values, delimiter=',')
505
+ result_tdata.metadata.to_csv(metadata_file, index=False)
506
+
507
+ return {
508
+ "message": f"TrackData {operation_info}: shape {result_tdata.values.shape}",
509
+ "reference": "https://github.com/google-deepmind/alphagenome/tree/main/colabs/essential_commands.ipynb",
510
+ "artifacts": [
511
+ {
512
+ "description": f"Track data values after {operation}",
513
+ "path": str(values_file.resolve())
514
+ },
515
+ {
516
+ "description": f"Track metadata after {operation}",
517
+ "path": str(metadata_file.resolve())
518
+ }
519
+ ]
520
+ }
521
+
522
+ @essential_commands_mcp.tool
523
+ def track_data_resolution_conversion(
524
+ values_array: Annotated[str | None, "Path to CSV file with values array"] = None,
525
+ track_names: Annotated[list[str], "List of track names"] = ["track1", "track1", "track2"],
526
+ track_strands: Annotated[list[str], "List of track strands"] = ["+", "-", "."],
527
+ original_resolution: Annotated[int, "Original resolution in bp"] = 1,
528
+ target_resolution: Annotated[int, "Target resolution in bp"] = 2,
529
+ chromosome: Annotated[str, "Chromosome"] = "chr1",
530
+ start: Annotated[int, "Start position"] = 1000,
531
+ end: Annotated[int, "End position"] = 1004,
532
+ out_prefix: Annotated[str | None, "Output file prefix"] = None,
533
+ ) -> dict:
534
+ """
535
+ Convert between different resolutions by upsampling or downsampling TrackData.
536
+ Input is TrackData and resolution parameters and output is resolution-converted data.
537
+ """
538
+
539
+ if values_array is None:
540
+ # Use tutorial example data
541
+ values = np.array([[0, 1, 2], [3, 4, 5], [6, 7, 8], [9, 10, 11]]).astype(np.float32)
542
+ else:
543
+ values = np.loadtxt(values_array, delimiter=',').astype(np.float32)
544
+
545
+ # Create metadata and interval
546
+ metadata = pd.DataFrame({
547
+ 'name': track_names,
548
+ 'strand': track_strands,
549
+ })
550
+
551
+ interval = genome.Interval(chromosome=chromosome, start=start, end=end)
552
+ tdata = track_data.TrackData(
553
+ values=values,
554
+ metadata=metadata,
555
+ resolution=original_resolution,
556
+ interval=interval
557
+ )
558
+
559
+ # Change resolution
560
+ converted_tdata = tdata.change_resolution(resolution=target_resolution)
561
+
562
+ # Save results
563
+ prefix = out_prefix or f"track_data_resolution_{original_resolution}_to_{target_resolution}_{timestamp}"
564
+ original_file = OUTPUT_DIR / f"{prefix}_original.csv"
565
+ converted_file = OUTPUT_DIR / f"{prefix}_converted.csv"
566
+ metadata_file = OUTPUT_DIR / f"{prefix}_metadata.csv"
567
+
568
+ # Save original, converted values, and metadata
569
+ np.savetxt(original_file, tdata.values, delimiter=',')
570
+ np.savetxt(converted_file, converted_tdata.values, delimiter=',')
571
+ converted_tdata.metadata.to_csv(metadata_file, index=False)
572
+
573
+ return {
574
+ "message": f"Resolution converted from {original_resolution}bp to {target_resolution}bp: {tdata.values.shape} -> {converted_tdata.values.shape}",
575
+ "reference": "https://github.com/google-deepmind/alphagenome/tree/main/colabs/essential_commands.ipynb",
576
+ "artifacts": [
577
+ {
578
+ "description": f"Original values at {original_resolution}bp resolution",
579
+ "path": str(original_file.resolve())
580
+ },
581
+ {
582
+ "description": f"Converted values at {target_resolution}bp resolution",
583
+ "path": str(converted_file.resolve())
584
+ },
585
+ {
586
+ "description": "Track metadata",
587
+ "path": str(metadata_file.resolve())
588
+ }
589
+ ]
590
+ }
tools/quick_start.py ADDED
@@ -0,0 +1,664 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ AlphaGenome Quick Start tutorial tools for DNA sequence analysis and prediction.
3
+
4
+ This MCP Server provides 6 tools:
5
+ 1. predict_dna_sequence: Predict genomic tracks from DNA sequence
6
+ 2. predict_genome_interval: Predict genomic tracks for reference genome intervals
7
+ 3. predict_variant_effects: Predict and visualize genetic variant effects
8
+ 4. score_variant_effect: Score genetic variant effects using variant scorers
9
+ 5. ism_analysis: Perform in silico mutagenesis analysis with sequence logos
10
+ 6. mouse_predictions: Make predictions for mouse sequences and intervals
11
+
12
+ All tools extracted from `https://github.com/google-deepmind/alphagenome/tree/main/colabs/quick_start.ipynb`.
13
+ """
14
+
15
+ # Standard imports
16
+ from typing import Annotated, Literal, Any
17
+ import pandas as pd
18
+ import numpy as np
19
+ from pathlib import Path
20
+ import os
21
+ from fastmcp import FastMCP
22
+ from datetime import datetime
23
+
24
+ # AlphaGenome imports
25
+ from alphagenome import colab_utils
26
+ from alphagenome.data import gene_annotation
27
+ from alphagenome.data import genome
28
+ from alphagenome.data import transcript as transcript_utils
29
+ from alphagenome.interpretation import ism
30
+ from alphagenome.models import dna_client
31
+ from alphagenome.models import variant_scorers
32
+ from alphagenome.visualization import plot_components
33
+ import matplotlib.pyplot as plt
34
+
35
+ # Base persistent directory (HF Spaces guarantees /data is writable & persistent)
36
+ BASE_DIR = Path("/data")
37
+
38
+ DEFAULT_INPUT_DIR = BASE_DIR / "tmp_inputs"
39
+ DEFAULT_OUTPUT_DIR = BASE_DIR / "tmp_outputs"
40
+
41
+ INPUT_DIR = Path(os.environ.get("QUICK_START_INPUT_DIR", DEFAULT_INPUT_DIR))
42
+ OUTPUT_DIR = Path(os.environ.get("QUICK_START_OUTPUT_DIR", DEFAULT_OUTPUT_DIR))
43
+
44
+ # Ensure directories exist
45
+ INPUT_DIR.mkdir(parents=True, exist_ok=True)
46
+ OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
47
+
48
+ # Timestamp for unique outputs
49
+ timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
50
+
51
+ # MCP server instance
52
+ quick_start_mcp = FastMCP(name="quick_start")
53
+
54
+ @quick_start_mcp.tool
55
+ def predict_dna_sequence(
56
+ api_key: Annotated[str, "AlphaGenome API key for authentication"],
57
+ sequence: Annotated[str, "DNA sequence to analyze (will be center-padded to valid length)"] = 'GATTACA',
58
+ sequence_length: Annotated[Literal["2KB", "16KB", "100KB", "500KB", "1MB"], "Model input sequence length"] = "2KB",
59
+ output_types: Annotated[list[str], "List of output types to predict (e.g. ['DNASE', 'CAGE', 'RNA_SEQ'])"] = ["DNASE"],
60
+ ontology_terms: Annotated[list[str], "List of ontology terms for tissues/cell types (e.g. ['UBERON:0002048'])"] = ["UBERON:0002048"],
61
+ out_prefix: Annotated[str | None, "Output file prefix"] = None,
62
+ ) -> dict:
63
+ """
64
+ Predict genomic tracks from a DNA sequence using AlphaGenome model.
65
+ Input is DNA sequence string and output is track predictions with metadata tables.
66
+ """
67
+ # Set up output prefix
68
+ if out_prefix is None:
69
+ out_prefix = f"predict_dna_sequence_{timestamp}"
70
+
71
+ # Create DNA model
72
+ dna_model = dna_client.create(api_key)
73
+
74
+ # Convert sequence length to client constant
75
+ length_map = {
76
+ "2KB": dna_client.SEQUENCE_LENGTH_2KB,
77
+ "16KB": dna_client.SEQUENCE_LENGTH_16KB,
78
+ "100KB": dna_client.SEQUENCE_LENGTH_100KB,
79
+ "500KB": dna_client.SEQUENCE_LENGTH_500KB,
80
+ "1MB": dna_client.SEQUENCE_LENGTH_1MB
81
+ }
82
+ target_length = length_map[sequence_length]
83
+
84
+ # Pad sequence to valid length
85
+ padded_sequence = sequence.center(target_length, 'N')
86
+
87
+ # Convert output types to client enums
88
+ output_enums = []
89
+ for output_type in output_types:
90
+ output_enums.append(getattr(dna_client.OutputType, output_type))
91
+
92
+ # Make prediction
93
+ output = dna_model.predict_sequence(
94
+ sequence=padded_sequence,
95
+ requested_outputs=output_enums,
96
+ ontology_terms=ontology_terms,
97
+ )
98
+
99
+ artifacts = []
100
+
101
+ # Save track data and metadata for each output type
102
+ for output_type in output_types:
103
+ track_data = getattr(output, output_type.lower())
104
+
105
+ # Save values
106
+ values_file = OUTPUT_DIR / f"{out_prefix}_{output_type.lower()}_values.csv"
107
+ pd.DataFrame(track_data.values).to_csv(values_file, index=False)
108
+ artifacts.append({
109
+ "description": f"{output_type} prediction values",
110
+ "path": str(values_file.resolve())
111
+ })
112
+
113
+ # Save metadata
114
+ metadata_file = OUTPUT_DIR / f"{out_prefix}_{output_type.lower()}_metadata.csv"
115
+ track_data.metadata.to_csv(metadata_file, index=False)
116
+ artifacts.append({
117
+ "description": f"{output_type} track metadata",
118
+ "path": str(metadata_file.resolve())
119
+ })
120
+
121
+ return {
122
+ "message": f"DNA sequence predictions completed for {len(output_types)} output types",
123
+ "reference": "https://github.com/google-deepmind/alphagenome/tree/main/colabs/quick_start.ipynb",
124
+ "artifacts": artifacts
125
+ }
126
+
127
+ @quick_start_mcp.tool
128
+ def predict_genome_interval(
129
+ api_key: Annotated[str, "AlphaGenome API key for authentication"],
130
+ chromosome: Annotated[str, "Chromosome name (e.g. 'chr19')"] = "chr19",
131
+ start_position: Annotated[int, "Start position on chromosome"] = 40991281,
132
+ end_position: Annotated[int, "End position on chromosome"] = 41018398,
133
+ strand: Annotated[Literal["+", "-", "."], "Strand orientation"] = "+",
134
+ sequence_length: Annotated[Literal["2KB", "16KB", "100KB", "500KB", "1MB"], "Model input sequence length"] = "1MB",
135
+ output_types: Annotated[list[str], "List of output types to predict (e.g. ['RNA_SEQ'])"] = ["RNA_SEQ"],
136
+ ontology_terms: Annotated[list[str], "List of ontology terms for tissues/cell types"] = ["UBERON:0001114"],
137
+ gene_symbol: Annotated[str | None, "Gene symbol to center interval on (overrides coordinates)"] = "CYP2B6",
138
+ out_prefix: Annotated[str | None, "Output file prefix"] = None,
139
+ ) -> dict:
140
+ """
141
+ Predict genomic tracks for a reference genome interval with transcript visualization.
142
+ Input is genomic coordinates or gene symbol and output is prediction plot and metadata.
143
+ """
144
+ # Set up output prefix
145
+ if out_prefix is None:
146
+ out_prefix = f"predict_genome_interval_{timestamp}"
147
+
148
+ # Create DNA model
149
+ dna_model = dna_client.create(api_key)
150
+
151
+ # Load GTF file for gene annotation
152
+ gtf = pd.read_feather(
153
+ 'https://storage.googleapis.com/alphagenome/reference/gencode/'
154
+ 'hg38/gencode.v46.annotation.gtf.gz.feather'
155
+ )
156
+
157
+ # Set up transcript extractors
158
+ gtf_transcripts = gene_annotation.filter_protein_coding(gtf)
159
+ gtf_transcripts = gene_annotation.filter_to_longest_transcript(gtf_transcripts)
160
+ transcript_extractor = transcript_utils.TranscriptExtractor(gtf_transcripts)
161
+
162
+ # Create interval - use gene symbol if provided, otherwise coordinates
163
+ if gene_symbol:
164
+ interval = gene_annotation.get_gene_interval(gtf, gene_symbol=gene_symbol)
165
+ else:
166
+ interval = genome.Interval(chromosome, start_position, end_position, strand)
167
+
168
+ # Resize to model-compatible length
169
+ length_map = {
170
+ "2KB": dna_client.SEQUENCE_LENGTH_2KB,
171
+ "16KB": dna_client.SEQUENCE_LENGTH_16KB,
172
+ "100KB": dna_client.SEQUENCE_LENGTH_100KB,
173
+ "500KB": dna_client.SEQUENCE_LENGTH_500KB,
174
+ "1MB": dna_client.SEQUENCE_LENGTH_1MB
175
+ }
176
+ interval = interval.resize(length_map[sequence_length])
177
+
178
+ # Convert output types to client enums
179
+ output_enums = []
180
+ for output_type in output_types:
181
+ output_enums.append(getattr(dna_client.OutputType, output_type))
182
+
183
+ # Make prediction
184
+ output = dna_model.predict_interval(
185
+ interval=interval,
186
+ requested_outputs=output_enums,
187
+ ontology_terms=ontology_terms,
188
+ )
189
+
190
+ # Extract transcripts for visualization
191
+ longest_transcripts = transcript_extractor.extract(interval)
192
+
193
+ artifacts = []
194
+
195
+ # Save metadata for each output type
196
+ for output_type in output_types:
197
+ track_data = getattr(output, output_type.lower())
198
+
199
+ # Save metadata
200
+ metadata_file = OUTPUT_DIR / f"{out_prefix}_{output_type.lower()}_metadata.csv"
201
+ track_data.metadata.to_csv(metadata_file, index=False)
202
+ artifacts.append({
203
+ "description": f"{output_type} track metadata",
204
+ "path": str(metadata_file.resolve())
205
+ })
206
+
207
+ # Create visualization - full interval
208
+ plt.figure(figsize=(15, 8))
209
+ track_data = getattr(output, output_types[0].lower())
210
+ plot_components.plot(
211
+ components=[
212
+ plot_components.TranscriptAnnotation(longest_transcripts),
213
+ plot_components.Tracks(track_data),
214
+ ],
215
+ interval=track_data.interval,
216
+ )
217
+
218
+ plot_file = OUTPUT_DIR / f"{out_prefix}_{output_types[0].lower()}_plot.png"
219
+ plt.savefig(plot_file, dpi=300, bbox_inches='tight')
220
+ plt.close()
221
+ artifacts.append({
222
+ "description": f"{output_types[0]} prediction plot",
223
+ "path": str(plot_file.resolve())
224
+ })
225
+
226
+ # Create zoomed visualization
227
+ plt.figure(figsize=(15, 8))
228
+ plot_components.plot(
229
+ components=[
230
+ plot_components.TranscriptAnnotation(longest_transcripts, fig_height=0.1),
231
+ plot_components.Tracks(track_data),
232
+ ],
233
+ interval=track_data.interval.resize(2**15),
234
+ )
235
+
236
+ plot_zoomed_file = OUTPUT_DIR / f"{out_prefix}_{output_types[0].lower()}_plot_zoomed.png"
237
+ plt.savefig(plot_zoomed_file, dpi=300, bbox_inches='tight')
238
+ plt.close()
239
+ artifacts.append({
240
+ "description": f"{output_types[0]} prediction plot (zoomed)",
241
+ "path": str(plot_zoomed_file.resolve())
242
+ })
243
+
244
+ return {
245
+ "message": f"Genome interval predictions completed with {len(longest_transcripts)} transcripts",
246
+ "reference": "https://github.com/google-deepmind/alphagenome/tree/main/colabs/quick_start.ipynb",
247
+ "artifacts": artifacts
248
+ }
249
+
250
+
251
+ def predict_variant_effects(
252
+ api_key: Annotated[str, "AlphaGenome API key for authentication"],
253
+ chromosome: Annotated[str, "Chromosome name (e.g. 'chr22')"] = "chr22",
254
+ position: Annotated[int, "Variant position"] = 36201698,
255
+ reference_bases: Annotated[str, "Reference allele"] = "A",
256
+ alternate_bases: Annotated[str, "Alternative allele"] = "C",
257
+ sequence_length: Annotated[Literal["2KB", "16KB", "100KB", "500KB", "1MB"], "Model input sequence length"] = "1MB",
258
+ output_types: Annotated[list[str], "List of output types to predict"] = ["RNA_SEQ"],
259
+ ontology_terms: Annotated[list[str], "List of ontology terms for tissues/cell types"] = ["UBERON:0001157"],
260
+ out_prefix: Annotated[str | None, "Output file prefix"] = None,
261
+ ) -> dict:
262
+ """
263
+ Predict and visualize genetic variant effects comparing REF vs ALT predictions.
264
+ Input is variant coordinates and output is overlaid REF/ALT visualization plot.
265
+ """
266
+ # Set up output prefix
267
+ if out_prefix is None:
268
+ out_prefix = f"predict_variant_effects_{timestamp}"
269
+
270
+ # Create DNA model
271
+ dna_model = dna_client.create(api_key)
272
+
273
+ # Create variant object
274
+ variant = genome.Variant(
275
+ chromosome=chromosome,
276
+ position=position,
277
+ reference_bases=reference_bases,
278
+ alternate_bases=alternate_bases,
279
+ )
280
+
281
+ # Create interval from variant
282
+ length_map = {
283
+ "2KB": dna_client.SEQUENCE_LENGTH_2KB,
284
+ "16KB": dna_client.SEQUENCE_LENGTH_16KB,
285
+ "100KB": dna_client.SEQUENCE_LENGTH_100KB,
286
+ "500KB": dna_client.SEQUENCE_LENGTH_500KB,
287
+ "1MB": dna_client.SEQUENCE_LENGTH_1MB
288
+ }
289
+ interval = variant.reference_interval.resize(length_map[sequence_length])
290
+
291
+ # Convert output types to client enums
292
+ output_enums = []
293
+ for output_type in output_types:
294
+ output_enums.append(getattr(dna_client.OutputType, output_type))
295
+
296
+ # Make variant prediction
297
+ variant_output = dna_model.predict_variant(
298
+ interval=interval,
299
+ variant=variant,
300
+ requested_outputs=output_enums,
301
+ ontology_terms=ontology_terms,
302
+ )
303
+
304
+ # Load GTF for transcript annotation
305
+ gtf = pd.read_feather(
306
+ 'https://storage.googleapis.com/alphagenome/reference/gencode/'
307
+ 'hg38/gencode.v46.annotation.gtf.gz.feather'
308
+ )
309
+ gtf_transcripts = gene_annotation.filter_protein_coding(gtf)
310
+ gtf_transcripts = gene_annotation.filter_to_longest_transcript(gtf_transcripts)
311
+ transcript_extractor = transcript_utils.TranscriptExtractor(gtf_transcripts)
312
+ longest_transcripts = transcript_extractor.extract(interval)
313
+
314
+ artifacts = []
315
+
316
+ # Create overlaid REF vs ALT visualization
317
+ plt.figure(figsize=(15, 8))
318
+ ref_track_data = getattr(variant_output.reference, output_types[0].lower())
319
+ alt_track_data = getattr(variant_output.alternate, output_types[0].lower())
320
+
321
+ plot_components.plot(
322
+ [
323
+ plot_components.TranscriptAnnotation(longest_transcripts),
324
+ plot_components.OverlaidTracks(
325
+ tdata={
326
+ 'REF': ref_track_data,
327
+ 'ALT': alt_track_data,
328
+ },
329
+ colors={'REF': 'dimgrey', 'ALT': 'red'},
330
+ ),
331
+ ],
332
+ interval=ref_track_data.interval.resize(2**15),
333
+ # Annotate the location of the variant as a vertical line
334
+ annotations=[plot_components.VariantAnnotation([variant], alpha=0.8)],
335
+ )
336
+
337
+ plot_file = OUTPUT_DIR / f"{out_prefix}_{output_types[0].lower()}_variant_plot.png"
338
+ plt.savefig(plot_file, dpi=300, bbox_inches='tight')
339
+ plt.close()
340
+ artifacts.append({
341
+ "description": f"{output_types[0]} REF vs ALT comparison plot",
342
+ "path": str(plot_file.resolve())
343
+ })
344
+
345
+ return {
346
+ "message": f"Variant effect predictions completed for {chromosome}:{position}:{reference_bases}>{alternate_bases}",
347
+ "reference": "https://github.com/google-deepmind/alphagenome/tree/main/colabs/quick_start.ipynb",
348
+ "artifacts": artifacts
349
+ }
350
+
351
+
352
+ def score_variant_effect(
353
+ api_key: Annotated[str, "AlphaGenome API key for authentication"],
354
+ chromosome: Annotated[str, "Chromosome name (e.g. 'chr22')"] = "chr22",
355
+ position: Annotated[int, "Variant position"] = 36201698,
356
+ reference_bases: Annotated[str, "Reference allele"] = "A",
357
+ alternate_bases: Annotated[str, "Alternative allele"] = "C",
358
+ sequence_length: Annotated[Literal["2KB", "16KB", "100KB", "500KB", "1MB"], "Model input sequence length"] = "1MB",
359
+ scorer_type: Annotated[Literal["RNA_SEQ", "DNASE", "CAGE", "ATAC"], "Variant scorer type to use"] = "RNA_SEQ",
360
+ out_prefix: Annotated[str | None, "Output file prefix"] = None,
361
+ ) -> dict:
362
+ """
363
+ Score genetic variant effects using recommended variant scorers and produce tidy scores.
364
+ Input is variant coordinates and output is variant scores table with genes and tracks.
365
+ """
366
+ # Set up output prefix
367
+ if out_prefix is None:
368
+ out_prefix = f"score_variant_effect_{timestamp}"
369
+
370
+ # Create DNA model
371
+ dna_model = dna_client.create(api_key)
372
+
373
+ # Create variant object
374
+ variant = genome.Variant(
375
+ chromosome=chromosome,
376
+ position=position,
377
+ reference_bases=reference_bases,
378
+ alternate_bases=alternate_bases,
379
+ )
380
+
381
+ # Create interval from variant
382
+ length_map = {
383
+ "2KB": dna_client.SEQUENCE_LENGTH_2KB,
384
+ "16KB": dna_client.SEQUENCE_LENGTH_16KB,
385
+ "100KB": dna_client.SEQUENCE_LENGTH_100KB,
386
+ "500KB": dna_client.SEQUENCE_LENGTH_500KB,
387
+ "1MB": dna_client.SEQUENCE_LENGTH_1MB
388
+ }
389
+ interval = variant.reference_interval.resize(length_map[sequence_length])
390
+
391
+ # Get recommended variant scorer
392
+ variant_scorer = variant_scorers.RECOMMENDED_VARIANT_SCORERS[scorer_type]
393
+
394
+ # Score variant
395
+ variant_scores = dna_model.score_variant(
396
+ interval=interval,
397
+ variant=variant,
398
+ variant_scorers=[variant_scorer]
399
+ )
400
+
401
+ artifacts = []
402
+
403
+ # Extract first scorer results
404
+ scores_adata = variant_scores[0]
405
+
406
+ # Save gene metadata (obs)
407
+ genes_file = OUTPUT_DIR / f"{out_prefix}_{scorer_type}_genes.csv"
408
+ scores_adata.obs.to_csv(genes_file, index=True)
409
+ artifacts.append({
410
+ "description": f"{scorer_type} gene metadata",
411
+ "path": str(genes_file.resolve())
412
+ })
413
+
414
+ # Save track metadata (var)
415
+ tracks_file = OUTPUT_DIR / f"{out_prefix}_{scorer_type}_tracks.csv"
416
+ scores_adata.var.to_csv(tracks_file, index=True)
417
+ artifacts.append({
418
+ "description": f"{scorer_type} track metadata",
419
+ "path": str(tracks_file.resolve())
420
+ })
421
+
422
+ # Save raw scores matrix
423
+ raw_scores_file = OUTPUT_DIR / f"{out_prefix}_{scorer_type}_raw_scores.csv"
424
+ pd.DataFrame(scores_adata.X,
425
+ index=scores_adata.obs.index,
426
+ columns=scores_adata.var.index).to_csv(raw_scores_file, index=True)
427
+ artifacts.append({
428
+ "description": f"{scorer_type} raw scores matrix",
429
+ "path": str(raw_scores_file.resolve())
430
+ })
431
+
432
+ # Create tidy scores dataframe
433
+ tidy_scores_df = variant_scorers.tidy_scores([scores_adata], match_gene_strand=True)
434
+
435
+ # Save tidy scores
436
+ tidy_file = OUTPUT_DIR / f"{out_prefix}_{scorer_type}_tidy_scores.csv"
437
+ tidy_scores_df.to_csv(tidy_file, index=False)
438
+ artifacts.append({
439
+ "description": f"{scorer_type} tidy scores table",
440
+ "path": str(tidy_file.resolve())
441
+ })
442
+
443
+ return {
444
+ "message": f"Variant scoring completed: {scores_adata.X.shape[0]} genes × {scores_adata.X.shape[1]} tracks",
445
+ "reference": "https://github.com/google-deepmind/alphagenome/tree/main/colabs/quick_start.ipynb",
446
+ "artifacts": artifacts
447
+ }
448
+
449
+ @quick_start_mcp.tool
450
+ def ism_analysis(
451
+ api_key: Annotated[str, "AlphaGenome API key for authentication"],
452
+ chromosome: Annotated[str, "Chromosome for ISM analysis"] = "chr20",
453
+ start_position: Annotated[int, "Start position for sequence context"] = 3753000,
454
+ end_position: Annotated[int, "End position for sequence context"] = 3753400,
455
+ sequence_length: Annotated[Literal["2KB", "16KB", "100KB", "500KB", "1MB"], "Model input sequence length"] = "2KB",
456
+ ism_width: Annotated[int, "Width of region to mutate systematically"] = 256,
457
+ output_type: Annotated[Literal["DNASE", "RNA_SEQ", "CAGE", "ATAC"], "Output type for scoring variants"] = "DNASE",
458
+ mask_width: Annotated[int, "Width of center mask for scoring"] = 501,
459
+ target_cell_line: Annotated[str, "Ontology term for specific cell line/tissue"] = "EFO:0002067",
460
+ out_prefix: Annotated[str | None, "Output file prefix"] = None,
461
+ ) -> dict:
462
+ """
463
+ Perform in silico mutagenesis analysis with sequence logo visualization of important regions.
464
+ Input is genomic coordinates and output is ISM matrix and sequence logo plot.
465
+ """
466
+ # Set up output prefix
467
+ if out_prefix is None:
468
+ out_prefix = f"ism_analysis_{timestamp}"
469
+
470
+ # Create DNA model
471
+ dna_model = dna_client.create(api_key)
472
+
473
+ # Create sequence interval
474
+ sequence_interval = genome.Interval(chromosome, start_position, end_position)
475
+ length_map = {
476
+ "2KB": dna_client.SEQUENCE_LENGTH_2KB,
477
+ "16KB": dna_client.SEQUENCE_LENGTH_16KB,
478
+ "100KB": dna_client.SEQUENCE_LENGTH_100KB,
479
+ "500KB": dna_client.SEQUENCE_LENGTH_500KB,
480
+ "1MB": dna_client.SEQUENCE_LENGTH_1MB
481
+ }
482
+ sequence_interval = sequence_interval.resize(length_map[sequence_length])
483
+
484
+ # Create ISM interval (region to mutate)
485
+ ism_interval = sequence_interval.resize(ism_width)
486
+
487
+ # Create variant scorer
488
+ output_enum = getattr(dna_client.OutputType, output_type)
489
+ variant_scorer = variant_scorers.CenterMaskScorer(
490
+ requested_output=output_enum,
491
+ width=mask_width,
492
+ aggregation_type=variant_scorers.AggregationType.DIFF_MEAN,
493
+ )
494
+
495
+ # Score all ISM variants
496
+ variant_scores = dna_model.score_ism_variants(
497
+ interval=sequence_interval,
498
+ ism_interval=ism_interval,
499
+ variant_scorers=[variant_scorer],
500
+ )
501
+
502
+ # Extract scores for target cell line/tissue
503
+ def extract_target_scores(adata):
504
+ values = adata.X[:, adata.var['ontology_curie'] == target_cell_line]
505
+ if values.size == 0:
506
+ # If target not found, use first available track
507
+ values = adata.X[:, 0:1]
508
+ assert values.size >= 1
509
+ return values.flatten()[0]
510
+
511
+ # Create ISM matrix
512
+ ism_result = ism.ism_matrix(
513
+ [extract_target_scores(x[0]) for x in variant_scores],
514
+ variants=[v[0].uns['variant'] for v in variant_scores],
515
+ )
516
+
517
+ artifacts = []
518
+
519
+ # Save ISM matrix
520
+ ism_matrix_file = OUTPUT_DIR / f"{out_prefix}_ism_matrix.csv"
521
+ pd.DataFrame(ism_result).to_csv(ism_matrix_file, index=False)
522
+ artifacts.append({
523
+ "description": "ISM contribution matrix",
524
+ "path": str(ism_matrix_file.resolve())
525
+ })
526
+
527
+ # Create sequence logo plot
528
+ plt.figure(figsize=(35, 6))
529
+ plot_components.plot(
530
+ [
531
+ plot_components.SeqLogo(
532
+ scores=ism_result,
533
+ scores_interval=ism_interval,
534
+ ylabel=f'ISM {target_cell_line} {output_type}',
535
+ )
536
+ ],
537
+ interval=ism_interval,
538
+ fig_width=35,
539
+ )
540
+
541
+ logo_file = OUTPUT_DIR / f"{out_prefix}_sequence_logo.png"
542
+ plt.savefig(logo_file, dpi=300, bbox_inches='tight')
543
+ plt.close()
544
+ artifacts.append({
545
+ "description": "ISM sequence logo plot",
546
+ "path": str(logo_file.resolve())
547
+ })
548
+
549
+ return {
550
+ "message": f"ISM analysis completed: {len(variant_scores)} variants scored ({ism_width} positions)",
551
+ "reference": "https://github.com/google-deepmind/alphagenome/tree/main/colabs/quick_start.ipynb",
552
+ "artifacts": artifacts
553
+ }
554
+
555
+
556
+ def mouse_predictions(
557
+ api_key: Annotated[str, "AlphaGenome API key for authentication"],
558
+ sequence: Annotated[str | None, "DNA sequence for sequence prediction"] = 'GATTACA',
559
+ chromosome: Annotated[str | None, "Mouse chromosome for interval prediction"] = "chr1",
560
+ start_position: Annotated[int | None, "Start position for interval prediction"] = 3000000,
561
+ end_position: Annotated[int | None, "End position for interval prediction"] = 3000001,
562
+ prediction_type: Annotated[Literal["sequence", "interval"], "Type of prediction to perform"] = "sequence",
563
+ sequence_length: Annotated[Literal["2KB", "16KB", "100KB", "500KB", "1MB"], "Model input sequence length"] = "2KB",
564
+ output_types: Annotated[list[str], "List of output types to predict"] = ["DNASE"],
565
+ ontology_terms: Annotated[list[str], "List of ontology terms for mouse tissues"] = ["UBERON:0002048"],
566
+ out_prefix: Annotated[str | None, "Output file prefix"] = None,
567
+ ) -> dict:
568
+ """
569
+ Make predictions for mouse sequences and genomic intervals using MUS_MUSCULUS organism.
570
+ Input is mouse DNA sequence or coordinates and output is prediction metadata tables.
571
+ """
572
+ # Set up output prefix
573
+ if out_prefix is None:
574
+ out_prefix = f"mouse_predictions_{timestamp}"
575
+
576
+ # Create DNA model
577
+ dna_model = dna_client.create(api_key)
578
+
579
+ # Convert sequence length to client constant
580
+ length_map = {
581
+ "2KB": dna_client.SEQUENCE_LENGTH_2KB,
582
+ "16KB": dna_client.SEQUENCE_LENGTH_16KB,
583
+ "100KB": dna_client.SEQUENCE_LENGTH_100KB,
584
+ "500KB": dna_client.SEQUENCE_LENGTH_500KB,
585
+ "1MB": dna_client.SEQUENCE_LENGTH_1MB
586
+ }
587
+ target_length = length_map[sequence_length]
588
+
589
+ # Convert output types to client enums
590
+ output_enums = []
591
+ for output_type in output_types:
592
+ output_enums.append(getattr(dna_client.OutputType, output_type))
593
+
594
+ artifacts = []
595
+
596
+ if prediction_type == "sequence":
597
+ # Sequence prediction for mouse
598
+ if sequence is None:
599
+ raise ValueError("sequence must be provided for sequence prediction")
600
+
601
+ # Pad sequence to valid length
602
+ padded_sequence = sequence.center(target_length, 'N')
603
+
604
+ # Make mouse sequence prediction
605
+ output = dna_model.predict_sequence(
606
+ sequence=padded_sequence,
607
+ organism=dna_client.Organism.MUS_MUSCULUS,
608
+ requested_outputs=output_enums,
609
+ ontology_terms=ontology_terms,
610
+ )
611
+
612
+ # Save results for each output type
613
+ for output_type in output_types:
614
+ track_data = getattr(output, output_type.lower())
615
+
616
+ # Save values
617
+ values_file = OUTPUT_DIR / f"{out_prefix}_{output_type.lower()}_values.csv"
618
+ pd.DataFrame(track_data.values).to_csv(values_file, index=False)
619
+ artifacts.append({
620
+ "description": f"Mouse {output_type} prediction values",
621
+ "path": str(values_file.resolve())
622
+ })
623
+
624
+ # Save metadata
625
+ metadata_file = OUTPUT_DIR / f"{out_prefix}_{output_type.lower()}_metadata.csv"
626
+ track_data.metadata.to_csv(metadata_file, index=False)
627
+ artifacts.append({
628
+ "description": f"Mouse {output_type} track metadata",
629
+ "path": str(metadata_file.resolve())
630
+ })
631
+
632
+ elif prediction_type == "interval":
633
+ # Interval prediction for mouse
634
+ if chromosome is None or start_position is None or end_position is None:
635
+ raise ValueError("chromosome, start_position, and end_position must be provided for interval prediction")
636
+
637
+ # Create mouse interval
638
+ interval = genome.Interval(chromosome, start_position, end_position).resize(target_length)
639
+
640
+ # Make mouse interval prediction
641
+ output = dna_model.predict_interval(
642
+ interval=interval,
643
+ organism=dna_client.Organism.MUS_MUSCULUS,
644
+ requested_outputs=output_enums,
645
+ ontology_terms=ontology_terms,
646
+ )
647
+
648
+ # Save results for each output type
649
+ for output_type in output_types:
650
+ track_data = getattr(output, output_type.lower())
651
+
652
+ # Save metadata
653
+ metadata_file = OUTPUT_DIR / f"{out_prefix}_{output_type.lower()}_metadata.csv"
654
+ track_data.metadata.to_csv(metadata_file, index=False)
655
+ artifacts.append({
656
+ "description": f"Mouse {output_type} interval metadata",
657
+ "path": str(metadata_file.resolve())
658
+ })
659
+
660
+ return {
661
+ "message": f"Mouse {prediction_type} predictions completed for {len(output_types)} output types",
662
+ "reference": "https://github.com/google-deepmind/alphagenome/tree/main/colabs/quick_start.ipynb",
663
+ "artifacts": artifacts
664
+ }
tools/tissue_ontology_mapping.py ADDED
@@ -0,0 +1,148 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Tissue ontology mapping tutorial for navigating biological data ontologies in AlphaGenome.
3
+
4
+ This MCP Server provides 2 tools:
5
+ 1. explore_output_metadata: Explore and filter output metadata for specific organisms and search terms
6
+ 2. count_tracks_by_output_type: Count tracks by output type for human and mouse organisms
7
+
8
+ All tools extracted from `https://github.com/google-deepmind/alphagenome/tree/main/colabs/tissue_ontology_mapping.ipynb`.
9
+ """
10
+
11
+ # Standard imports
12
+ from typing import Annotated, Literal, Any
13
+ import pandas as pd
14
+ import numpy as np
15
+ from pathlib import Path
16
+ import os
17
+ from fastmcp import FastMCP
18
+ from datetime import datetime
19
+
20
+ # Base persistent directory (HF Spaces guarantees /data is writable & persistent)
21
+ BASE_DIR = Path("/data")
22
+
23
+ DEFAULT_INPUT_DIR = BASE_DIR / "tmp_inputs"
24
+ DEFAULT_OUTPUT_DIR = BASE_DIR / "tmp_outputs"
25
+
26
+
27
+ INPUT_DIR = Path(os.environ.get("TISSUE_ONTOLOGY_MAPPING_INPUT_DIR", DEFAULT_INPUT_DIR))
28
+ OUTPUT_DIR = Path(os.environ.get("TISSUE_ONTOLOGY_MAPPING_OUTPUT_DIR", DEFAULT_OUTPUT_DIR))
29
+
30
+ # Ensure directories exist
31
+ INPUT_DIR.mkdir(parents=True, exist_ok=True)
32
+ OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
33
+
34
+ # Timestamp for unique outputs
35
+ timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
36
+
37
+ ALPHAGENOME_API_KEY = os.environ["ALPHAGENOME_API_KEY"]
38
+
39
+ # MCP server instance
40
+ tissue_ontology_mapping_mcp = FastMCP(name="tissue_ontology_mapping")
41
+
42
+ @tissue_ontology_mapping_mcp.tool
43
+ def explore_output_metadata(
44
+ # Analysis parameters with tutorial defaults
45
+ organism: Annotated[Literal["HOMO_SAPIENS", "MUS_MUSCULUS"], "Target organism for metadata exploration"] = "HOMO_SAPIENS",
46
+ api_key: Annotated[str, "AlphaGenome API key for accessing the DNA model"] = ALPHAGENOME_API_KEY,
47
+ out_prefix: Annotated[str | None, "Output file prefix"] = None,
48
+ ) -> dict:
49
+ """
50
+ Explore output metadata for specific organisms to find ontology terms and tissue types.
51
+ Input is organism selection and API key and output is metadata table for interactive exploration.
52
+ """
53
+ # Import required modules
54
+ from alphagenome.models import dna_client
55
+
56
+ if not api_key:
57
+ raise ValueError("API key must be provided")
58
+
59
+ # Create DNA model client
60
+ dna_model = dna_client.create(api_key)
61
+
62
+ # Get organism enum
63
+ org_enum = getattr(dna_client.Organism, organism)
64
+
65
+ # Get output metadata
66
+ output_metadata = dna_model.output_metadata(org_enum).concatenate()
67
+
68
+ # Set output filename
69
+ if out_prefix is None:
70
+ out_prefix = f"output_metadata_{organism.lower()}"
71
+
72
+ output_file = OUTPUT_DIR / f"{out_prefix}_{timestamp}.csv"
73
+
74
+ # Save metadata as CSV
75
+ output_metadata.to_csv(output_file, index=False)
76
+
77
+ # Return standardized format
78
+ return {
79
+ "message": f"Output metadata exploration completed for {organism}",
80
+ "reference": "https://github.com/google-deepmind/alphagenome/tree/main/colabs/tissue_ontology_mapping.ipynb",
81
+ "artifacts": [
82
+ {
83
+ "description": f"Output metadata for {organism}",
84
+ "path": str(output_file.resolve())
85
+ }
86
+ ]
87
+ }
88
+
89
+ @tissue_ontology_mapping_mcp.tool
90
+ def count_tracks_by_output_type(
91
+ # Analysis parameters with tutorial defaults
92
+ api_key: Annotated[str, "AlphaGenome API key for accessing the DNA model"] = ALPHAGENOME_API_KEY,
93
+ out_prefix: Annotated[str | None, "Output file prefix"] = None,
94
+ ) -> dict:
95
+ """
96
+ Count tracks by output type for both human and mouse organisms to understand data availability.
97
+ Input is API key and output is track counts table comparing human vs mouse availability.
98
+ """
99
+ # Import required modules
100
+ from alphagenome.models import dna_client
101
+
102
+ if not api_key:
103
+ raise ValueError("API key must be provided")
104
+
105
+ # Create DNA model client
106
+ dna_model = dna_client.create(api_key)
107
+
108
+ # Count human tracks
109
+ human_tracks = (
110
+ dna_model.output_metadata(dna_client.Organism.HOMO_SAPIENS)
111
+ .concatenate()
112
+ .groupby('output_type')
113
+ .size()
114
+ .rename('# Human tracks')
115
+ )
116
+
117
+ # Count mouse tracks
118
+ mouse_tracks = (
119
+ dna_model.output_metadata(dna_client.Organism.MUS_MUSCULUS)
120
+ .concatenate()
121
+ .groupby('output_type')
122
+ .size()
123
+ .rename('# Mouse tracks')
124
+ )
125
+
126
+ # Combine the results
127
+ track_counts = pd.concat([human_tracks, mouse_tracks], axis=1).astype(pd.Int64Dtype())
128
+
129
+ # Set output filename
130
+ if out_prefix is None:
131
+ out_prefix = "track_counts"
132
+
133
+ output_file = OUTPUT_DIR / f"{out_prefix}_{timestamp}.csv"
134
+
135
+ # Save track counts as CSV
136
+ track_counts.to_csv(output_file)
137
+
138
+ # Return standardized format
139
+ return {
140
+ "message": "Track counting by output type completed successfully",
141
+ "reference": "https://github.com/google-deepmind/alphagenome/tree/main/colabs/tissue_ontology_mapping.ipynb",
142
+ "artifacts": [
143
+ {
144
+ "description": "Track counts by output type",
145
+ "path": str(output_file.resolve())
146
+ }
147
+ ]
148
+ }
tools/variant_scoring_ui.py ADDED
@@ -0,0 +1,389 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Interactive tutorial for scoring and visualizing single variants with comprehensive modality options.
3
+
4
+ This MCP Server provides 2 tools:
5
+ 1. score_variant: Score a single variant with multiple variant scorers and save results
6
+ 2. visualize_variant_effects: Generate comprehensive variant effect visualization across multiple modalities
7
+
8
+ All tools extracted from `https://github.com/google-deepmind/alphagenome/tree/main/colabs/variant_scoring_ui.ipynb`.
9
+ """
10
+
11
+ # Standard imports
12
+ from typing import Annotated, Literal, Any
13
+ import pandas as pd
14
+ import numpy as np
15
+ from pathlib import Path
16
+ import os
17
+ from fastmcp import FastMCP
18
+ from datetime import datetime
19
+
20
+ # Base persistent directory (HF Spaces guarantees /data is writable & persistent)
21
+ BASE_DIR = Path("/data")
22
+
23
+ DEFAULT_INPUT_DIR = BASE_DIR / "tmp_inputs"
24
+ DEFAULT_OUTPUT_DIR = BASE_DIR / "tmp_outputs"
25
+
26
+ INPUT_DIR = Path(os.environ.get("BATCH_VARIANT_SCORING_INPUT_DIR", DEFAULT_INPUT_DIR))
27
+ OUTPUT_DIR = Path(os.environ.get("BATCH_VARIANT_SCORING_OUTPUT_DIR", DEFAULT_OUTPUT_DIR))
28
+
29
+ # Ensure directories exist
30
+ INPUT_DIR.mkdir(parents=True, exist_ok=True)
31
+ OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
32
+
33
+ # Timestamp for unique outputs
34
+ timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
35
+
36
+ # Fetch your secret
37
+ ALPHAGENOME_API_KEY = os.environ["ALPHAGENOME_API_KEY"]
38
+
39
+ # MCP server instance
40
+ variant_scoring_ui_mcp = FastMCP(name="variant_scoring_ui")
41
+
42
+
43
+ def score_variant(
44
+ # Primary data inputs - variant specification
45
+ variant_chromosome: Annotated[str, "Chromosome name (e.g., 'chr22')"] = "chr22",
46
+ variant_position: Annotated[int, "Genomic position"] = 36201698,
47
+ variant_reference_bases: Annotated[str, "Reference allele"] = "A",
48
+ variant_alternate_bases: Annotated[str, "Alternate allele"] = "C",
49
+ # Analysis parameters with tutorial defaults
50
+ organism: Annotated[Literal["human", "mouse"], "Organism to analyze"] = "human",
51
+ sequence_length: Annotated[Literal["2KB", "16KB", "100KB", "500KB", "1MB"], "Length of sequence around variant to predict"] = "1MB",
52
+ api_key: Annotated[str | None, "API key for AlphaGenome model"] = ALPHAGENOME_API_KEY,
53
+ out_prefix: Annotated[str | None, "Output file prefix"] = None,
54
+ ) -> dict:
55
+ """
56
+ Score a single variant using multiple variant scorers with comprehensive analysis.
57
+ Input is variant coordinates and parameters, output is variant scores table and downloadable CSV file.
58
+ """
59
+ from alphagenome import colab_utils
60
+ from alphagenome.data import genome
61
+ from alphagenome.models import dna_client, variant_scorers
62
+
63
+ # Use provided API key or get from environment/colab
64
+ if api_key:
65
+ dna_model = dna_client.create(api_key)
66
+ else:
67
+ dna_model = dna_client.create(colab_utils.get_api_key())
68
+
69
+ # Map organism string to enum
70
+ organism_map = {
71
+ 'human': dna_client.Organism.HOMO_SAPIENS,
72
+ 'mouse': dna_client.Organism.MUS_MUSCULUS,
73
+ }
74
+ organism_enum = organism_map[organism]
75
+
76
+ # Create variant object
77
+ variant = genome.Variant(
78
+ chromosome=variant_chromosome,
79
+ position=variant_position,
80
+ reference_bases=variant_reference_bases,
81
+ alternate_bases=variant_alternate_bases,
82
+ )
83
+
84
+ # Get sequence length
85
+ sequence_length_value = dna_client.SUPPORTED_SEQUENCE_LENGTHS[
86
+ f'SEQUENCE_LENGTH_{sequence_length}'
87
+ ]
88
+
89
+ # The input interval is derived from the variant (centered on it)
90
+ interval = variant.reference_interval.resize(sequence_length_value)
91
+
92
+ # Score variant
93
+ variant_scores = dna_model.score_variant(
94
+ interval=interval,
95
+ variant=variant,
96
+ variant_scorers=list(variant_scorers.RECOMMENDED_VARIANT_SCORERS.values()),
97
+ )
98
+
99
+ # Convert to tidy format
100
+ df_scores = variant_scorers.tidy_scores(variant_scores)
101
+
102
+ # Save results
103
+ if out_prefix is None:
104
+ out_prefix = f"variant_{variant_chromosome}_{variant_position}_{variant_reference_bases}_{variant_alternate_bases}"
105
+
106
+ output_file = OUTPUT_DIR / f"{out_prefix}_scores_{timestamp}.csv"
107
+
108
+ # Filter columns for display (remove internal columns)
109
+ columns = [
110
+ c for c in df_scores.columns if c not in ['variant_id', 'scored_interval']
111
+ ]
112
+ df_display = df_scores[columns]
113
+
114
+ # Save full results
115
+ df_scores.to_csv(output_file, index=False)
116
+
117
+ return {
118
+ "message": f"Variant scoring completed with {len(df_scores)} scores across modalities",
119
+ "reference": "https://github.com/google-deepmind/alphagenome/tree/main/colabs/variant_scoring_ui.ipynb",
120
+ "artifacts": [
121
+ {
122
+ "description": "Variant scores CSV",
123
+ "path": str(output_file.resolve())
124
+ }
125
+ ]
126
+ }
127
+
128
+ @variant_scoring_ui_mcp.tool
129
+ def visualize_variant_effects(
130
+ # Primary data inputs - variant specification
131
+ variant_chromosome: Annotated[str, "Chromosome name (e.g., 'chr22')"] = "chr22",
132
+ variant_position: Annotated[int, "Genomic position"] = 36201698,
133
+ variant_reference_bases: Annotated[str, "Reference allele"] = "A",
134
+ variant_alternate_bases: Annotated[str, "Alternate allele"] = "C",
135
+ # Analysis parameters with tutorial defaults
136
+ organism: Annotated[Literal["human", "mouse"], "Organism to analyze"] = "human",
137
+ sequence_length: Annotated[Literal["2KB", "16KB", "100KB", "500KB", "1MB"], "Length of sequence around variant to predict"] = "1MB",
138
+ ontology_terms: Annotated[list[str], "List of cell and tissue ontology terms"] = None,
139
+ # Gene annotation options
140
+ plot_gene_annotation: Annotated[bool, "Include gene annotation in plot"] = True,
141
+ plot_longest_transcript_only: Annotated[bool, "Show only longest transcript per gene"] = True,
142
+ # Output types to plot
143
+ plot_rna_seq: Annotated[bool, "Plot RNA-seq tracks"] = True,
144
+ plot_cage: Annotated[bool, "Plot CAGE tracks"] = True,
145
+ plot_atac: Annotated[bool, "Plot ATAC-seq tracks"] = False,
146
+ plot_dnase: Annotated[bool, "Plot DNase tracks"] = False,
147
+ plot_chip_histone: Annotated[bool, "Plot ChIP-seq histone tracks"] = False,
148
+ plot_chip_tf: Annotated[bool, "Plot ChIP-seq transcription factor tracks"] = False,
149
+ plot_splice_sites: Annotated[bool, "Plot splice sites"] = True,
150
+ plot_splice_site_usage: Annotated[bool, "Plot splice site usage"] = False,
151
+ plot_contact_maps: Annotated[bool, "Plot contact maps"] = False,
152
+ plot_splice_junctions: Annotated[bool, "Plot splice junctions"] = False,
153
+ # DNA strand filtering
154
+ filter_to_positive_strand: Annotated[bool, "Filter tracks to positive strand only"] = False,
155
+ filter_to_negative_strand: Annotated[bool, "Filter tracks to negative strand only"] = False,
156
+ # Visualization options
157
+ ref_color: Annotated[str, "Color for reference allele"] = "dimgrey",
158
+ alt_color: Annotated[str, "Color for alternate allele"] = "red",
159
+ plot_interval_width: Annotated[int, "Width of plot interval in base pairs"] = 43008,
160
+ plot_interval_shift: Annotated[int, "Shift of plot interval from variant center"] = 0,
161
+ api_key: Annotated[str | None, "API key for AlphaGenome model"] = ALPHAGENOME_API_KEY,
162
+ out_prefix: Annotated[str | None, "Output file prefix"] = None,
163
+ ) -> dict:
164
+ """
165
+ Generate comprehensive variant effect visualization across multiple genomic modalities.
166
+ Input is variant coordinates and visualization parameters, output is variant effect plots showing REF vs ALT predictions.
167
+ """
168
+ from alphagenome import colab_utils
169
+ from alphagenome.data import gene_annotation, genome, transcript
170
+ from alphagenome.models import dna_client
171
+ from alphagenome.visualization import plot_components
172
+ import matplotlib.pyplot as plt
173
+
174
+ # Validate strand filtering parameters
175
+ if filter_to_positive_strand and filter_to_negative_strand:
176
+ raise ValueError(
177
+ 'Cannot specify both filter_to_positive_strand and '
178
+ 'filter_to_negative_strand.'
179
+ )
180
+
181
+ # Use provided API key or get from environment/colab
182
+ if api_key:
183
+ dna_model = dna_client.create(api_key)
184
+ else:
185
+ dna_model = dna_client.create(colab_utils.get_api_key())
186
+
187
+ # Default ontology terms from tutorial
188
+ if ontology_terms is None:
189
+ ontology_terms = ['EFO:0001187', 'EFO:0002067', 'EFO:0002784']
190
+
191
+ # Map organism string to enum
192
+ organism_map = {
193
+ 'human': dna_client.Organism.HOMO_SAPIENS,
194
+ 'mouse': dna_client.Organism.MUS_MUSCULUS,
195
+ }
196
+ organism_enum = organism_map[organism]
197
+
198
+ # Reference paths for gene annotation
199
+ HG38_GTF_FEATHER = (
200
+ 'https://storage.googleapis.com/alphagenome/reference/gencode/'
201
+ 'hg38/gencode.v46.annotation.gtf.gz.feather'
202
+ )
203
+ MM10_GTF_FEATHER = (
204
+ 'https://storage.googleapis.com/alphagenome/reference/gencode/'
205
+ 'mm10/gencode.vM23.annotation.gtf.gz.feather'
206
+ )
207
+
208
+ # Create variant object
209
+ variant = genome.Variant(
210
+ chromosome=variant_chromosome,
211
+ position=variant_position,
212
+ reference_bases=variant_reference_bases,
213
+ alternate_bases=variant_alternate_bases,
214
+ )
215
+
216
+ # Get sequence length
217
+ sequence_length_value = dna_client.SUPPORTED_SEQUENCE_LENGTHS[
218
+ f'SEQUENCE_LENGTH_{sequence_length}'
219
+ ]
220
+
221
+ # The input interval is derived from the variant (centered on it)
222
+ interval = variant.reference_interval.resize(sequence_length_value)
223
+
224
+ # Load gene annotation
225
+ match organism_enum:
226
+ case dna_client.Organism.HOMO_SAPIENS:
227
+ gtf_path = HG38_GTF_FEATHER
228
+ case dna_client.Organism.MUS_MUSCULUS:
229
+ gtf_path = MM10_GTF_FEATHER
230
+ case _:
231
+ raise ValueError(f'Unsupported organism: {organism_enum}')
232
+
233
+ import pandas as pd
234
+ gtf = pd.read_feather(gtf_path)
235
+
236
+ # Filter to protein-coding genes and highly supported transcripts
237
+ gtf_transcript = gene_annotation.filter_transcript_support_level(
238
+ gene_annotation.filter_protein_coding(gtf), ['1']
239
+ )
240
+
241
+ # Extractor for identifying transcripts in a region
242
+ transcript_extractor = transcript.TranscriptExtractor(gtf_transcript)
243
+
244
+ # Also define an extractor that fetches only the longest transcript per gene
245
+ gtf_longest_transcript = gene_annotation.filter_to_longest_transcript(
246
+ gtf_transcript
247
+ )
248
+ longest_transcript_extractor = transcript.TranscriptExtractor(
249
+ gtf_longest_transcript
250
+ )
251
+
252
+ # Predict variant effects
253
+ output = dna_model.predict_variant(
254
+ interval=interval,
255
+ variant=variant,
256
+ organism=organism_enum,
257
+ requested_outputs=[*dna_client.OutputType],
258
+ ontology_terms=ontology_terms,
259
+ )
260
+
261
+ # Filter to DNA strand if requested
262
+ ref, alt = output.reference, output.alternate
263
+
264
+ if filter_to_positive_strand:
265
+ ref = ref.filter_to_strand(strand='+')
266
+ alt = alt.filter_to_strand(strand='+')
267
+ elif filter_to_negative_strand:
268
+ ref = ref.filter_to_strand(strand='-')
269
+ alt = alt.filter_to_strand(strand='-')
270
+
271
+ # Build plot components
272
+ components = []
273
+ ref_alt_colors = {'REF': ref_color, 'ALT': alt_color}
274
+
275
+ # Gene and transcript annotation
276
+ if plot_gene_annotation:
277
+ if plot_longest_transcript_only:
278
+ transcripts = longest_transcript_extractor.extract(interval)
279
+ else:
280
+ transcripts = transcript_extractor.extract(interval)
281
+ components.append(plot_components.TranscriptAnnotation(transcripts))
282
+
283
+ # Individual output type plots
284
+ plot_map = {
285
+ 'plot_atac': (ref.atac, alt.atac, 'ATAC'),
286
+ 'plot_cage': (ref.cage, alt.cage, 'CAGE'),
287
+ 'plot_chip_histone': (ref.chip_histone, alt.chip_histone, 'CHIP_HISTONE'),
288
+ 'plot_chip_tf': (ref.chip_tf, alt.chip_tf, 'CHIP_TF'),
289
+ 'plot_contact_maps': (ref.contact_maps, alt.contact_maps, 'CONTACT_MAPS'),
290
+ 'plot_dnase': (ref.dnase, alt.dnase, 'DNASE'),
291
+ 'plot_rna_seq': (ref.rna_seq, alt.rna_seq, 'RNA_SEQ'),
292
+ 'plot_splice_junctions': (
293
+ ref.splice_junctions,
294
+ alt.splice_junctions,
295
+ 'SPLICE_JUNCTIONS',
296
+ ),
297
+ 'plot_splice_sites': (ref.splice_sites, alt.splice_sites, 'SPLICE_SITES'),
298
+ 'plot_splice_site_usage': (
299
+ ref.splice_site_usage,
300
+ alt.splice_site_usage,
301
+ 'SPLICE_SITE_USAGE',
302
+ ),
303
+ }
304
+
305
+ for key, (ref_data, alt_data, output_type) in plot_map.items():
306
+ if eval(key) and ref_data is not None and ref_data.values.shape[-1] == 0:
307
+ print(
308
+ f'Requested plot for output {output_type} but no tracks exist in'
309
+ ' output. This is likely because this output does not exist for your'
310
+ ' ontologies or requested DNA strand.'
311
+ )
312
+ if eval(key) and ref_data and alt_data:
313
+ match output_type:
314
+ case 'CHIP_HISTONE':
315
+ ylabel_template = (
316
+ f'{output_type}: {{biosample_name}} ({{strand}})\n{{histone_mark}}'
317
+ )
318
+ case 'CHIP_TF':
319
+ ylabel_template = (
320
+ f'{output_type}: {{biosample_name}}'
321
+ ' ({strand})\n{transcription_factor}'
322
+ )
323
+ case 'CONTACT_MAPS':
324
+ ylabel_template = f'{output_type}: {{biosample_name}} ({{strand}})'
325
+ case 'SPLICE_SITES':
326
+ ylabel_template = f'{output_type}: {{name}} ({{strand}})'
327
+ case _:
328
+ ylabel_template = (
329
+ f'{output_type}: {{biosample_name}} ({{strand}})\n{{name}}'
330
+ )
331
+
332
+ if output_type == 'CONTACT_MAPS':
333
+ component = plot_components.ContactMapsDiff(
334
+ tdata=alt_data - ref_data,
335
+ ylabel_template=ylabel_template,
336
+ )
337
+ components.append(component)
338
+ elif output_type == 'SPLICE_JUNCTIONS':
339
+ ref_plot = plot_components.Sashimi(
340
+ ref_data,
341
+ ylabel_template='REF: ' + ylabel_template,
342
+ )
343
+ alt_plot = plot_components.Sashimi(
344
+ alt_data,
345
+ ylabel_template='ALT: ' + ylabel_template,
346
+ )
347
+ components.extend([ref_plot, alt_plot])
348
+ else:
349
+ component = plot_components.OverlaidTracks(
350
+ tdata={'REF': ref_data, 'ALT': alt_data},
351
+ colors=ref_alt_colors,
352
+ ylabel_template=ylabel_template,
353
+ )
354
+ components.append(component)
355
+
356
+ # Validate plot interval width
357
+ if plot_interval_width > interval.width:
358
+ raise ValueError(
359
+ f'plot_interval_width ({plot_interval_width}) must be less than '
360
+ f'interval.width ({interval.width}).'
361
+ )
362
+
363
+ # Generate plot
364
+ plot = plot_components.plot(
365
+ components=components,
366
+ interval=interval.shift(plot_interval_shift).resize(plot_interval_width),
367
+ annotations=[
368
+ plot_components.VariantAnnotation([variant]),
369
+ ],
370
+ )
371
+
372
+ # Save plot
373
+ if out_prefix is None:
374
+ out_prefix = f"variant_{variant_chromosome}_{variant_position}_{variant_reference_bases}_{variant_alternate_bases}"
375
+
376
+ output_file = OUTPUT_DIR / f"{out_prefix}_effects_{timestamp}.png"
377
+ plt.savefig(output_file, dpi=300, bbox_inches='tight')
378
+ plt.close()
379
+
380
+ return {
381
+ "message": f"Variant visualization completed with {len(components)} plot components",
382
+ "reference": "https://github.com/google-deepmind/alphagenome/tree/main/colabs/variant_scoring_ui.ipynb",
383
+ "artifacts": [
384
+ {
385
+ "description": "Variant effects plot",
386
+ "path": str(output_file.resolve())
387
+ }
388
+ ]
389
+ }
tools/visualization_modality_tour.py ADDED
@@ -0,0 +1,1053 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ AlphaGenome visualization modality tour for different genomic data types.
3
+
4
+ This MCP Server provides 9 tools:
5
+ 1. visualize_gene_expression: Visualize RNA_SEQ and CAGE gene expression predictions
6
+ 2. visualize_variant_expression_effects: Show REF vs ALT variant effects on gene expression
7
+ 3. visualize_custom_annotations: Plot custom annotations like polyadenylation sites
8
+ 4. visualize_chromatin_accessibility: Visualize DNASE and ATAC chromatin accessibility
9
+ 5. visualize_splicing_effects: Visualize splicing predictions with SPLICE_SITES, SPLICE_SITE_USAGE, SPLICE_JUNCTIONS
10
+ 6. visualize_variant_splicing_effects: Show REF vs ALT variant effects on splicing with sashimi plots
11
+ 7. visualize_histone_modifications: Visualize CHIP_HISTONE predictions with custom colors
12
+ 8. visualize_tf_binding: Visualize CHIP_TF transcription factor binding predictions
13
+ 9. visualize_contact_maps: Visualize CONTACT_MAPS DNA-DNA contact predictions
14
+
15
+ All tools extracted from `https://github.com/google-deepmind/alphagenome/tree/main/colabs/visualization_modality_tour.ipynb`.
16
+ """
17
+
18
+ # Standard imports
19
+ from typing import Annotated, Literal, Any
20
+ import pandas as pd
21
+ import numpy as np
22
+ from pathlib import Path
23
+ import os
24
+ from fastmcp import FastMCP
25
+ from datetime import datetime
26
+ import matplotlib.pyplot as plt
27
+
28
+ # AlphaGenome imports
29
+ from alphagenome import colab_utils
30
+ from alphagenome.data import gene_annotation, genome, track_data, transcript
31
+ from alphagenome.models import dna_client
32
+ from alphagenome.visualization import plot_components
33
+
34
+ # Base persistent directory (HF Spaces guarantees /data is writable & persistent)
35
+ BASE_DIR = Path("/data")
36
+
37
+ DEFAULT_INPUT_DIR = BASE_DIR / "tmp_inputs"
38
+ DEFAULT_OUTPUT_DIR = BASE_DIR / "tmp_outputs"
39
+
40
+ INPUT_DIR = Path(os.environ.get("BATCH_VARIANT_SCORING_INPUT_DIR", DEFAULT_INPUT_DIR))
41
+ OUTPUT_DIR = Path(os.environ.get("BATCH_VARIANT_SCORING_OUTPUT_DIR", DEFAULT_OUTPUT_DIR))
42
+
43
+ # Ensure directories exist
44
+ INPUT_DIR.mkdir(parents=True, exist_ok=True)
45
+ OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
46
+
47
+ # Fetch your secret
48
+ ALPHAGENOME_API_KEY = os.environ["ALPHAGENOME_API_KEY"]
49
+
50
+ # Timestamp for unique outputs
51
+ timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
52
+
53
+ # MCP server instance
54
+ visualization_modality_tour_mcp = FastMCP(name="visualization_modality_tour")
55
+
56
+ @visualization_modality_tour_mcp.tool
57
+ def visualize_gene_expression(
58
+ chromosome: Annotated[str, "Chromosome name (e.g., 'chr22')"],
59
+ start_position: Annotated[int, "Start genomic position"],
60
+ end_position: Annotated[int, "End genomic position"],
61
+ ontology_terms: Annotated[list, "List of ontology terms for tissues/cell types"] = ['UBERON:0001159', 'UBERON:0001155'],
62
+ api_key: Annotated[str | None, "AlphaGenome API key"] = ALPHAGENOME_API_KEY,
63
+ out_prefix: Annotated[str | None, "Output file prefix"] = None,
64
+ ) -> dict:
65
+ """
66
+ Visualize RNA_SEQ and CAGE gene expression predictions for a genomic interval.
67
+ Input is genomic coordinates and ontology terms and output is gene expression visualization plot.
68
+ """
69
+ if api_key is None:
70
+ raise ValueError("API key must be provided")
71
+
72
+ # Create interval and resize to supported length
73
+ interval = genome.Interval(chromosome, start_position, end_position).resize(
74
+ dna_client.SEQUENCE_LENGTH_1MB
75
+ )
76
+
77
+ # Create model client
78
+ dna_model = dna_client.create(api_key)
79
+
80
+ # Load gene annotations
81
+ gtf = pd.read_feather(
82
+ 'https://storage.googleapis.com/alphagenome/reference/gencode/'
83
+ 'hg38/gencode.v46.annotation.gtf.gz.feather'
84
+ )
85
+ gtf_transcript = gene_annotation.filter_transcript_support_level(
86
+ gene_annotation.filter_protein_coding(gtf), ['1']
87
+ )
88
+ longest_transcript_extractor = transcript.TranscriptExtractor(
89
+ gene_annotation.filter_to_longest_transcript(gtf_transcript)
90
+ )
91
+
92
+ # Make predictions
93
+ output = dna_model.predict_interval(
94
+ interval=interval,
95
+ requested_outputs={
96
+ dna_client.OutputType.RNA_SEQ,
97
+ dna_client.OutputType.CAGE,
98
+ },
99
+ ontology_terms=ontology_terms,
100
+ )
101
+
102
+ # Extract transcripts
103
+ longest_transcripts = longest_transcript_extractor.extract(interval)
104
+
105
+ # Build plot
106
+ plot = plot_components.plot(
107
+ [
108
+ plot_components.TranscriptAnnotation(longest_transcripts),
109
+ plot_components.Tracks(
110
+ tdata=output.rna_seq,
111
+ ylabel_template='RNA_SEQ: {biosample_name} ({strand})\n{name}',
112
+ ),
113
+ plot_components.Tracks(
114
+ tdata=output.cage,
115
+ ylabel_template='CAGE: {biosample_name} ({strand})\n{name}',
116
+ ),
117
+ ],
118
+ interval=interval,
119
+ title='Predicted RNA Expression (RNA_SEQ, CAGE) for colon tissue',
120
+ )
121
+
122
+ # Save plot
123
+ if out_prefix is None:
124
+ out_prefix = f"gene_expression_{timestamp}"
125
+ output_file = OUTPUT_DIR / f"{out_prefix}.png"
126
+ plt.savefig(output_file, dpi=300, bbox_inches='tight')
127
+ plt.close()
128
+
129
+ return {
130
+ "message": "Gene expression visualization completed successfully",
131
+ "reference": "https://github.com/google-deepmind/alphagenome/tree/main/colabs/visualization_modality_tour.ipynb",
132
+ "artifacts": [
133
+ {
134
+ "description": "Gene expression visualization plot",
135
+ "path": str(output_file.resolve())
136
+ }
137
+ ]
138
+ }
139
+
140
+
141
+ def visualize_variant_expression_effects(
142
+ chromosome: Annotated[str, "Chromosome name (e.g., 'chr22')"],
143
+ position: Annotated[int, "Variant genomic position"],
144
+ reference_bases: Annotated[str, "Reference allele sequence"],
145
+ alternate_bases: Annotated[str, "Alternate allele sequence"],
146
+ interval_start: Annotated[int, "Start position for prediction interval"],
147
+ interval_end: Annotated[int, "End position for prediction interval"],
148
+ gene_symbol: Annotated[str | None, "Gene symbol to zoom in on"] = None,
149
+ ontology_terms: Annotated[list, "List of ontology terms for tissues/cell types"] = ['UBERON:0001159', 'UBERON:0001155'],
150
+ api_key: Annotated[str | None, "AlphaGenome API key"] = ALPHAGENOME_API_KEY,
151
+ out_prefix: Annotated[str | None, "Output file prefix"] = None,
152
+ ) -> dict:
153
+ """
154
+ Visualize REF vs ALT variant effects on gene expression with overlaid tracks.
155
+ Input is variant coordinates and interval and output is variant effect visualization plot.
156
+ """
157
+ if api_key is None:
158
+ raise ValueError("API key must be provided")
159
+
160
+ # Create variant and interval
161
+ variant = genome.Variant(chromosome, position, reference_bases, alternate_bases)
162
+ interval = genome.Interval(chromosome, interval_start, interval_end).resize(
163
+ dna_client.SEQUENCE_LENGTH_1MB
164
+ )
165
+
166
+ # Create model client
167
+ dna_model = dna_client.create(api_key)
168
+
169
+ # Load gene annotations
170
+ gtf = pd.read_feather(
171
+ 'https://storage.googleapis.com/alphagenome/reference/gencode/'
172
+ 'hg38/gencode.v46.annotation.gtf.gz.feather'
173
+ )
174
+ gtf_transcript = gene_annotation.filter_transcript_support_level(
175
+ gene_annotation.filter_protein_coding(gtf), ['1']
176
+ )
177
+ longest_transcript_extractor = transcript.TranscriptExtractor(
178
+ gene_annotation.filter_to_longest_transcript(gtf_transcript)
179
+ )
180
+
181
+ # Make variant predictions
182
+ output = dna_model.predict_variant(
183
+ interval=interval,
184
+ variant=variant,
185
+ requested_outputs={
186
+ dna_client.OutputType.RNA_SEQ,
187
+ dna_client.OutputType.CAGE,
188
+ },
189
+ ontology_terms=ontology_terms,
190
+ )
191
+
192
+ # Extract transcripts
193
+ longest_transcripts = longest_transcript_extractor.extract(interval)
194
+
195
+ # Determine plot interval
196
+ if gene_symbol is not None:
197
+ plot_interval = gene_annotation.get_gene_interval(gtf, gene_symbol=gene_symbol)
198
+ plot_interval.resize_inplace(plot_interval.width + 1000)
199
+ else:
200
+ plot_interval = interval
201
+
202
+ # Define colors for REF and ALT
203
+ ref_alt_colors = {'REF': 'dimgrey', 'ALT': 'red'}
204
+
205
+ # Build plot
206
+ plot = plot_components.plot(
207
+ [
208
+ plot_components.TranscriptAnnotation(longest_transcripts),
209
+ plot_components.OverlaidTracks(
210
+ tdata={
211
+ 'REF': output.reference.rna_seq.filter_to_nonpositive_strand(),
212
+ 'ALT': output.alternate.rna_seq.filter_to_nonpositive_strand(),
213
+ },
214
+ colors=ref_alt_colors,
215
+ ylabel_template='{biosample_name} ({strand})\n{name}',
216
+ ),
217
+ plot_components.OverlaidTracks(
218
+ tdata={
219
+ 'REF': output.reference.cage.filter_to_nonpositive_strand(),
220
+ 'ALT': output.alternate.cage.filter_to_nonpositive_strand(),
221
+ },
222
+ colors=ref_alt_colors,
223
+ ylabel_template='{biosample_name} ({strand})\n{name}',
224
+ ),
225
+ ],
226
+ annotations=[plot_components.VariantAnnotation([variant])],
227
+ interval=plot_interval,
228
+ title='Effect of variant on predicted RNA Expression in colon tissue',
229
+ )
230
+
231
+ # Save plot
232
+ if out_prefix is None:
233
+ out_prefix = f"variant_expression_effects_{timestamp}"
234
+ output_file = OUTPUT_DIR / f"{out_prefix}.png"
235
+ plt.savefig(output_file, dpi=300, bbox_inches='tight')
236
+ plt.close()
237
+
238
+ return {
239
+ "message": "Variant expression effects visualization completed successfully",
240
+ "reference": "https://github.com/google-deepmind/alphagenome/tree/main/colabs/visualization_modality_tour.ipynb",
241
+ "artifacts": [
242
+ {
243
+ "description": "Variant expression effects plot",
244
+ "path": str(output_file.resolve())
245
+ }
246
+ ]
247
+ }
248
+
249
+
250
+ def visualize_custom_annotations(
251
+ chromosome: Annotated[str, "Chromosome name (e.g., 'chr22')"],
252
+ start_position: Annotated[int, "Start genomic position"],
253
+ end_position: Annotated[int, "End genomic position"],
254
+ annotation_intervals: Annotated[list, "List of annotation intervals as [chr, start, end, strand, label] tuples"],
255
+ plot_start: Annotated[int, "Start position for plotting window"],
256
+ plot_end: Annotated[int, "End position for plotting window"],
257
+ ontology_terms: Annotated[list, "List of ontology terms for tissues/cell types"] = ['UBERON:0001159', 'UBERON:0002048'],
258
+ api_key: Annotated[str | None, "AlphaGenome API key"] = ALPHAGENOME_API_KEY,
259
+ out_prefix: Annotated[str | None, "Output file prefix"] = None,
260
+ ) -> dict:
261
+ """
262
+ Visualize RNA predictions with custom interval annotations like polyadenylation sites.
263
+ Input is genomic coordinates and custom annotations and output is annotated RNA visualization plot.
264
+ """
265
+ if api_key is None:
266
+ raise ValueError("API key must be provided")
267
+
268
+ # Create interval
269
+ interval = genome.Interval(chromosome, start_position, end_position).resize(
270
+ dna_client.SEQUENCE_LENGTH_1MB
271
+ )
272
+
273
+ # Create model client
274
+ dna_model = dna_client.create(api_key)
275
+
276
+ # Load gene annotations
277
+ gtf = pd.read_feather(
278
+ 'https://storage.googleapis.com/alphagenome/reference/gencode/'
279
+ 'hg38/gencode.v46.annotation.gtf.gz.feather'
280
+ )
281
+ gtf_transcript = gene_annotation.filter_transcript_support_level(
282
+ gene_annotation.filter_protein_coding(gtf), ['1']
283
+ )
284
+ longest_transcript_extractor = transcript.TranscriptExtractor(
285
+ gene_annotation.filter_to_longest_transcript(gtf_transcript)
286
+ )
287
+
288
+ # Make predictions
289
+ output = dna_model.predict_interval(
290
+ interval=interval,
291
+ requested_outputs={
292
+ dna_client.OutputType.RNA_SEQ,
293
+ },
294
+ ontology_terms=ontology_terms,
295
+ )
296
+
297
+ # Extract transcripts
298
+ longest_transcripts = longest_transcript_extractor.extract(interval)
299
+
300
+ # Create custom annotation intervals
301
+ custom_intervals = []
302
+ labels = []
303
+ for ann in annotation_intervals:
304
+ if len(ann) >= 5:
305
+ chr_name, start, end, strand, label = ann[:5]
306
+ custom_intervals.append(genome.Interval(chr_name, start, end, strand))
307
+ labels.append(label)
308
+
309
+ # Define plotting interval
310
+ plot_interval = genome.Interval(chromosome, plot_start, plot_end, '-')
311
+
312
+ # Build plot
313
+ plot = plot_components.plot(
314
+ [
315
+ plot_components.TranscriptAnnotation(longest_transcripts),
316
+ plot_components.Tracks(
317
+ tdata=output.rna_seq.filter_to_negative_strand(),
318
+ ylabel_template='RNA_SEQ: {biosample_name} ({strand})\n{name}',
319
+ shared_y_scale=True,
320
+ )
321
+ ],
322
+ annotations=[
323
+ plot_components.IntervalAnnotation(
324
+ custom_intervals,
325
+ alpha=1,
326
+ labels=labels,
327
+ label_angle=90
328
+ )
329
+ ],
330
+ interval=plot_interval,
331
+ title='Custom annotations with RNA expression',
332
+ )
333
+
334
+ # Save plot
335
+ if out_prefix is None:
336
+ out_prefix = f"custom_annotations_{timestamp}"
337
+ output_file = OUTPUT_DIR / f"{out_prefix}.png"
338
+ plt.savefig(output_file, dpi=300, bbox_inches='tight')
339
+ plt.close()
340
+
341
+ return {
342
+ "message": "Custom annotations visualization completed successfully",
343
+ "reference": "https://github.com/google-deepmind/alphagenome/tree/main/colabs/visualization_modality_tour.ipynb",
344
+ "artifacts": [
345
+ {
346
+ "description": "Custom annotations plot",
347
+ "path": str(output_file.resolve())
348
+ }
349
+ ]
350
+ }
351
+
352
+ @visualization_modality_tour_mcp.tool
353
+ def visualize_chromatin_accessibility(
354
+ chromosome: Annotated[str, "Chromosome name (e.g., 'chr22')"],
355
+ start_position: Annotated[int, "Start genomic position"],
356
+ end_position: Annotated[int, "End genomic position"],
357
+ variant_position: Annotated[int | None, "Variant position to highlight"] = None,
358
+ variant_ref: Annotated[str | None, "Variant reference allele"] = None,
359
+ variant_alt: Annotated[str | None, "Variant alternate allele"] = None,
360
+ promoter_intervals: Annotated[list | None, "List of promoter intervals as [chr, start, end, name] tuples"] = None,
361
+ window_size: Annotated[int, "Size of plotting window around variant"] = 8000,
362
+ ontology_terms: Annotated[list, "List of ontology terms for tissues/cell types"] = ['UBERON:0000317', 'UBERON:0001155', 'UBERON:0001157', 'UBERON:0001159', 'UBERON:0004992', 'UBERON:0008971'],
363
+ api_key: Annotated[str | None, "AlphaGenome API key"] = ALPHAGENOME_API_KEY,
364
+ out_prefix: Annotated[str | None, "Output file prefix"] = None,
365
+ ) -> dict:
366
+ """
367
+ Visualize DNASE and ATAC chromatin accessibility predictions for intestinal tissues.
368
+ Input is genomic coordinates and optional variant information and output is chromatin accessibility plot.
369
+ """
370
+ if api_key is None:
371
+ raise ValueError("API key must be provided")
372
+
373
+ # Create interval
374
+ interval = genome.Interval(chromosome, start_position, end_position).resize(
375
+ dna_client.SEQUENCE_LENGTH_1MB
376
+ )
377
+
378
+ # Create model client
379
+ dna_model = dna_client.create(api_key)
380
+
381
+ # Load gene annotations
382
+ gtf = pd.read_feather(
383
+ 'https://storage.googleapis.com/alphagenome/reference/gencode/'
384
+ 'hg38/gencode.v46.annotation.gtf.gz.feather'
385
+ )
386
+ gtf_transcript = gene_annotation.filter_transcript_support_level(
387
+ gene_annotation.filter_protein_coding(gtf), ['1']
388
+ )
389
+ longest_transcript_extractor = transcript.TranscriptExtractor(
390
+ gene_annotation.filter_to_longest_transcript(gtf_transcript)
391
+ )
392
+
393
+ # Make predictions
394
+ output = dna_model.predict_interval(
395
+ interval,
396
+ requested_outputs={
397
+ dna_client.OutputType.DNASE,
398
+ dna_client.OutputType.ATAC,
399
+ },
400
+ ontology_terms=ontology_terms,
401
+ )
402
+
403
+ # Extract transcripts
404
+ longest_transcripts = longest_transcript_extractor.extract(interval)
405
+
406
+ # Prepare annotations
407
+ annotations = []
408
+
409
+ # Add variant annotation if provided
410
+ if variant_position is not None and variant_ref is not None and variant_alt is not None:
411
+ variant = genome.Variant(chromosome, variant_position, variant_ref, variant_alt)
412
+ annotations.append(plot_components.VariantAnnotation([variant]))
413
+ plot_interval = variant.reference_interval.resize(window_size)
414
+ else:
415
+ plot_interval = interval
416
+
417
+ # Add promoter annotations if provided
418
+ if promoter_intervals is not None:
419
+ promoter_objs = []
420
+ for prom in promoter_intervals:
421
+ if len(prom) >= 4:
422
+ chr_name, start, end, name = prom[:4]
423
+ promoter_objs.append(genome.Interval(chr_name, start, end, name=name))
424
+ if promoter_objs:
425
+ annotations.append(plot_components.IntervalAnnotation(promoter_objs))
426
+
427
+ # Build plot
428
+ plot = plot_components.plot(
429
+ [
430
+ plot_components.TranscriptAnnotation(longest_transcripts),
431
+ plot_components.Tracks(
432
+ tdata=output.dnase,
433
+ ylabel_template='DNASE: {biosample_name} ({strand})\n{name}',
434
+ ),
435
+ plot_components.Tracks(
436
+ tdata=output.atac,
437
+ ylabel_template='ATAC: {biosample_name} ({strand})\n{name}',
438
+ ),
439
+ ],
440
+ interval=plot_interval,
441
+ annotations=annotations,
442
+ title='Predicted chromatin accessibility (DNASE, ATAC) for colon tissue',
443
+ )
444
+
445
+ # Save plot
446
+ if out_prefix is None:
447
+ out_prefix = f"chromatin_accessibility_{timestamp}"
448
+ output_file = OUTPUT_DIR / f"{out_prefix}.png"
449
+ plt.savefig(output_file, dpi=300, bbox_inches='tight')
450
+ plt.close()
451
+
452
+ return {
453
+ "message": "Chromatin accessibility visualization completed successfully",
454
+ "reference": "https://github.com/google-deepmind/alphagenome/tree/main/colabs/visualization_modality_tour.ipynb",
455
+ "artifacts": [
456
+ {
457
+ "description": "Chromatin accessibility plot",
458
+ "path": str(output_file.resolve())
459
+ }
460
+ ]
461
+ }
462
+
463
+ @visualization_modality_tour_mcp.tool
464
+ def visualize_splicing_effects(
465
+ chromosome: Annotated[str, "Chromosome name (e.g., 'chr22')"],
466
+ start_position: Annotated[int, "Start genomic position"],
467
+ end_position: Annotated[int, "End genomic position"],
468
+ gene_symbol: Annotated[str | None, "Gene symbol to focus on"] = None,
469
+ ontology_terms: Annotated[list, "List of ontology terms for tissues/cell types"] = ['UBERON:0001157', 'UBERON:0001159'],
470
+ api_key: Annotated[str | None, "AlphaGenome API key"] = ALPHAGENOME_API_KEY,
471
+ out_prefix: Annotated[str | None, "Output file prefix"] = None,
472
+ ) -> dict:
473
+ """
474
+ Visualize splicing predictions including SPLICE_SITES, SPLICE_SITE_USAGE, and SPLICE_JUNCTIONS.
475
+ Input is genomic coordinates and optional gene symbol and output is splicing effects plot.
476
+ """
477
+ if api_key is None:
478
+ raise ValueError("API key must be provided")
479
+
480
+ # Create interval
481
+ interval = genome.Interval(chromosome, start_position, end_position).resize(
482
+ dna_client.SEQUENCE_LENGTH_1MB
483
+ )
484
+
485
+ # Create model client
486
+ dna_model = dna_client.create(api_key)
487
+
488
+ # Load gene annotations
489
+ gtf = pd.read_feather(
490
+ 'https://storage.googleapis.com/alphagenome/reference/gencode/'
491
+ 'hg38/gencode.v46.annotation.gtf.gz.feather'
492
+ )
493
+ gtf_transcript = gene_annotation.filter_transcript_support_level(
494
+ gene_annotation.filter_protein_coding(gtf), ['1']
495
+ )
496
+ longest_transcript_extractor = transcript.TranscriptExtractor(
497
+ gene_annotation.filter_to_longest_transcript(gtf_transcript)
498
+ )
499
+
500
+ # Make predictions
501
+ output = dna_model.predict_interval(
502
+ interval=interval,
503
+ requested_outputs={
504
+ dna_client.OutputType.RNA_SEQ,
505
+ dna_client.OutputType.SPLICE_SITES,
506
+ dna_client.OutputType.SPLICE_SITE_USAGE,
507
+ dna_client.OutputType.SPLICE_JUNCTIONS,
508
+ },
509
+ ontology_terms=ontology_terms,
510
+ )
511
+
512
+ # Extract transcripts
513
+ longest_transcripts = longest_transcript_extractor.extract(interval)
514
+
515
+ # Determine plot interval
516
+ if gene_symbol is not None:
517
+ plot_interval = gene_annotation.get_gene_interval(gtf, gene_symbol=gene_symbol)
518
+ plot_interval.resize_inplace(plot_interval.width + 1000)
519
+ else:
520
+ plot_interval = interval
521
+
522
+ # Build plot
523
+ plot = plot_components.plot(
524
+ [
525
+ plot_components.TranscriptAnnotation(longest_transcripts),
526
+ plot_components.Tracks(
527
+ tdata=output.splice_sites.filter_to_negative_strand(),
528
+ ylabel_template='SPLICE SITES: {name} ({strand})',
529
+ ),
530
+ ],
531
+ interval=plot_interval,
532
+ title='Predicted splicing effects for colon tissue',
533
+ )
534
+
535
+ # Save plot
536
+ if out_prefix is None:
537
+ out_prefix = f"splicing_effects_{timestamp}"
538
+ output_file = OUTPUT_DIR / f"{out_prefix}.png"
539
+ plt.savefig(output_file, dpi=300, bbox_inches='tight')
540
+ plt.close()
541
+
542
+ return {
543
+ "message": "Splicing effects visualization completed successfully",
544
+ "reference": "https://github.com/google-deepmind/alphagenome/tree/main/colabs/visualization_modality_tour.ipynb",
545
+ "artifacts": [
546
+ {
547
+ "description": "Splicing effects plot",
548
+ "path": str(output_file.resolve())
549
+ }
550
+ ]
551
+ }
552
+
553
+
554
+ def visualize_variant_splicing_effects(
555
+ chromosome: Annotated[str, "Chromosome name (e.g., 'chr22')"],
556
+ variant_position: Annotated[int, "Variant genomic position"],
557
+ variant_ref: Annotated[str, "Variant reference allele"],
558
+ variant_alt: Annotated[str, "Variant alternate allele"],
559
+ interval_start: Annotated[int, "Start position for prediction interval"],
560
+ interval_end: Annotated[int, "End position for prediction interval"],
561
+ gene_symbol: Annotated[str | None, "Gene symbol to focus on"] = None,
562
+ tissue_filter: Annotated[str, "Specific tissue to filter for sashimi plots"] = "Colon_Transverse",
563
+ ontology_terms: Annotated[list, "List of ontology terms for tissues/cell types"] = ['UBERON:0001157', 'UBERON:0001159'],
564
+ api_key: Annotated[str | None, "AlphaGenome API key"] = ALPHAGENOME_API_KEY,
565
+ out_prefix: Annotated[str | None, "Output file prefix"] = None,
566
+ ) -> dict:
567
+ """
568
+ Visualize REF vs ALT variant effects on splicing with sashimi plots and overlaid tracks.
569
+ Input is variant coordinates and interval and output is variant splicing effects plot with sashimi arcs.
570
+ """
571
+ if api_key is None:
572
+ raise ValueError("API key must be provided")
573
+
574
+ # Create variant and interval
575
+ variant = genome.Variant(chromosome, variant_position, variant_ref, variant_alt)
576
+ interval = genome.Interval(chromosome, interval_start, interval_end).resize(
577
+ dna_client.SEQUENCE_LENGTH_1MB
578
+ )
579
+
580
+ # Create model client
581
+ dna_model = dna_client.create(api_key)
582
+
583
+ # Load gene annotations
584
+ gtf = pd.read_feather(
585
+ 'https://storage.googleapis.com/alphagenome/reference/gencode/'
586
+ 'hg38/gencode.v46.annotation.gtf.gz.feather'
587
+ )
588
+ gtf_transcript = gene_annotation.filter_transcript_support_level(
589
+ gene_annotation.filter_protein_coding(gtf), ['1']
590
+ )
591
+ transcript_extractor = transcript.TranscriptExtractor(gtf_transcript)
592
+
593
+ # Make variant predictions
594
+ output = dna_model.predict_variant(
595
+ interval=interval,
596
+ variant=variant,
597
+ requested_outputs={
598
+ dna_client.OutputType.RNA_SEQ,
599
+ dna_client.OutputType.SPLICE_SITES,
600
+ dna_client.OutputType.SPLICE_SITE_USAGE,
601
+ dna_client.OutputType.SPLICE_JUNCTIONS,
602
+ },
603
+ ontology_terms=ontology_terms,
604
+ )
605
+
606
+ # Extract all transcripts
607
+ transcripts = transcript_extractor.extract(interval)
608
+
609
+ # Determine plot interval
610
+ if gene_symbol is not None:
611
+ plot_interval = gene_annotation.get_gene_interval(gtf, gene_symbol=gene_symbol)
612
+ plot_interval.resize_inplace(plot_interval.width + 1000)
613
+ else:
614
+ plot_interval = interval
615
+
616
+ ref_output = output.reference
617
+ alt_output = output.alternate
618
+
619
+ # Define colors for REF and ALT
620
+ ref_alt_colors = {'REF': 'dimgrey', 'ALT': 'red'}
621
+
622
+ # Build plot
623
+ plot = plot_components.plot(
624
+ [
625
+ plot_components.TranscriptAnnotation(transcripts),
626
+ plot_components.Sashimi(
627
+ ref_output.splice_junctions
628
+ .filter_to_strand('-')
629
+ .filter_by_tissue(tissue_filter),
630
+ ylabel_template='Reference {biosample_name} ({strand})\n{name}',
631
+ ),
632
+ plot_components.Sashimi(
633
+ alt_output.splice_junctions
634
+ .filter_to_strand('-')
635
+ .filter_by_tissue(tissue_filter),
636
+ ylabel_template='Alternate {biosample_name} ({strand})\n{name}',
637
+ ),
638
+ plot_components.OverlaidTracks(
639
+ tdata={
640
+ 'REF': ref_output.rna_seq.filter_to_nonpositive_strand(),
641
+ 'ALT': alt_output.rna_seq.filter_to_nonpositive_strand(),
642
+ },
643
+ colors=ref_alt_colors,
644
+ ylabel_template='RNA_SEQ: {biosample_name} ({strand})\n{name}',
645
+ ),
646
+ plot_components.OverlaidTracks(
647
+ tdata={
648
+ 'REF': ref_output.splice_sites.filter_to_nonpositive_strand(),
649
+ 'ALT': alt_output.splice_sites.filter_to_nonpositive_strand(),
650
+ },
651
+ colors=ref_alt_colors,
652
+ ylabel_template='SPLICE SITES: {name} ({strand})',
653
+ ),
654
+ plot_components.OverlaidTracks(
655
+ tdata={
656
+ 'REF': (
657
+ ref_output.splice_site_usage.filter_to_nonpositive_strand()
658
+ ),
659
+ 'ALT': (
660
+ alt_output.splice_site_usage.filter_to_nonpositive_strand()
661
+ ),
662
+ },
663
+ colors=ref_alt_colors,
664
+ ylabel_template=(
665
+ 'SPLICE SITE USAGE: {biosample_name} ({strand})\n{name}'
666
+ ),
667
+ ),
668
+ ],
669
+ interval=plot_interval,
670
+ annotations=[plot_components.VariantAnnotation([variant])],
671
+ title='Predicted REF vs. ALT effects of variant in colon tissue',
672
+ )
673
+
674
+ # Save plot
675
+ if out_prefix is None:
676
+ out_prefix = f"variant_splicing_effects_{timestamp}"
677
+ output_file = OUTPUT_DIR / f"{out_prefix}.png"
678
+ plt.savefig(output_file, dpi=300, bbox_inches='tight')
679
+ plt.close()
680
+
681
+ return {
682
+ "message": "Variant splicing effects visualization completed successfully",
683
+ "reference": "https://github.com/google-deepmind/alphagenome/tree/main/colabs/visualization_modality_tour.ipynb",
684
+ "artifacts": [
685
+ {
686
+ "description": "Variant splicing effects plot",
687
+ "path": str(output_file.resolve())
688
+ }
689
+ ]
690
+ }
691
+
692
+ @visualization_modality_tour_mcp.tool
693
+ def visualize_histone_modifications(
694
+ chromosome: Annotated[str, "Chromosome name (e.g., 'chr22')"],
695
+ start_position: Annotated[int, "Start genomic position"],
696
+ end_position: Annotated[int, "End genomic position"],
697
+ include_tss_annotations: Annotated[bool, "Whether to include transcription start site annotations"] = True,
698
+ ontology_terms: Annotated[list, "List of ontology terms for tissues/cell types"] = ['UBERON:0000317', 'UBERON:0001155', 'UBERON:0001157', 'UBERON:0001159'],
699
+ api_key: Annotated[str | None, "AlphaGenome API key"] = ALPHAGENOME_API_KEY,
700
+ out_prefix: Annotated[str | None, "Output file prefix"] = None,
701
+ ) -> dict:
702
+ """
703
+ Visualize CHIP_HISTONE predictions with custom colors grouped by histone mark.
704
+ Input is genomic coordinates and output is histone modifications plot with colored tracks.
705
+ """
706
+ if api_key is None:
707
+ raise ValueError("API key must be provided")
708
+
709
+ # Create interval
710
+ interval = genome.Interval(chromosome, start_position, end_position).resize(
711
+ dna_client.SEQUENCE_LENGTH_1MB
712
+ )
713
+
714
+ # Create model client
715
+ dna_model = dna_client.create(api_key)
716
+
717
+ # Load gene annotations
718
+ gtf = pd.read_feather(
719
+ 'https://storage.googleapis.com/alphagenome/reference/gencode/'
720
+ 'hg38/gencode.v46.annotation.gtf.gz.feather'
721
+ )
722
+ gtf_transcript = gene_annotation.filter_transcript_support_level(
723
+ gene_annotation.filter_protein_coding(gtf), ['1']
724
+ )
725
+ gtf_longest_transcript = gene_annotation.filter_to_longest_transcript(gtf_transcript)
726
+ longest_transcript_extractor = transcript.TranscriptExtractor(gtf_longest_transcript)
727
+
728
+ # Make predictions
729
+ output = dna_model.predict_interval(
730
+ interval=interval,
731
+ requested_outputs={dna_client.OutputType.CHIP_HISTONE},
732
+ ontology_terms=ontology_terms,
733
+ )
734
+
735
+ # Extract transcripts
736
+ longest_transcripts = longest_transcript_extractor.extract(interval)
737
+
738
+ # Reorder tracks by histone mark and apply colors
739
+ reordered_chip_histone = output.chip_histone.select_tracks_by_index(
740
+ output.chip_histone.metadata.sort_values('histone_mark').index
741
+ )
742
+
743
+ histone_to_color = {
744
+ 'H3K27AC': '#e41a1c',
745
+ 'H3K36ME3': '#ff7f00',
746
+ 'H3K4ME1': '#377eb8',
747
+ 'H3K4ME3': '#984ea3',
748
+ 'H3K9AC': '#4daf4a',
749
+ 'H3K27ME3': '#ffc0cb',
750
+ }
751
+
752
+ track_colors = (
753
+ reordered_chip_histone.metadata['histone_mark']
754
+ .map(lambda x: histone_to_color.get(x.upper(), '#000000'))
755
+ .values
756
+ )
757
+
758
+ # Prepare annotations
759
+ annotations = []
760
+ if include_tss_annotations:
761
+ # Extract TSS annotations
762
+ gtf_tss = gene_annotation.extract_tss(gtf_longest_transcript)
763
+ tss_as_intervals = [
764
+ genome.Interval(
765
+ chromosome=row.Chromosome,
766
+ start=row.Start,
767
+ end=row.End + 1000, # Add extra 1Kb so the TSSs are visible
768
+ name=row.gene_name,
769
+ )
770
+ for _, row in gtf_tss.iterrows()
771
+ ]
772
+ annotations.append(
773
+ plot_components.IntervalAnnotation(
774
+ tss_as_intervals, alpha=0.5, colors='blue'
775
+ )
776
+ )
777
+
778
+ # Build plot
779
+ plot = plot_components.plot(
780
+ [
781
+ plot_components.TranscriptAnnotation(longest_transcripts),
782
+ plot_components.Tracks(
783
+ tdata=reordered_chip_histone,
784
+ ylabel_template=(
785
+ 'CHIP HISTONE: {biosample_name} ({strand})\n{histone_mark}'
786
+ ),
787
+ filled=True,
788
+ track_colors=track_colors,
789
+ ),
790
+ ],
791
+ interval=interval,
792
+ annotations=annotations,
793
+ despine_keep_bottom=True,
794
+ title='Predicted histone modification markers in colon tissue',
795
+ )
796
+
797
+ # Save plot
798
+ if out_prefix is None:
799
+ out_prefix = f"histone_modifications_{timestamp}"
800
+ output_file = OUTPUT_DIR / f"{out_prefix}.png"
801
+ plt.savefig(output_file, dpi=300, bbox_inches='tight')
802
+ plt.close()
803
+
804
+ return {
805
+ "message": "Histone modifications visualization completed successfully",
806
+ "reference": "https://github.com/google-deepmind/alphagenome/tree/main/colabs/visualization_modality_tour.ipynb",
807
+ "artifacts": [
808
+ {
809
+ "description": "Histone modifications plot",
810
+ "path": str(output_file.resolve())
811
+ }
812
+ ]
813
+ }
814
+
815
+ @visualization_modality_tour_mcp.tool
816
+ def visualize_tf_binding(
817
+ chromosome: Annotated[str, "Chromosome name (e.g., 'chr22')"],
818
+ start_position: Annotated[int, "Start genomic position"],
819
+ end_position: Annotated[int, "End genomic position"],
820
+ transcription_factor: Annotated[str | None, "Specific transcription factor to visualize (e.g., 'CTCF')"] = None,
821
+ min_max_prediction: Annotated[float, "Minimum maximum prediction value to include tracks"] = 8000.0,
822
+ gene_symbol: Annotated[str | None, "Gene symbol to focus analysis on"] = None,
823
+ include_tss_annotations: Annotated[bool, "Whether to include transcription start site annotations"] = True,
824
+ ontology_terms: Annotated[list, "List of ontology terms for tissues/cell types"] = ['UBERON:0001159', 'UBERON:0001157', 'EFO:0002067', 'EFO:0001187'],
825
+ api_key: Annotated[str | None, "AlphaGenome API key"] = ALPHAGENOME_API_KEY,
826
+ out_prefix: Annotated[str | None, "Output file prefix"] = None,
827
+ ) -> dict:
828
+ """
829
+ Visualize CHIP_TF transcription factor binding predictions with filtering and averaging options.
830
+ Input is genomic coordinates and filtering parameters and output is TF binding visualization plot.
831
+ """
832
+ if api_key is None:
833
+ raise ValueError("API key must be provided")
834
+
835
+ # Create interval
836
+ interval = genome.Interval(chromosome, start_position, end_position).resize(
837
+ dna_client.SEQUENCE_LENGTH_1MB
838
+ )
839
+
840
+ # Create model client
841
+ dna_model = dna_client.create(api_key)
842
+
843
+ # Load gene annotations
844
+ gtf = pd.read_feather(
845
+ 'https://storage.googleapis.com/alphagenome/reference/gencode/'
846
+ 'hg38/gencode.v46.annotation.gtf.gz.feather'
847
+ )
848
+ gtf_transcript = gene_annotation.filter_transcript_support_level(
849
+ gene_annotation.filter_protein_coding(gtf), ['1']
850
+ )
851
+ gtf_longest_transcript = gene_annotation.filter_to_longest_transcript(gtf_transcript)
852
+ longest_transcript_extractor = transcript.TranscriptExtractor(gtf_longest_transcript)
853
+ transcript_extractor = transcript.TranscriptExtractor(gtf_transcript)
854
+
855
+ # Make predictions
856
+ output = dna_model.predict_interval(
857
+ interval=interval,
858
+ requested_outputs={dna_client.OutputType.CHIP_TF},
859
+ ontology_terms=ontology_terms,
860
+ )
861
+
862
+ # Extract transcripts
863
+ longest_transcripts = longest_transcript_extractor.extract(interval)
864
+ all_transcripts = transcript_extractor.extract(interval)
865
+
866
+ # Determine plot interval and filtering logic
867
+ if gene_symbol is not None:
868
+ gene_interval = gene_annotation.get_gene_interval(gtf, gene_symbol=gene_symbol)
869
+ gene_interval.resize_inplace(gene_interval.width + 1000)
870
+
871
+ # Filter based on gene interval
872
+ max_predictions = output.chip_tf.slice_by_interval(
873
+ gene_interval, match_resolution=True
874
+ ).values.max(axis=0)
875
+
876
+ # Get top 10 tracks for gene-specific analysis
877
+ output_filtered = output.chip_tf.filter_tracks(
878
+ (max_predictions >= np.sort(max_predictions)[-10])
879
+ )
880
+ plot_interval = gene_interval
881
+ transcripts_to_use = all_transcripts
882
+ else:
883
+ # Filter by minimum max prediction globally
884
+ output_filtered = output.chip_tf.filter_tracks(
885
+ output.chip_tf.values.max(axis=0) > min_max_prediction
886
+ )
887
+ plot_interval = interval
888
+ transcripts_to_use = longest_transcripts
889
+
890
+ # Handle specific transcription factor averaging
891
+ if transcription_factor is not None:
892
+ # Filter to specific TF and create mean track
893
+ tf_mask = output_filtered.metadata['transcription_factor'] == transcription_factor
894
+ if tf_mask.any():
895
+ mean_tf_values = output_filtered.values[:, tf_mask].mean(axis=1)
896
+
897
+ # Create new TrackData object
898
+ tdata_mean_tf = track_data.TrackData(
899
+ values=mean_tf_values[:, None],
900
+ metadata=pd.DataFrame({
901
+ 'transcription_factor': [transcription_factor],
902
+ 'name': ['mean'],
903
+ 'strand': ['.']
904
+ }),
905
+ interval=output_filtered.interval,
906
+ resolution=output_filtered.resolution,
907
+ )
908
+
909
+ track_data_to_plot = tdata_mean_tf
910
+ ylabel_template = '{name} {transcription_factor}'
911
+ plot_title = f'Predicted {transcription_factor} binding (mean across cell types)'
912
+ else:
913
+ raise ValueError(f"No tracks found for transcription factor: {transcription_factor}")
914
+ else:
915
+ track_data_to_plot = output_filtered
916
+ ylabel_template = 'CHIP TF: {biosample_name} ({strand})\n{transcription_factor}'
917
+ plot_title = 'Predicted TF-binding in K562, HepG2, and sigmoid colon.'
918
+
919
+ # Prepare annotations
920
+ annotations = []
921
+ if include_tss_annotations:
922
+ # Extract TSS annotations
923
+ gtf_tss = gene_annotation.extract_tss(gtf_longest_transcript)
924
+ tss_as_intervals = [
925
+ genome.Interval(
926
+ chromosome=row.Chromosome,
927
+ start=row.Start,
928
+ end=row.End + 1000, # Add extra 1Kb so the TSSs are visible
929
+ name=row.gene_name,
930
+ )
931
+ for _, row in gtf_tss.iterrows()
932
+ ]
933
+ annotations.append(
934
+ plot_components.IntervalAnnotation(
935
+ tss_as_intervals, alpha=0.3, colors='blue'
936
+ )
937
+ )
938
+
939
+ # Build plot
940
+ plot = plot_components.plot(
941
+ [
942
+ plot_components.TranscriptAnnotation(transcripts_to_use),
943
+ plot_components.Tracks(
944
+ tdata=track_data_to_plot,
945
+ ylabel_template=ylabel_template,
946
+ filled=True,
947
+ ),
948
+ ],
949
+ interval=plot_interval,
950
+ annotations=annotations,
951
+ despine_keep_bottom=True,
952
+ title=plot_title,
953
+ )
954
+
955
+ # Save plot
956
+ if out_prefix is None:
957
+ tf_suffix = f"_{transcription_factor}" if transcription_factor else ""
958
+ out_prefix = f"tf_binding{tf_suffix}_{timestamp}"
959
+ output_file = OUTPUT_DIR / f"{out_prefix}.png"
960
+ plt.savefig(output_file, dpi=300, bbox_inches='tight')
961
+ plt.close()
962
+
963
+ return {
964
+ "message": "TF binding visualization completed successfully",
965
+ "reference": "https://github.com/google-deepmind/alphagenome/tree/main/colabs/visualization_modality_tour.ipynb",
966
+ "artifacts": [
967
+ {
968
+ "description": "TF binding plot",
969
+ "path": str(output_file.resolve())
970
+ }
971
+ ]
972
+ }
973
+
974
+ @visualization_modality_tour_mcp.tool
975
+ def visualize_contact_maps(
976
+ chromosome: Annotated[str, "Chromosome name (e.g., 'chr22')"],
977
+ start_position: Annotated[int, "Start genomic position"],
978
+ end_position: Annotated[int, "End genomic position"],
979
+ colormap: Annotated[str, "Matplotlib colormap for contact map"] = 'autumn_r',
980
+ vmax: Annotated[float, "Maximum value for colormap scaling"] = 1.0,
981
+ ontology_terms: Annotated[list, "List of ontology terms for cell lines"] = ['EFO:0002824'],
982
+ api_key: Annotated[str | None, "AlphaGenome API key"] = ALPHAGENOME_API_KEY,
983
+ out_prefix: Annotated[str | None, "Output file prefix"] = None,
984
+ ) -> dict:
985
+ """
986
+ Visualize CONTACT_MAPS DNA-DNA contact predictions showing topologically-associated domains.
987
+ Input is genomic coordinates and colormap parameters and output is contact maps visualization plot.
988
+ """
989
+ if api_key is None:
990
+ raise ValueError("API key must be provided")
991
+
992
+ # Create interval
993
+ interval = genome.Interval(chromosome, start_position, end_position).resize(
994
+ dna_client.SEQUENCE_LENGTH_1MB
995
+ )
996
+
997
+ # Create model client
998
+ dna_model = dna_client.create(api_key)
999
+
1000
+ # Load gene annotations
1001
+ gtf = pd.read_feather(
1002
+ 'https://storage.googleapis.com/alphagenome/reference/gencode/'
1003
+ 'hg38/gencode.v46.annotation.gtf.gz.feather'
1004
+ )
1005
+ gtf_transcript = gene_annotation.filter_transcript_support_level(
1006
+ gene_annotation.filter_protein_coding(gtf), ['1']
1007
+ )
1008
+ longest_transcript_extractor = transcript.TranscriptExtractor(
1009
+ gene_annotation.filter_to_longest_transcript(gtf_transcript)
1010
+ )
1011
+
1012
+ # Make predictions
1013
+ output = dna_model.predict_interval(
1014
+ interval=interval,
1015
+ requested_outputs={dna_client.OutputType.CONTACT_MAPS},
1016
+ ontology_terms=ontology_terms,
1017
+ )
1018
+
1019
+ # Extract transcripts
1020
+ longest_transcripts = longest_transcript_extractor.extract(interval)
1021
+
1022
+ # Build plot
1023
+ plot = plot_components.plot(
1024
+ [
1025
+ plot_components.TranscriptAnnotation(longest_transcripts),
1026
+ plot_components.ContactMaps(
1027
+ tdata=output.contact_maps,
1028
+ ylabel_template='{biosample_name}\n{name}',
1029
+ cmap=colormap,
1030
+ vmax=vmax,
1031
+ ),
1032
+ ],
1033
+ interval=interval,
1034
+ title='Predicted contact maps',
1035
+ )
1036
+
1037
+ # Save plot
1038
+ if out_prefix is None:
1039
+ out_prefix = f"contact_maps_{timestamp}"
1040
+ output_file = OUTPUT_DIR / f"{out_prefix}.png"
1041
+ plt.savefig(output_file, dpi=300, bbox_inches='tight')
1042
+ plt.close()
1043
+
1044
+ return {
1045
+ "message": "Contact maps visualization completed successfully",
1046
+ "reference": "https://github.com/google-deepmind/alphagenome/tree/main/colabs/visualization_modality_tour.ipynb",
1047
+ "artifacts": [
1048
+ {
1049
+ "description": "Contact maps plot",
1050
+ "path": str(output_file.resolve())
1051
+ }
1052
+ ]
1053
+ }