Spaces:

LfOreVEr
/

CRISPRtool

Sleeping

App Files Files Community

LfOreVEr commited on Sep 18, 2024

Commit

4bb4454

verified ·

1 Parent(s): 566a054

Upload 7 files

Browse files

Files changed (7) hide show

cas12.py +220 -0
cas12lstm.py +258 -0
cas12lstmvcf.py +351 -0
cas9att.py +296 -0
cas9attvcf.py +393 -0
cas9off.py +134 -0
tiger.py +417 -0

cas12.py ADDED Viewed

	@@ -0,0 +1,220 @@

+from keras import Model
+from keras.layers import Input
+from keras.layers import Multiply
+from keras.layers import Dense, Dropout, Activation, Flatten
+from keras.layers import Convolution1D, AveragePooling1D
+import pandas as pd
+import numpy as np
+import keras
+import requests
+from functools import reduce
+from operator import add
+from Bio.SeqRecord import SeqRecord
+from Bio.SeqFeature import SeqFeature, FeatureLocation
+from Bio.Seq import Seq
+from Bio import SeqIO
+ntmap = {'A': (1, 0, 0, 0),
+         'C': (0, 1, 0, 0),
+         'G': (0, 0, 1, 0),
+         'T': (0, 0, 0, 1)
+         }
+def get_seqcode(seq):
+    return np.array(reduce(add, map(lambda c: ntmap[c], seq.upper()))).reshape((1, len(seq), -1))
+def Seq_DeepCpf1_model(input_shape):
+  Seq_deepCpf1_Input_SEQ = Input(shape=input_shape)
+  Seq_deepCpf1_C1 = Convolution1D(80, 5, activation='relu')(Seq_deepCpf1_Input_SEQ)
+  Seq_deepCpf1_P1 = AveragePooling1D(2)(Seq_deepCpf1_C1)
+  Seq_deepCpf1_F = Flatten()(Seq_deepCpf1_P1)
+  Seq_deepCpf1_DO1 = Dropout(0.3)(Seq_deepCpf1_F)
+  Seq_deepCpf1_D1 = Dense(80, activation='relu')(Seq_deepCpf1_DO1)
+  Seq_deepCpf1_DO2 = Dropout(0.3)(Seq_deepCpf1_D1)
+  Seq_deepCpf1_D2 = Dense(40, activation='relu')(Seq_deepCpf1_DO2)
+  Seq_deepCpf1_DO3 = Dropout(0.3)(Seq_deepCpf1_D2)
+  Seq_deepCpf1_D3 = Dense(40, activation='relu')(Seq_deepCpf1_DO3)
+  Seq_deepCpf1_DO4 = Dropout(0.3)(Seq_deepCpf1_D3)
+  Seq_deepCpf1_Output = Dense(1, activation='linear')(Seq_deepCpf1_DO4)
+  Seq_deepCpf1 = Model(inputs=[Seq_deepCpf1_Input_SEQ], outputs=[Seq_deepCpf1_Output])
+  return Seq_deepCpf1
+# seq-ca model (DeepCpf1)
+def DeepCpf1_model(input_shape):
+  DeepCpf1_Input_SEQ = Input(shape=input_shape)
+  DeepCpf1_C1 = Convolution1D(80, 5, activation='relu')(DeepCpf1_Input_SEQ)
+  DeepCpf1_P1 = AveragePooling1D(2)(DeepCpf1_C1)
+  DeepCpf1_F = Flatten()(DeepCpf1_P1)
+  DeepCpf1_DO1 = Dropout(0.3)(DeepCpf1_F)
+  DeepCpf1_D1 = Dense(80, activation='relu')(DeepCpf1_DO1)
+  DeepCpf1_DO2 = Dropout(0.3)(DeepCpf1_D1)
+  DeepCpf1_D2 = Dense(40, activation='relu')(DeepCpf1_DO2)
+  DeepCpf1_DO3 = Dropout(0.3)(DeepCpf1_D2)
+  DeepCpf1_D3_SEQ = Dense(40, activation='relu')(DeepCpf1_DO3)
+  DeepCpf1_Input_CA = Input(shape=(1,))
+  DeepCpf1_D3_CA = Dense(40, activation='relu')(DeepCpf1_Input_CA)
+  DeepCpf1_M = Multiply()([DeepCpf1_D3_SEQ, DeepCpf1_D3_CA])
+  DeepCpf1_DO4 = Dropout(0.3)(DeepCpf1_M)
+  DeepCpf1_Output = Dense(1, activation='linear')(DeepCpf1_DO4)
+  DeepCpf1 = Model(inputs=[DeepCpf1_Input_SEQ, DeepCpf1_Input_CA], outputs=[DeepCpf1_Output])
+  return DeepCpf1
+def fetch_ensembl_transcripts(gene_symbol):
+    url = f"https://rest.ensembl.org/lookup/symbol/homo_sapiens/{gene_symbol}?expand=1;content-type=application/json"
+    response = requests.get(url)
+    if response.status_code == 200:
+        gene_data = response.json()
+        if 'Transcript' in gene_data:
+            return gene_data['Transcript']
+        else:
+            print("No transcripts found for gene:", gene_symbol)
+            return None
+    else:
+        print(f"Error fetching gene data from Ensembl: {response.text}")
+        return None
+def fetch_ensembl_sequence(transcript_id):
+    url = f"https://rest.ensembl.org/sequence/id/{transcript_id}?content-type=application/json"
+    response = requests.get(url)
+    if response.status_code == 200:
+        sequence_data = response.json()
+        if 'seq' in sequence_data:
+            return sequence_data['seq']
+        else:
+            print("No sequence found for transcript:", transcript_id)
+            return None
+    else:
+        print(f"Error fetching sequence data from Ensembl: {response.text}")
+        return None
+def find_crispr_targets(sequence, chr, start, end, strand, transcript_id, exon_id, pam="TTTN", target_length=34):
+    targets = []
+    len_sequence = len(sequence)
+    complement = {'A': 'T', 'T': 'A', 'C': 'G', 'G': 'C'}
+    dnatorna = {'A': 'A', 'T': 'U', 'C': 'C', 'G': 'G'}
+    for i in range(len_sequence - target_length + 1):
+        target_seq = sequence[i:i + target_length]
+        if target_seq[4:7] == 'TTT':
+            if strand == -1:
+                tar_start = end - i - target_length + 1
+                tar_end = end -i
+                #seq_in_ref = ''.join([complement[base] for base in target_seq])[::-1]
+            else:
+                tar_start = start + i
+                tar_end = start + i + target_length - 1
+                #seq_in_ref = target_seq
+            gRNA = ''.join([dnatorna[base] for base in target_seq[8:28]])
+            targets.append([target_seq, gRNA, chr, str(tar_start), str(tar_end), str(strand), transcript_id, exon_id])
+    return targets
+def format_prediction_output(targets, model_path):
+    # Loading weights for the model
+    Seq_deepCpf1 = Seq_DeepCpf1_model(input_shape=(34, 4))
+    Seq_deepCpf1.load_weights(model_path)
+    formatted_data = []
+    for target in targets:
+        # Predict
+        encoded_seq = get_seqcode(target[0])
+        prediction = float(list(Seq_deepCpf1.predict(encoded_seq)[0])[0])
+        if prediction > 100:
+            prediction = 100
+        # Format output
+        gRNA = target[1]
+        chr = target[2]
+        start = target[3]
+        end = target[4]
+        strand = target[5]
+        transcript_id = target[6]
+        exon_id = target[7]
+        formatted_data.append([chr, start, end, strand, transcript_id, exon_id, target[0], gRNA, prediction])
+    return formatted_data
+def process_gene(gene_symbol, model_path):
+    transcripts = fetch_ensembl_transcripts(gene_symbol)
+    results = []
+    all_exons = []
+    all_gene_sequences = []
+    if transcripts:
+        for i in range(len(transcripts)):
+            Exons = transcripts[i]['Exon']
+            all_exons.append(Exons)
+            transcript_id = transcripts[i]['id']
+            for j in range(len(Exons)):
+                exon_id = Exons[j]['id']
+                gene_sequence = fetch_ensembl_sequence(exon_id)
+                if gene_sequence:
+                    all_gene_sequences.append(gene_sequence)
+                    start = Exons[j]['start']
+                    end = Exons[j]['end']
+                    strand = Exons[j]['strand']
+                    chr = Exons[j]['seq_region_name']
+                    targets = find_crispr_targets(gene_sequence, chr, start, end, strand, transcript_id, exon_id)
+                    if targets:
+                        formatted_data = format_prediction_output(targets, model_path)
+                        results.append(formatted_data)
+                        # for data in formatted_data:
+                        #    print(f"Chr: {data[0]}, Start: {data[1]}, End: {data[2]}, Strand: {data[3]}, target: {data[4]}, gRNA: {data[5]}, pred_Score: {data[6]}")
+                else:
+                    print("Failed to retrieve gene sequence.")
+    else:
+        print("Failed to retrieve transcripts.")
+    return results, all_gene_sequences, all_exons
+# def create_genbank_features(formatted_data):
+#     features = []
+#     for data in formatted_data:
+#         try:
+#             # Attempt to convert start and end positions to integers
+#             start = int(data[1])
+#             end = int(data[2])
+#         except ValueError as e:
+#             # Log the error and skip this iteration if conversion fails
+#             print(f"Error converting start/end to int: {data[1]}, {data[2]} - {e}")
+#             continue  # Skip this iteration
+#
+#         # Proceed as normal if conversion is successful
+#         strand = 1 if data[3] == '+' else -1
+#         location = FeatureLocation(start=start, end=end, strand=strand)
+#         feature = SeqFeature(location=location, type="misc_feature", qualifiers={
+#             'label': data[5],  # gRNA as label
+#             'note': f"Prediction: {data[6]}"  # Prediction score in note
+#         })
+#         features.append(feature)
+#     return features
+#
+# def generate_genbank_file_from_data(formatted_data, gene_sequence, gene_symbol, output_path):
+#     features = create_genbank_features(formatted_data)
+#     record = SeqRecord(Seq(gene_sequence), id=gene_symbol, name=gene_symbol,
+#                        description='CRISPR Cas12 predicted targets', features=features)
+#     record.annotations["molecule_type"] = "DNA"
+#     SeqIO.write(record, output_path, "genbank")
+#
+# def create_csv_from_df(df, output_path):
+#     df.to_csv(output_path, index=False)
+#
+# def generate_bed_file_from_data(formatted_data, output_path):
+#     with open(output_path, 'w') as bed_file:
+#         for data in formatted_data:
+#             try:
+#                 # Ensure data has the expected number of elements
+#                 if len(data) < 7:
+#                     raise ValueError("Incomplete data item")
+#
+#                 chrom = data[0]
+#                 start = data[1]
+#                 end = data[2]
+#                 strand = '+' if data[3] == '+' else '-'
+#                 gRNA = data[5]
+#                 score = data[6]  # Ensure this index exists
+#
+#                 bed_file.write(f"{chrom}\t{start}\t{end}\t{gRNA}\t{score}\t{strand}\n")
+#             except ValueError as e:
+#                 print(f"Skipping an item due to error: {e}")
+#                 continue

cas12lstm.py ADDED Viewed

	@@ -0,0 +1,258 @@

+import tensorflow as tf
+from keras import regularizers
+from keras.layers import Input, Dense, Dropout, Activation, Conv1D
+from keras.layers import GlobalAveragePooling1D, AveragePooling1D
+from keras.layers import Bidirectional, LSTM
+from keras import Model
+from keras.metrics import MeanSquaredError
+import pandas as pd
+import numpy as np
+import requests
+from functools import reduce
+from operator import add
+import tabulate
+from difflib import SequenceMatcher
+from Bio import SeqIO
+from Bio.SeqRecord import SeqRecord
+from Bio.SeqFeature import SeqFeature, FeatureLocation
+from Bio.Seq import Seq
+import cyvcf2
+import parasail
+import re
+ntmap = {'A': (1, 0, 0, 0),
+         'C': (0, 1, 0, 0),
+         'G': (0, 0, 1, 0),
+         'T': (0, 0, 0, 1)
+         }
+def get_seqcode(seq):
+    return np.array(reduce(add, map(lambda c: ntmap[c], seq.upper()))).reshape((1, len(seq), -1))
+def BiLSTM_model(input_shape):
+    input = Input(shape=input_shape)
+    conv1 = Conv1D(128, 5, activation="relu")(input)
+    pool1 = AveragePooling1D(2)(conv1)
+    drop1 = Dropout(0.1)(pool1)
+    conv2 = Conv1D(128, 5, activation="relu")(drop1)
+    pool2 = AveragePooling1D(2)(conv2)
+    drop2 = Dropout(0.1)(pool2)
+    lstm1 = Bidirectional(LSTM(128,
+                               dropout=0.1,
+                               activation='tanh',
+                               return_sequences=True,
+                               kernel_regularizer=regularizers.l2(1e-4)))(drop2)
+    avgpool = GlobalAveragePooling1D()(lstm1)
+    dense1 = Dense(128,
+                   kernel_regularizer=regularizers.l2(1e-4),
+                   bias_regularizer=regularizers.l2(1e-4),
+                   activation="relu")(avgpool)
+    drop3 = Dropout(0.1)(dense1)
+    dense2 = Dense(32,
+                   kernel_regularizer=regularizers.l2(1e-4),
+                   bias_regularizer=regularizers.l2(1e-4),
+                   activation="relu")(drop3)
+    drop4 = Dropout(0.1)(dense2)
+    dense3 = Dense(32,
+                   kernel_regularizer=regularizers.l2(1e-4),
+                   bias_regularizer=regularizers.l2(1e-4),
+                   activation="relu")(drop4)
+    drop5 = Dropout(0.1)(dense3)
+    output = Dense(1, activation="linear")(drop5)
+    model = Model(inputs=[input], outputs=[output])
+    return model
+def fetch_ensembl_transcripts(gene_symbol):
+    url = f"https://rest.ensembl.org/lookup/symbol/homo_sapiens/{gene_symbol}?expand=1;content-type=application/json"
+    response = requests.get(url)
+    if response.status_code == 200:
+        gene_data = response.json()
+        if 'Transcript' in gene_data:
+            return gene_data['Transcript']
+        else:
+            print("No transcripts found for gene:", gene_symbol)
+            return None
+    else:
+        print(f"Error fetching gene data from Ensembl: {response.text}")
+        return None
+def fetch_ensembl_sequence(transcript_id):
+    url = f"https://rest.ensembl.org/sequence/id/{transcript_id}?content-type=application/json"
+    response = requests.get(url)
+    if response.status_code == 200:
+        sequence_data = response.json()
+        if 'seq' in sequence_data:
+            return sequence_data['seq']
+        else:
+            print("No sequence found for transcript:", transcript_id)
+            return None
+    else:
+        print(f"Error fetching sequence data from Ensembl: {response.text}")
+        return None
+def find_crispr_targets(sequence, chr, start, end, strand, transcript_id, exon_id, pam="TTTN", target_length=34):
+    targets = []
+    len_sequence = len(sequence)
+    #complement = {'A': 'T', 'T': 'A', 'C': 'G', 'G': 'C'}
+    dnatorna = {'A': 'A', 'T': 'U', 'C': 'C', 'G': 'G'}
+    for i in range(len_sequence - target_length + 1):
+        target_seq = sequence[i:i + target_length]
+        if target_seq[4:7] == 'TTT':
+            if strand == -1:
+                tar_start = end - i - target_length + 1
+                tar_end = end -i
+                #seq_in_ref = ''.join([complement[base] for base in target_seq])[::-1]
+            else:
+                tar_start = start + i
+                tar_end = start + i + target_length - 1
+                #seq_in_ref = target_seq
+            gRNA = ''.join([dnatorna[base] for base in target_seq[8:28]])
+            targets.append([target_seq, gRNA, chr, str(tar_start), str(tar_end), str(strand), transcript_id, exon_id])
+            #targets.append([target_seq, gRNA, chr, str(tar_start), str(tar_end), str(strand), transcript_id, exon_id, seq_in_ref])
+    return targets
+def format_prediction_output(targets, model_path):
+    # Loading weights for the model
+    Crispr_BiLSTM = BiLSTM_model(input_shape=(34, 4))
+    Crispr_BiLSTM.load_weights(model_path)
+    formatted_data = []
+    for target in targets:
+        # Predict
+        encoded_seq = get_seqcode(target[0])
+        prediction = float(list(Crispr_BiLSTM.predict(encoded_seq, verbose=0)[0])[0])
+        if prediction > 100:
+            prediction = 100
+        # Format output
+        gRNA = target[1]
+        chr = target[2]
+        start = target[3]
+        end = target[4]
+        strand = target[5]
+        transcript_id = target[6]
+        exon_id = target[7]
+        #seq_in_ref = target[8]
+        #formatted_data.append([chr, start, end, strand, transcript_id, exon_id, target[0], gRNA, seq_in_ref, prediction])
+        formatted_data.append([chr, start, end, strand, transcript_id, exon_id, target[0], gRNA, prediction])
+    return formatted_data
+def process_gene(gene_symbol, model_path):
+    transcripts = fetch_ensembl_transcripts(gene_symbol)
+    results = []
+    all_exons = []  # To accumulate all exons
+    all_gene_sequences = []  # To accumulate all gene sequences
+    if transcripts:
+        for transcript in transcripts:
+            Exons = transcript['Exon']
+            all_exons.extend(Exons)  # Add all exons from this transcript to the list
+            transcript_id = transcript['id']
+            for Exon in Exons:
+                exon_id = Exon['id']
+                gene_sequence = fetch_ensembl_sequence(exon_id)
+                if gene_sequence:
+                    all_gene_sequences.append(gene_sequence)  # Add this gene sequence to the list
+                    chr = Exon['seq_region_name']
+                    start = Exon['start']
+                    end = Exon['end']
+                    strand = Exon['strand']
+                    targets = find_crispr_targets(gene_sequence, chr, start, end, strand, transcript_id, exon_id)
+                    if targets:
+                        # Predict on-target efficiency for each gRNA site
+                        formatted_data = format_prediction_output(targets, model_path)
+                        results.extend(formatted_data)  # Flatten the results
+                else:
+                    print(f"Failed to retrieve gene sequence for exon {exon_id}.")
+    else:
+        print("Failed to retrieve transcripts.")
+    output = []
+    for result in results:
+        for item in result:
+            output.append(item)
+    # Return the sorted output, combined gene sequences, and all exons
+    return results, all_gene_sequences, all_exons
+def create_genbank_features(data):
+    features = []
+    # If the input data is a DataFrame, convert it to a list of lists
+    if isinstance(data, pd.DataFrame):
+        formatted_data = data.values.tolist()
+    elif isinstance(data, list):
+        formatted_data = data
+    else:
+        raise TypeError("Data should be either a list or a pandas DataFrame.")
+    for row in formatted_data:
+        try:
+            start = int(row[1])
+            end = int(row[2])
+        except ValueError as e:
+            print(f"Error converting start/end to int: {row[1]}, {row[2]} - {e}")
+            continue
+        strand = 1 if row[3] == '+' else -1
+        location = FeatureLocation(start=start, end=end, strand=strand)
+        feature = SeqFeature(location=location, type="misc_feature", qualifiers={
+            'label': row[7],  # Use gRNA as the label
+            'note': f"Prediction: {row[8]}"  # Include the prediction score
+        })
+        features.append(feature)
+    return features
+def generate_genbank_file_from_df(df, gene_sequence, gene_symbol, output_path):
+    # Ensure gene_sequence is a string before creating Seq object
+    if not isinstance(gene_sequence, str):
+        gene_sequence = str(gene_sequence)
+    features = create_genbank_features(df)
+    # Now gene_sequence is guaranteed to be a string, suitable for Seq
+    seq_obj = Seq(gene_sequence)
+    record = SeqRecord(seq_obj, id=gene_symbol, name=gene_symbol,
+                       description=f'CRISPR Cas12 predicted targets for {gene_symbol}', features=features)
+    record.annotations["molecule_type"] = "DNA"
+    SeqIO.write(record, output_path, "genbank")
+def create_bed_file_from_df(df, output_path):
+    with open(output_path, 'w') as bed_file:
+        for index, row in df.iterrows():
+            chrom = row["Chr"]
+            start = int(row["Start Pos"])
+            end = int(row["End Pos"])
+            strand = '+' if row["Strand"] == '1' else '-'
+            gRNA = row["gRNA"]
+            score = str(row["Prediction"])
+            # transcript_id is not typically part of the standard BED columns but added here for completeness
+            transcript_id = row["Transcript"]
+            # Writing only standard BED columns; additional columns can be appended as needed
+            bed_file.write(f"{chrom}\t{start}\t{end}\t{gRNA}\t{score}\t{strand}\n")
+def create_csv_from_df(df, output_path):
+    df.to_csv(output_path, index=False)

cas12lstmvcf.py ADDED Viewed

	@@ -0,0 +1,351 @@

+import tensorflow as tf
+from keras import regularizers
+from keras.layers import Input, Dense, Dropout, Activation, Conv1D
+from keras.layers import GlobalAveragePooling1D, AveragePooling1D
+from keras.layers import Bidirectional, LSTM
+from keras import Model
+from keras.metrics import MeanSquaredError
+import pandas as pd
+import numpy as np
+from Bio import SeqIO
+from Bio.SeqRecord import SeqRecord
+from Bio.SeqFeature import SeqFeature, FeatureLocation
+from Bio.Seq import Seq
+import requests
+from functools import reduce
+from operator import add
+import tabulate
+from difflib import SequenceMatcher
+import cyvcf2
+import parasail
+import re
+ntmap = {'A': (1, 0, 0, 0),
+         'C': (0, 1, 0, 0),
+         'G': (0, 0, 1, 0),
+         'T': (0, 0, 0, 1)
+         }
+def get_seqcode(seq):
+    return np.array(reduce(add, map(lambda c: ntmap[c], seq.upper()))).reshape((1, len(seq), -1))
+def BiLSTM_model(input_shape):
+    input = Input(shape=input_shape)
+    conv1 = Conv1D(128, 5, activation="relu")(input)
+    pool1 = AveragePooling1D(2)(conv1)
+    drop1 = Dropout(0.1)(pool1)
+    conv2 = Conv1D(128, 5, activation="relu")(drop1)
+    pool2 = AveragePooling1D(2)(conv2)
+    drop2 = Dropout(0.1)(pool2)
+    lstm1 = Bidirectional(LSTM(128,
+                               dropout=0.1,
+                               activation='tanh',
+                               return_sequences=True,
+                               kernel_regularizer=regularizers.l2(1e-4)))(drop2)
+    avgpool = GlobalAveragePooling1D()(lstm1)
+    dense1 = Dense(128,
+                   kernel_regularizer=regularizers.l2(1e-4),
+                   bias_regularizer=regularizers.l2(1e-4),
+                   activation="relu")(avgpool)
+    drop3 = Dropout(0.1)(dense1)
+    dense2 = Dense(32,
+                   kernel_regularizer=regularizers.l2(1e-4),
+                   bias_regularizer=regularizers.l2(1e-4),
+                   activation="relu")(drop3)
+    drop4 = Dropout(0.1)(dense2)
+    dense3 = Dense(32,
+                   kernel_regularizer=regularizers.l2(1e-4),
+                   bias_regularizer=regularizers.l2(1e-4),
+                   activation="relu")(drop4)
+    drop5 = Dropout(0.1)(dense3)
+    output = Dense(1, activation="linear")(drop5)
+    model = Model(inputs=[input], outputs=[output])
+    return model
+def fetch_ensembl_transcripts(gene_symbol):
+    url = f"https://rest.ensembl.org/lookup/symbol/homo_sapiens/{gene_symbol}?expand=1;content-type=application/json"
+    response = requests.get(url)
+    if response.status_code == 200:
+        gene_data = response.json()
+        if 'Transcript' in gene_data:
+            return gene_data['Transcript']
+        else:
+            print("No transcripts found for gene:", gene_symbol)
+            return None
+    else:
+        print(f"Error fetching gene data from Ensembl: {response.text}")
+        return None
+def fetch_ensembl_sequence(transcript_id):
+    url = f"https://rest.ensembl.org/sequence/id/{transcript_id}?content-type=application/json"
+    response = requests.get(url)
+    if response.status_code == 200:
+        sequence_data = response.json()
+        if 'seq' in sequence_data:
+            return sequence_data['seq']
+        else:
+            print("No sequence found for transcript:", transcript_id)
+            return None
+    else:
+        print(f"Error fetching sequence data from Ensembl: {response.text}")
+        return None
+def apply_mutation(ref_sequence, offset, ref, alt):
+    """
+    Apply a single mutation to the sequence.
+    """
+    if len(ref) == len(alt) and alt != "*":  # SNP
+        mutated_seq = ref_sequence[:offset] + alt + ref_sequence[offset+len(alt):]
+    elif len(ref) < len(alt):  # Insertion
+        mutated_seq = ref_sequence[:offset] + alt + ref_sequence[offset+1:]
+    elif len(ref) == len(alt) and alt == "*":  # Deletion
+        mutated_seq = ref_sequence[:offset] + ref_sequence[offset+1:]
+    elif len(ref) > len(alt) and alt != "*":  # Deletion
+        mutated_seq = ref_sequence[:offset] + alt + ref_sequence[offset+len(ref):]
+    elif len(ref) > len(alt) and alt == "*":  # Deletion
+        mutated_seq = ref_sequence[:offset] + ref_sequence[offset+len(ref):]
+    return mutated_seq
+def construct_combinations(sequence, mutations):
+    """
+    Construct all combinations of mutations.
+    mutations is a list of tuples (position, ref, [alts])
+    """
+    if not mutations:
+        return [sequence]
+    # Take the first mutation and recursively construct combinations for the rest
+    first_mutation = mutations[0]
+    rest_mutations = mutations[1:]
+    offset, ref, alts = first_mutation
+    sequences = []
+    for alt in alts:
+        mutated_sequence = apply_mutation(sequence, offset, ref, alt)
+        sequences.extend(construct_combinations(mutated_sequence, rest_mutations))
+    return sequences
+def needleman_wunsch_alignment(query_seq, ref_seq):
+    """
+    Use Needleman-Wunsch alignment to find the maximum alignment position in ref_seq
+    Use this position to represent the position of target sequence with mutations
+    """
+    # Needleman-Wunsch alignment
+    alignment = parasail.nw_trace(query_seq, ref_seq, 10, 1, parasail.blosum62)
+    # extract CIGAR object
+    cigar = alignment.cigar
+    cigar_string = cigar.decode.decode("utf-8")
+    # record ref_pos
+    ref_pos = 0
+    matches = re.findall(r'(\d+)([MIDNSHP=X])', cigar_string)
+    max_num_before_equal = 0
+    max_equal_index = -1
+    total_before_max_equal = 0
+    for i, (num_str, op) in enumerate(matches):
+        num = int(num_str)
+        if op == '=':
+            if num > max_num_before_equal:
+                max_num_before_equal = num
+                max_equal_index = i
+    total_before_max_equal = sum(int(matches[j][0]) for j in range(max_equal_index))
+    ref_pos = total_before_max_equal
+    return ref_pos
+def find_gRNA_with_mutation(ref_sequence, exon_chr, start, end, strand, transcript_id,
+                            exon_id, gene_symbol, vcf_reader, pam="TTTN", target_length=34):
+    # initialization
+    mutated_sequences = [ref_sequence]
+    # find mutations within interested region
+    mutations = vcf_reader(f"{exon_chr}:{start}-{end}")
+    if mutations:
+        # find mutations
+        mutation_list = []
+        for mutation in mutations:
+            offset = mutation.POS - start
+            ref = mutation.REF
+            alts = mutation.ALT[:-1]
+            mutation_list.append((offset, ref, alts))
+        # replace reference sequence of mutation
+        mutated_sequences = construct_combinations(ref_sequence, mutation_list)
+    # find gRNA in ref_sequence or all mutated_sequences
+    targets = []
+    for seq in mutated_sequences:
+        len_sequence = len(seq)
+        dnatorna = {'A': 'A', 'T': 'U', 'C': 'C', 'G': 'G'}
+        for i in range(len_sequence - target_length + 1):
+            target_seq = seq[i:i + target_length]
+            if target_seq[4:7] == 'TTT':
+                pos = ref_sequence.find(target_seq)
+                if pos != -1:
+                    is_mut = False
+                    if strand == -1:
+                        tar_start = end - pos - target_length + 1
+                    else:
+                        tar_start = start + pos
+                else:
+                    is_mut = True
+                    nw_pos = needleman_wunsch_alignment(target_seq, ref_sequence)
+                    if strand == -1:
+                        tar_start = str(end - nw_pos - target_length + 1) + '*'
+                    else:
+                        tar_start = str(start + nw_pos) + '*'
+                gRNA = ''.join([dnatorna[base] for base in target_seq[8:28]])
+                targets.append([target_seq, gRNA, exon_chr, str(strand), str(tar_start), transcript_id, exon_id, gene_symbol, is_mut])
+    # filter duplicated targets
+    unique_targets_set = set(tuple(element) for element in targets)
+    unique_targets = [list(element) for element in unique_targets_set]
+    return unique_targets
+def format_prediction_output_with_mutation(targets, model_path):
+    Crispr_BiLSTM = BiLSTM_model(input_shape=(34, 4))
+    Crispr_BiLSTM.load_weights(model_path)
+    formatted_data = []
+    for target in targets:
+        # Predict
+        encoded_seq = get_seqcode(target[0])
+        prediction = float(list(Crispr_BiLSTM.predict(encoded_seq, verbose=0)[0])[0])
+        if prediction > 100:
+            prediction = 100
+        # Format output
+        gRNA = target[1]
+        exon_chr = target[2]
+        strand = target[3]
+        tar_start = target[4]
+        transcript_id = target[5]
+        exon_id = target[6]
+        gene_symbol = target[7]
+        is_mut = target[8]
+        formatted_data.append([gene_symbol, exon_chr, strand, tar_start, transcript_id, exon_id, target[0], gRNA, prediction, is_mut])
+    return formatted_data
+def process_gene(gene_symbol, vcf_reader, model_path):
+    transcripts = fetch_ensembl_transcripts(gene_symbol)
+    results = []
+    all_exons = []  # To accumulate all exons
+    all_gene_sequences = []  # To accumulate all gene sequences
+    if transcripts:
+        for transcript in transcripts:
+            Exons = transcript['Exon']
+            all_exons.extend(Exons)  # Add all exons from this transcript to the list
+            transcript_id = transcript['id']
+            for Exon in Exons:
+                exon_id = Exon['id']
+                gene_sequence = fetch_ensembl_sequence(exon_id)  # Reference exon sequence
+                if gene_sequence:
+                    all_gene_sequences.append(gene_sequence)  # Add this gene sequence to the list
+                    exon_chr = Exon['seq_region_name']
+                    start = Exon['start']
+                    end = Exon['end']
+                    strand = Exon['strand']
+                    targets = find_gRNA_with_mutation(gene_sequence, exon_chr, start, end, strand, transcript_id, exon_id, gene_symbol, vcf_reader)
+                    if targets:
+                        # Predict on-target efficiency for each gRNA site
+                        formatted_data = format_prediction_output_with_mutation(targets, model_path)
+                        results.extend(formatted_data)  # Flatten the results
+                else:
+                    print(f"Failed to retrieve gene sequence for exon {exon_id}.")
+    else:
+        print("Failed to retrieve transcripts.")
+    # Return the sorted output, combined gene sequences, and all exons
+    return results, all_gene_sequences, all_exons
+def create_genbank_features(data):
+    features = []
+    # If the input data is a DataFrame, convert it to a list of lists
+    if isinstance(data, pd.DataFrame):
+        formatted_data = data.values.tolist()
+    elif isinstance(data, list):
+        formatted_data = data
+    else:
+        raise TypeError("Data should be either a list or a pandas DataFrame.")
+    for row in formatted_data:
+        try:
+            start = int(row[1])
+            end = start + len(row[6])  # Calculate the end position based on the target sequence length
+        except ValueError as e:
+            print(f"Error converting start/end to int: {row[1]}, {row[2]} - {e}")
+            continue
+        strand = 1 if row[3] == '1' else -1
+        location = FeatureLocation(start=start, end=end, strand=strand)
+        is_mutation = 'Yes' if row[9] else 'No'
+        feature = SeqFeature(location=location, type="misc_feature", qualifiers={
+            'label': row[7],  # Use gRNA as the label
+            'note': f"Prediction: {row[8]}, Mutation: {is_mutation}"  # Include the prediction score and mutation status
+        })
+        features.append(feature)
+    return features
+def generate_genbank_file_from_df(df, gene_sequence, gene_symbol, output_path):
+    # Ensure gene_sequence is a string before creating Seq object
+    if not isinstance(gene_sequence, str):
+        gene_sequence = str(gene_sequence)
+    features = create_genbank_features(df)
+    # Now gene_sequence is guaranteed to be a string, suitable for Seq
+    seq_obj = Seq(gene_sequence)
+    record = SeqRecord(seq_obj, id=gene_symbol, name=gene_symbol,
+                       description=f'CRISPR Cas12 predicted targets for {gene_symbol}', features=features)
+    record.annotations["molecule_type"] = "DNA"
+    SeqIO.write(record, output_path, "genbank")
+def create_bed_file_from_df(df, output_path):
+    with open(output_path, 'w') as bed_file:
+        for index, row in df.iterrows():
+            chrom = row["Chr"]
+            start = int(row["Target Start"])
+            end = start + len(row["Target"])  # Calculate the end position based on the target sequence length
+            strand = '+' if row["Strand"] == '1' else '-'
+            gRNA = row["gRNA"]
+            score = str(row["Prediction"])
+            is_mutation = 'Yes' if row["Is Mutation"] else 'No'
+            # transcript_id is not typically part of the standard BED columns but added here for completeness
+            transcript_id = row["Transcript"]
+            # Writing only standard BED columns; additional columns can be appended as needed
+            bed_file.write(f"{chrom}\t{start}\t{end}\t{gRNA}\t{score}\t{strand}\t{is_mutation}\n")
+def create_csv_from_df(df, output_path):
+    df.to_csv(output_path, index=False)

cas9att.py ADDED Viewed

	@@ -0,0 +1,296 @@

+import requests
+import tensorflow as tf
+import pandas as pd
+import numpy as np
+from operator import add
+from functools import reduce
+import random
+import tabulate
+from keras import Model
+from keras import regularizers
+from keras.optimizers import Adam
+from keras.layers import Conv2D, BatchNormalization, ReLU, Input, Flatten, Softmax
+from keras.layers import Concatenate, Activation, Dense, GlobalAveragePooling2D, Dropout
+from keras.layers import AveragePooling1D, Bidirectional, LSTM, GlobalAveragePooling1D, MaxPool1D, Reshape
+from keras.layers import LayerNormalization, Conv1D, MultiHeadAttention, Layer
+from keras.models import load_model
+from keras.callbacks import EarlyStopping, ReduceLROnPlateau
+from Bio import SeqIO
+from Bio.SeqRecord import SeqRecord
+from Bio.SeqFeature import SeqFeature, FeatureLocation
+from Bio.Seq import Seq
+import cyvcf2
+import parasail
+import re
+ntmap = {'A': (1, 0, 0, 0),
+         'C': (0, 1, 0, 0),
+         'G': (0, 0, 1, 0),
+         'T': (0, 0, 0, 1)
+         }
+def get_seqcode(seq):
+    return np.array(reduce(add, map(lambda c: ntmap[c], seq.upper()))).reshape((1, len(seq), -1))
+class PositionalEncoding(Layer):
+    def __init__(self, sequence_len=None, embedding_dim=None,**kwargs):
+        super(PositionalEncoding, self).__init__()
+        self.sequence_len = sequence_len
+        self.embedding_dim = embedding_dim
+    def call(self, x):
+        position_embedding = np.array([
+            [pos / np.power(10000, 2. * i / self.embedding_dim) for i in range(self.embedding_dim)]
+            for pos in range(self.sequence_len)])
+        position_embedding[:, 0::2] = np.sin(position_embedding[:, 0::2])  # dim 2i
+        position_embedding[:, 1::2] = np.cos(position_embedding[:, 1::2])  # dim 2i+1
+        position_embedding = tf.cast(position_embedding, dtype=tf.float32)
+        return position_embedding+x
+    def get_config(self):
+        config = super().get_config().copy()
+        config.update({
+            'sequence_len' : self.sequence_len,
+            'embedding_dim' : self.embedding_dim,
+        })
+        return config
+def MultiHeadAttention_model(input_shape):
+    input = Input(shape=input_shape)
+    conv1 = Conv1D(256, 3, activation="relu")(input)
+    pool1 = AveragePooling1D(2)(conv1)
+    drop1 = Dropout(0.4)(pool1)
+    conv2 = Conv1D(256, 3, activation="relu")(drop1)
+    pool2 = AveragePooling1D(2)(conv2)
+    drop2 = Dropout(0.4)(pool2)
+    lstm = Bidirectional(LSTM(128,
+                               dropout=0.5,
+                               activation='tanh',
+                               return_sequences=True,
+                               kernel_regularizer=regularizers.l2(0.01)))(drop2)
+    pos_embedding = PositionalEncoding(sequence_len=int(((23-3+1)/2-3+1)/2), embedding_dim=2*128)(lstm)
+    atten = MultiHeadAttention(num_heads=2,
+                               key_dim=64,
+                               dropout=0.2,
+                               kernel_regularizer=regularizers.l2(0.01))(pos_embedding, pos_embedding)
+    flat = Flatten()(atten)
+    dense1 = Dense(512,
+                   kernel_regularizer=regularizers.l2(1e-4),
+                   bias_regularizer=regularizers.l2(1e-4),
+                   activation="relu")(flat)
+    drop3 = Dropout(0.1)(dense1)
+    dense2 = Dense(128,
+                   kernel_regularizer=regularizers.l2(1e-4),
+                   bias_regularizer=regularizers.l2(1e-4),
+                   activation="relu")(drop3)
+    drop4 = Dropout(0.1)(dense2)
+    dense3 = Dense(256,
+                   kernel_regularizer=regularizers.l2(1e-4),
+                   bias_regularizer=regularizers.l2(1e-4),
+                   activation="relu")(drop4)
+    drop5 = Dropout(0.1)(dense3)
+    output = Dense(1, activation="linear")(drop5)
+    model = Model(inputs=[input], outputs=[output])
+    return model
+def fetch_ensembl_transcripts(gene_symbol):
+    url = f"https://rest.ensembl.org/lookup/symbol/homo_sapiens/{gene_symbol}?expand=1;content-type=application/json"
+    response = requests.get(url)
+    if response.status_code == 200:
+        gene_data = response.json()
+        if 'Transcript' in gene_data:
+            return gene_data['Transcript']
+        else:
+            print("No transcripts found for gene:", gene_symbol)
+            return None
+    else:
+        print(f"Error fetching gene data from Ensembl: {response.text}")
+        return None
+def fetch_ensembl_sequence(transcript_id):
+    url = f"https://rest.ensembl.org/sequence/id/{transcript_id}?content-type=application/json"
+    response = requests.get(url)
+    if response.status_code == 200:
+        sequence_data = response.json()
+        if 'seq' in sequence_data:
+            return sequence_data['seq']
+        else:
+            print("No sequence found for transcript:", transcript_id)
+            return None
+    else:
+        print(f"Error fetching sequence data from Ensembl: {response.text}")
+        return None
+def find_crispr_targets(sequence, chr, start, end, strand, transcript_id, exon_id, pam="NGG", target_length=20):
+    targets = []
+    len_sequence = len(sequence)
+    #complement = {'A': 'T', 'T': 'A', 'C': 'G', 'G': 'C'}
+    dnatorna = {'A': 'A', 'T': 'U', 'C': 'C', 'G': 'G'}
+    for i in range(len_sequence - len(pam) + 1):
+        if sequence[i + 1:i + 3] == pam[1:]:
+            if i >= target_length:
+                target_seq = sequence[i - target_length:i + 3]
+                if strand == -1:
+                    tar_start = end - (i + 2)
+                    tar_end = end - (i - target_length)
+                    #seq_in_ref = ''.join([complement[base] for base in target_seq])[::-1]
+                else:
+                    tar_start = start + i - target_length
+                    tar_end = start + i + 3 - 1
+                    #seq_in_ref = target_seq
+                gRNA = ''.join([dnatorna[base] for base in sequence[i - target_length:i]])
+                #targets.append([target_seq, gRNA, chr, str(tar_start), str(tar_end), str(strand), transcript_id, exon_id, seq_in_ref])
+                targets.append([target_seq, gRNA, chr, str(tar_start), str(tar_end), str(strand), transcript_id, exon_id])
+    return targets
+# Function to predict on-target efficiency and format output
+def format_prediction_output(targets, model_path):
+    model = MultiHeadAttention_model(input_shape=(23, 4))
+    model.load_weights(model_path)
+    formatted_data = []
+    for target in targets:
+        # Encode the gRNA sequence
+        encoded_seq = get_seqcode(target[0])
+        # Predict on-target efficiency using the model
+        prediction = float(list(model.predict(encoded_seq, verbose=0)[0])[0])
+        if prediction > 100:
+            prediction = 100
+        # Format output
+        gRNA = target[1]
+        chr = target[2]
+        start = target[3]
+        end = target[4]
+        strand = target[5]
+        transcript_id = target[6]
+        exon_id = target[7]
+        #seq_in_ref = target[8]
+        #formatted_data.append([chr, start, end, strand, transcript_id, exon_id, target[0], gRNA, seq_in_ref, prediction[0]])
+        formatted_data.append([chr, start, end, strand, transcript_id, exon_id, target[0], gRNA, prediction])
+    return formatted_data
+def process_gene(gene_symbol, model_path):
+    # Fetch transcripts for the given gene symbol
+    transcripts = fetch_ensembl_transcripts(gene_symbol)
+    results = []
+    all_exons = []  # To accumulate all exons
+    all_gene_sequences = []  # To accumulate all gene sequences
+    if transcripts:
+        for transcript in transcripts:
+            Exons = transcript['Exon']
+            all_exons.extend(Exons)  # Add all exons from this transcript to the list
+            transcript_id = transcript['id']
+            for exon in Exons:
+                exon_id = exon['id']
+                gene_sequence = fetch_ensembl_sequence(exon_id)
+                if gene_sequence:
+                    all_gene_sequences.append(gene_sequence)  # Add this gene sequence to the list
+                    start = exon['start']
+                    end = exon['end']
+                    strand = exon['strand']
+                    chr = exon['seq_region_name']
+                    # Find potential CRISPR targets within the exon
+                    targets = find_crispr_targets(gene_sequence, chr, start, end, strand, transcript_id, exon_id)
+                    if targets:
+                        # Format the prediction output for the targets found
+                        formatted_data = format_prediction_output(targets, model_path)
+                        results.extend(formatted_data)  # Append results
+                else:
+                    print(f"Failed to retrieve gene sequence for exon {exon_id}.")
+    else:
+        print("Failed to retrieve transcripts.")
+    # Return the sorted output, combined gene sequences, and all exons
+    return results, all_gene_sequences, all_exons
+def create_genbank_features(data):
+    features = []
+    # If the input data is a DataFrame, convert it to a list of lists
+    if isinstance(data, pd.DataFrame):
+        formatted_data = data.values.tolist()
+    elif isinstance(data, list):
+        formatted_data = data
+    else:
+        raise TypeError("Data should be either a list or a pandas DataFrame.")
+    for row in formatted_data:
+        try:
+            start = int(row[1])
+            end = int(row[2])
+        except ValueError as e:
+            print(f"Error converting start/end to int: {row[1]}, {row[2]} - {e}")
+            continue
+        strand = 1 if row[3] == '+' else -1
+        location = FeatureLocation(start=start, end=end, strand=strand)
+        feature = SeqFeature(location=location, type="misc_feature", qualifiers={
+            'label': row[7],  # Use gRNA as the label
+            'note': f"Prediction: {row[8]}"  # Include the prediction score
+        })
+        features.append(feature)
+    return features
+def generate_genbank_file_from_df(df, gene_sequence, gene_symbol, output_path):
+    # Ensure gene_sequence is a string before creating Seq object
+    if not isinstance(gene_sequence, str):
+        gene_sequence = str(gene_sequence)
+    features = create_genbank_features(df)
+    # Now gene_sequence is guaranteed to be a string, suitable for Seq
+    seq_obj = Seq(gene_sequence)
+    record = SeqRecord(seq_obj, id=gene_symbol, name=gene_symbol,
+                       description=f'CRISPR Cas9 predicted targets for {gene_symbol}', features=features)
+    record.annotations["molecule_type"] = "DNA"
+    SeqIO.write(record, output_path, "genbank")
+def create_bed_file_from_df(df, output_path):
+    with open(output_path, 'w') as bed_file:
+        for index, row in df.iterrows():
+            chrom = row["Chr"]
+            start = int(row["Start Pos"])
+            end = int(row["End Pos"])
+            strand = '+' if row["Strand"] == '1' else '-'
+            gRNA = row["gRNA"]
+            score = str(row["Prediction"])
+            # transcript_id is not typically part of the standard BED columns but added here for completeness
+            transcript_id = row["Transcript"]
+            # Writing only standard BED columns; additional columns can be appended as needed
+            bed_file.write(f"{chrom}\t{start}\t{end}\t{gRNA}\t{score}\t{strand}\n")
+def create_csv_from_df(df, output_path):
+    df.to_csv(output_path, index=False)

cas9attvcf.py ADDED Viewed

	@@ -0,0 +1,393 @@

+import requests
+import tensorflow as tf
+import pandas as pd
+import numpy as np
+from operator import add
+from functools import reduce
+import random
+import tabulate
+from keras import Model
+from keras import regularizers
+from keras.optimizers import Adam
+from keras.layers import Conv2D, BatchNormalization, ReLU, Input, Flatten, Softmax
+from keras.layers import Concatenate, Activation, Dense, GlobalAveragePooling2D, Dropout
+from keras.layers import AveragePooling1D, Bidirectional, LSTM, GlobalAveragePooling1D, MaxPool1D, Reshape
+from keras.layers import LayerNormalization, Conv1D, MultiHeadAttention, Layer
+from keras.models import load_model
+from keras.callbacks import EarlyStopping, ReduceLROnPlateau
+from Bio import SeqIO
+from Bio.SeqRecord import SeqRecord
+from Bio.SeqFeature import SeqFeature, FeatureLocation
+from Bio.Seq import Seq
+import cyvcf2
+import parasail
+import re
+ntmap = {'A': (1, 0, 0, 0),
+         'C': (0, 1, 0, 0),
+         'G': (0, 0, 1, 0),
+         'T': (0, 0, 0, 1)
+         }
+def get_seqcode(seq):
+    return np.array(reduce(add, map(lambda c: ntmap[c], seq.upper()))).reshape((1, len(seq), -1))
+class PositionalEncoding(Layer):
+    def __init__(self, sequence_len=None, embedding_dim=None,**kwargs):
+        super(PositionalEncoding, self).__init__()
+        self.sequence_len = sequence_len
+        self.embedding_dim = embedding_dim
+    def call(self, x):
+        position_embedding = np.array([
+            [pos / np.power(10000, 2. * i / self.embedding_dim) for i in range(self.embedding_dim)]
+            for pos in range(self.sequence_len)])
+        position_embedding[:, 0::2] = np.sin(position_embedding[:, 0::2])  # dim 2i
+        position_embedding[:, 1::2] = np.cos(position_embedding[:, 1::2])  # dim 2i+1
+        position_embedding = tf.cast(position_embedding, dtype=tf.float32)
+        return position_embedding+x
+    def get_config(self):
+        config = super().get_config().copy()
+        config.update({
+            'sequence_len' : self.sequence_len,
+            'embedding_dim' : self.embedding_dim,
+        })
+        return config
+def MultiHeadAttention_model(input_shape):
+    input = Input(shape=input_shape)
+    conv1 = Conv1D(256, 3, activation="relu")(input)
+    pool1 = AveragePooling1D(2)(conv1)
+    drop1 = Dropout(0.4)(pool1)
+    conv2 = Conv1D(256, 3, activation="relu")(drop1)
+    pool2 = AveragePooling1D(2)(conv2)
+    drop2 = Dropout(0.4)(pool2)
+    lstm = Bidirectional(LSTM(128,
+                               dropout=0.5,
+                               activation='tanh',
+                               return_sequences=True,
+                               kernel_regularizer=regularizers.l2(0.01)))(drop2)
+    pos_embedding = PositionalEncoding(sequence_len=int(((23-3+1)/2-3+1)/2), embedding_dim=2*128)(lstm)
+    atten = MultiHeadAttention(num_heads=2,
+                               key_dim=64,
+                               dropout=0.2,
+                               kernel_regularizer=regularizers.l2(0.01))(pos_embedding, pos_embedding)
+    flat = Flatten()(atten)
+    dense1 = Dense(512,
+                   kernel_regularizer=regularizers.l2(1e-4),
+                   bias_regularizer=regularizers.l2(1e-4),
+                   activation="relu")(flat)
+    drop3 = Dropout(0.1)(dense1)
+    dense2 = Dense(128,
+                   kernel_regularizer=regularizers.l2(1e-4),
+                   bias_regularizer=regularizers.l2(1e-4),
+                   activation="relu")(drop3)
+    drop4 = Dropout(0.1)(dense2)
+    dense3 = Dense(256,
+                   kernel_regularizer=regularizers.l2(1e-4),
+                   bias_regularizer=regularizers.l2(1e-4),
+                   activation="relu")(drop4)
+    drop5 = Dropout(0.1)(dense3)
+    output = Dense(1, activation="linear")(drop5)
+    model = Model(inputs=[input], outputs=[output])
+    return model
+def fetch_ensembl_transcripts(gene_symbol):
+    url = f"https://rest.ensembl.org/lookup/symbol/homo_sapiens/{gene_symbol}?expand=1;content-type=application/json"
+    response = requests.get(url)
+    if response.status_code == 200:
+        gene_data = response.json()
+        if 'Transcript' in gene_data:
+            return gene_data['Transcript']
+        else:
+            print("No transcripts found for gene:", gene_symbol)
+            return None
+    else:
+        print(f"Error fetching gene data from Ensembl: {response.text}")
+        return None
+def fetch_ensembl_sequence(transcript_id):
+    url = f"https://rest.ensembl.org/sequence/id/{transcript_id}?content-type=application/json"
+    response = requests.get(url)
+    if response.status_code == 200:
+        sequence_data = response.json()
+        if 'seq' in sequence_data:
+            return sequence_data['seq']
+        else:
+            print("No sequence found for transcript:", transcript_id)
+            return None
+    else:
+        print(f"Error fetching sequence data from Ensembl: {response.text}")
+        return None
+def apply_mutation(ref_sequence, offset, ref, alt):
+    """
+    Apply a single mutation to the sequence.
+    """
+    if len(ref) == len(alt) and alt != "*":  # SNP
+        mutated_seq = ref_sequence[:offset] + alt + ref_sequence[offset+len(alt):]
+    elif len(ref) < len(alt):  # Insertion
+        mutated_seq = ref_sequence[:offset] + alt + ref_sequence[offset+1:]
+    elif len(ref) == len(alt) and alt == "*":  # Deletion
+        mutated_seq = ref_sequence[:offset] + ref_sequence[offset+1:]
+    elif len(ref) > len(alt) and alt != "*":  # Deletion
+        mutated_seq = ref_sequence[:offset] + alt + ref_sequence[offset+len(ref):]
+    elif len(ref) > len(alt) and alt == "*":  # Deletion
+        mutated_seq = ref_sequence[:offset] + ref_sequence[offset+len(ref):]
+    return mutated_seq
+def construct_combinations(sequence, mutations):
+    """
+    Construct all combinations of mutations.
+    mutations is a list of tuples (position, ref, [alts])
+    """
+    if not mutations:
+        return [sequence]
+    # Take the first mutation and recursively construct combinations for the rest
+    first_mutation = mutations[0]
+    rest_mutations = mutations[1:]
+    offset, ref, alts = first_mutation
+    sequences = []
+    for alt in alts:
+        mutated_sequence = apply_mutation(sequence, offset, ref, alt)
+        sequences.extend(construct_combinations(mutated_sequence, rest_mutations))
+    return sequences
+def needleman_wunsch_alignment(query_seq, ref_seq):
+    """
+    Use Needleman-Wunsch alignment to find the maximum alignment position in ref_seq
+    Use this position to represent the position of target sequence with mutations
+    """
+    # Needleman-Wunsch alignment
+    alignment = parasail.nw_trace(query_seq, ref_seq, 10, 1, parasail.blosum62)
+    # extract CIGAR object
+    cigar = alignment.cigar
+    cigar_string = cigar.decode.decode("utf-8")
+    # record ref_pos
+    ref_pos = 0
+    matches = re.findall(r'(\d+)([MIDNSHP=X])', cigar_string)
+    max_num_before_equal = 0
+    max_equal_index = -1
+    total_before_max_equal = 0
+    for i, (num_str, op) in enumerate(matches):
+        num = int(num_str)
+        if op == '=':
+            if num > max_num_before_equal:
+                max_num_before_equal = num
+                max_equal_index = i
+    total_before_max_equal = sum(int(matches[j][0]) for j in range(max_equal_index))
+    ref_pos = total_before_max_equal
+    return ref_pos
+def find_gRNA_with_mutation(ref_sequence, exon_chr, start, end, strand, transcript_id,
+                            exon_id, gene_symbol, vcf_reader, pam="NGG", target_length=20):
+    # initialization
+    mutated_sequences = [ref_sequence]
+    # find mutations within interested region
+    mutations = vcf_reader(f"{exon_chr}:{start}-{end}")
+    if mutations:
+        # find mutations
+        mutation_list = []
+        for mutation in mutations:
+            offset = mutation.POS - start
+            ref = mutation.REF
+            alts = mutation.ALT[:-1]
+            mutation_list.append((offset, ref, alts))
+        # replace reference sequence of mutation
+        mutated_sequences = construct_combinations(ref_sequence, mutation_list)
+    # find gRNA in ref_sequence or all mutated_sequences
+    targets = []
+    for seq in mutated_sequences:
+        len_sequence = len(seq)
+        dnatorna = {'A': 'A', 'T': 'U', 'C': 'C', 'G': 'G'}
+        for i in range(len_sequence - len(pam) + 1):
+            if seq[i + 1:i + 3] == pam[1:]:
+                if i >= target_length:
+                    target_seq = seq[i - target_length:i + 3]
+                    pos = ref_sequence.find(target_seq)
+                    if pos != -1:
+                        is_mut = False
+                        if strand == -1:
+                            tar_start = end - pos - target_length - 2
+                        else:
+                            tar_start = start + pos
+                    else:
+                        is_mut = True
+                        nw_pos = needleman_wunsch_alignment(target_seq, ref_sequence)
+                        if strand == -1:
+                            tar_start = str(end - nw_pos - target_length - 2) + '*'
+                        else:
+                            tar_start = str(start + nw_pos) + '*'
+                    gRNA = ''.join([dnatorna[base] for base in seq[i - target_length:i]])
+                    targets.append([target_seq, gRNA, exon_chr, str(strand), str(tar_start), transcript_id, exon_id, gene_symbol, is_mut])
+    # filter duplicated targets
+    unique_targets_set = set(tuple(element) for element in targets)
+    unique_targets = [list(element) for element in unique_targets_set]
+    return unique_targets
+def format_prediction_output_with_mutation(targets, model_path):
+    model = MultiHeadAttention_model(input_shape=(23, 4))
+    model.load_weights(model_path)
+    formatted_data = []
+    for target in targets:
+        # Encode the gRNA sequence
+        encoded_seq = get_seqcode(target[0])
+        # Predict on-target efficiency using the model
+        prediction = float(list(model.predict(encoded_seq, verbose=0)[0])[0])
+        if prediction > 100:
+            prediction = 100
+        # Format output
+        gRNA = target[1]
+        exon_chr = target[2]
+        strand = target[3]
+        tar_start = target[4]
+        transcript_id = target[5]
+        exon_id = target[6]
+        gene_symbol = target[7]
+        is_mut = target[8]
+        formatted_data.append([gene_symbol, exon_chr, strand, tar_start, transcript_id,
+                               exon_id, target[0], gRNA, prediction, is_mut])
+    return formatted_data
+def process_gene(gene_symbol, vcf_reader, model_path):
+    transcripts = fetch_ensembl_transcripts(gene_symbol)
+    results = []
+    all_exons = []  # To accumulate all exons
+    all_gene_sequences = []  # To accumulate all gene sequences
+    if transcripts:
+        for transcript in transcripts:
+            Exons = transcript['Exon']
+            all_exons.extend(Exons)  # Add all exons from this transcript to the list
+            transcript_id = transcript['id']
+            for Exon in Exons:
+                exon_id = Exon['id']
+                gene_sequence = fetch_ensembl_sequence(exon_id)  # Reference exon sequence
+                if gene_sequence:
+                    all_gene_sequences.append(gene_sequence)  # Add this gene sequence to the list
+                    exon_chr = Exon['seq_region_name']
+                    start = Exon['start']
+                    end = Exon['end']
+                    strand = Exon['strand']
+                    targets = find_gRNA_with_mutation(gene_sequence, exon_chr, start, end, strand,
+                                                      transcript_id, exon_id, gene_symbol, vcf_reader)
+                    if targets:
+                        # Predict on-target efficiency for each gRNA site including mutations
+                        formatted_data = format_prediction_output_with_mutation(targets, model_path)
+                        results.extend(formatted_data)
+                else:
+                    print(f"Failed to retrieve gene sequence for exon {exon_id}.")
+    else:
+        print("Failed to retrieve transcripts.")
+    # Return the sorted output, combined gene sequences, and all exons
+    return results, all_gene_sequences, all_exons
+def create_genbank_features(data):
+    features = []
+    # If the input data is a DataFrame, convert it to a list of lists
+    if isinstance(data, pd.DataFrame):
+        formatted_data = data.values.tolist()
+    elif isinstance(data, list):
+        formatted_data = data
+    else:
+        raise TypeError("Data should be either a list or a pandas DataFrame.")
+    for row in formatted_data:
+        try:
+            start = int(row[1])
+            end = start + len(row[6])  # Calculate the end position based on the target sequence length
+        except ValueError as e:
+            print(f"Error converting start/end to int: {row[1]}, {row[2]} - {e}")
+            continue
+        strand = 1 if row[3] == '1' else -1
+        location = FeatureLocation(start=start, end=end, strand=strand)
+        is_mutation = 'Yes' if row[9] else 'No'
+        feature = SeqFeature(location=location, type="misc_feature", qualifiers={
+            'label': row[7],  # Use gRNA as the label
+            'note': f"Prediction: {row[8]}, Mutation: {is_mutation}"  # Include the prediction score and mutation status
+        })
+        features.append(feature)
+    return features
+def generate_genbank_file_from_df(df, gene_sequence, gene_symbol, output_path):
+    # Ensure gene_sequence is a string before creating Seq object
+    if not isinstance(gene_sequence, str):
+        gene_sequence = str(gene_sequence)
+    features = create_genbank_features(df)
+    # Now gene_sequence is guaranteed to be a string, suitable for Seq
+    seq_obj = Seq(gene_sequence)
+    record = SeqRecord(seq_obj, id=gene_symbol, name=gene_symbol,
+                       description=f'CRISPR Cas9 predicted targets for {gene_symbol}', features=features)
+    record.annotations["molecule_type"] = "DNA"
+    SeqIO.write(record, output_path, "genbank")
+def create_bed_file_from_df(df, output_path):
+    with open(output_path, 'w') as bed_file:
+        for index, row in df.iterrows():
+            chrom = row["Chr"]
+            start = int(row["Target Start"])
+            end = start + len(row["Target"])  # Calculate the end position based on the target sequence length
+            strand = '+' if row["Strand"] == '1' else '-'
+            gRNA = row["gRNA"]
+            score = str(row["Prediction"])
+            is_mutation = 'Yes' if row["Is Mutation"] else 'No'
+            # transcript_id is not typically part of the standard BED columns but added here for completeness
+            transcript_id = row["Transcript"]
+            # Writing only standard BED columns; additional columns can be appended as needed
+            bed_file.write(f"{chrom}\t{start}\t{end}\t{gRNA}\t{score}\t{strand}\t{is_mutation}\n")
+def create_csv_from_df(df, output_path):
+    df.to_csv(output_path, index=False)

cas9off.py ADDED Viewed

	@@ -0,0 +1,134 @@

+import tensorflow as tf
+import numpy as np
+import pandas as pd
+import os
+import argparse
+# configure GPUs
+for gpu in tf.config.list_physical_devices('GPU'):
+    tf.config.experimental.set_memory_growth(gpu, enable=True)
+if len(tf.config.list_physical_devices('GPU')) > 0:
+    tf.config.experimental.set_visible_devices(tf.config.list_physical_devices('GPU')[0], 'GPU')
+class Encoder:
+    def __init__(self, on_seq, off_seq, with_category = False, label = None, with_reg_val = False, value = None):
+        tlen = 24
+        self.on_seq = "-" *(tlen-len(on_seq)) +  on_seq
+        self.off_seq = "-" *(tlen-len(off_seq)) + off_seq
+        self.encoded_dict_indel = {'A': [1, 0, 0, 0, 0], 'T': [0, 1, 0, 0, 0],
+                                   'G': [0, 0, 1, 0, 0], 'C': [0, 0, 0, 1, 0], '_': [0, 0, 0, 0, 1], '-': [0, 0, 0, 0, 0]}
+        self.direction_dict = {'A':5, 'G':4, 'C':3, 'T':2, '_':1}
+        if with_category:
+            self.label = label
+        if with_reg_val:
+            self.value = value
+        self.encode_on_off_dim7()
+    def encode_sgRNA(self):
+        code_list = []
+        encoded_dict = self.encoded_dict_indel
+        sgRNA_bases = list(self.on_seq)
+        for i in range(len(sgRNA_bases)):
+            if sgRNA_bases[i] == "N":
+                sgRNA_bases[i] = list(self.off_seq)[i]
+            code_list.append(encoded_dict[sgRNA_bases[i]])
+        self.sgRNA_code = np.array(code_list)
+    def encode_off(self):
+        code_list = []
+        encoded_dict = self.encoded_dict_indel
+        off_bases = list(self.off_seq)
+        for i in range(len(off_bases)):
+            code_list.append(encoded_dict[off_bases[i]])
+        self.off_code = np.array(code_list)
+    def encode_on_off_dim7(self):
+        self.encode_sgRNA()
+        self.encode_off()
+        on_bases = list(self.on_seq)
+        off_bases = list(self.off_seq)
+        on_off_dim7_codes = []
+        for i in range(len(on_bases)):
+            diff_code = np.bitwise_or(self.sgRNA_code[i], self.off_code[i])
+            on_b = on_bases[i]
+            off_b = off_bases[i]
+            if on_b == "N":
+                on_b = off_b
+            dir_code = np.zeros(2)
+            if on_b == "-" or off_b == "-" or self.direction_dict[on_b] == self.direction_dict[off_b]:
+                pass
+            else:
+                if self.direction_dict[on_b] > self.direction_dict[off_b]:
+                    dir_code[0] = 1
+                else:
+                    dir_code[1] = 1
+            on_off_dim7_codes.append(np.concatenate((diff_code, dir_code)))
+        self.on_off_code = np.array(on_off_dim7_codes)
+def encode_on_off_seq_pairs(input_file):
+    inputs = pd.read_csv(input_file, delimiter=",", header=None, names=['on_seq', 'off_seq'])
+    input_codes = []
+    for idx, row in inputs.iterrows():
+        on_seq = row['on_seq']
+        off_seq = row['off_seq']
+        en = Encoder(on_seq=on_seq, off_seq=off_seq)
+        input_codes.append(en.on_off_code)
+    input_codes = np.array(input_codes)
+    input_codes = input_codes.reshape((len(input_codes), 1, 24, 7))
+    y_pred = CRISPR_net_predict(input_codes)
+    inputs['CRISPR_Net_score'] = y_pred
+    inputs.to_csv("CRISPR_net_results.csv", index=False)
+def CRISPR_net_predict(X_test):
+    json_file = open("cas9_model/CRISPR_Net_CIRCLE_elevation_SITE_structure.json", 'r')
+    loaded_model_json = json_file.read()
+    json_file.close()
+    loaded_model = tf.keras.models.model_from_json(loaded_model_json)  # Updated for TensorFlow 2
+    loaded_model.load_weights("cas9_model/CRISPR_Net_CIRCLE_elevation_SITE_weights.h5")
+    y_pred = loaded_model.predict(X_test).flatten()
+    return y_pred
+def process_input_and_predict(input_data, input_type='manual'):
+    if input_type == 'manual':
+        sequences = [seq.split(',') for seq in input_data.split('\n')]
+        inputs = pd.DataFrame(sequences, columns=['on_seq', 'off_seq'])
+    elif input_type == 'file':
+        inputs = pd.read_csv(input_data, delimiter=",", header=None, names=['on_seq', 'off_seq'])
+    valid_inputs = []
+    input_codes = []
+    for idx, row in inputs.iterrows():
+        on_seq = row['on_seq']
+        off_seq = row['off_seq']
+        if not on_seq or not off_seq:
+            continue
+        en = Encoder(on_seq=on_seq, off_seq=off_seq)
+        input_codes.append(en.on_off_code)
+        valid_inputs.append((on_seq, off_seq))
+    input_codes = np.array(input_codes)
+    input_codes = input_codes.reshape((len(input_codes), 1, 24, 7))
+    y_pred = CRISPR_net_predict(input_codes)
+    # Create a new DataFrame from valid inputs and predictions
+    result_df = pd.DataFrame(valid_inputs, columns=['on_seq', 'off_seq'])
+    result_df['CRISPR_Net_score'] = y_pred
+    return result_df
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description="CRISPR-Net v1.0 (Aug 10 2019)")
+    parser.add_argument("input_file",
+                        help="input_file example (on-target seq, off-target seq):\n GAGT_CCGAGCAGAAGAAGAATGG,GAGTACCAAGTAGAAGAAAAATTT\n"
+                             "GTTGCCCCACAGGGCAGTAAAGG,GTGGACACCCCGGGCAGGAAAGG\n"
+                             "GGGTGGGGGGAGTTTGCTCCAGG,AGGTGGGGTGA_TTTGCTCCAGG")
+    args = parser.parse_args()
+    file = args.input_file
+    if not os.path.exists(args.input_file):
+        print("File doesn't exist!")
+    else:
+        encode_on_off_seq_pairs(file)
+        tf.keras.backend.clear_session()

tiger.py ADDED Viewed

	@@ -0,0 +1,417 @@

+import argparse
+import os
+import gzip
+import pickle
+import numpy as np
+import pandas as pd
+import tensorflow as tf
+from Bio import SeqIO
+# column names
+ID_COL = 'Transcript ID'
+SEQ_COL = 'Transcript Sequence'
+TARGET_COL = 'Target Sequence'
+GUIDE_COL = 'Guide Sequence'
+MM_COL = 'Number of Mismatches'
+SCORE_COL = 'Guide Score'
+# nucleotide tokens
+NUCLEOTIDE_TOKENS = dict(zip(['A', 'C', 'G', 'T', 'N'], [0, 1, 2, 3, 255]))
+NUCLEOTIDE_COMPLEMENT = dict(zip(['A', 'C', 'G', 'T'], ['T', 'G', 'C', 'A']))
+# model hyper-parameters
+GUIDE_LEN = 23
+CONTEXT_5P = 3
+CONTEXT_3P = 0
+TARGET_LEN = CONTEXT_5P + GUIDE_LEN + CONTEXT_3P
+UNIT_INTERVAL_MAP = 'sigmoid'
+# reference transcript files
+REFERENCE_TRANSCRIPTS = ('gencode.v19.pc_transcripts.fa.gz', 'gencode.v19.lncRNA_transcripts.fa.gz')
+# application configuration
+BATCH_SIZE_COMPUTE = 500
+BATCH_SIZE_SCAN = 20
+BATCH_SIZE_TRANSCRIPTS = 50
+NUM_TOP_GUIDES = 10
+NUM_MISMATCHES = 3
+RUN_MODES = dict(
+    all='All on-target guides per transcript',
+    top_guides='Top {:d} guides per transcript'.format(NUM_TOP_GUIDES),
+    titration='Top {:d} guides per transcript & their titration candidates'.format(NUM_TOP_GUIDES)
+)
+# configure GPUs
+for gpu in tf.config.list_physical_devices('GPU'):
+    tf.config.experimental.set_memory_growth(gpu, enable=True)
+if len(tf.config.list_physical_devices('GPU')) > 0:
+    tf.config.experimental.set_visible_devices(tf.config.list_physical_devices('GPU')[0], 'GPU')
+def load_transcripts(fasta_files: list, enforce_unique_ids: bool = True):
+    # load all transcripts from fasta files into a DataFrame
+    transcripts = pd.DataFrame()
+    for file in fasta_files:
+        try:
+            if os.path.splitext(file)[1] == '.gz':
+                with gzip.open(file, 'rt') as f:
+                    df = pd.DataFrame([(t.id, str(t.seq)) for t in SeqIO.parse(f, 'fasta')], columns=[ID_COL, SEQ_COL])
+            else:
+                df = pd.DataFrame([(t.id, str(t.seq)) for t in SeqIO.parse(file, 'fasta')], columns=[ID_COL, SEQ_COL])
+        except Exception as e:
+            print(e, 'while loading', file)
+            continue
+        transcripts = pd.concat([transcripts, df])
+    # set index
+    transcripts[ID_COL] = transcripts[ID_COL].apply(lambda s: s.split('|')[0])
+    transcripts.set_index(ID_COL, inplace=True)
+    if enforce_unique_ids:
+        assert not transcripts.index.has_duplicates, "duplicate transcript ID's detected in fasta file"
+    return transcripts
+def sequence_complement(sequence: list):
+    return [''.join([NUCLEOTIDE_COMPLEMENT[nt] for nt in list(seq)]) for seq in sequence]
+def one_hot_encode_sequence(sequence: list, add_context_padding: bool = False):
+    # stack list of sequences into a tensor
+    sequence = tf.ragged.stack([tf.constant(list(seq)) for seq in sequence], axis=0)
+    # tokenize sequence
+    nucleotide_table = tf.lookup.StaticVocabularyTable(
+        initializer=tf.lookup.KeyValueTensorInitializer(
+            keys=tf.constant(list(NUCLEOTIDE_TOKENS.keys()), dtype=tf.string),
+            values=tf.constant(list(NUCLEOTIDE_TOKENS.values()), dtype=tf.int64)),
+        num_oov_buckets=1)
+    sequence = tf.RaggedTensor.from_row_splits(values=nucleotide_table.lookup(sequence.values),
+                                               row_splits=sequence.row_splits).to_tensor(255)
+    # add context padding if requested
+    if add_context_padding:
+        pad_5p = 255 * tf.ones([sequence.shape[0], CONTEXT_5P], dtype=sequence.dtype)
+        pad_3p = 255 * tf.ones([sequence.shape[0], CONTEXT_3P], dtype=sequence.dtype)
+        sequence = tf.concat([pad_5p, sequence, pad_3p], axis=1)
+    # one-hot encode
+    sequence = tf.one_hot(sequence, depth=4, dtype=tf.float16)
+    return sequence
+def process_data(transcript_seq: str):
+    # convert to upper case
+    transcript_seq = transcript_seq.upper()
+    # get all target sites
+    target_seq = [transcript_seq[i: i + TARGET_LEN] for i in range(len(transcript_seq) - TARGET_LEN + 1)]
+    # prepare guide sequences
+    guide_seq = sequence_complement([seq[CONTEXT_5P:len(seq) - CONTEXT_3P] for seq in target_seq])
+    # model inputs
+    model_inputs = tf.concat([
+        tf.reshape(one_hot_encode_sequence(target_seq, add_context_padding=False), [len(target_seq), -1]),
+        tf.reshape(one_hot_encode_sequence(guide_seq, add_context_padding=True), [len(guide_seq), -1]),
+        ], axis=-1)
+    return target_seq, guide_seq, model_inputs
+def calibrate_predictions(predictions: np.array, num_mismatches: np.array, params: pd.DataFrame = None):
+    if params is None:
+        params = pd.read_pickle('calibration_params.pkl')
+    correction = np.squeeze(params.set_index('num_mismatches').loc[num_mismatches, 'slope'].to_numpy())
+    return correction * predictions
+def score_predictions(predictions: np.array, params: pd.DataFrame = None):
+    if params is None:
+        params = pd.read_pickle('scoring_params.pkl')
+    if UNIT_INTERVAL_MAP == 'sigmoid':
+        params = params.iloc[0]
+        return 1 - 1 / (1 + np.exp(params['a'] * predictions + params['b']))
+    elif UNIT_INTERVAL_MAP == 'min-max':
+        return 1 - (predictions - params['a']) / (params['b'] - params['a'])
+    elif UNIT_INTERVAL_MAP == 'exp-lin-exp':
+        # regime indices
+        active_saturation = predictions < params['a']
+        linear_regime = (params['a'] <= predictions) & (predictions <= params['c'])
+        inactive_saturation = params['c'] < predictions
+        # linear regime
+        slope = (params['d'] - params['b']) / (params['c'] - params['a'])
+        intercept = -params['a'] * slope + params['b']
+        predictions[linear_regime] = slope * predictions[linear_regime] + intercept
+        # active saturation regime
+        alpha = slope / params['b']
+        beta = alpha * params['a'] - np.log(params['b'])
+        predictions[active_saturation] = np.exp(alpha * predictions[active_saturation] - beta)
+        # inactive saturation regime
+        alpha = slope / (1 - params['d'])
+        beta = -alpha * params['c'] - np.log(1 - params['d'])
+        predictions[inactive_saturation] = 1 - np.exp(-alpha * predictions[inactive_saturation] - beta)
+        return 1 - predictions
+    else:
+        raise NotImplementedError
+def get_on_target_predictions(transcripts: pd.DataFrame, model: tf.keras.Model, status_update_fn=None):
+    # loop over transcripts
+    predictions = pd.DataFrame()
+    for i, (index, row) in enumerate(transcripts.iterrows()):
+        # parse transcript sequence
+        target_seq, guide_seq, model_inputs = process_data(row[SEQ_COL])
+        # get predictions
+        lfc_estimate = model.predict(model_inputs, batch_size=BATCH_SIZE_COMPUTE, verbose=False)[:, 0]
+        lfc_estimate = calibrate_predictions(lfc_estimate, num_mismatches=np.zeros_like(lfc_estimate))
+        scores = score_predictions(lfc_estimate)
+        predictions = pd.concat([predictions, pd.DataFrame({
+            ID_COL: [index] * len(scores),
+            TARGET_COL: target_seq,
+            GUIDE_COL: guide_seq,
+            SCORE_COL: scores})])
+        # progress update
+        percent_complete = 100 * min((i + 1) / len(transcripts), 1)
+        update_text = 'Evaluating on-target guides for each transcript: {:.2f}%'.format(percent_complete)
+        print('\r' + update_text, end='')
+        if status_update_fn is not None:
+            status_update_fn(update_text, percent_complete)
+    print('')
+    return predictions
+def top_guides_per_transcript(predictions: pd.DataFrame):
+    # select and sort top guides for each transcript
+    top_guides = pd.DataFrame()
+    for transcript in predictions[ID_COL].unique():
+        df = predictions.loc[predictions[ID_COL] == transcript]
+        df = df.sort_values(SCORE_COL, ascending=False).reset_index(drop=True).iloc[:NUM_TOP_GUIDES]
+        top_guides = pd.concat([top_guides, df])
+    return top_guides.reset_index(drop=True)
+def get_titration_candidates(top_guide_predictions: pd.DataFrame):
+    # generate a table of all titration candidates
+    titration_candidates = pd.DataFrame()
+    for _, row in top_guide_predictions.iterrows():
+        for i in range(len(row[GUIDE_COL])):
+            nt = row[GUIDE_COL][i]
+            for mutation in set(NUCLEOTIDE_TOKENS.keys()) - {nt, 'N'}:
+                sm_guide = list(row[GUIDE_COL])
+                sm_guide[i] = mutation
+                sm_guide = ''.join(sm_guide)
+                assert row[GUIDE_COL] != sm_guide
+                titration_candidates = pd.concat([titration_candidates, pd.DataFrame({
+                    ID_COL: [row[ID_COL]],
+                    TARGET_COL: [row[TARGET_COL]],
+                    GUIDE_COL: [sm_guide],
+                    MM_COL: [1]
+                })])
+    return titration_candidates
+def find_off_targets(top_guides: pd.DataFrame, status_update_fn=None):
+    # load reference transcripts
+    reference_transcripts = load_transcripts([os.path.join('transcripts', f) for f in REFERENCE_TRANSCRIPTS])
+    # one-hot encode guides to form a filter
+    guide_filter = one_hot_encode_sequence(sequence_complement(top_guides[GUIDE_COL]), add_context_padding=False)
+    guide_filter = tf.transpose(guide_filter, [1, 2, 0])
+    # loop over transcripts in batches
+    i = 0
+    off_targets = pd.DataFrame()
+    while i < len(reference_transcripts):
+        # select batch
+        df_batch = reference_transcripts.iloc[i:min(i + BATCH_SIZE_SCAN, len(reference_transcripts))]
+        i += BATCH_SIZE_SCAN
+        # find locations of off-targets
+        transcripts = one_hot_encode_sequence(df_batch[SEQ_COL].values.tolist(), add_context_padding=False)
+        num_mismatches = GUIDE_LEN - tf.nn.conv1d(transcripts, guide_filter, stride=1, padding='SAME')
+        loc_off_targets = tf.where(tf.round(num_mismatches) <= NUM_MISMATCHES).numpy()
+        # off-targets discovered
+        if len(loc_off_targets) > 0:
+            # log off-targets
+            dict_off_targets = pd.DataFrame({
+                'On-target ' + ID_COL: top_guides.iloc[loc_off_targets[:, 2]][ID_COL],
+                GUIDE_COL: top_guides.iloc[loc_off_targets[:, 2]][GUIDE_COL],
+                'Off-target ' + ID_COL: df_batch.index.values[loc_off_targets[:, 0]],
+                'Guide Midpoint': loc_off_targets[:, 1],
+                SEQ_COL: df_batch[SEQ_COL].values[loc_off_targets[:, 0]],
+                MM_COL: tf.gather_nd(num_mismatches, loc_off_targets).numpy().astype(int),
+            }).to_dict('records')
+            # trim transcripts to targets
+            for row in dict_off_targets:
+                start_location = row['Guide Midpoint'] - (GUIDE_LEN // 2)
+                del row['Guide Midpoint']
+                target = row[SEQ_COL]
+                del row[SEQ_COL]
+                if start_location < CONTEXT_5P:
+                    target = target[0:GUIDE_LEN + CONTEXT_3P]
+                    target = 'N' * (TARGET_LEN - len(target)) + target
+                elif start_location + GUIDE_LEN + CONTEXT_3P > len(target):
+                    target = target[start_location - CONTEXT_5P:]
+                    target = target + 'N' * (TARGET_LEN - len(target))
+                else:
+                    target = target[start_location - CONTEXT_5P:start_location + GUIDE_LEN + CONTEXT_3P]
+                if row[MM_COL] == 0 and 'N' not in target:
+                    assert row[GUIDE_COL] == sequence_complement([target[CONTEXT_5P:TARGET_LEN - CONTEXT_3P]])[0]
+                row[TARGET_COL] = target
+            # append new off-targets
+            off_targets = pd.concat([off_targets, pd.DataFrame(dict_off_targets)])
+        # progress update
+        percent_complete = 100 * min((i + 1) / len(reference_transcripts), 1)
+        update_text = 'Scanning for off-targets: {:.2f}%'.format(percent_complete)
+        print('\r' + update_text, end='')
+        if status_update_fn is not None:
+            status_update_fn(update_text, percent_complete)
+    print('')
+    return off_targets
+def predict_off_target(off_targets: pd.DataFrame, model: tf.keras.Model):
+    if len(off_targets) == 0:
+        return pd.DataFrame()
+    # compute off-target predictions
+    model_inputs = tf.concat([
+        tf.reshape(one_hot_encode_sequence(off_targets[TARGET_COL], add_context_padding=False), [len(off_targets), -1]),
+        tf.reshape(one_hot_encode_sequence(off_targets[GUIDE_COL], add_context_padding=True), [len(off_targets), -1]),
+        ], axis=-1)
+    lfc_estimate = model.predict(model_inputs, batch_size=BATCH_SIZE_COMPUTE, verbose=False)[:, 0]
+    lfc_estimate = calibrate_predictions(lfc_estimate, off_targets['Number of Mismatches'].to_numpy())
+    off_targets[SCORE_COL] = score_predictions(lfc_estimate)
+    return off_targets.reset_index(drop=True)
+def tiger_exhibit(transcripts: pd.DataFrame, mode: str, check_off_targets: bool, status_update_fn=None):
+    # load model
+    if os.path.exists('cas13_model'):
+        tiger = tf.keras.models.load_model('cas13_model')
+    else:
+        print('no saved model!')
+        exit()
+    # evaluate all on-target guides per transcript
+    on_target_predictions = get_on_target_predictions(transcripts, tiger, status_update_fn)
+    # initialize other outputs
+    titration_predictions = off_target_predictions = None
+    if mode == 'all' and not check_off_targets:
+        off_target_candidates = None
+    elif mode == 'top_guides':
+        on_target_predictions = top_guides_per_transcript(on_target_predictions)
+        off_target_candidates = on_target_predictions
+    elif mode == 'titration':
+        on_target_predictions = top_guides_per_transcript(on_target_predictions)
+        titration_candidates = get_titration_candidates(on_target_predictions)
+        titration_predictions = predict_off_target(titration_candidates, model=tiger)
+        off_target_candidates = pd.concat([on_target_predictions, titration_predictions])
+    else:
+        raise NotImplementedError
+    # check off-target effects for top guides
+    if check_off_targets and off_target_candidates is not None:
+        off_target_candidates = find_off_targets(off_target_candidates, status_update_fn)
+        off_target_predictions = predict_off_target(off_target_candidates, model=tiger)
+        if len(off_target_predictions) > 0:
+            off_target_predictions = off_target_predictions.sort_values(SCORE_COL, ascending=False)
+            off_target_predictions = off_target_predictions.reset_index(drop=True)
+    # finalize tables
+    for df in [on_target_predictions, titration_predictions, off_target_predictions]:
+        if df is not None and len(df) > 0:
+            for col in df.columns:
+                if ID_COL in col and set(df[col].unique()) == {'ManualEntry'}:
+                    del df[col]
+            df[GUIDE_COL] = df[GUIDE_COL].apply(lambda s: s[::-1])  # reverse guide sequences
+            df[TARGET_COL] = df[TARGET_COL].apply(lambda seq: seq[CONTEXT_5P:len(seq) - CONTEXT_3P])  # remove context
+    return on_target_predictions, titration_predictions, off_target_predictions
+if __name__ == '__main__':
+    # common arguments
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--mode', type=str, default='titration')
+    parser.add_argument('--check_off_targets', action='store_true', default=False)
+    parser.add_argument('--fasta_path', type=str, default=None)
+    args = parser.parse_args()
+    # check for any existing results
+    if os.path.exists('on_target.csv') or os.path.exists('titration.csv') or os.path.exists('off_target.csv'):
+        raise FileExistsError('please rename or delete existing results')
+    # load transcripts from a directory of fasta files
+    if args.fasta_path is not None and os.path.exists(args.fasta_path):
+        df_transcripts = load_transcripts([os.path.join(args.fasta_path, f) for f in os.listdir(args.fasta_path)])
+    # otherwise consider simple test case with first 50 nucleotides from EIF3B-003's CDS
+    else:
+        df_transcripts = pd.DataFrame({
+            ID_COL: ['ManualEntry'],
+            SEQ_COL: ['ATGCAGGACGCGGAGAACGTGGCGGTGCCCGAGGCGGCCGAGGAGCGCGC']})
+        df_transcripts.set_index(ID_COL, inplace=True)
+    # process in batches
+    batch = 0
+    num_batches = len(df_transcripts) // BATCH_SIZE_TRANSCRIPTS
+    num_batches += (len(df_transcripts) % BATCH_SIZE_TRANSCRIPTS > 0)
+    for idx in range(0, len(df_transcripts), BATCH_SIZE_TRANSCRIPTS):
+        batch += 1
+        print('Batch {:d} of {:d}'.format(batch, num_batches))
+        # run batch
+        idx_stop = min(idx + BATCH_SIZE_TRANSCRIPTS, len(df_transcripts))
+        df_on_target, df_titration, df_off_target = tiger_exhibit(
+            transcripts=df_transcripts[idx:idx_stop],
+            mode=args.mode,
+            check_off_targets=args.check_off_targets
+        )
+        # save batch results
+        df_on_target.to_csv('on_target.csv', header=batch == 1, index=False, mode='a')
+        if df_titration is not None:
+            df_titration.to_csv('titration.csv', header=batch == 1, index=False, mode='a')
+        if df_off_target is not None:
+            df_off_target.to_csv('off_target.csv', header=batch == 1, index=False, mode='a')
+        # clear session to prevent memory blow up
+        tf.keras.backend.clear_session()