Spaces:
Sleeping
Sleeping
File size: 5,783 Bytes
a52cd62 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 |
import gradio as gr
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
import json
import re
import tempfile
# Load symbolic phrase dictionary
with open("semillas.json", "r", encoding="utf-8") as f:
diccionario_semillas = json.load(f)
def phrase_to_seed(phrase):
phrase = phrase.lower()
for key, seed in diccionario_semillas.items():
if key.lower() in phrase:
return seed
return "M"
tokenizer = AutoTokenizer.from_pretrained("nferruz/ProtGPT2", do_lower_case=False)
tokenizer.pad_token = tokenizer.eos_token
model = AutoModelForCausalLM.from_pretrained("nferruz/ProtGPT2")
def generate_protein_and_props(phrase):
seed = phrase_to_seed(phrase)
inputs = tokenizer(seed, return_tensors="pt", padding=True)
input_ids = inputs["input_ids"]
attention_mask = inputs.get("attention_mask", torch.ones_like(input_ids))
with torch.no_grad():
output = model.generate(
input_ids=input_ids,
attention_mask=attention_mask,
max_length=100,
min_length=20,
do_sample=True,
top_k=50,
temperature=0.9,
pad_token_id=tokenizer.eos_token_id,
num_return_sequences=1
)
seq = tokenizer.decode(output[0], skip_special_tokens=True)
# Calculate properties
length = len(seq)
aa_count = {aa: seq.count(aa) for aa in "ACDEFGHIKLMNPQRSTVWY"}
charge = sum([aa_count.get(a, 0) for a in "KR"]) - sum([aa_count.get(a, 0) for a in "DE"])
mw = sum([aa_count[a]*w for a, w in {
"A": 89.1, "C": 121.2, "D": 133.1, "E": 147.1, "F": 165.2,
"G": 75.1, "H": 155.2, "I": 131.2, "K": 146.2, "L": 131.2,
"M": 149.2, "N": 132.1, "P": 115.1, "Q": 146.2, "R": 174.2,
"S": 105.1, "T": 119.1, "V": 117.1, "W": 204.2, "Y": 181.2
}.items()])
props = f"🧪 Seed: {seed}\n🧬 Protein: {seq}\n\n🔬 Properties:\n- Length: {length} aa\n- Charge: {charge}\n- MW: {mw:.1f} Da"
# Save to FASTA
with tempfile.NamedTemporaryFile(delete=False, suffix=".fasta", mode="w", encoding="utf-8") as f:
f.write(f">Generated_Protein\n{seq}\n")
fasta_path = f.name
return props, fasta_path
def sequence_to_phrase(seq):
seq = seq.upper()
tags = []
if re.search(r"^M*K{3,}", seq):
tags.append("Dom(Kin)")
if re.search(r"[RK]{3,}", seq):
tags.append("Mot(NLS)")
if len(re.findall(r"E", seq)) >= 5 or "DEG" in seq:
tags.append("Mot(PEST)")
if re.search(r"KQAK|QAK", seq):
tags.append("*AcK@X")
if re.search(r"[RST]P", seq):
tags.append("*P@X")
if "PRKRK" in seq or "PKKKRKV" in seq:
tags.append("Localize(Nucleus)")
if re.search(r"(AILFL|LAGGAV|LVLL|AAVL)", seq):
tags.append("Localize(Membrane)")
return "^p:" + "-".join(sorted(set(tags))) if tags else "// No symbolic motifs found"
def phrase_to_description(phrase):
phrase = phrase.replace("^p:", "")
fragments = phrase.split("-")
translation = {
"Dom(Kin)": "a kinase domain",
"Mot(NLS)": "a nuclear localization signal",
"Mot(PEST)": "a PEST motif indicating protein degradation",
"*AcK@X": "lysine acetylation at a specific position",
"*P@X": "a phosphorylation site",
"Localize(Nucleus)": "localizes to the cell nucleus",
"Localize(Membrane)": "localizes to the cell membrane"
}
phrases = [translation.get(tag, tag) for tag in fragments if tag]
if not phrases:
return "No interpretable symbolic elements found."
return "This protein contains " + ", ".join(phrases[:-1]) + (
f", and {phrases[-1]}." if len(phrases) > 1 else f"{phrases[0]}.")
with gr.Blocks() as demo:
gr.Markdown("# 🧬 GeneForgeLang AI Tools")
gr.Markdown("Design, interpret, describe, and export proteins using symbolic language and AI.")
with gr.Tab("🧠 Phrase → Protein"):
inp = gr.Textbox(label="GeneForgeLang Phrase", placeholder="^p:Dom(Kin)-Mot(NLS)*AcK@147")
out = gr.Textbox(label="Protein + Properties")
fasta = gr.File(label="Download FASTA")
btn = gr.Button("Generate")
btn.click(fn=generate_protein_and_props, inputs=inp, outputs=[out, fasta])
with gr.Tab("🧪 Protein → Phrase"):
inp2 = gr.Textbox(label="Protein Sequence", placeholder="MKKKPRRRDEEGEK...")
out2 = gr.Textbox(label="Interpreted GeneForgeLang")
btn2 = gr.Button("Translate")
btn2.click(fn=sequence_to_phrase, inputs=inp2, outputs=out2)
with gr.Tab("🧬 Mutate Protein"):
inp4 = gr.Textbox(label="GeneForgeLang Phrase", placeholder="^p:Dom(Kin)-Mot(NLS)*AcK@147")
out4 = gr.Textbox(label="Mutated Protein")
btn4 = gr.Button("Mutate")
btn4.click(fn=generate_protein_and_props, inputs=inp4, outputs=[out4, gr.File(visible=False)])
with gr.Tab("📊 Analyze Protein"):
inp5 = gr.Textbox(label="Protein Sequence", placeholder="Paste sequence to analyze")
out5 = gr.Image(label="Amino Acid Composition")
btn5 = gr.Button("Analyze")
def analyze_graph(seq): return generar_composicion_grafico(seq)
btn5.click(fn=analyze_graph, inputs=inp5, outputs=out5)
with gr.Tab("📖 Phrase → Natural Language"):
inp3 = gr.Textbox(label="GeneForgeLang Phrase", placeholder="^p:Dom(Kin)-Mot(NLS)*AcK@147")
out3 = gr.Textbox(label="Scientific Description")
btn3 = gr.Button("Describe")
btn3.click(fn=phrase_to_description, inputs=inp3, outputs=out3)
if __name__ == "__main__":
demo.launch() |