Spaces:

fneurociencias
/

GeneForgeLang

Sleeping

App Files Files Community

ManMenGon commited on Apr 22

Commit

a52cd62

verified ·

1 Parent(s): d1c63e4

Upload 5 files

Browse files

Files changed (5) hide show

README.md +75 -14
app.py +141 -0
requirements.txt +0 -0
semillas.json +12 -0
translate_to_geneforgelang.py +49 -0

README.md CHANGED Viewed

@@ -1,14 +1,75 @@
----
-title: GeneForgeLang
-emoji: ⚡
-colorFrom: gray
-colorTo: red
-sdk: gradio
-sdk_version: 5.25.2
-app_file: app.py
-pinned: false
-license: mit
-short_description: Symbolic-to-Sequence Protein Design Toolkit
----
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

+# 🧬 GeneForgeLang: Symbolic-to-Sequence Protein Design Toolkit
+GeneForgeLang is a symbolic language and toolset for generative biology. It connects high-level biological design intentions to low-level amino acid sequences via AI, rules, and natural language.
+---
+## 🚀 Features
+| Module                      | Description |
+|----------------------------|-------------|
+| 🧠 Phrase → Protein         | Generate realistic protein sequences from symbolic phrases |
+| 🧪 Protein → Phrase         | Infer functional motifs from amino acid sequences |
+| 📖 Phrase → Description     | Translate symbolic design into scientific English |
+| 🧬 Mutate Protein           | Generate variants of proteins from the same symbolic seed |
+| 📦 Export to FASTA          | Download generated proteins for downstream use |
+| 📊 Analyze Protein          | Visualize amino acid composition as bar plot |
+| 📚 Symbolic Language        | GeneForgeLang syntax allows structured protein definitions |
+---
+## 🧪 Example
+### Input Phrase:
+^p:Dom(Kin)-Mot(NLS)*AcK@147=Localize(Nucleus)
+yaml
+Copiar
+Editar
+### Output:
+- Seed: `MKKK`
+- Generated protein: realistic sequence (via ProtGPT2)
+- Properties: length, charge, MW
+- Description: *“This protein contains a kinase domain, a nuclear localization signal, and lysine acetylation at a specific position.”*
+- Export: `.fasta` format
+- Graph: bar plot of amino acid composition
+---
+## ▶️ How to Use
+1. Clone this repo
+2. Install dependencies:
+```bash
+pip install -r requirements.txt
+Launch the interface:
+bash
+Copiar
+Editar
+python app.py
+Navigate to:
+cpp
+Copiar
+Editar
+http://127.0.0.1:7860
+📁 Repository Structure
+File	Description
+app.py	Main UI app with all functionality
+semillas.json	Phrase-to-seed dictionary
+translate_to_geneforgelang.py	Reverse translator
+README.md	This file
+requirements.txt	Python dependencies
+🧠 Developed by
+Fundación de Neurociencias
+Licensed under the MIT License
+Join us in shaping symbolic bio-AI.
+yaml
+Copiar
+Editar

app.py ADDED Viewed

	@@ -0,0 +1,141 @@

+import gradio as gr
+from transformers import AutoTokenizer, AutoModelForCausalLM
+import torch
+import json
+import re
+import tempfile
+# Load symbolic phrase dictionary
+with open("semillas.json", "r", encoding="utf-8") as f:
+    diccionario_semillas = json.load(f)
+def phrase_to_seed(phrase):
+    phrase = phrase.lower()
+    for key, seed in diccionario_semillas.items():
+        if key.lower() in phrase:
+            return seed
+    return "M"
+tokenizer = AutoTokenizer.from_pretrained("nferruz/ProtGPT2", do_lower_case=False)
+tokenizer.pad_token = tokenizer.eos_token
+model = AutoModelForCausalLM.from_pretrained("nferruz/ProtGPT2")
+def generate_protein_and_props(phrase):
+    seed = phrase_to_seed(phrase)
+    inputs = tokenizer(seed, return_tensors="pt", padding=True)
+    input_ids = inputs["input_ids"]
+    attention_mask = inputs.get("attention_mask", torch.ones_like(input_ids))
+    with torch.no_grad():
+        output = model.generate(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            max_length=100,
+            min_length=20,
+            do_sample=True,
+            top_k=50,
+            temperature=0.9,
+            pad_token_id=tokenizer.eos_token_id,
+            num_return_sequences=1
+        )
+    seq = tokenizer.decode(output[0], skip_special_tokens=True)
+    # Calculate properties
+    length = len(seq)
+    aa_count = {aa: seq.count(aa) for aa in "ACDEFGHIKLMNPQRSTVWY"}
+    charge = sum([aa_count.get(a, 0) for a in "KR"]) - sum([aa_count.get(a, 0) for a in "DE"])
+    mw = sum([aa_count[a]*w for a, w in {
+        "A": 89.1, "C": 121.2, "D": 133.1, "E": 147.1, "F": 165.2,
+        "G": 75.1, "H": 155.2, "I": 131.2, "K": 146.2, "L": 131.2,
+        "M": 149.2, "N": 132.1, "P": 115.1, "Q": 146.2, "R": 174.2,
+        "S": 105.1, "T": 119.1, "V": 117.1, "W": 204.2, "Y": 181.2
+    }.items()])
+    props = f"🧪 Seed: {seed}\n🧬 Protein: {seq}\n\n🔬 Properties:\n- Length: {length} aa\n- Charge: {charge}\n- MW: {mw:.1f} Da"
+    # Save to FASTA
+    with tempfile.NamedTemporaryFile(delete=False, suffix=".fasta", mode="w", encoding="utf-8") as f:
+        f.write(f">Generated_Protein\n{seq}\n")
+        fasta_path = f.name
+    return props, fasta_path
+def sequence_to_phrase(seq):
+    seq = seq.upper()
+    tags = []
+    if re.search(r"^M*K{3,}", seq):
+        tags.append("Dom(Kin)")
+    if re.search(r"[RK]{3,}", seq):
+        tags.append("Mot(NLS)")
+    if len(re.findall(r"E", seq)) >= 5 or "DEG" in seq:
+        tags.append("Mot(PEST)")
+    if re.search(r"KQAK|QAK", seq):
+        tags.append("*AcK@X")
+    if re.search(r"[RST]P", seq):
+        tags.append("*P@X")
+    if "PRKRK" in seq or "PKKKRKV" in seq:
+        tags.append("Localize(Nucleus)")
+    if re.search(r"(AILFL|LAGGAV|LVLL|AAVL)", seq):
+        tags.append("Localize(Membrane)")
+    return "^p:" + "-".join(sorted(set(tags))) if tags else "// No symbolic motifs found"
+def phrase_to_description(phrase):
+    phrase = phrase.replace("^p:", "")
+    fragments = phrase.split("-")
+    translation = {
+        "Dom(Kin)": "a kinase domain",
+        "Mot(NLS)": "a nuclear localization signal",
+        "Mot(PEST)": "a PEST motif indicating protein degradation",
+        "*AcK@X": "lysine acetylation at a specific position",
+        "*P@X": "a phosphorylation site",
+        "Localize(Nucleus)": "localizes to the cell nucleus",
+        "Localize(Membrane)": "localizes to the cell membrane"
+    }
+    phrases = [translation.get(tag, tag) for tag in fragments if tag]
+    if not phrases:
+        return "No interpretable symbolic elements found."
+    return "This protein contains " + ", ".join(phrases[:-1]) + (
+        f", and {phrases[-1]}." if len(phrases) > 1 else f"{phrases[0]}.")
+with gr.Blocks() as demo:
+    gr.Markdown("# 🧬 GeneForgeLang AI Tools")
+    gr.Markdown("Design, interpret, describe, and export proteins using symbolic language and AI.")
+    with gr.Tab("🧠 Phrase → Protein"):
+        inp = gr.Textbox(label="GeneForgeLang Phrase", placeholder="^p:Dom(Kin)-Mot(NLS)*AcK@147")
+        out = gr.Textbox(label="Protein + Properties")
+        fasta = gr.File(label="Download FASTA")
+        btn = gr.Button("Generate")
+        btn.click(fn=generate_protein_and_props, inputs=inp, outputs=[out, fasta])
+    with gr.Tab("🧪 Protein → Phrase"):
+        inp2 = gr.Textbox(label="Protein Sequence", placeholder="MKKKPRRRDEEGEK...")
+        out2 = gr.Textbox(label="Interpreted GeneForgeLang")
+        btn2 = gr.Button("Translate")
+        btn2.click(fn=sequence_to_phrase, inputs=inp2, outputs=out2)
+    with gr.Tab("🧬 Mutate Protein"):
+        inp4 = gr.Textbox(label="GeneForgeLang Phrase", placeholder="^p:Dom(Kin)-Mot(NLS)*AcK@147")
+        out4 = gr.Textbox(label="Mutated Protein")
+        btn4 = gr.Button("Mutate")
+        btn4.click(fn=generate_protein_and_props, inputs=inp4, outputs=[out4, gr.File(visible=False)])
+    with gr.Tab("📊 Analyze Protein"):
+        inp5 = gr.Textbox(label="Protein Sequence", placeholder="Paste sequence to analyze")
+        out5 = gr.Image(label="Amino Acid Composition")
+        btn5 = gr.Button("Analyze")
+        def analyze_graph(seq): return generar_composicion_grafico(seq)
+        btn5.click(fn=analyze_graph, inputs=inp5, outputs=out5)
+    with gr.Tab("📖 Phrase → Natural Language"):
+        inp3 = gr.Textbox(label="GeneForgeLang Phrase", placeholder="^p:Dom(Kin)-Mot(NLS)*AcK@147")
+        out3 = gr.Textbox(label="Scientific Description")
+        btn3 = gr.Button("Describe")
+        btn3.click(fn=phrase_to_description, inputs=inp3, outputs=out3)
+if __name__ == "__main__":
+    demo.launch()

requirements.txt ADDED Viewed

Binary file (78 Bytes). View file

semillas.json ADDED Viewed

	@@ -0,0 +1,12 @@

+{
+  "Dom(Kin)": "MKKK",
+  "Mot(NLS)": "MPRRR",
+  "Mot(PEST)": "MDGQL",
+  "TF(GATA1)": "MKTFG",
+  "*AcK": "MKQAK",
+  "*Ac": "MKQAK",
+  "*P": "MKRP",
+  "*Phos": "MKRP",
+  "Localize(Nucleus)": "MPKRK",
+  "Localize(Membrane)": "MAIFL"
+}

translate_to_geneforgelang.py ADDED Viewed

	@@ -0,0 +1,49 @@

+import sys
+import re
+# Nuevas reglas de reconocimiento con patrones suaves
+def traducir_a_geneforge(secuencia):
+    motivos = []
+    # Dom(Kin): 3 o más K seguidos al principio
+    if re.search(r"^M*K{3,}", secuencia):
+        motivos.append("Dom(Kin)")
+    # Mot(NLS): presencia de varias R o K juntas, típica señal nuclear
+    if re.search(r"[RK]{3,}", secuencia):
+        motivos.append("Mot(NLS)")
+    # Mot(PEST): alta densidad de E o D (glutámico o aspártico)
+    if len(re.findall(r"E", secuencia)) >= 5 or "DEG" in secuencia:
+        motivos.append("Mot(PEST)")
+    # *AcK@X: presencia de "KQAK" o "QAK" como motivo de acetilación
+    if re.search(r"KQAK|QAK", secuencia):
+        motivos.append("*AcK@X")
+    # *P@X: motivos de fosforilación comunes
+    if re.search(r"[RST]P", secuencia):
+        motivos.append("*P@X")
+    # Localize(Nucleus): presencia de PRKRK, PKKKRKV
+    if "PRKRK" in secuencia or "PKKKRKV" in secuencia:
+        motivos.append("Localize(Nucleus)")
+    # Localize(Membrane): patrones hidrofóbicos como AILFL o LAGGAV
+    if re.search(r"(AILFL|LAGGAV|LVLL|AAVL)", secuencia):
+        motivos.append("Localize(Membrane)")
+    if not motivos:
+        return "// No se encontraron motivos reconocibles"
+    return "^p:" + "-".join(sorted(set(motivos)))
+if __name__ == "__main__":
+    if len(sys.argv) < 2:
+        print("Uso: python traducir_a_geneforgelang_v2.py <secuencia_proteina>")
+        sys.exit(1)
+    secuencia = sys.argv[1].upper()
+    resultado = traducir_a_geneforge(secuencia)
+    print("🔍 GeneForgeLang:")
+    print(resultado)