Spaces:

Bifrost-AI
/

Bifrost-Bitnet-b1.58-autotrain-and-chat

Sleeping

App Files Files Community

BifrostTitan commited on May 23

Commit

8abbcd4

verified ·

1 Parent(s): 11d8028

Update app.py

Browse files

Files changed (1) hide show

app.py +123 -70

app.py CHANGED Viewed

@@ -1,4 +1,41 @@
-import os
 import sys
 import json
 import logging
@@ -12,15 +49,21 @@ from torch.utils.data import DataLoader, Dataset
 import transformers
 from transformers import AutoModelForCausalLM, AutoTokenizer
 from zeta.optim import StableAdamWUnfused
-import gradio as gr
-import os
-import subprocess
-os.system("pip install git+https://github.com/shumingma/transformers.git")
-os.system("pip install zetascale==2.8.0")
 # Suppress TorchDynamo errors (this will fallback to eager mode)
 import torch._dynamo
 torch._dynamo.config.suppress_errors = True
 ##################
 # Data Processing
 ##################
@@ -61,24 +104,25 @@ transformers.utils.logging.enable_explicit_format()
 # Load Hugging Face model and tokenizer
 # ---------------------------------------------------------------------------------
 model_id = "microsoft/bitnet-b1.58-2B-4T-bf16"
-tokenizer = AutoTokenizer.from_pretrained(model_id)
-hf_save_dir = "./bitnet"
 model = AutoModelForCausalLM.from_pretrained(
     model_id,
-    torch_dtype=torch.bfloat32,
-    device_map="auto"
 )
-device = model.device
 if torch.cuda.is_available():
     print("CUDA is available. Using GPU:", torch.cuda.get_device_name(0))
 else:
     print("CUDA not available; using CPU.")
 print(f"Loaded pre-trained Hugging Face model '{model_id}'.")
 # ---------------------------------------------------------------------------------
 # Load new Hugging Face dataset and preprocess it using the new formatting_func
 # ---------------------------------------------------------------------------------
 full_dataset = load_dataset("Bifrost-AI/Solana-blockchain-360-Coding", split="train")
 def preprocess_function(example):
@@ -101,7 +145,7 @@ def preprocess_function(example):
     input_ids = tokenized_full["input_ids"]
     labels = input_ids.copy()
-    # Mask out the prompt tokens (loss is computed only on answer tokens)
     for i in range(prompt_len):
         labels[i] = -100
@@ -113,6 +157,8 @@ def preprocess_function(example):
 # Apply preprocessing and remove the original columns.
 processed_dataset = full_dataset.map(preprocess_function, remove_columns=full_dataset.column_names)
 processed_dataset.set_format(type="torch", columns=["input_ids", "labels", "prompt_len"])
 # Split the processed dataset into train and validation sets (90/10 split).
@@ -156,71 +202,78 @@ val_loader = cycle(DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False,
 optim = StableAdamWUnfused(model.parameters(), lr=LEARNING_RATE)
 # ---------------------------------------------------------------------------------
-# Define training function for Gradio UI
 # ---------------------------------------------------------------------------------
-def train_model():
-    """
-    Runs a training loop for a fixed number of batches and returns training logs.
-    """
     model.train()
-    logs = []
-    for i in range(NUM_BATCHES):
-        total_loss = 0.0
-        for _ in range(GRADIENT_ACCUMULATE_EVERY):
-            batch = next(train_loader)
             input_ids = batch["input_ids"].to(device)
             labels = batch["labels"].to(device)
             outputs = model(input_ids=input_ids, labels=labels)
-            loss = outputs.loss
-            loss.backward()
-            total_loss += loss.item()
-        avg_loss = total_loss / GRADIENT_ACCUMULATE_EVERY
-        logs.append(f"Batch {i}: Training loss = {avg_loss:.4f}")
-        torch.nn.utils.clip_grad_norm_(model.parameters(), 0.5)
-        optim.step()
-        optim.zero_grad()
-    return "\n".join(logs)
 # ---------------------------------------------------------------------------------
-# Define text generation function for Gradio UI
 # ---------------------------------------------------------------------------------
-def generate_text_from_prompt(prompt: str):
-    """
-    Generates output text from a given prompt.
-    """
-    model.eval()
-    # Ensure the prompt is formatted as expected for the model.
-    if not prompt.strip().startswith("### Question:"):
-        prompt = "### Question: " + prompt.strip() + "\n ### Answer:"
-    tokenized_input = tokenizer(prompt, return_tensors="pt").to(device)
-    generated_ids = model.generate(
-        input_ids=tokenized_input["input_ids"],
-        max_new_tokens=GENERATE_LENGTH,
-        do_sample=True,
-        temperature=1.0
-    )
-    generated_text = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
-    return generated_text
-# ---------------------------------------------------------------------------------
-# Gradio UI Setup for Auto-Trainer App
-# ---------------------------------------------------------------------------------
 with gr.Blocks() as demo:
-    gr.Markdown("## Bitnet SFT Fine-Tuning & Generation App")
-    gr.Markdown("This app allows you to fine-tune the Bitnet model using your dataset and generate outputs from it.")
-    with gr.Tab("Train Model"):
-        train_button = gr.Button("Run Training")
-        train_output = gr.Textbox(label="Training Logs", lines=10)
-        train_button.click(fn=train_model, inputs=[], outputs=train_output)
-    with gr.Tab("Generate Text"):
-        instruction_input = gr.Textbox(label="Enter your question/instruction", placeholder="Type your question here...", lines=4)
-        generate_button = gr.Button("Generate Answer")
-        generation_output = gr.Textbox(label="Generated Output", lines=10)
-        generate_button.click(fn=generate_text_from_prompt, inputs=instruction_input, outputs=generation_output)
-if __name__ == "__main__":
-    demo.launch()

+import gradio as gr
+# Markdown text with instructions for running the script locally.
+instructions = """
+# How to Run the SFT Training Script Locally
+This Space shows you how to run the SFT fine-tuning training script on your own machine.
+## Instructions:
+1. **Clone or Copy the Repository:**
+   Make sure you have the repository containing the SFT training script. You can clone it or download the code.
+2. **Install Dependencies:**
+   Ensure you have Python 3.10 or above. Install the required packages by running:
+   ```
+   pip install -r requirements.txt
+   ```
+   Your `requirements.txt` should include all necessary packages and install your custom GitHub fork of `transformers` last.
+3. **Review or Edit the Training Script:**
+   Open the `finetune_sft_training.py` file (or whichever file contains the SFT training script) to review the code and adjust hyperparameters, file paths, or other settings as needed.
+4. **Run the Script Locally:**
+   From the terminal, execute:
+   ```
+   python finetune_sft_training.py
+   ```
+   This will start the fine-tuning process. Check your terminal for training loss logs and progress messages.
+5. **Troubleshooting Tips:**
+   - If you’re running on a CPU-only machine, ensure the model is loaded in `torch.float32` instead of `torch.bfloat16`.
+   - Verify that your dataset paths and configurations match your local environment.
+Enjoy fine-tuning your model locally!
+"""
+sft_training_script = r'''import os
 import sys
 import json
 import logging
 import transformers
 from transformers import AutoModelForCausalLM, AutoTokenizer
 from zeta.optim import StableAdamWUnfused
+import pkg_resources
+import sys
 # Suppress TorchDynamo errors (this will fallback to eager mode)
 import torch._dynamo
 torch._dynamo.config.suppress_errors = True
+print("Installed Packages:")
+for dist in pkg_resources.working_set:
+    print(f"{dist.project_name}=={dist.version}")
+print("Currently imported modules:")
+for module_name in sys.modules.keys():
+    print(module_name)
 ##################
 # Data Processing
 ##################
 # Load Hugging Face model and tokenizer
 # ---------------------------------------------------------------------------------
 model_id = "microsoft/bitnet-b1.58-2B-4T-bf16"
+tokenizer = AutoTokenizer.from_pretrained(model_id, use_fast=False)
 model = AutoModelForCausalLM.from_pretrained(
     model_id,
+    torch_dtype=torch.bfloat16
 )
+hf_save_dir = "./bitnet"
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 if torch.cuda.is_available():
     print("CUDA is available. Using GPU:", torch.cuda.get_device_name(0))
 else:
     print("CUDA not available; using CPU.")
+model.to(device)
 print(f"Loaded pre-trained Hugging Face model '{model_id}'.")
 # ---------------------------------------------------------------------------------
 # Load new Hugging Face dataset and preprocess it using the new formatting_func
 # ---------------------------------------------------------------------------------
+# Load the dataset from Hugging Face
 full_dataset = load_dataset("Bifrost-AI/Solana-blockchain-360-Coding", split="train")
 def preprocess_function(example):
     input_ids = tokenized_full["input_ids"]
     labels = input_ids.copy()
+    # Mask the prompt tokens (loss computed only on answer tokens)
     for i in range(prompt_len):
         labels[i] = -100
 # Apply preprocessing and remove the original columns.
 processed_dataset = full_dataset.map(preprocess_function, remove_columns=full_dataset.column_names)
+# Set the format so that when the dataset is indexed, the fields are torch tensors.
 processed_dataset.set_format(type="torch", columns=["input_ids", "labels", "prompt_len"])
 # Split the processed dataset into train and validation sets (90/10 split).
 optim = StableAdamWUnfused(model.parameters(), lr=LEARNING_RATE)
 # ---------------------------------------------------------------------------------
+# Training loop for SFT fine tuning.
+#
+# For Hugging Face causal LM models, supplying 'labels' automatically shifts inputs
+# and computes the loss only on the unmasked portion (i.e. the answer tokens).
 # ---------------------------------------------------------------------------------
+for i in tqdm.tqdm(range(NUM_BATCHES), mininterval=10.0, desc="training"):
     model.train()
+    total_loss = 0.0
+    for _ in range(GRADIENT_ACCUMULATE_EVERY):
+        batch = next(train_loader)
+        input_ids = batch["input_ids"].to(device)
+        labels = batch["labels"].to(device)
+        outputs = model(input_ids=input_ids, labels=labels)
+        loss = outputs.loss
+        loss.backward()
+        total_loss += loss.item()
+    print(f"training loss: {total_loss / GRADIENT_ACCUMULATE_EVERY}")
+    torch.nn.utils.clip_grad_norm_(model.parameters(), 0.5)
+    optim.step()
+    optim.zero_grad()
+    if i % VALIDATE_EVERY == 0:
+        model.eval()
+        with torch.no_grad():
+            batch = next(val_loader)
             input_ids = batch["input_ids"].to(device)
             labels = batch["labels"].to(device)
             outputs = model(input_ids=input_ids, labels=labels)
+            val_loss = outputs.loss
+            print(f"validation loss: {val_loss.item()}")
+    if i % GENERATE_EVERY == 5:
+        model.eval()
+        # For generation, pick a random validation sample and extract its prompt.
+        sample = random.choice(val_dataset)
+        prompt_len = sample["prompt_len"]
+        if prompt_len == 0:
+            continue
+        prime_ids = sample["input_ids"][:prompt_len].unsqueeze(0).to(device)
+        prime_text = tokenizer.decode(prime_ids[0], skip_special_tokens=True)
+        print(f"Prompt:\n{prime_text}\n{'*' * 100}")
+        generated_ids = model.generate(
+            input_ids=prime_ids,
+            max_new_tokens=GENERATE_LENGTH,
+            do_sample=True,
+            temperature=1.0
+        )
+        output_str = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
+        print(f"Generated output:\n{output_str}")
 # ---------------------------------------------------------------------------------
+# Save the final fine-tuned model after training.
 # ---------------------------------------------------------------------------------
+output_checkpoint = "finetuned-bitnet.pt"
+torch.save(model.state_dict(), output_checkpoint)
+model.save_pretrained(hf_save_dir)
+tokenizer.save_pretrained(hf_save_dir)
+print(f"Model saved to '{output_checkpoint}' and Hugging Face artifacts saved to '{hf_save_dir}'!")
+'''
+# Build the Gradio interface with two tabs: one for instructions and one for the script.
 with gr.Blocks() as demo:
+    gr.Markdown("# Local SFT Training Script Viewer")
+    gr.Markdown("This app shows you the SFT training script along with detailed instructions on how to run it locally.")
+    with gr.Tabs():
+        with gr.TabItem("Instructions"):
+            gr.Markdown(instructions)
+        with gr.TabItem("SFT Training Script"):
+            gr.Textbox(value=sft_training_script, label="SFT Training Script", lines=40)
+demo.launch()