Spaces:

Bifrost-AI
/

Bifrost-Bitnet-b1.58-autotrain-and-chat

Sleeping

File size: 10,918 Bytes

import gradio as gr

# Markdown text with instructions for running the script locally.
instructions = """
# How to Run Bifrost's Bitnet SFT Training Script Locally

This Space shows you how to run the SFT fine-tuning training script on your own machine or on a GPU space.

## Instructions:


1. **Install Dependencies:**  
   Ensure you have Python 3.10 or above. Install the required packages by running this in a requirements.txt:
   ```
    torch==2.6.0
    pandas
    sympy
    scikit-learn==1.6.1
    huggingface-hub
    beartype
    matplotlib==3.10.3
    bitnet
    git+https://github.com/shumingma/transformers.git#egg=transformers
   ```
   Your `requirements.txt` should include all necessary packages and install your custom GitHub fork of `transformers` last.

2. **Review or Edit the Training Script:**  
   Copy the script then open the `bitnet_sft_training.py` file (or whichever file contains the SFT training script) to review the code and adjust hyperparameters, file paths, or other settings as needed.

3. **Run the Script Locally:**  
   From the terminal, execute:
   ```
   python bitnet_sft_training.py
   ```
   This will start the fine-tuning process. Check your terminal for training loss logs and progress messages.

4. **Troubleshooting Tips:**  
   - If you’re running on a CPU-only machine, ensure the model is loaded in `torch.float32` instead of `torch.bfloat16`.
   - Verify that your dataset paths and configurations match your local environment.

Enjoy fine-tuning your model locally!
"""

sft_training_script = r'''import os
import sys
import json
import logging
import random
import numpy as np
import torch
import tqdm
import datasets
from datasets import load_dataset
from torch.utils.data import DataLoader, Dataset
import transformers
from transformers import AutoModelForCausalLM, AutoTokenizer
from zeta.optim import StableAdamWUnfused
import pkg_resources
import sys

# Suppress TorchDynamo errors (this will fallback to eager mode)
import torch._dynamo
torch._dynamo.config.suppress_errors = True

##################
# Data Processing
##################
def formatting_func(example):
    """
    Formats an example using the new style.
    """
    text = f"### Question: {example['instruction']}\n ### Answer: {example['output']}"
    return text

# ---------------------------------------------------------------------------------
# Constants
# ---------------------------------------------------------------------------------
NUM_BATCHES = 6
BATCH_SIZE = 2
GRADIENT_ACCUMULATE_EVERY = 4
LEARNING_RATE = 2e-4
VALIDATE_EVERY = 5
GENERATE_EVERY = 10
GENERATE_LENGTH = 512

###############
# Setup logging
###############
logging.basicConfig(
    format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
    datefmt="%Y-%m-%d %H:%M:%S",
    handlers=[logging.StreamHandler(sys.stdout)],
)
log_level = 1

datasets.utils.logging.set_verbosity(log_level)
transformers.utils.logging.set_verbosity(log_level)
transformers.utils.logging.enable_default_handler()
transformers.utils.logging.enable_explicit_format()

# ---------------------------------------------------------------------------------
# Load Hugging Face model and tokenizer
# ---------------------------------------------------------------------------------
model_id = "microsoft/bitnet-b1.58-2B-4T-bf16"
tokenizer = AutoTokenizer.from_pretrained(model_id, use_fast=False)
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    torch_dtype=torch.bfloat16
)
hf_save_dir = "./bitnet"

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
if torch.cuda.is_available():
    print("CUDA is available. Using GPU:", torch.cuda.get_device_name(0))
else:
    print("CUDA not available; using CPU.")
model.to(device)
print(f"Loaded pre-trained Hugging Face model '{model_id}'.")

# ---------------------------------------------------------------------------------
# Load new Hugging Face dataset and preprocess it using the new formatting_func
# ---------------------------------------------------------------------------------
# Load the dataset from Hugging Face
full_dataset = load_dataset("Bifrost-AI/Solana-blockchain-360-Coding", split="train")

def preprocess_function(example):
    # Format the example using the new formatting function.
    formatted_text = formatting_func(example)
    
    # Determine the prompt portion by looking for the answer marker.
    answer_marker = "### Answer:"
    if answer_marker in formatted_text:
        # Include the answer marker in the prompt.
        prompt_text = formatted_text.split(answer_marker, 1)[0] + answer_marker
    else:
        prompt_text = formatted_text

    # Tokenize the full formatted text.
    tokenized_full = tokenizer(formatted_text, truncation=True, padding=False)
    # Tokenize only the prompt portion to measure its token length.
    tokenized_prompt = tokenizer(prompt_text, truncation=True, padding=False)
    prompt_len = len(tokenized_prompt["input_ids"])
    
    input_ids = tokenized_full["input_ids"]
    labels = input_ids.copy()
    # Mask the prompt tokens (loss computed only on answer tokens)
    for i in range(prompt_len):
        labels[i] = -100

    return {
        "input_ids": torch.tensor(input_ids, dtype=torch.long),
        "labels": torch.tensor(labels, dtype=torch.long),
        "prompt_len": prompt_len
    }

# Apply preprocessing and remove the original columns.
processed_dataset = full_dataset.map(preprocess_function, remove_columns=full_dataset.column_names)

# Set the format so that when the dataset is indexed, the fields are torch tensors.
processed_dataset.set_format(type="torch", columns=["input_ids", "labels", "prompt_len"])

# Split the processed dataset into train and validation sets (90/10 split).
split_idx = int(0.9 * len(processed_dataset))
train_dataset = torch.utils.data.Subset(processed_dataset, list(range(0, split_idx)))
val_dataset = torch.utils.data.Subset(processed_dataset, list(range(split_idx, len(processed_dataset))))

# ---------------------------------------------------------------------------------
# Collate function for DataLoader
# ---------------------------------------------------------------------------------
def sft_collate_fn(batch):
    """
    Collate a list of examples by padding them to the maximum sequence length in the batch.
    """
    max_len = max(x["input_ids"].size(0) for x in batch)
    input_ids_batch = []
    labels_batch = []
    prompt_lens = []
    pad_id = tokenizer.pad_token_id if tokenizer.pad_token_id is not None else 0
    for ex in batch:
        input_ids = ex["input_ids"]
        labels = ex["labels"]
        pad_len = max_len - input_ids.size(0)
        input_ids_padded = torch.cat([input_ids, torch.full((pad_len,), pad_id, dtype=input_ids.dtype)])
        labels_padded = torch.cat([labels, torch.full((pad_len,), -100, dtype=labels.dtype)])
        input_ids_batch.append(input_ids_padded)
        labels_batch.append(labels_padded)
        prompt_lens.append(ex["prompt_len"])
    return {"input_ids": torch.stack(input_ids_batch), "labels": torch.stack(labels_batch), "prompt_len": prompt_lens}

def cycle(loader):
    while True:
        yield from loader

train_loader = cycle(DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=sft_collate_fn))
val_loader = cycle(DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False, collate_fn=sft_collate_fn))

# ---------------------------------------------------------------------------------
# Setup optimizer
# ---------------------------------------------------------------------------------
optim = StableAdamWUnfused(model.parameters(), lr=LEARNING_RATE)

# ---------------------------------------------------------------------------------
# Training loop for SFT fine tuning.
#
# For Hugging Face causal LM models, supplying 'labels' automatically shifts inputs
# and computes the loss only on the unmasked portion (i.e. the answer tokens).
# ---------------------------------------------------------------------------------
for i in tqdm.tqdm(range(NUM_BATCHES), mininterval=10.0, desc="training"):
    model.train()
    total_loss = 0.0
    for _ in range(GRADIENT_ACCUMULATE_EVERY):
        batch = next(train_loader)
        input_ids = batch["input_ids"].to(device)
        labels = batch["labels"].to(device)
        
        outputs = model(input_ids=input_ids, labels=labels)
        loss = outputs.loss
        loss.backward()
        total_loss += loss.item()
    
    print(f"training loss: {total_loss / GRADIENT_ACCUMULATE_EVERY}")
    torch.nn.utils.clip_grad_norm_(model.parameters(), 0.5)
    optim.step()
    optim.zero_grad()
    
    if i % VALIDATE_EVERY == 0:
        model.eval()
        with torch.no_grad():
            batch = next(val_loader)
            input_ids = batch["input_ids"].to(device)
            labels = batch["labels"].to(device)
            outputs = model(input_ids=input_ids, labels=labels)
            val_loss = outputs.loss
            print(f"validation loss: {val_loss.item()}")
    
    if i % GENERATE_EVERY == 5:
        model.eval()
        # For generation, pick a random validation sample and extract its prompt.
        sample = random.choice(val_dataset)
        prompt_len = sample["prompt_len"]
        if prompt_len == 0:
            continue
        prime_ids = sample["input_ids"][:prompt_len].unsqueeze(0).to(device)
        prime_text = tokenizer.decode(prime_ids[0], skip_special_tokens=True)
        print(f"Prompt:\n{prime_text}\n{'*' * 100}")
        
        generated_ids = model.generate(
            input_ids=prime_ids,
            max_new_tokens=GENERATE_LENGTH,
            do_sample=True,
            temperature=1.0
        )
        output_str = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
        print(f"Generated output:\n{output_str}")

# ---------------------------------------------------------------------------------
# Save the final fine-tuned model after training.
# ---------------------------------------------------------------------------------
output_checkpoint = "finetuned-bitnet.pt"
torch.save(model.state_dict(), output_checkpoint)
model.save_pretrained(hf_save_dir)
tokenizer.save_pretrained(hf_save_dir)
print(f"Model saved to '{output_checkpoint}' and Hugging Face artifacts saved to '{hf_save_dir}'!")
'''

# Build the Gradio interface with two tabs: one for instructions and one for the script.
with gr.Blocks() as demo:
    gr.Markdown("# Bitnet SFT Training Script Viewer & Tutorial")
    gr.Markdown("This app shows you Bifrost's SFT training script along with detailed instructions on how to run it locally or on a GPU space.")
    
    with gr.Tabs():
        with gr.TabItem("Instructions"):
            gr.Markdown(instructions)
        with gr.TabItem("SFT Training Script"):
            gr.Textbox(value=sft_training_script, label="SFT Training Script", lines=40)
            
demo.launch()