|
|
|
|
|
""" |
|
Requirements: |
|
streamlit |
|
torch |
|
pandas |
|
transformers |
|
""" |
|
|
|
import os |
|
import streamlit as st |
|
import torch |
|
from transformers import AutoModelForCausalLM, AutoTokenizer |
|
from torch.utils.data import Dataset, DataLoader |
|
import csv |
|
|
|
|
|
st.set_page_config( |
|
page_title="SFT Model Builder π", |
|
page_icon="π€", |
|
layout="wide", |
|
initial_sidebar_state="expanded", |
|
) |
|
|
|
|
|
HELP_DOC = """ |
|
# SFT Model Builder - Help Guide π |
|
|
|
## Overview |
|
This Streamlit app allows users to **download, fine-tune, and test Transformer models** with **Supervised Fine-Tuning (SFT)** using CSV data. It is designed for NLP tasks and can be expanded for **CV and Speech models**. |
|
|
|
## Features |
|
- β
**Download a pre-trained model** from Hugging Face. |
|
- β
**Upload a CSV dataset** for fine-tuning. |
|
- β
**Train the model** with multiple epochs and adjustable batch sizes. |
|
- β
**Test the fine-tuned model** with text prompts. |
|
|
|
## Installation |
|
To run the app, install dependencies: |
|
```bash |
|
pip install -r requirements.txt |
|
``` |
|
Then, start the app: |
|
```bash |
|
streamlit run app.py |
|
``` |
|
|
|
## How to Use |
|
1. **Download Model**: Select a base model (e.g., `distilgpt2`), then click **Download Model**. |
|
2. **Upload CSV**: The CSV must have two columns: `prompt` and `response`. |
|
3. **Fine-Tune Model**: Click **Fine-Tune Model** to start training. |
|
4. **Test Model**: Enter a text prompt and generate responses. |
|
|
|
## CSV Format |
|
Example format: |
|
```csv |
|
prompt,response |
|
"What is AI?","AI is artificial intelligence." |
|
"Explain machine learning","Machine learning is a subset of AI." |
|
``` |
|
|
|
## Model References |
|
| Model π | Description π | Link π | |
|
|---------|-------------|---------| |
|
| **GPT-2** π€ | Standard NLP model | [Hugging Face](https://huggingface.co/gpt2) | |
|
| **DistilGPT-2** β‘ | Lightweight version of GPT-2 | [Hugging Face](https://huggingface.co/distilgpt2) | |
|
| **EleutherAI Pythia** π¬ | Open-source GPT-like models | [Hugging Face](https://huggingface.co/EleutherAI/pythia-70m) | |
|
|
|
## Additional Notes |
|
- This app supports **PyTorch models**. |
|
- Default training parameters: `epochs=3`, `batch_size=4`. |
|
- Fine-tuned models are **saved locally** for future use. |
|
|
|
For more details, visit [Hugging Face Models](https://huggingface.co/models). π |
|
""" |
|
|
|
|
|
class SFTDataset(Dataset): |
|
def __init__(self, data, tokenizer, max_length=128): |
|
self.data = data |
|
self.tokenizer = tokenizer |
|
self.max_length = max_length |
|
|
|
def __len__(self): |
|
return len(self.data) |
|
|
|
def __getitem__(self, idx): |
|
prompt = self.data[idx]["prompt"] |
|
response = self.data[idx]["response"] |
|
input_text = f"{prompt} {response}" |
|
encoding = self.tokenizer( |
|
input_text, |
|
max_length=self.max_length, |
|
padding="max_length", |
|
truncation=True, |
|
return_tensors="pt" |
|
) |
|
return { |
|
"input_ids": encoding["input_ids"].squeeze(), |
|
"attention_mask": encoding["attention_mask"].squeeze(), |
|
"labels": encoding["input_ids"].squeeze() |
|
} |
|
|
|
|
|
class ModelBuilder: |
|
def __init__(self, model_name="distilgpt2"): |
|
self.model_name = model_name |
|
self.model = None |
|
self.tokenizer = None |
|
|
|
def load_model(self): |
|
st.spinner("Loading model... β³") |
|
self.model = AutoModelForCausalLM.from_pretrained(self.model_name) |
|
self.tokenizer = AutoTokenizer.from_pretrained(self.model_name) |
|
if self.tokenizer.pad_token is None: |
|
self.tokenizer.pad_token = self.tokenizer.eos_token |
|
st.success("Model loaded! β
") |
|
|
|
def fine_tune(self, csv_path, epochs=3, batch_size=4): |
|
"""Supervised Fine-Tuning with CSV data""" |
|
sft_data = [] |
|
with open(csv_path, "r") as f: |
|
reader = csv.DictReader(f) |
|
for row in reader: |
|
sft_data.append({"prompt": row["prompt"], "response": row["response"]}) |
|
|
|
dataset = SFTDataset(sft_data, self.tokenizer) |
|
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True) |
|
optimizer = torch.optim.AdamW(self.model.parameters(), lr=2e-5) |
|
|
|
self.model.train() |
|
for epoch in range(epochs): |
|
st.spinner(f"Training epoch {epoch + 1}/{epochs}... βοΈ") |
|
for batch in dataloader: |
|
optimizer.zero_grad() |
|
input_ids = batch["input_ids"].to(self.model.device) |
|
attention_mask = batch["attention_mask"].to(self.model.device) |
|
labels = batch["labels"].to(self.model.device) |
|
outputs = self.model(input_ids=input_ids, attention_mask=attention_mask, labels=labels) |
|
loss = outputs.loss |
|
loss.backward() |
|
optimizer.step() |
|
st.write(f"Epoch {epoch + 1} completed.") |
|
st.success("Fine-tuning completed! π") |
|
|
|
|
|
st.title("SFT Model Builder π€π") |
|
model_builder = ModelBuilder() |
|
|
|
if st.button("Download Model β¬οΈ"): |
|
model_builder.load_model() |
|
|
|
csv_file = st.file_uploader("Upload CSV for Fine-Tuning", type="csv") |
|
if csv_file and st.button("Fine-Tune Model π"): |
|
model_builder.fine_tune(csv_file) |
|
|
|
|
|
st.markdown(HELP_DOC) |
|
|