File size: 5,358 Bytes
3cc7408
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
#!/usr/bin/env python3

"""
Requirements:
streamlit
torch
pandas
transformers
"""

import os
import streamlit as st
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from torch.utils.data import Dataset, DataLoader
import csv

# Page Configuration
st.set_page_config(
    page_title="SFT Model Builder πŸš€",
    page_icon="πŸ€–",
    layout="wide",
    initial_sidebar_state="expanded",
)

# Help Documentation as a Variable
HELP_DOC = """
# SFT Model Builder - Help Guide πŸš€

## Overview
This Streamlit app allows users to **download, fine-tune, and test Transformer models** with **Supervised Fine-Tuning (SFT)** using CSV data. It is designed for NLP tasks and can be expanded for **CV and Speech models**.

## Features
- βœ… **Download a pre-trained model** from Hugging Face.
- βœ… **Upload a CSV dataset** for fine-tuning.
- βœ… **Train the model** with multiple epochs and adjustable batch sizes.
- βœ… **Test the fine-tuned model** with text prompts.

## Installation
To run the app, install dependencies:
```bash
pip install -r requirements.txt
```
Then, start the app:
```bash
streamlit run app.py
```

## How to Use
1. **Download Model**: Select a base model (e.g., `distilgpt2`), then click **Download Model**.
2. **Upload CSV**: The CSV must have two columns: `prompt` and `response`.
3. **Fine-Tune Model**: Click **Fine-Tune Model** to start training.
4. **Test Model**: Enter a text prompt and generate responses.

## CSV Format
Example format:
```csv
prompt,response
"What is AI?","AI is artificial intelligence."
"Explain machine learning","Machine learning is a subset of AI."
```

## Model References
| Model πŸ† | Description πŸ“Œ | Link πŸ”— |
|---------|-------------|---------|
| **GPT-2** πŸ€– | Standard NLP model | [Hugging Face](https://huggingface.co/gpt2) |
| **DistilGPT-2** ⚑ | Lightweight version of GPT-2 | [Hugging Face](https://huggingface.co/distilgpt2) |
| **EleutherAI Pythia** πŸ”¬ | Open-source GPT-like models | [Hugging Face](https://huggingface.co/EleutherAI/pythia-70m) |

## Additional Notes
- This app supports **PyTorch models**.
- Default training parameters: `epochs=3`, `batch_size=4`.
- Fine-tuned models are **saved locally** for future use.

For more details, visit [Hugging Face Models](https://huggingface.co/models). πŸš€
"""

# Custom Dataset for Fine-Tuning
class SFTDataset(Dataset):
    def __init__(self, data, tokenizer, max_length=128):
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        prompt = self.data[idx]["prompt"]
        response = self.data[idx]["response"]
        input_text = f"{prompt} {response}"
        encoding = self.tokenizer(
            input_text,
            max_length=self.max_length,
            padding="max_length",
            truncation=True,
            return_tensors="pt"
        )
        return {
            "input_ids": encoding["input_ids"].squeeze(),
            "attention_mask": encoding["attention_mask"].squeeze(),
            "labels": encoding["input_ids"].squeeze()
        }

# Model Loader and Trainer Class
class ModelBuilder:
    def __init__(self, model_name="distilgpt2"):
        self.model_name = model_name
        self.model = None
        self.tokenizer = None

    def load_model(self):
        st.spinner("Loading model... ⏳")
        self.model = AutoModelForCausalLM.from_pretrained(self.model_name)
        self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
        if self.tokenizer.pad_token is None:
            self.tokenizer.pad_token = self.tokenizer.eos_token
        st.success("Model loaded! βœ…")

    def fine_tune(self, csv_path, epochs=3, batch_size=4):
        """Supervised Fine-Tuning with CSV data"""
        sft_data = []
        with open(csv_path, "r") as f:
            reader = csv.DictReader(f)
            for row in reader:
                sft_data.append({"prompt": row["prompt"], "response": row["response"]})

        dataset = SFTDataset(sft_data, self.tokenizer)
        dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)
        optimizer = torch.optim.AdamW(self.model.parameters(), lr=2e-5)

        self.model.train()
        for epoch in range(epochs):
            st.spinner(f"Training epoch {epoch + 1}/{epochs}... βš™οΈ")
            for batch in dataloader:
                optimizer.zero_grad()
                input_ids = batch["input_ids"].to(self.model.device)
                attention_mask = batch["attention_mask"].to(self.model.device)
                labels = batch["labels"].to(self.model.device)
                outputs = self.model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
                loss = outputs.loss
                loss.backward()
                optimizer.step()
            st.write(f"Epoch {epoch + 1} completed.")
        st.success("Fine-tuning completed! πŸŽ‰")

# Main UI
st.title("SFT Model Builder πŸ€–πŸš€")
model_builder = ModelBuilder()

if st.button("Download Model ⬇️"):
    model_builder.load_model()

csv_file = st.file_uploader("Upload CSV for Fine-Tuning", type="csv")
if csv_file and st.button("Fine-Tune Model πŸ”„"):
    model_builder.fine_tune(csv_file)

# Render Help Documentation at End
st.markdown(HELP_DOC)