BifrostTitan commited on
Commit
8abbcd4
·
verified ·
1 Parent(s): 11d8028

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +123 -70
app.py CHANGED
@@ -1,4 +1,41 @@
1
- import os
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
  import sys
3
  import json
4
  import logging
@@ -12,15 +49,21 @@ from torch.utils.data import DataLoader, Dataset
12
  import transformers
13
  from transformers import AutoModelForCausalLM, AutoTokenizer
14
  from zeta.optim import StableAdamWUnfused
15
- import gradio as gr
16
- import os
17
- import subprocess
18
- os.system("pip install git+https://github.com/shumingma/transformers.git")
19
- os.system("pip install zetascale==2.8.0")
20
  # Suppress TorchDynamo errors (this will fallback to eager mode)
21
  import torch._dynamo
22
  torch._dynamo.config.suppress_errors = True
23
 
 
 
 
 
 
 
 
 
24
  ##################
25
  # Data Processing
26
  ##################
@@ -61,24 +104,25 @@ transformers.utils.logging.enable_explicit_format()
61
  # Load Hugging Face model and tokenizer
62
  # ---------------------------------------------------------------------------------
63
  model_id = "microsoft/bitnet-b1.58-2B-4T-bf16"
64
- tokenizer = AutoTokenizer.from_pretrained(model_id)
65
- hf_save_dir = "./bitnet"
66
  model = AutoModelForCausalLM.from_pretrained(
67
  model_id,
68
- torch_dtype=torch.bfloat32,
69
- device_map="auto"
70
  )
71
- device = model.device
 
 
72
  if torch.cuda.is_available():
73
  print("CUDA is available. Using GPU:", torch.cuda.get_device_name(0))
74
  else:
75
  print("CUDA not available; using CPU.")
76
-
77
  print(f"Loaded pre-trained Hugging Face model '{model_id}'.")
78
 
79
  # ---------------------------------------------------------------------------------
80
  # Load new Hugging Face dataset and preprocess it using the new formatting_func
81
  # ---------------------------------------------------------------------------------
 
82
  full_dataset = load_dataset("Bifrost-AI/Solana-blockchain-360-Coding", split="train")
83
 
84
  def preprocess_function(example):
@@ -101,7 +145,7 @@ def preprocess_function(example):
101
 
102
  input_ids = tokenized_full["input_ids"]
103
  labels = input_ids.copy()
104
- # Mask out the prompt tokens (loss is computed only on answer tokens)
105
  for i in range(prompt_len):
106
  labels[i] = -100
107
 
@@ -113,6 +157,8 @@ def preprocess_function(example):
113
 
114
  # Apply preprocessing and remove the original columns.
115
  processed_dataset = full_dataset.map(preprocess_function, remove_columns=full_dataset.column_names)
 
 
116
  processed_dataset.set_format(type="torch", columns=["input_ids", "labels", "prompt_len"])
117
 
118
  # Split the processed dataset into train and validation sets (90/10 split).
@@ -156,71 +202,78 @@ val_loader = cycle(DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False,
156
  optim = StableAdamWUnfused(model.parameters(), lr=LEARNING_RATE)
157
 
158
  # ---------------------------------------------------------------------------------
159
- # Define training function for Gradio UI
 
 
 
160
  # ---------------------------------------------------------------------------------
161
-
162
- def train_model():
163
- """
164
- Runs a training loop for a fixed number of batches and returns training logs.
165
- """
166
  model.train()
167
- logs = []
168
- for i in range(NUM_BATCHES):
169
- total_loss = 0.0
170
- for _ in range(GRADIENT_ACCUMULATE_EVERY):
171
- batch = next(train_loader)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
172
  input_ids = batch["input_ids"].to(device)
173
  labels = batch["labels"].to(device)
174
-
175
  outputs = model(input_ids=input_ids, labels=labels)
176
- loss = outputs.loss
177
- loss.backward()
178
- total_loss += loss.item()
179
- avg_loss = total_loss / GRADIENT_ACCUMULATE_EVERY
180
- logs.append(f"Batch {i}: Training loss = {avg_loss:.4f}")
181
- torch.nn.utils.clip_grad_norm_(model.parameters(), 0.5)
182
- optim.step()
183
- optim.zero_grad()
184
- return "\n".join(logs)
 
 
 
 
 
 
 
 
 
 
 
 
 
185
 
186
  # ---------------------------------------------------------------------------------
187
- # Define text generation function for Gradio UI
188
  # ---------------------------------------------------------------------------------
189
- def generate_text_from_prompt(prompt: str):
190
- """
191
- Generates output text from a given prompt.
192
- """
193
- model.eval()
194
- # Ensure the prompt is formatted as expected for the model.
195
- if not prompt.strip().startswith("### Question:"):
196
- prompt = "### Question: " + prompt.strip() + "\n ### Answer:"
197
- tokenized_input = tokenizer(prompt, return_tensors="pt").to(device)
198
- generated_ids = model.generate(
199
- input_ids=tokenized_input["input_ids"],
200
- max_new_tokens=GENERATE_LENGTH,
201
- do_sample=True,
202
- temperature=1.0
203
- )
204
- generated_text = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
205
- return generated_text
206
 
207
- # ---------------------------------------------------------------------------------
208
- # Gradio UI Setup for Auto-Trainer App
209
- # ---------------------------------------------------------------------------------
210
  with gr.Blocks() as demo:
211
- gr.Markdown("## Bitnet SFT Fine-Tuning & Generation App")
212
- gr.Markdown("This app allows you to fine-tune the Bitnet model using your dataset and generate outputs from it.")
213
 
214
- with gr.Tab("Train Model"):
215
- train_button = gr.Button("Run Training")
216
- train_output = gr.Textbox(label="Training Logs", lines=10)
217
- train_button.click(fn=train_model, inputs=[], outputs=train_output)
218
-
219
- with gr.Tab("Generate Text"):
220
- instruction_input = gr.Textbox(label="Enter your question/instruction", placeholder="Type your question here...", lines=4)
221
- generate_button = gr.Button("Generate Answer")
222
- generation_output = gr.Textbox(label="Generated Output", lines=10)
223
- generate_button.click(fn=generate_text_from_prompt, inputs=instruction_input, outputs=generation_output)
224
-
225
- if __name__ == "__main__":
226
- demo.launch()
 
1
+ import gradio as gr
2
+
3
+ # Markdown text with instructions for running the script locally.
4
+ instructions = """
5
+ # How to Run the SFT Training Script Locally
6
+
7
+ This Space shows you how to run the SFT fine-tuning training script on your own machine.
8
+
9
+ ## Instructions:
10
+
11
+ 1. **Clone or Copy the Repository:**
12
+ Make sure you have the repository containing the SFT training script. You can clone it or download the code.
13
+
14
+ 2. **Install Dependencies:**
15
+ Ensure you have Python 3.10 or above. Install the required packages by running:
16
+ ```
17
+ pip install -r requirements.txt
18
+ ```
19
+ Your `requirements.txt` should include all necessary packages and install your custom GitHub fork of `transformers` last.
20
+
21
+ 3. **Review or Edit the Training Script:**
22
+ Open the `finetune_sft_training.py` file (or whichever file contains the SFT training script) to review the code and adjust hyperparameters, file paths, or other settings as needed.
23
+
24
+ 4. **Run the Script Locally:**
25
+ From the terminal, execute:
26
+ ```
27
+ python finetune_sft_training.py
28
+ ```
29
+ This will start the fine-tuning process. Check your terminal for training loss logs and progress messages.
30
+
31
+ 5. **Troubleshooting Tips:**
32
+ - If you’re running on a CPU-only machine, ensure the model is loaded in `torch.float32` instead of `torch.bfloat16`.
33
+ - Verify that your dataset paths and configurations match your local environment.
34
+
35
+ Enjoy fine-tuning your model locally!
36
+ """
37
+
38
+ sft_training_script = r'''import os
39
  import sys
40
  import json
41
  import logging
 
49
  import transformers
50
  from transformers import AutoModelForCausalLM, AutoTokenizer
51
  from zeta.optim import StableAdamWUnfused
52
+ import pkg_resources
53
+ import sys
54
+
 
 
55
  # Suppress TorchDynamo errors (this will fallback to eager mode)
56
  import torch._dynamo
57
  torch._dynamo.config.suppress_errors = True
58
 
59
+ print("Installed Packages:")
60
+ for dist in pkg_resources.working_set:
61
+ print(f"{dist.project_name}=={dist.version}")
62
+
63
+ print("Currently imported modules:")
64
+ for module_name in sys.modules.keys():
65
+ print(module_name)
66
+
67
  ##################
68
  # Data Processing
69
  ##################
 
104
  # Load Hugging Face model and tokenizer
105
  # ---------------------------------------------------------------------------------
106
  model_id = "microsoft/bitnet-b1.58-2B-4T-bf16"
107
+ tokenizer = AutoTokenizer.from_pretrained(model_id, use_fast=False)
 
108
  model = AutoModelForCausalLM.from_pretrained(
109
  model_id,
110
+ torch_dtype=torch.bfloat16
 
111
  )
112
+ hf_save_dir = "./bitnet"
113
+
114
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
115
  if torch.cuda.is_available():
116
  print("CUDA is available. Using GPU:", torch.cuda.get_device_name(0))
117
  else:
118
  print("CUDA not available; using CPU.")
119
+ model.to(device)
120
  print(f"Loaded pre-trained Hugging Face model '{model_id}'.")
121
 
122
  # ---------------------------------------------------------------------------------
123
  # Load new Hugging Face dataset and preprocess it using the new formatting_func
124
  # ---------------------------------------------------------------------------------
125
+ # Load the dataset from Hugging Face
126
  full_dataset = load_dataset("Bifrost-AI/Solana-blockchain-360-Coding", split="train")
127
 
128
  def preprocess_function(example):
 
145
 
146
  input_ids = tokenized_full["input_ids"]
147
  labels = input_ids.copy()
148
+ # Mask the prompt tokens (loss computed only on answer tokens)
149
  for i in range(prompt_len):
150
  labels[i] = -100
151
 
 
157
 
158
  # Apply preprocessing and remove the original columns.
159
  processed_dataset = full_dataset.map(preprocess_function, remove_columns=full_dataset.column_names)
160
+
161
+ # Set the format so that when the dataset is indexed, the fields are torch tensors.
162
  processed_dataset.set_format(type="torch", columns=["input_ids", "labels", "prompt_len"])
163
 
164
  # Split the processed dataset into train and validation sets (90/10 split).
 
202
  optim = StableAdamWUnfused(model.parameters(), lr=LEARNING_RATE)
203
 
204
  # ---------------------------------------------------------------------------------
205
+ # Training loop for SFT fine tuning.
206
+ #
207
+ # For Hugging Face causal LM models, supplying 'labels' automatically shifts inputs
208
+ # and computes the loss only on the unmasked portion (i.e. the answer tokens).
209
  # ---------------------------------------------------------------------------------
210
+ for i in tqdm.tqdm(range(NUM_BATCHES), mininterval=10.0, desc="training"):
 
 
 
 
211
  model.train()
212
+ total_loss = 0.0
213
+ for _ in range(GRADIENT_ACCUMULATE_EVERY):
214
+ batch = next(train_loader)
215
+ input_ids = batch["input_ids"].to(device)
216
+ labels = batch["labels"].to(device)
217
+
218
+ outputs = model(input_ids=input_ids, labels=labels)
219
+ loss = outputs.loss
220
+ loss.backward()
221
+ total_loss += loss.item()
222
+
223
+ print(f"training loss: {total_loss / GRADIENT_ACCUMULATE_EVERY}")
224
+ torch.nn.utils.clip_grad_norm_(model.parameters(), 0.5)
225
+ optim.step()
226
+ optim.zero_grad()
227
+
228
+ if i % VALIDATE_EVERY == 0:
229
+ model.eval()
230
+ with torch.no_grad():
231
+ batch = next(val_loader)
232
  input_ids = batch["input_ids"].to(device)
233
  labels = batch["labels"].to(device)
 
234
  outputs = model(input_ids=input_ids, labels=labels)
235
+ val_loss = outputs.loss
236
+ print(f"validation loss: {val_loss.item()}")
237
+
238
+ if i % GENERATE_EVERY == 5:
239
+ model.eval()
240
+ # For generation, pick a random validation sample and extract its prompt.
241
+ sample = random.choice(val_dataset)
242
+ prompt_len = sample["prompt_len"]
243
+ if prompt_len == 0:
244
+ continue
245
+ prime_ids = sample["input_ids"][:prompt_len].unsqueeze(0).to(device)
246
+ prime_text = tokenizer.decode(prime_ids[0], skip_special_tokens=True)
247
+ print(f"Prompt:\n{prime_text}\n{'*' * 100}")
248
+
249
+ generated_ids = model.generate(
250
+ input_ids=prime_ids,
251
+ max_new_tokens=GENERATE_LENGTH,
252
+ do_sample=True,
253
+ temperature=1.0
254
+ )
255
+ output_str = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
256
+ print(f"Generated output:\n{output_str}")
257
 
258
  # ---------------------------------------------------------------------------------
259
+ # Save the final fine-tuned model after training.
260
  # ---------------------------------------------------------------------------------
261
+ output_checkpoint = "finetuned-bitnet.pt"
262
+ torch.save(model.state_dict(), output_checkpoint)
263
+ model.save_pretrained(hf_save_dir)
264
+ tokenizer.save_pretrained(hf_save_dir)
265
+ print(f"Model saved to '{output_checkpoint}' and Hugging Face artifacts saved to '{hf_save_dir}'!")
266
+ '''
 
 
 
 
 
 
 
 
 
 
 
267
 
268
+ # Build the Gradio interface with two tabs: one for instructions and one for the script.
 
 
269
  with gr.Blocks() as demo:
270
+ gr.Markdown("# Local SFT Training Script Viewer")
271
+ gr.Markdown("This app shows you the SFT training script along with detailed instructions on how to run it locally.")
272
 
273
+ with gr.Tabs():
274
+ with gr.TabItem("Instructions"):
275
+ gr.Markdown(instructions)
276
+ with gr.TabItem("SFT Training Script"):
277
+ gr.Textbox(value=sft_training_script, label="SFT Training Script", lines=40)
278
+
279
+ demo.launch()