Gong Baitao
commited on
Commit
·
9c2f19e
1
Parent(s):
fac119c
Update modeling_cpmbee.py and README.md
Browse files- README.md +36 -1
- modeling_cpmbee.py +2 -2
README.md
CHANGED
|
@@ -67,4 +67,39 @@ res = model.generate(
|
|
| 67 |
)
|
| 68 |
print(res)
|
| 69 |
|
| 70 |
-
```
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 67 |
)
|
| 68 |
print(res)
|
| 69 |
|
| 70 |
+
```
|
| 71 |
+
|
| 72 |
+
We suggest to use `bmtrain` to finetune CPM-Bee. Also, you can use `accelerate` and `deepspeed` to finetune CPM-Bee. Here we will give a brief example of a training loop:
|
| 73 |
+
|
| 74 |
+
```python
|
| 75 |
+
from transformers import AutoTokenizer, AutoModelForCausalLM
|
| 76 |
+
from accelerate import Accelerator
|
| 77 |
+
from torch.utils.data import Dataset, DataLoader
|
| 78 |
+
|
| 79 |
+
accelerator = Accelerator()
|
| 80 |
+
|
| 81 |
+
trainset = Dataset() # Make sure trainset.__getitem__() can get data with correct format like {"input": "...", "<ans>": ""}
|
| 82 |
+
# for details, you can read https://github.com/OpenBMB/CPM-Bee/tree/main/tutorials/basic_task_finetune
|
| 83 |
+
train_loader = DataLoader(trainset, batch_size=1)
|
| 84 |
+
|
| 85 |
+
tokenizer = AutoTokenizer.from_pretrained("openbmb/cpm-bee-2b", trust_remote_code=True)
|
| 86 |
+
model = AutoModelForCausalLM.from_pretrained("openbmb/cpm-bee-2b", trust_remote_code=True).cuda()
|
| 87 |
+
|
| 88 |
+
optimizer = torch.optim.Adam(model.parameters())
|
| 89 |
+
|
| 90 |
+
model, optimizer, train_loader = accelerator.prepare(
|
| 91 |
+
model, optimizer, train_loader
|
| 92 |
+
)
|
| 93 |
+
|
| 94 |
+
for iter, data in enumerate(train_loader):
|
| 95 |
+
optimizer.zero_grad()
|
| 96 |
+
|
| 97 |
+
# change the data to a trainable format
|
| 98 |
+
input_encoded = tokenizer.prepare_for_finetune(data, max_length=512).to(model.device)
|
| 99 |
+
|
| 100 |
+
outputs = model(**input_encoded)
|
| 101 |
+
loss = outputs.loss
|
| 102 |
+
accelerator.backward(loss)
|
| 103 |
+
optimizer.step()
|
| 104 |
+
```
|
| 105 |
+
You should design your own parallel and mix_precision training strategy on the basis of it.
|
modeling_cpmbee.py
CHANGED
|
@@ -569,10 +569,10 @@ class CpmBeeRotaryEmbedding(nn.Module):
|
|
| 569 |
self.inv_freq = inv_freq.to(config.torch_dtype)
|
| 570 |
|
| 571 |
def forward(self, x: torch.Tensor, x_pos: torch.Tensor):
|
| 572 |
-
inv_freq = self.inv_freq.to(device=x.device, dtype=
|
| 573 |
|
| 574 |
x_pos = x_pos * self.distance_scale
|
| 575 |
-
freqs = x_pos[..., None]
|
| 576 |
|
| 577 |
emb = torch.cat((freqs, freqs), dim=-1) # (..., dim)
|
| 578 |
emb_cos = emb.cos() # (..., dim)
|
|
|
|
| 569 |
self.inv_freq = inv_freq.to(config.torch_dtype)
|
| 570 |
|
| 571 |
def forward(self, x: torch.Tensor, x_pos: torch.Tensor):
|
| 572 |
+
inv_freq = self.inv_freq.to(device=x.device, dtype=x.dtype)
|
| 573 |
|
| 574 |
x_pos = x_pos * self.distance_scale
|
| 575 |
+
freqs = x_pos[..., None] * inv_freq[None, :] # (..., dim/2)
|
| 576 |
|
| 577 |
emb = torch.cat((freqs, freqs), dim=-1) # (..., dim)
|
| 578 |
emb_cos = emb.cos() # (..., dim)
|