File size: 2,375 Bytes
fde374e
 
 
 
274c62b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
fde374e
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
---
library_name: peft
base_model: unsloth/gemma-7b-bnb-4bit
---
prompt

```
<original>Ok. What do the drivers look like?</original>
<translate to="th">
```

response
```
<original>กรุงเทพอยู่ที่ไหน</original>
<translate to="en">where is bangkok</translate><eos>
```

code to create dataset
```python
import random


alpaca_prompt = """<original>{}</original>
<translate to="{}">{}"""

BOS_TOKEN = tokenizer.bos_token # Must add EOS_TOKEN
EOS_TOKEN = "</translate>"+tokenizer.eos_token # Must add EOS_TOKEN
def formatting_prompts_func(examples):
    translations = examples["translation"]
    texts = []
    text_en = ""
    text_th = ""
    translate_to = 'th'
    max_group_count = 1
    group_count = 0
    for translation in translations:
       
        if group_count >= max_group_count:
            if(translate_to == 'th'):
                text = alpaca_prompt.format(text_en, translate_to, text_th) + EOS_TOKEN
            else:
                text = alpaca_prompt.format(text_th, translate_to, text_en) + EOS_TOKEN
            texts.append(text)
            text_en = ""
            text_th = ""
            max_group_count = random.randint(1, 5)
            group_count = 0
            translate_to = random.choice(['en', 'th'])
        
        num_newlines = random.randint(1, 5)
        newlines = '\n' * num_newlines
        if(text_en == ""):
            text_en = translation['en']
            text_th = translation['th']
        else:
            text_en = text_en+newlines+translation['en']
            text_th = text_th+newlines+translation['th']
        group_count = group_count+1
    if(translate_to == 'th'):
        text = alpaca_prompt.format(text_en, translate_to, text_th) + EOS_TOKEN
    else:
        text = alpaca_prompt.format(text_th, translate_to, text_en) + EOS_TOKEN
    texts.append(text)
    return { "text" : texts, }


from datasets import load_dataset
import datasets

# dataset = load_dataset("scb_mt_enth_2020",'enth', download_mode=datasets.DownloadMode.FORCE_REDOWNLOAD,cache_dir ="./cache")
dataset = load_dataset("scb_mt_enth_2020",'enth',cache_dir ="./cache")
dataset = dataset.shuffle(seed=42)
dataset = dataset.map(formatting_prompts_func, batched = True,remove_columns=["translation",'subdataset'])
dataset['train'][0:5]
```

[More Information Needed]
### Framework versions

- PEFT 0.10.0