--- library_name: peft base_model: unsloth/gemma-7b-bnb-4bit --- prompt ``` Ok. What do the drivers look like? ``` response ``` กรุงเทพอยู่ที่ไหน where is bangkok ``` code to create dataset ```python import random alpaca_prompt = """{} {}""" BOS_TOKEN = tokenizer.bos_token # Must add EOS_TOKEN EOS_TOKEN = ""+tokenizer.eos_token # Must add EOS_TOKEN def formatting_prompts_func(examples): translations = examples["translation"] texts = [] text_en = "" text_th = "" translate_to = 'th' max_group_count = 1 group_count = 0 for translation in translations: if group_count >= max_group_count: if(translate_to == 'th'): text = alpaca_prompt.format(text_en, translate_to, text_th) + EOS_TOKEN else: text = alpaca_prompt.format(text_th, translate_to, text_en) + EOS_TOKEN texts.append(text) text_en = "" text_th = "" max_group_count = random.randint(1, 5) group_count = 0 translate_to = random.choice(['en', 'th']) num_newlines = random.randint(1, 5) newlines = '\n' * num_newlines if(text_en == ""): text_en = translation['en'] text_th = translation['th'] else: text_en = text_en+newlines+translation['en'] text_th = text_th+newlines+translation['th'] group_count = group_count+1 if(translate_to == 'th'): text = alpaca_prompt.format(text_en, translate_to, text_th) + EOS_TOKEN else: text = alpaca_prompt.format(text_th, translate_to, text_en) + EOS_TOKEN texts.append(text) return { "text" : texts, } from datasets import load_dataset import datasets # dataset = load_dataset("scb_mt_enth_2020",'enth', download_mode=datasets.DownloadMode.FORCE_REDOWNLOAD,cache_dir ="./cache") dataset = load_dataset("scb_mt_enth_2020",'enth',cache_dir ="./cache") dataset = dataset.shuffle(seed=42) dataset = dataset.map(formatting_prompts_func, batched = True,remove_columns=["translation",'subdataset']) dataset['train'][0:5] ``` [More Information Needed] ### Framework versions - PEFT 0.10.0