First_agent_template

Sleeping

Agathe1489 commited on Feb 19

Commit

44fa3df

verified ·

1 Parent(s): 012c153

Create convert_datasets.py

Files changed (1) hide show

convert_datasets.py ADDED Viewed

+from datasets import load_dataset
+from transformers import AutoTokenizer
+# These will use different templates automatically
+mistral_tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-Instruct-v0.1")
+qwen_tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen-7B-Chat")
+smol_tokenizer = AutoTokenizer.from_pretrained("HuggingFaceTB/SmolLM2-135M-Instruct")
+messages = [
+    {"role": "system", "content": "You are a helpful assistant."},
+    {"role": "user", "content": "Hello!"},
+]
+# Each will format according to its model's template
+mistral_chat = mistral_tokenizer.apply_chat_template(messages, tokenize=False)
+qwen_chat = qwen_tokenizer.apply_chat_template(messages, tokenize=False)
+smol_chat = smol_tokenizer.apply_chat_template(messages, tokenize=False)
+dataset = load_dataset("HuggingFaceTB/smoltalk")
+def convert_to_chatml(example):
+    return {
+        "messages": [
+            {"role": "user", "content": example["input"]},
+            {"role": "assistant", "content": example["output"]},
+        ]
+    }