{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "#!/usr/bin/env python\n", "# -*- coding: utf-8 -*-\n", "\n", "\"\"\"\n", "Sample script to finetune ModernBERT-Ja-130M on Japanese MT-bench (0~10 discrete scores).\n", "\"\"\"\n", "\n", "import os\n", "import gc\n", "import re\n", "import glob\n", "import json\n", "import random\n", "import pickle\n", "import numpy as np\n", "import pandas as pd\n", "from typing import Dict, Any\n", "\n", "from matplotlib import pyplot as plt\n", "\n", "import torch\n", "from torch import nn\n", "from datasets import Dataset, DatasetDict\n", "\n", "\n", "\n", "from transformers import (\n", " AutoTokenizer,\n", " AutoConfig,\n", " ModernBertForSequenceClassification,\n", " DataCollatorWithPadding,\n", " Trainer,\n", " TrainingArguments,\n", ")\n", "\n", "from transformers.modeling_outputs import SequenceClassifierOutput\n", "\n", "\n", "\n", "\n", "os.environ[\"CUDA_VISIBLE_DEVICES\"] = \"0\"\n", "\n", "# -------------------------------------------------------\n", "# 1. パラメータ設定(適宜変更)\n", "# -------------------------------------------------------\n", "CSV_FILE_PATH = r\"/media/kurogane/kioxia1/dataset/sss/pixiv/JMTB_1_rescore_float.csv\" # Japanese MT benchのCSVファイルパスを指定\n", "MODEL_NAME = \"sbintuitions/modernbert-ja-130m\" # ModernBERT-Ja-130M\n", "NUM_LABELS = 1 # 0~10の11クラス分類とする\n", "SEED = 42\n", "\n", "\n", "BASE_PROMPT = \"AIアシスタントがユーザーの質問に対して提供した回答の質を、公平な立場で評価してください。評価の際は、回答の有用性、関連性、正確性、深さ、創造性、詳細さを考慮してください。評価の前に短い説明を提供し、できるだけ客観的に評価してください。期待される言語は日本語です。日本語以外の言語での回答は、特に要求されない限り減点対象となります。全く日本語を使用しない場合、最低評価となります。ただし、Pythonのスクリプトや計算結果のみを提供する場合、日本語は必須ではありません。評価を0から1.0の範囲で小数点第一位までの数値で示し、floatで記載してください。例:\\\"0.5\\\"。\"\n", "\n", "# -------------------------------------------------------\n", "# 2. CSV読み込み & データ前処理\n", "# -------------------------------------------------------\n", "def load_jmtb_data(csv_path: str) -> pd.DataFrame:\n", " \"\"\"\n", " CSVを読み込んでDataFrameを返す。\n", " CSVの列名例:\n", " ['model_name', 'question_id', 'category', 'question', 'answer', 'judge', 'user_prompt',\n", " 'judgment', 'score', 'turn', 'tstamp', 'sub_category']\n", " \"\"\"\n", " df = pd.read_csv(csv_path)\n", " return df\n", "\n", "\n", "def build_input_text(row: pd.Series, df: pd.DataFrame) -> str:\n", " \"\"\"\n", " turn=1 の場合は「ターン1のみ」のテキストを構築。\n", " turn=2 の場合は「ターン1のQ&A + ターン2のQ&A」を一つに連結したテキストを構築。\n", " \"\"\"\n", " turn = row[\"turn\"]\n", " if turn == 1:\n", " # シングルターン\n", " # text = (\n", " # f\"{BASE_PROMPT}\\n\\n\",\n", " # f\"{row['question']}\",\n", " # f\"{row['answer']}\",\n", " # )\n", " text = f\"{BASE_PROMPT}{row['question']}{row['answer']}\"\n", " else:\n", " # 2ターン目のscore行なので、同じquestion_idのturn=1を探す\n", " qid = row[\"question_id\"]\n", " # 同じquestion_id & turn=1の行を検索\n", " df_turn1 = df[(df[\"question_id\"] == qid) & (df[\"turn\"] == 1)]\n", " if len(df_turn1) > 0:\n", " # 1行だけのはずだが、複数ある場合はiloc[0]\n", " r1 = df_turn1.iloc[0]\n", " # text = (\n", " # f\"{BASE_PROMPT}\\n\\n\",\n", " # f\"{r1['question']}\\n\\n{row['question']}\",\n", " # f\"{row['answer']}\",\n", " # )\n", " text = f\"{BASE_PROMPT}{r1['question']}\\n\\n{row['question']}{row['answer']}\"\n", " else:\n", " # turn=1が見当たらない不備データの場合 -> 仕方ないのでターン2だけ\n", " # text = (\n", " # f\"{BASE_PROMPT}\\n\\n\",\n", " # f\"{row['question']}\",\n", " # f\"{row['answer']}\",\n", " # )\n", " text = f\"{BASE_PROMPT}{row['question']}{row['answer']}\"\n", " return text\n", "\n", "\n", "def create_dataset_from_df(df: pd.DataFrame) -> Dataset:\n", " \"\"\"\n", " pandas DataFrame から [input_text, label] を作り、Hugging Face Datasets の Dataset を返す。\n", " - label は score (0~10) をそのまま格納。\n", " \"\"\"\n", " # 新しい列 input_text と label を作成\n", " # 参照しやすいようにデータフレームをコピー\n", " df2 = df.copy()\n", "\n", " # テキスト列を作成\n", " df2[\"input_text\"] = df2.apply(lambda row: build_input_text(row, df2), axis=1)\n", " # スコアを整数化(既にintなら不要)\n", " df2[\"label\"] = df2[\"score\"].astype(float)\n", "\n", " # 必要な列のみ残す\n", " used_cols = [\"input_text\", \"label\"]\n", " df2 = df2[used_cols]\n", "\n", " # Pandas -> Huggingface Dataset\n", " dataset = Dataset.from_pandas(df2, preserve_index=False)\n", " return dataset\n", "\n", "\n", "# -------------------------------------------------------\n", "# 3. データセットの分割: train/valid/test\n", "# -------------------------------------------------------\n", "def split_dataset(\n", " dataset: Dataset,\n", " split_ratio=(0.8, 0.1, 0.1),\n", " seed=SEED\n", ") -> DatasetDict:\n", " \"\"\"\n", " Dataset を train/dev/test に分割 (ランダム).\n", " デフォルトは 8:1:1\n", " \"\"\"\n", " train_ratio, valid_ratio, test_ratio = split_ratio\n", " # assert sum(split_ratio) == 1.0\n", " n_samples = len(dataset)\n", "\n", " # まず shuffle\n", " dataset = dataset.shuffle(seed=seed)\n", "\n", " train_end = int(n_samples * train_ratio)\n", " valid_end = int(n_samples * (train_ratio + valid_ratio))\n", "\n", " train_dataset = dataset.select(range(0, train_end))\n", " valid_dataset = dataset.select(range(train_end, valid_end))\n", " test_dataset = dataset.select(range(valid_end, n_samples))\n", "\n", " return DatasetDict({\n", " \"train\": train_dataset,\n", " \"validation\": valid_dataset,\n", " \"test\": test_dataset\n", " })\n", "\n", "\n", "# -------------------------------------------------------\n", "# 4. トークナイズ関数\n", "# -------------------------------------------------------\n", "def tokenize_function(examples, tokenizer, max_length=None):\n", " \"\"\"\n", " 文章をトークナイズ。max_lengthは適宜設定(Noneの場合は基本無制限、FlashAttention2でpadding無視)\n", " \"\"\"\n", " return tokenizer(\n", " examples[\"input_text\"],\n", " truncation=(max_length is not None),\n", " max_length=max_length,\n", " )\n", "\n", "\n", "# -------------------------------------------------------\n", "# 5. 評価指標: 分類タスク (単純にAccuracyを例示)\n", "# 必要に応じて MAE, F1, MSE などを追加実装してください。\n", "# -------------------------------------------------------\n", "def compute_metrics_regression(eval_pred):\n", " logits, labels = eval_pred\n", " # logits: shape (batch_size, 1)\n", " predictions = logits.reshape(-1)\n", " mae = mean_absolute_error(labels, predictions)\n", " mse = mean_squared_error(labels, predictions)\n", " return {\n", " \"mae\": mae,\n", " \"mse\": mse\n", " }\n", "\n", "class ModernBertForScoring(ModernBertForSequenceClassification):\n", " \"\"\"\n", " ModernBertForSequenceClassificationを継承し、\n", " 出力層にシグモイドをかけて 0~1 の範囲にマッピングするカスタムクラス。\n", " \"\"\"\n", "\n", " def __init__(self, config):\n", " super().__init__(config)\n", " # num_labels=1 + 回帰タスク想定なので、classification_head は linear + activation とする\n", " # 既存の self.classifier を再利用しつつ、最後にシグモイドを追加するイメージ\n", " self.sigmoid = nn.Sigmoid()\n", " # もし self.classifier が 1 出力以外になっている場合は要調整\n", " # (ModernBertForSequenceClassification の場合は config.num_labels に応じた Linear が作られる想定)\n", "\n", " def forward(\n", " self,\n", " input_ids=None,\n", " attention_mask=None,\n", " token_type_ids=None,\n", " labels=None,\n", " **kwargs,\n", " ):\n", " # 親クラス(ModernBertForSequenceClassification)の forward を実行\n", " # ただし 親クラスは [loss, logits] を返す実装なので、それを受け取り再加工する\n", " outputs = super().forward(\n", " input_ids=input_ids,\n", " attention_mask=attention_mask,\n", " token_type_ids=token_type_ids,\n", " labels=None, # ここでは一旦親の loss 計算を無効化し、自前でやる\n", " **kwargs,\n", " )\n", "\n", " # 親から返される logits は shape = (batch_size, num_labels=1) のはず\n", " logits = outputs.logits # => [B,1]\n", "\n", " # ここでシグモイドをかけて 0~1 に収まるようにする\n", " preds = self.sigmoid(logits) # => [B,1], range(0,1)\n", "\n", " loss = None\n", " if labels is not None:\n", " labels = labels.view(-1, 1).float()\n", " loss_fct = nn.MSELoss()\n", " loss = loss_fct(preds, labels)\n", " \n", " # hidden_states / attentions が None の場合も型的に問題なく格納できる\n", " return SequenceClassifierOutput(\n", " loss=loss,\n", " logits=preds, # シグモイド後の出力 (shape=[B,1])\n", " hidden_states=outputs.hidden_states,\n", " attentions=outputs.attentions,\n", " )\n", "\n", "\n", "from sklearn.metrics import mean_absolute_error, mean_squared_error\n", "\n", "def compute_metrics_regression(eval_pred):\n", " logits, labels = eval_pred\n", " # logits: [batch_size, 1], labels: [batch_size]\n", " preds = logits.reshape(-1)\n", " mse = mean_squared_error(labels, preds)\n", " mae = mean_absolute_error(labels, preds)\n", " return {\n", " \"mse\": mse,\n", " \"mae\": mae\n", " }\n", "\n", "\n" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[Info] Loading CSV from: /media/kurogane/kioxia1/dataset/sss/pixiv/JMTB_1_rescore_float.csv\n", "[Info] CSV loaded: 6480 rows.\n", "[Info] Built dataset with columns: ['input_text', 'label']\n", "DatasetDict({\n", " train: Dataset({\n", " features: ['input_text', 'label'],\n", " num_rows: 5184\n", " })\n", " validation: Dataset({\n", " features: ['input_text', 'label'],\n", " num_rows: 648\n", " })\n", " test: Dataset({\n", " features: ['input_text', 'label'],\n", " num_rows: 648\n", " })\n", "})\n", "[Info] Loading tokenizer for sbintuitions/modernbert-ja-130m\n" ] }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "851e20ba48b845b995288997e95a58c5", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Map: 0%| | 0/5184 [00:00\n", " \n", " \n", " [2592/2592 18:44, Epoch 32/32]\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
EpochTraining LossValidation LossMseMae
10.1210000.0507170.0507170.172666
20.0891000.0413920.0413920.158076
30.0789000.0295730.0295730.121381
40.1049000.0648990.0648990.209115
50.0505000.0299660.0299660.131566
60.0245000.0687390.0687390.215073
70.0176000.0326280.0326280.140590
80.0115000.0240800.0240800.107284
90.0096000.0235500.0235500.106661
100.0089000.0196720.0196720.098421
110.0079000.0208090.0208090.108778
120.0050000.0187930.0187930.098439
130.0036000.0176990.0176990.098569
140.0029000.0202240.0202240.100133
150.0034000.0172070.0172070.096104
160.0012000.0177200.0177200.095289
170.0015000.0179830.0179830.096090
180.0008000.0177090.0177090.095045
190.0009000.0174560.0174560.094618
200.0003000.0174870.0174870.095387
210.0002000.0174180.0174180.094866
220.0001000.0173750.0173750.095027
230.0001000.0171700.0171700.095647
240.0001000.0173440.0173440.095632
250.0000000.0171270.0171270.095365
260.0000000.0171530.0171530.095548
270.0000000.0172620.0172620.095495
280.0000000.0172040.0172040.095659
290.0000000.0173180.0173180.095501
300.0000000.0172860.0172860.095896
310.0000000.0173630.0173630.095974
320.0000000.0172500.0172500.095597

" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "[Info] Evaluating on test set ...\n" ] }, { "data": { "text/html": [ "\n", "

\n", " \n", " \n", " [81/81 00:01]\n", "
\n", " " ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "Test set metrics: {'eval_loss': 0.022432124242186546, 'eval_mse': 0.022432127967476845, 'eval_mae': 0.10348472744226456, 'eval_runtime': 1.4185, 'eval_samples_per_second': 456.805, 'eval_steps_per_second': 57.101, 'epoch': 32.0}\n", "[Info] Done. Saving final model ...\n", "[Info] Finished.\n" ] }, { "data": { "image/png": "", "text/plain": [ "
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# -------------------------------------------------------\n", "# 6. 実行メイン\n", "# -------------------------------------------------------\n", "\n", "6\n", "# 学習関連ハイパーパラメータ\n", "TRAIN_EPOCHS = 32\n", "TRAIN_BATCH_SIZE = 64\n", "EVAL_BATCH_SIZE = 8\n", "LEARNING_RATE = 4e-5\n", "\n", "\n", "SAVE_DIR = \"./modernbert_jamt_finetune_ckpt_{:0=2}\".format(len(glob.glob(\"./modernbert_jamt_finetune_ckpt_*\")))\n", "SPLIT_RATIO = (0.8, 0.1, 0.1) # train:valid:test = 8:1:1\n", "\n", "\n", "os.environ[\"TOKENIZERS_PARALLELISM\"] = \"false\"\n", "random.seed(SEED)\n", "np.random.seed(SEED)\n", "torch.manual_seed(SEED)\n", "\n", "print(f\"[Info] Loading CSV from: {CSV_FILE_PATH}\")\n", "df = load_jmtb_data(CSV_FILE_PATH)\n", "print(f\"[Info] CSV loaded: {len(df)} rows.\")\n", "\n", "# Dataset化\n", "dataset_all = create_dataset_from_df(df)\n", "print(\"[Info] Built dataset with columns:\", dataset_all.column_names)\n", "\n", "# train/dev/test split\n", "dataset_dict = split_dataset(dataset_all, split_ratio=SPLIT_RATIO, seed=SEED)\n", "\n", "# 変数をpickle形式で保存する\n", "with open(\"./dataset_dict_float.pkl\", \"wb\") as file:\n", " pickle.dump(dataset_dict, file)\n", "# # pickle形式で保存された変数を読み込む\n", "# with open(\"./dataset_dict_float.pkl\", \"rb\") as file:\n", "# dataset_dict = pickle.load(file)\n", "\n", "\n", "# dataset_dict = DatasetDict.load_from_disk(\"./jmtb_dataset_splits\")\n", "\n", "print(dataset_dict)\n", "\n", "# トークナイザ準備\n", "print(f\"[Info] Loading tokenizer for {MODEL_NAME}\")\n", "tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)\n", "\n", "def tokenize_fn(examples):\n", " return tokenize_function(examples, tokenizer, max_length=None)\n", "\n", "dataset_dict = dataset_dict.map(tokenize_fn, batched=True)\n", "\n", "# モデルConfigとモデル本体\n", "# num_labels=11クラス分類 (score=0..10)\n", "config = AutoConfig.from_pretrained(\n", " MODEL_NAME,\n", " num_labels=1,\n", " problem_type=\"single_label_regression\"\n", ")\n", "# 注意: AutoConfig で problem_type 指定しても、上書きするのは親クラスの forward.\n", "# ここでは主に「情報として入れておく」ため\n", "\n", "tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)\n", "\n", "model = ModernBertForScoring.from_pretrained(\n", " MODEL_NAME,\n", " config=config\n", ")\n", "\n", "\n", "\n", "# 学習データと評価データへ正しく入力されるようにcollator準備\n", "data_collator = DataCollatorWithPadding(tokenizer=tokenizer)\n", "\n", "# Trainer用の引数設定\n", "training_args = TrainingArguments(\n", " output_dir=SAVE_DIR,\n", " num_train_epochs=TRAIN_EPOCHS,\n", " learning_rate=LEARNING_RATE,\n", " per_device_train_batch_size=TRAIN_BATCH_SIZE,\n", " per_device_eval_batch_size=EVAL_BATCH_SIZE,\n", " evaluation_strategy=\"epoch\",\n", " save_strategy=\"epoch\",\n", " logging_strategy=\"epoch\",\n", " load_best_model_at_end=True,\n", " bf16=True, # Ampere以降のGPUでMixed Precision(BF16)学習\n", " bf16_full_eval=True,\n", " report_to=\"none\", # レポート先をOFF(W&Bなど使わない場合)\n", " seed=SEED,\n", " warmup_ratio=0.1,\n", " lr_scheduler_type=\"cosine\",\n", " weight_decay=0.01,\n", " # logging_dir=SAVE_DIR,\n", ")\n", "\n", "# Trainer生成\n", "trainer = Trainer(\n", " model=model,\n", " args=training_args,\n", " train_dataset=dataset_dict[\"train\"],\n", " eval_dataset=dataset_dict[\"validation\"],\n", " tokenizer=tokenizer,\n", " data_collator=data_collator,\n", " compute_metrics=compute_metrics_regression, #compute_metrics_classification,\n", ")\n", "\n", "print(\"[Info] Starting training ...\")\n", "trainer.train()\n", "\n", "# 学習完了後、テストセットで評価\n", "print(\"[Info] Evaluating on test set ...\")\n", "metrics_test = trainer.evaluate(dataset_dict[\"test\"])\n", "print(\"Test set metrics:\", metrics_test)\n", "\n", "# 終了処理\n", "print(\"[Info] Done. Saving final model ...\")\n", "trainer.save_model(SAVE_DIR)\n", "print(\"[Info] Finished.\")\n", "\n", "\n", "\n", "# ロスなどの結果を別途保存\n", "dir_checkpoints = glob.glob(os.path.join(SAVE_DIR, \"checkpoint-*\", \"trainer_state.json\"))\n", "def atoi(text):\n", " return int(text) if text.isdigit() else text\n", "\n", "def natural_keys(text):\n", " return [ atoi(c) for c in re.split(r'(\\d+)', text) ]\n", "\n", "dir_checkpoints = sorted(dir_checkpoints,key=natural_keys)\n", "\n", "l_data_eval_mae = []\n", "l_data_eval_mse = []\n", "l_data_eval_loss = []\n", "l_data_loss = []\n", "for i_checkpoint in dir_checkpoints:\n", " with open(i_checkpoint, \"r\", encoding=\"utf-8\") as reader:\n", " data_check = json.load(reader)\n", " l_data_eval_mae.append(data_check[\"log_history\"][-1][\"eval_mae\"])\n", " l_data_eval_mse.append(data_check[\"log_history\"][-1][\"eval_mse\"])\n", " l_data_eval_loss.append(data_check[\"log_history\"][-1][\"eval_loss\"])\n", " l_data_loss.append(data_check[\"log_history\"][-2][\"loss\"])\n", "\n", "d_logs = {\n", " \"eval_mae\": l_data_eval_mae,\n", " \"eval_mse\": l_data_eval_mse,\n", " \"eval_loss\": l_data_eval_loss,\n", " \"loss\": l_data_loss,\n", "}\n", "\n", "with open(os.path.join(SAVE_DIR, \"log_epochs.json\"), \"w\", encoding=\"utf-8\") as writer:\n", " json.dump(d_logs, writer, indent=4, ensure_ascii=False)\n", "\n", "# 可視化\n", "fig, ax = plt.subplots(ncols=2)\n", "\n", "ax[0].plot(l_data_eval_mae, label=\"eval_mae\")\n", "ax[0].plot(l_data_eval_mse, label=\"eval_mse\")\n", "ax[1].plot(l_data_eval_loss, label=\"eval_loss\")\n", "ax[1].plot(l_data_loss, label=\"loss\")\n", "\n", "ax[0].set_xlabel(\"epochs\")\n", "ax[1].set_xlabel(\"epochs\")\n", "\n", "ax[0].legend()\n", "ax[1].legend()\n", "\n", "plt.savefig(os.path.join(SAVE_DIR, \"log_epochs.png\"))\n", "plt.show()" ] } ], "metadata": { "kernelspec": { "display_name": "vllmtest", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.12.4" } }, "nbformat": 4, "nbformat_minor": 2 }