{ "best_metric": 0.6833301782608032, "best_model_checkpoint": "mistralai/Mistral-7B-Instruct-v0.2_trail-/checkpoint-1600", "epoch": 0.9696969696969697, "eval_steps": 200, "global_step": 1600, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.01, "grad_norm": 2.0257179737091064, "learning_rate": 2e-05, "loss": 1.4524, "step": 10 }, { "epoch": 0.01, "grad_norm": 1.6968803405761719, "learning_rate": 2e-05, "loss": 0.9393, "step": 20 }, { "epoch": 0.02, "grad_norm": 1.7078455686569214, "learning_rate": 2e-05, "loss": 0.8432, "step": 30 }, { "epoch": 0.02, "grad_norm": 1.6603143215179443, "learning_rate": 2e-05, "loss": 0.8382, "step": 40 }, { "epoch": 0.03, "grad_norm": 1.6785234212875366, "learning_rate": 2e-05, "loss": 0.7994, "step": 50 }, { "epoch": 0.04, "grad_norm": 1.273807406425476, "learning_rate": 2e-05, "loss": 0.8302, "step": 60 }, { "epoch": 0.04, "grad_norm": 1.3506838083267212, "learning_rate": 2e-05, "loss": 0.7588, "step": 70 }, { "epoch": 0.05, "grad_norm": 1.3038816452026367, "learning_rate": 2e-05, "loss": 0.7462, "step": 80 }, { "epoch": 0.05, "grad_norm": 1.4663119316101074, "learning_rate": 2e-05, "loss": 0.7391, "step": 90 }, { "epoch": 0.06, "grad_norm": 1.2945542335510254, "learning_rate": 2e-05, "loss": 0.7768, "step": 100 }, { "epoch": 0.07, "grad_norm": 1.258899211883545, "learning_rate": 2e-05, "loss": 0.7737, "step": 110 }, { "epoch": 0.07, "grad_norm": 1.416283369064331, "learning_rate": 2e-05, "loss": 0.7422, "step": 120 }, { "epoch": 0.08, "grad_norm": 1.200121283531189, "learning_rate": 2e-05, "loss": 0.7306, "step": 130 }, { "epoch": 0.08, "grad_norm": 1.1042457818984985, "learning_rate": 2e-05, "loss": 0.733, "step": 140 }, { "epoch": 0.09, "grad_norm": 1.3173305988311768, "learning_rate": 2e-05, "loss": 0.7454, "step": 150 }, { "epoch": 0.1, "grad_norm": 1.2873802185058594, "learning_rate": 2e-05, "loss": 0.7218, "step": 160 }, { "epoch": 0.1, "grad_norm": 1.2658429145812988, "learning_rate": 2e-05, "loss": 0.763, "step": 170 }, { "epoch": 0.11, "grad_norm": 1.1320024728775024, "learning_rate": 2e-05, "loss": 0.754, "step": 180 }, { "epoch": 0.12, "grad_norm": 1.074420690536499, "learning_rate": 2e-05, "loss": 0.6971, "step": 190 }, { "epoch": 0.12, "grad_norm": 1.051477074623108, "learning_rate": 2e-05, "loss": 0.7318, "step": 200 }, { "epoch": 0.12, "eval_loss": 0.72369384765625, "eval_runtime": 576.3119, "eval_samples_per_second": 0.953, "eval_steps_per_second": 0.095, "step": 200 }, { "epoch": 0.13, "grad_norm": 1.084792971611023, "learning_rate": 2e-05, "loss": 0.7234, "step": 210 }, { "epoch": 0.13, "grad_norm": 1.2191482782363892, "learning_rate": 2e-05, "loss": 0.7117, "step": 220 }, { "epoch": 0.14, "grad_norm": 1.0391403436660767, "learning_rate": 2e-05, "loss": 0.7561, "step": 230 }, { "epoch": 0.15, "grad_norm": 1.1228817701339722, "learning_rate": 2e-05, "loss": 0.7488, "step": 240 }, { "epoch": 0.15, "grad_norm": 1.03204345703125, "learning_rate": 2e-05, "loss": 0.7096, "step": 250 }, { "epoch": 0.16, "grad_norm": 1.0652753114700317, "learning_rate": 2e-05, "loss": 0.7035, "step": 260 }, { "epoch": 0.16, "grad_norm": 1.0256322622299194, "learning_rate": 2e-05, "loss": 0.7151, "step": 270 }, { "epoch": 0.17, "grad_norm": 0.9747219681739807, "learning_rate": 2e-05, "loss": 0.7161, "step": 280 }, { "epoch": 0.18, "grad_norm": 1.028686761856079, "learning_rate": 2e-05, "loss": 0.7268, "step": 290 }, { "epoch": 0.18, "grad_norm": 1.129952073097229, "learning_rate": 2e-05, "loss": 0.812, "step": 300 }, { "epoch": 0.19, "grad_norm": 1.1518712043762207, "learning_rate": 2e-05, "loss": 0.7328, "step": 310 }, { "epoch": 0.19, "grad_norm": 1.0078867673873901, "learning_rate": 2e-05, "loss": 0.7201, "step": 320 }, { "epoch": 0.2, "grad_norm": 1.050847053527832, "learning_rate": 2e-05, "loss": 0.7003, "step": 330 }, { "epoch": 0.21, "grad_norm": 0.9970359802246094, "learning_rate": 2e-05, "loss": 0.6904, "step": 340 }, { "epoch": 0.21, "grad_norm": 0.9830156564712524, "learning_rate": 2e-05, "loss": 0.7167, "step": 350 }, { "epoch": 0.22, "grad_norm": 1.121741771697998, "learning_rate": 2e-05, "loss": 0.7053, "step": 360 }, { "epoch": 0.22, "grad_norm": 1.11673104763031, "learning_rate": 2e-05, "loss": 0.7162, "step": 370 }, { "epoch": 0.23, "grad_norm": 0.94569993019104, "learning_rate": 2e-05, "loss": 0.6991, "step": 380 }, { "epoch": 0.24, "grad_norm": 0.9942061305046082, "learning_rate": 2e-05, "loss": 0.7207, "step": 390 }, { "epoch": 0.24, "grad_norm": 1.0249754190444946, "learning_rate": 2e-05, "loss": 0.7364, "step": 400 }, { "epoch": 0.24, "eval_loss": 0.7096507549285889, "eval_runtime": 576.6307, "eval_samples_per_second": 0.952, "eval_steps_per_second": 0.095, "step": 400 }, { "epoch": 0.25, "grad_norm": 1.0410763025283813, "learning_rate": 2e-05, "loss": 0.7104, "step": 410 }, { "epoch": 0.25, "grad_norm": 1.0619620084762573, "learning_rate": 2e-05, "loss": 0.7713, "step": 420 }, { "epoch": 0.26, "grad_norm": 1.0299198627471924, "learning_rate": 2e-05, "loss": 0.6684, "step": 430 }, { "epoch": 0.27, "grad_norm": 0.946596086025238, "learning_rate": 2e-05, "loss": 0.7197, "step": 440 }, { "epoch": 0.27, "grad_norm": 1.00041663646698, "learning_rate": 2e-05, "loss": 0.7406, "step": 450 }, { "epoch": 0.28, "grad_norm": 0.971476674079895, "learning_rate": 2e-05, "loss": 0.6926, "step": 460 }, { "epoch": 0.28, "grad_norm": 0.9575595259666443, "learning_rate": 2e-05, "loss": 0.7121, "step": 470 }, { "epoch": 0.29, "grad_norm": 1.0066837072372437, "learning_rate": 2e-05, "loss": 0.7501, "step": 480 }, { "epoch": 0.3, "grad_norm": 1.1008684635162354, "learning_rate": 2e-05, "loss": 0.7413, "step": 490 }, { "epoch": 0.3, "grad_norm": 1.004758358001709, "learning_rate": 2e-05, "loss": 0.726, "step": 500 }, { "epoch": 0.31, "grad_norm": 1.0132404565811157, "learning_rate": 2e-05, "loss": 0.7446, "step": 510 }, { "epoch": 0.32, "grad_norm": 0.928865909576416, "learning_rate": 2e-05, "loss": 0.6484, "step": 520 }, { "epoch": 0.32, "grad_norm": 1.0739837884902954, "learning_rate": 2e-05, "loss": 0.693, "step": 530 }, { "epoch": 0.33, "grad_norm": 0.9825330376625061, "learning_rate": 2e-05, "loss": 0.7004, "step": 540 }, { "epoch": 0.33, "grad_norm": 1.034639596939087, "learning_rate": 2e-05, "loss": 0.7102, "step": 550 }, { "epoch": 0.34, "grad_norm": 0.9825816750526428, "learning_rate": 2e-05, "loss": 0.6763, "step": 560 }, { "epoch": 0.35, "grad_norm": 0.872897744178772, "learning_rate": 2e-05, "loss": 0.7211, "step": 570 }, { "epoch": 0.35, "grad_norm": 1.0789843797683716, "learning_rate": 2e-05, "loss": 0.744, "step": 580 }, { "epoch": 0.36, "grad_norm": 0.8900558352470398, "learning_rate": 2e-05, "loss": 0.7098, "step": 590 }, { "epoch": 0.36, "grad_norm": 0.9795206785202026, "learning_rate": 2e-05, "loss": 0.6974, "step": 600 }, { "epoch": 0.36, "eval_loss": 0.7024338245391846, "eval_runtime": 576.7068, "eval_samples_per_second": 0.952, "eval_steps_per_second": 0.095, "step": 600 }, { "epoch": 0.37, "grad_norm": 0.8986483812332153, "learning_rate": 2e-05, "loss": 0.6655, "step": 610 }, { "epoch": 0.38, "grad_norm": 1.0255687236785889, "learning_rate": 2e-05, "loss": 0.6921, "step": 620 }, { "epoch": 0.38, "grad_norm": 1.1219247579574585, "learning_rate": 2e-05, "loss": 0.7142, "step": 630 }, { "epoch": 0.39, "grad_norm": 1.0987035036087036, "learning_rate": 2e-05, "loss": 0.7698, "step": 640 }, { "epoch": 0.39, "grad_norm": 0.976980984210968, "learning_rate": 2e-05, "loss": 0.7185, "step": 650 }, { "epoch": 0.4, "grad_norm": 0.9381852149963379, "learning_rate": 2e-05, "loss": 0.6631, "step": 660 }, { "epoch": 0.41, "grad_norm": 1.0329375267028809, "learning_rate": 2e-05, "loss": 0.6894, "step": 670 }, { "epoch": 0.41, "grad_norm": 0.921355128288269, "learning_rate": 2e-05, "loss": 0.753, "step": 680 }, { "epoch": 0.42, "grad_norm": 0.9028052687644958, "learning_rate": 2e-05, "loss": 0.7371, "step": 690 }, { "epoch": 0.42, "grad_norm": 0.9813818335533142, "learning_rate": 2e-05, "loss": 0.6981, "step": 700 }, { "epoch": 0.43, "grad_norm": 1.2183573246002197, "learning_rate": 2e-05, "loss": 0.7199, "step": 710 }, { "epoch": 0.44, "grad_norm": 0.9314076900482178, "learning_rate": 2e-05, "loss": 0.7096, "step": 720 }, { "epoch": 0.44, "grad_norm": 0.8565794825553894, "learning_rate": 2e-05, "loss": 0.6797, "step": 730 }, { "epoch": 0.45, "grad_norm": 0.8477567434310913, "learning_rate": 2e-05, "loss": 0.6874, "step": 740 }, { "epoch": 0.45, "grad_norm": 1.0110888481140137, "learning_rate": 2e-05, "loss": 0.7495, "step": 750 }, { "epoch": 0.46, "grad_norm": 0.9757490158081055, "learning_rate": 2e-05, "loss": 0.6809, "step": 760 }, { "epoch": 0.47, "grad_norm": 0.8909426927566528, "learning_rate": 2e-05, "loss": 0.6746, "step": 770 }, { "epoch": 0.47, "grad_norm": 0.8148036003112793, "learning_rate": 2e-05, "loss": 0.7196, "step": 780 }, { "epoch": 0.48, "grad_norm": 0.9427577257156372, "learning_rate": 2e-05, "loss": 0.7082, "step": 790 }, { "epoch": 0.48, "grad_norm": 0.8812272548675537, "learning_rate": 2e-05, "loss": 0.7031, "step": 800 }, { "epoch": 0.48, "eval_loss": 0.6963450908660889, "eval_runtime": 576.5372, "eval_samples_per_second": 0.952, "eval_steps_per_second": 0.095, "step": 800 }, { "epoch": 0.49, "grad_norm": 1.184993028640747, "learning_rate": 2e-05, "loss": 0.6883, "step": 810 }, { "epoch": 0.5, "grad_norm": 1.032034993171692, "learning_rate": 2e-05, "loss": 0.687, "step": 820 }, { "epoch": 0.5, "grad_norm": 0.8716715574264526, "learning_rate": 2e-05, "loss": 0.7196, "step": 830 }, { "epoch": 0.51, "grad_norm": 1.0160155296325684, "learning_rate": 2e-05, "loss": 0.7549, "step": 840 }, { "epoch": 0.52, "grad_norm": 0.9079670906066895, "learning_rate": 2e-05, "loss": 0.6564, "step": 850 }, { "epoch": 0.52, "grad_norm": 0.9788998365402222, "learning_rate": 2e-05, "loss": 0.698, "step": 860 }, { "epoch": 0.53, "grad_norm": 0.9993180632591248, "learning_rate": 2e-05, "loss": 0.7041, "step": 870 }, { "epoch": 0.53, "grad_norm": 0.8905129432678223, "learning_rate": 2e-05, "loss": 0.7061, "step": 880 }, { "epoch": 0.54, "grad_norm": 0.9284083247184753, "learning_rate": 2e-05, "loss": 0.7213, "step": 890 }, { "epoch": 0.55, "grad_norm": 0.9979264140129089, "learning_rate": 2e-05, "loss": 0.7067, "step": 900 }, { "epoch": 0.55, "grad_norm": 0.9622723460197449, "learning_rate": 2e-05, "loss": 0.7078, "step": 910 }, { "epoch": 0.56, "grad_norm": 1.0288429260253906, "learning_rate": 2e-05, "loss": 0.7018, "step": 920 }, { "epoch": 0.56, "grad_norm": 0.9015412926673889, "learning_rate": 2e-05, "loss": 0.7013, "step": 930 }, { "epoch": 0.57, "grad_norm": 0.9322758316993713, "learning_rate": 2e-05, "loss": 0.7006, "step": 940 }, { "epoch": 0.58, "grad_norm": 0.875117301940918, "learning_rate": 2e-05, "loss": 0.6632, "step": 950 }, { "epoch": 0.58, "grad_norm": 0.9057668447494507, "learning_rate": 2e-05, "loss": 0.7009, "step": 960 }, { "epoch": 0.59, "grad_norm": 0.8785907030105591, "learning_rate": 2e-05, "loss": 0.7029, "step": 970 }, { "epoch": 0.59, "grad_norm": 0.896600067615509, "learning_rate": 2e-05, "loss": 0.6583, "step": 980 }, { "epoch": 0.6, "grad_norm": 0.9632496237754822, "learning_rate": 2e-05, "loss": 0.7226, "step": 990 }, { "epoch": 0.61, "grad_norm": 1.0047279596328735, "learning_rate": 2e-05, "loss": 0.7201, "step": 1000 }, { "epoch": 0.61, "eval_loss": 0.692192018032074, "eval_runtime": 576.3596, "eval_samples_per_second": 0.953, "eval_steps_per_second": 0.095, "step": 1000 }, { "epoch": 0.61, "grad_norm": 0.8589789271354675, "learning_rate": 2e-05, "loss": 0.6875, "step": 1010 }, { "epoch": 0.62, "grad_norm": 0.9107193946838379, "learning_rate": 2e-05, "loss": 0.6563, "step": 1020 }, { "epoch": 0.62, "grad_norm": 0.9334760308265686, "learning_rate": 2e-05, "loss": 0.7311, "step": 1030 }, { "epoch": 0.63, "grad_norm": 0.8864697813987732, "learning_rate": 2e-05, "loss": 0.7121, "step": 1040 }, { "epoch": 0.64, "grad_norm": 0.952052891254425, "learning_rate": 2e-05, "loss": 0.6842, "step": 1050 }, { "epoch": 0.64, "grad_norm": 0.9001901745796204, "learning_rate": 2e-05, "loss": 0.6769, "step": 1060 }, { "epoch": 0.65, "grad_norm": 0.963742733001709, "learning_rate": 2e-05, "loss": 0.7003, "step": 1070 }, { "epoch": 0.65, "grad_norm": 0.9565225839614868, "learning_rate": 2e-05, "loss": 0.7478, "step": 1080 }, { "epoch": 0.66, "grad_norm": 0.9401019811630249, "learning_rate": 2e-05, "loss": 0.7475, "step": 1090 }, { "epoch": 0.67, "grad_norm": 0.8856386542320251, "learning_rate": 2e-05, "loss": 0.6879, "step": 1100 }, { "epoch": 0.67, "grad_norm": 0.8875776529312134, "learning_rate": 2e-05, "loss": 0.705, "step": 1110 }, { "epoch": 0.68, "grad_norm": 0.9274523854255676, "learning_rate": 2e-05, "loss": 0.7378, "step": 1120 }, { "epoch": 0.68, "grad_norm": 0.9206425547599792, "learning_rate": 2e-05, "loss": 0.6691, "step": 1130 }, { "epoch": 0.69, "grad_norm": 0.8944551944732666, "learning_rate": 2e-05, "loss": 0.684, "step": 1140 }, { "epoch": 0.7, "grad_norm": 1.1095612049102783, "learning_rate": 2e-05, "loss": 0.7046, "step": 1150 }, { "epoch": 0.7, "grad_norm": 0.9099195599555969, "learning_rate": 2e-05, "loss": 0.6839, "step": 1160 }, { "epoch": 0.71, "grad_norm": 0.8802533745765686, "learning_rate": 2e-05, "loss": 0.6668, "step": 1170 }, { "epoch": 0.72, "grad_norm": 0.9142954349517822, "learning_rate": 2e-05, "loss": 0.6869, "step": 1180 }, { "epoch": 0.72, "grad_norm": 0.8676968812942505, "learning_rate": 2e-05, "loss": 0.7255, "step": 1190 }, { "epoch": 0.73, "grad_norm": 0.8218028545379639, "learning_rate": 2e-05, "loss": 0.7424, "step": 1200 }, { "epoch": 0.73, "eval_loss": 0.6879069209098816, "eval_runtime": 576.2219, "eval_samples_per_second": 0.953, "eval_steps_per_second": 0.095, "step": 1200 }, { "epoch": 0.73, "grad_norm": 0.8519162535667419, "learning_rate": 2e-05, "loss": 0.7128, "step": 1210 }, { "epoch": 0.74, "grad_norm": 0.8599090576171875, "learning_rate": 2e-05, "loss": 0.7309, "step": 1220 }, { "epoch": 0.75, "grad_norm": 0.9774763584136963, "learning_rate": 2e-05, "loss": 0.7092, "step": 1230 }, { "epoch": 0.75, "grad_norm": 0.9119387865066528, "learning_rate": 2e-05, "loss": 0.682, "step": 1240 }, { "epoch": 0.76, "grad_norm": 0.9051375985145569, "learning_rate": 2e-05, "loss": 0.7042, "step": 1250 }, { "epoch": 0.76, "grad_norm": 0.9292181134223938, "learning_rate": 2e-05, "loss": 0.6991, "step": 1260 }, { "epoch": 0.77, "grad_norm": 0.9441558718681335, "learning_rate": 2e-05, "loss": 0.6838, "step": 1270 }, { "epoch": 0.78, "grad_norm": 0.8878441452980042, "learning_rate": 2e-05, "loss": 0.6815, "step": 1280 }, { "epoch": 0.78, "grad_norm": 0.9556174874305725, "learning_rate": 2e-05, "loss": 0.6765, "step": 1290 }, { "epoch": 0.79, "grad_norm": 0.8373817205429077, "learning_rate": 2e-05, "loss": 0.7348, "step": 1300 }, { "epoch": 0.79, "grad_norm": 0.9634573459625244, "learning_rate": 2e-05, "loss": 0.6846, "step": 1310 }, { "epoch": 0.8, "grad_norm": 0.8429288268089294, "learning_rate": 2e-05, "loss": 0.678, "step": 1320 }, { "epoch": 0.81, "grad_norm": 0.9088312387466431, "learning_rate": 2e-05, "loss": 0.6949, "step": 1330 }, { "epoch": 0.81, "grad_norm": 0.9536995887756348, "learning_rate": 2e-05, "loss": 0.6794, "step": 1340 }, { "epoch": 0.82, "grad_norm": 0.9287436008453369, "learning_rate": 2e-05, "loss": 0.7111, "step": 1350 }, { "epoch": 0.82, "grad_norm": 0.8534522652626038, "learning_rate": 2e-05, "loss": 0.6892, "step": 1360 }, { "epoch": 0.83, "grad_norm": 0.9239506721496582, "learning_rate": 2e-05, "loss": 0.7138, "step": 1370 }, { "epoch": 0.84, "grad_norm": 0.9194501042366028, "learning_rate": 2e-05, "loss": 0.7232, "step": 1380 }, { "epoch": 0.84, "grad_norm": 0.961037278175354, "learning_rate": 2e-05, "loss": 0.697, "step": 1390 }, { "epoch": 0.85, "grad_norm": 0.8385772109031677, "learning_rate": 2e-05, "loss": 0.7016, "step": 1400 }, { "epoch": 0.85, "eval_loss": 0.6852899789810181, "eval_runtime": 576.2672, "eval_samples_per_second": 0.953, "eval_steps_per_second": 0.095, "step": 1400 }, { "epoch": 0.85, "grad_norm": 1.0281404256820679, "learning_rate": 2e-05, "loss": 0.7012, "step": 1410 }, { "epoch": 0.86, "grad_norm": 1.0333633422851562, "learning_rate": 2e-05, "loss": 0.7501, "step": 1420 }, { "epoch": 0.87, "grad_norm": 0.9055823683738708, "learning_rate": 2e-05, "loss": 0.738, "step": 1430 }, { "epoch": 0.87, "grad_norm": 0.9356181025505066, "learning_rate": 2e-05, "loss": 0.6842, "step": 1440 }, { "epoch": 0.88, "grad_norm": 0.8437691926956177, "learning_rate": 2e-05, "loss": 0.6658, "step": 1450 }, { "epoch": 0.88, "grad_norm": 1.0060279369354248, "learning_rate": 2e-05, "loss": 0.6954, "step": 1460 }, { "epoch": 0.89, "grad_norm": 0.8564276099205017, "learning_rate": 2e-05, "loss": 0.684, "step": 1470 }, { "epoch": 0.9, "grad_norm": 0.9397062659263611, "learning_rate": 2e-05, "loss": 0.7047, "step": 1480 }, { "epoch": 0.9, "grad_norm": 0.9040536284446716, "learning_rate": 2e-05, "loss": 0.7001, "step": 1490 }, { "epoch": 0.91, "grad_norm": 0.9114941358566284, "learning_rate": 2e-05, "loss": 0.6796, "step": 1500 }, { "epoch": 0.92, "grad_norm": 0.9135074615478516, "learning_rate": 2e-05, "loss": 0.7335, "step": 1510 }, { "epoch": 0.92, "grad_norm": 0.8748692274093628, "learning_rate": 2e-05, "loss": 0.6922, "step": 1520 }, { "epoch": 0.93, "grad_norm": 0.8640381097793579, "learning_rate": 2e-05, "loss": 0.6638, "step": 1530 }, { "epoch": 0.93, "grad_norm": 0.9141325950622559, "learning_rate": 2e-05, "loss": 0.6729, "step": 1540 }, { "epoch": 0.94, "grad_norm": 0.925592839717865, "learning_rate": 2e-05, "loss": 0.7008, "step": 1550 }, { "epoch": 0.95, "grad_norm": 0.9290179014205933, "learning_rate": 2e-05, "loss": 0.7058, "step": 1560 }, { "epoch": 0.95, "grad_norm": 0.8498104810714722, "learning_rate": 2e-05, "loss": 0.6838, "step": 1570 }, { "epoch": 0.96, "grad_norm": 0.9508926868438721, "learning_rate": 2e-05, "loss": 0.72, "step": 1580 }, { "epoch": 0.96, "grad_norm": 0.8391693830490112, "learning_rate": 2e-05, "loss": 0.6693, "step": 1590 }, { "epoch": 0.97, "grad_norm": 0.8421803712844849, "learning_rate": 2e-05, "loss": 0.6953, "step": 1600 }, { "epoch": 0.97, "eval_loss": 0.6833301782608032, "eval_runtime": 576.54, "eval_samples_per_second": 0.952, "eval_steps_per_second": 0.095, "step": 1600 } ], "logging_steps": 10, "max_steps": 1649, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 200, "total_flos": 4.1937076224e+17, "train_batch_size": 10, "trial_name": null, "trial_params": null }