{ "best_metric": null, "best_model_checkpoint": null, "epoch": 5.0, "eval_steps": 1000, "global_step": 1110, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.04504504504504504, "grad_norm": 3.3412909507751465, "learning_rate": 1.801801801801802e-05, "loss": 1.2791, "step": 10 }, { "epoch": 0.09009009009009009, "grad_norm": 1.4792481660842896, "learning_rate": 3.603603603603604e-05, "loss": 1.0799, "step": 20 }, { "epoch": 0.13513513513513514, "grad_norm": 1.3788402080535889, "learning_rate": 5.405405405405406e-05, "loss": 0.7657, "step": 30 }, { "epoch": 0.18018018018018017, "grad_norm": 0.7668061256408691, "learning_rate": 7.207207207207208e-05, "loss": 0.5807, "step": 40 }, { "epoch": 0.22522522522522523, "grad_norm": 0.7166613936424255, "learning_rate": 9.009009009009009e-05, "loss": 0.6035, "step": 50 }, { "epoch": 0.2702702702702703, "grad_norm": 0.9661350846290588, "learning_rate": 0.00010810810810810812, "loss": 0.5441, "step": 60 }, { "epoch": 0.3153153153153153, "grad_norm": 0.7341681122779846, "learning_rate": 0.00012612612612612612, "loss": 0.6031, "step": 70 }, { "epoch": 0.36036036036036034, "grad_norm": 1.3319752216339111, "learning_rate": 0.00014414414414414415, "loss": 0.5686, "step": 80 }, { "epoch": 0.40540540540540543, "grad_norm": 0.7269447445869446, "learning_rate": 0.00016216216216216218, "loss": 0.4978, "step": 90 }, { "epoch": 0.45045045045045046, "grad_norm": 0.46780362725257874, "learning_rate": 0.00018018018018018018, "loss": 0.473, "step": 100 }, { "epoch": 0.4954954954954955, "grad_norm": 0.728823184967041, "learning_rate": 0.0001981981981981982, "loss": 0.5892, "step": 110 }, { "epoch": 0.5405405405405406, "grad_norm": 0.5054831504821777, "learning_rate": 0.0001999599507118322, "loss": 0.5117, "step": 120 }, { "epoch": 0.5855855855855856, "grad_norm": 0.6193355321884155, "learning_rate": 0.00019982154991201608, "loss": 0.4759, "step": 130 }, { "epoch": 0.6306306306306306, "grad_norm": 0.3995501399040222, "learning_rate": 0.00019958443999073397, "loss": 0.3928, "step": 140 }, { "epoch": 0.6756756756756757, "grad_norm": 0.5497131943702698, "learning_rate": 0.0001992488554155135, "loss": 0.4507, "step": 150 }, { "epoch": 0.7207207207207207, "grad_norm": 1.0290075540542603, "learning_rate": 0.00019881512803111796, "loss": 0.4766, "step": 160 }, { "epoch": 0.7657657657657657, "grad_norm": 0.6826834678649902, "learning_rate": 0.00019828368673139947, "loss": 0.5236, "step": 170 }, { "epoch": 0.8108108108108109, "grad_norm": 0.6135373711585999, "learning_rate": 0.00019765505703518496, "loss": 0.4454, "step": 180 }, { "epoch": 0.8558558558558559, "grad_norm": 0.6352598667144775, "learning_rate": 0.00019692986056661356, "loss": 0.508, "step": 190 }, { "epoch": 0.9009009009009009, "grad_norm": 0.5680545568466187, "learning_rate": 0.0001961088144404403, "loss": 0.5896, "step": 200 }, { "epoch": 0.9459459459459459, "grad_norm": 0.3386252820491791, "learning_rate": 0.00019519273055291266, "loss": 0.4729, "step": 210 }, { "epoch": 0.990990990990991, "grad_norm": 0.358553409576416, "learning_rate": 0.0001941825147789225, "loss": 0.45, "step": 220 }, { "epoch": 1.0360360360360361, "grad_norm": 0.668021023273468, "learning_rate": 0.0001930791660762262, "loss": 0.4162, "step": 230 }, { "epoch": 1.0810810810810811, "grad_norm": 0.5463367700576782, "learning_rate": 0.00019188377549761963, "loss": 0.4445, "step": 240 }, { "epoch": 1.1261261261261262, "grad_norm": 0.7385880351066589, "learning_rate": 0.000190597525112044, "loss": 0.3849, "step": 250 }, { "epoch": 1.1711711711711712, "grad_norm": 0.6837536692619324, "learning_rate": 0.0001892216868356904, "loss": 0.4652, "step": 260 }, { "epoch": 1.2162162162162162, "grad_norm": 0.866578221321106, "learning_rate": 0.00018775762117425777, "loss": 0.4648, "step": 270 }, { "epoch": 1.2612612612612613, "grad_norm": 0.6583455204963684, "learning_rate": 0.00018620677587760916, "loss": 0.3848, "step": 280 }, { "epoch": 1.3063063063063063, "grad_norm": 0.6937561631202698, "learning_rate": 0.00018457068450815562, "loss": 0.4532, "step": 290 }, { "epoch": 1.3513513513513513, "grad_norm": 0.5930050611495972, "learning_rate": 0.00018285096492438424, "loss": 0.5282, "step": 300 }, { "epoch": 1.3963963963963963, "grad_norm": 1.1432991027832031, "learning_rate": 0.0001810493176810292, "loss": 0.4369, "step": 310 }, { "epoch": 1.4414414414414414, "grad_norm": 0.5736434459686279, "learning_rate": 0.00017916752434746856, "loss": 0.4434, "step": 320 }, { "epoch": 1.4864864864864864, "grad_norm": 0.5792914032936096, "learning_rate": 0.00017720744574600863, "loss": 0.4434, "step": 330 }, { "epoch": 1.5315315315315314, "grad_norm": 0.7626290917396545, "learning_rate": 0.00017517102011179933, "loss": 0.4226, "step": 340 }, { "epoch": 1.5765765765765765, "grad_norm": 0.6746386289596558, "learning_rate": 0.00017306026117619889, "loss": 0.4126, "step": 350 }, { "epoch": 1.6216216216216215, "grad_norm": 0.5064342617988586, "learning_rate": 0.00017087725617548385, "loss": 0.3926, "step": 360 }, { "epoch": 1.6666666666666665, "grad_norm": 0.793991208076477, "learning_rate": 0.0001686241637868734, "loss": 0.4437, "step": 370 }, { "epoch": 1.7117117117117115, "grad_norm": 0.6197868585586548, "learning_rate": 0.00016630321199390867, "loss": 0.3932, "step": 380 }, { "epoch": 1.7567567567567568, "grad_norm": 0.5004612803459167, "learning_rate": 0.0001639166958832985, "loss": 0.3883, "step": 390 }, { "epoch": 1.8018018018018018, "grad_norm": 0.7265865206718445, "learning_rate": 0.00016146697537540924, "loss": 0.4453, "step": 400 }, { "epoch": 1.8468468468468469, "grad_norm": 0.5155379772186279, "learning_rate": 0.00015895647289064396, "loss": 0.48, "step": 410 }, { "epoch": 1.8918918918918919, "grad_norm": 0.5756716132164001, "learning_rate": 0.0001563876709540178, "loss": 0.4874, "step": 420 }, { "epoch": 1.936936936936937, "grad_norm": 0.7919459342956543, "learning_rate": 0.00015376310974029873, "loss": 0.4075, "step": 430 }, { "epoch": 1.981981981981982, "grad_norm": 0.5977569818496704, "learning_rate": 0.0001510853845621409, "loss": 0.504, "step": 440 }, { "epoch": 2.027027027027027, "grad_norm": 0.601466178894043, "learning_rate": 0.00014835714330369446, "loss": 0.3732, "step": 450 }, { "epoch": 2.0720720720720722, "grad_norm": 0.575406014919281, "learning_rate": 0.00014558108380223012, "loss": 0.3317, "step": 460 }, { "epoch": 2.1171171171171173, "grad_norm": 1.0440267324447632, "learning_rate": 0.00014275995118036693, "loss": 0.3896, "step": 470 }, { "epoch": 2.1621621621621623, "grad_norm": 0.6646713614463806, "learning_rate": 0.00013989653513154165, "loss": 0.3478, "step": 480 }, { "epoch": 2.2072072072072073, "grad_norm": 0.7202288508415222, "learning_rate": 0.00013699366716140435, "loss": 0.3712, "step": 490 }, { "epoch": 2.2522522522522523, "grad_norm": 0.6566169261932373, "learning_rate": 0.00013405421778786737, "loss": 0.3548, "step": 500 }, { "epoch": 2.2972972972972974, "grad_norm": 1.0158584117889404, "learning_rate": 0.00013108109370257712, "loss": 0.3404, "step": 510 }, { "epoch": 2.3423423423423424, "grad_norm": 0.7582752108573914, "learning_rate": 0.00012807723489661495, "loss": 0.374, "step": 520 }, { "epoch": 2.3873873873873874, "grad_norm": 0.7685467600822449, "learning_rate": 0.00012504561175326985, "loss": 0.3127, "step": 530 }, { "epoch": 2.4324324324324325, "grad_norm": 0.9077286124229431, "learning_rate": 0.00012198922211075778, "loss": 0.353, "step": 540 }, { "epoch": 2.4774774774774775, "grad_norm": 0.9107437133789062, "learning_rate": 0.00011891108829779165, "loss": 0.3531, "step": 550 }, { "epoch": 2.5225225225225225, "grad_norm": 1.1385325193405151, "learning_rate": 0.0001158142541449341, "loss": 0.3695, "step": 560 }, { "epoch": 2.5675675675675675, "grad_norm": 0.9225629568099976, "learning_rate": 0.00011270178197468789, "loss": 0.3606, "step": 570 }, { "epoch": 2.6126126126126126, "grad_norm": 0.6338076591491699, "learning_rate": 0.00010957674957330042, "loss": 0.312, "step": 580 }, { "epoch": 2.6576576576576576, "grad_norm": 1.3998136520385742, "learning_rate": 0.00010644224714727681, "loss": 0.4027, "step": 590 }, { "epoch": 2.7027027027027026, "grad_norm": 0.598822832107544, "learning_rate": 0.00010330137426761135, "loss": 0.3496, "step": 600 }, { "epoch": 2.7477477477477477, "grad_norm": 0.9068642854690552, "learning_rate": 0.00010015723680475846, "loss": 0.3489, "step": 610 }, { "epoch": 2.7927927927927927, "grad_norm": 0.4025176167488098, "learning_rate": 9.70129438573747e-05, "loss": 0.3296, "step": 620 }, { "epoch": 2.8378378378378377, "grad_norm": 0.6708613634109497, "learning_rate": 9.38716046778684e-05, "loss": 0.3004, "step": 630 }, { "epoch": 2.8828828828828827, "grad_norm": 0.7858556509017944, "learning_rate": 9.07363255977973e-05, "loss": 0.3716, "step": 640 }, { "epoch": 2.9279279279279278, "grad_norm": 0.6855165958404541, "learning_rate": 8.76102069561545e-05, "loss": 0.311, "step": 650 }, { "epoch": 2.972972972972973, "grad_norm": 0.6526620388031006, "learning_rate": 8.449634003358022e-05, "loss": 0.3488, "step": 660 }, { "epoch": 3.018018018018018, "grad_norm": 0.3698066174983978, "learning_rate": 8.13978039955308e-05, "loss": 0.2858, "step": 670 }, { "epoch": 3.063063063063063, "grad_norm": 0.8586738705635071, "learning_rate": 7.831766284742807e-05, "loss": 0.2565, "step": 680 }, { "epoch": 3.108108108108108, "grad_norm": 0.8718597292900085, "learning_rate": 7.525896240479976e-05, "loss": 0.2173, "step": 690 }, { "epoch": 3.153153153153153, "grad_norm": 0.7671772241592407, "learning_rate": 7.222472728140695e-05, "loss": 0.2548, "step": 700 }, { "epoch": 3.1981981981981984, "grad_norm": 1.2702572345733643, "learning_rate": 6.921795789833723e-05, "loss": 0.2638, "step": 710 }, { "epoch": 3.2432432432432434, "grad_norm": 1.4898873567581177, "learning_rate": 6.624162751702076e-05, "loss": 0.2623, "step": 720 }, { "epoch": 3.2882882882882885, "grad_norm": 1.0137726068496704, "learning_rate": 6.329867929910347e-05, "loss": 0.2938, "step": 730 }, { "epoch": 3.3333333333333335, "grad_norm": 0.9631416201591492, "learning_rate": 6.039202339608432e-05, "loss": 0.2443, "step": 740 }, { "epoch": 3.3783783783783785, "grad_norm": 0.8912140130996704, "learning_rate": 5.752453407159522e-05, "loss": 0.2359, "step": 750 }, { "epoch": 3.4234234234234235, "grad_norm": 0.9686083793640137, "learning_rate": 5.469904685916861e-05, "loss": 0.2465, "step": 760 }, { "epoch": 3.4684684684684686, "grad_norm": 1.942658543586731, "learning_rate": 5.191835575830352e-05, "loss": 0.3042, "step": 770 }, { "epoch": 3.5135135135135136, "grad_norm": 1.2755067348480225, "learning_rate": 4.918521047160308e-05, "loss": 0.2885, "step": 780 }, { "epoch": 3.5585585585585586, "grad_norm": 0.8992679715156555, "learning_rate": 4.650231368571486e-05, "loss": 0.2728, "step": 790 }, { "epoch": 3.6036036036036037, "grad_norm": 2.1154401302337646, "learning_rate": 4.387231839876349e-05, "loss": 0.258, "step": 800 }, { "epoch": 3.6486486486486487, "grad_norm": 0.6957826018333435, "learning_rate": 4.129782529691815e-05, "loss": 0.3219, "step": 810 }, { "epoch": 3.6936936936936937, "grad_norm": 0.8093072175979614, "learning_rate": 3.878138018268866e-05, "loss": 0.2318, "step": 820 }, { "epoch": 3.7387387387387387, "grad_norm": 0.900164008140564, "learning_rate": 3.632547145749395e-05, "loss": 0.3025, "step": 830 }, { "epoch": 3.7837837837837838, "grad_norm": 1.3732051849365234, "learning_rate": 3.393252766099187e-05, "loss": 0.2744, "step": 840 }, { "epoch": 3.828828828828829, "grad_norm": 1.3438997268676758, "learning_rate": 3.1604915069603436e-05, "loss": 0.2663, "step": 850 }, { "epoch": 3.873873873873874, "grad_norm": 0.7277224063873291, "learning_rate": 2.9344935356606773e-05, "loss": 0.2058, "step": 860 }, { "epoch": 3.918918918918919, "grad_norm": 0.9671199321746826, "learning_rate": 2.7154823316113932e-05, "loss": 0.2466, "step": 870 }, { "epoch": 3.963963963963964, "grad_norm": 1.0856068134307861, "learning_rate": 2.5036744653181753e-05, "loss": 0.2695, "step": 880 }, { "epoch": 4.009009009009009, "grad_norm": 0.7191686034202576, "learning_rate": 2.29927938422419e-05, "loss": 0.2252, "step": 890 }, { "epoch": 4.054054054054054, "grad_norm": 0.9094095826148987, "learning_rate": 2.102499205596743e-05, "loss": 0.2067, "step": 900 }, { "epoch": 4.099099099099099, "grad_norm": 1.2016669511795044, "learning_rate": 1.913528516662452e-05, "loss": 0.2165, "step": 910 }, { "epoch": 4.1441441441441444, "grad_norm": 1.6922552585601807, "learning_rate": 1.7325541821885384e-05, "loss": 0.2102, "step": 920 }, { "epoch": 4.1891891891891895, "grad_norm": 1.52359139919281, "learning_rate": 1.5597551597004966e-05, "loss": 0.1765, "step": 930 }, { "epoch": 4.2342342342342345, "grad_norm": 1.333765983581543, "learning_rate": 1.3953023225189243e-05, "loss": 0.2147, "step": 940 }, { "epoch": 4.2792792792792795, "grad_norm": 0.9832772016525269, "learning_rate": 1.23935829079042e-05, "loss": 0.2068, "step": 950 }, { "epoch": 4.324324324324325, "grad_norm": 0.7258216738700867, "learning_rate": 1.0920772706797167e-05, "loss": 0.1884, "step": 960 }, { "epoch": 4.36936936936937, "grad_norm": 1.0229756832122803, "learning_rate": 9.536049018820192e-06, "loss": 0.2135, "step": 970 }, { "epoch": 4.414414414414415, "grad_norm": 1.0085179805755615, "learning_rate": 8.240781136063346e-06, "loss": 0.1831, "step": 980 }, { "epoch": 4.45945945945946, "grad_norm": 0.7446288466453552, "learning_rate": 7.03624989172228e-06, "loss": 0.198, "step": 990 }, { "epoch": 4.504504504504505, "grad_norm": 0.8291650414466858, "learning_rate": 5.9236463935389065e-06, "loss": 0.2189, "step": 1000 }, { "epoch": 4.504504504504505, "eval_loss": 0.9809222221374512, "eval_runtime": 10.6739, "eval_samples_per_second": 35.039, "eval_steps_per_second": 4.403, "step": 1000 }, { "epoch": 4.54954954954955, "grad_norm": 1.1298563480377197, "learning_rate": 4.904070845967468e-06, "loss": 0.1889, "step": 1010 }, { "epoch": 4.594594594594595, "grad_norm": 1.0232703685760498, "learning_rate": 3.9785314622310495e-06, "loss": 0.1891, "step": 1020 }, { "epoch": 4.63963963963964, "grad_norm": 1.2712104320526123, "learning_rate": 3.1479434673440167e-06, "loss": 0.1879, "step": 1030 }, { "epoch": 4.684684684684685, "grad_norm": 1.564489722251892, "learning_rate": 2.4131281930864002e-06, "loss": 0.1972, "step": 1040 }, { "epoch": 4.72972972972973, "grad_norm": 1.4100459814071655, "learning_rate": 1.7748122658251876e-06, "loss": 0.201, "step": 1050 }, { "epoch": 4.774774774774775, "grad_norm": 1.3149417638778687, "learning_rate": 1.2336268879856727e-06, "loss": 0.1876, "step": 1060 }, { "epoch": 4.81981981981982, "grad_norm": 0.8505904674530029, "learning_rate": 7.901072138831511e-07, "loss": 0.1722, "step": 1070 }, { "epoch": 4.864864864864865, "grad_norm": 2.1957037448883057, "learning_rate": 4.44691820532539e-07, "loss": 0.1917, "step": 1080 }, { "epoch": 4.90990990990991, "grad_norm": 1.9867583513259888, "learning_rate": 1.977222739588891e-07, "loss": 0.2082, "step": 1090 }, { "epoch": 4.954954954954955, "grad_norm": 1.539480447769165, "learning_rate": 4.9442791437848136e-08, "loss": 0.2052, "step": 1100 }, { "epoch": 5.0, "grad_norm": 0.6789027452468872, "learning_rate": 0.0, "loss": 0.197, "step": 1110 }, { "epoch": 5.0, "step": 1110, "total_flos": 1.01086802968209e+18, "train_loss": 0.36593411194311604, "train_runtime": 2755.6936, "train_samples_per_second": 12.872, "train_steps_per_second": 0.403 } ], "logging_steps": 10, "max_steps": 1110, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 1000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.01086802968209e+18, "train_batch_size": 2, "trial_name": null, "trial_params": null }