{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.9963379544860058, "eval_steps": 500, "global_step": 954, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.020925974365681402, "grad_norm": 0.5565130669449413, "learning_rate": 2.0833333333333334e-06, "loss": 0.0288, "step": 10 }, { "epoch": 0.041851948731362804, "grad_norm": 0.18606951178093903, "learning_rate": 4.166666666666667e-06, "loss": 0.0118, "step": 20 }, { "epoch": 0.0627779230970442, "grad_norm": 0.03950245765434584, "learning_rate": 6.25e-06, "loss": 0.0039, "step": 30 }, { "epoch": 0.08370389746272561, "grad_norm": 0.03819057800914349, "learning_rate": 8.333333333333334e-06, "loss": 0.0029, "step": 40 }, { "epoch": 0.10462987182840701, "grad_norm": 0.0544544775809417, "learning_rate": 1.0416666666666668e-05, "loss": 0.0028, "step": 50 }, { "epoch": 0.1255558461940884, "grad_norm": 0.017750069891728056, "learning_rate": 1.25e-05, "loss": 0.0024, "step": 60 }, { "epoch": 0.14648182055976983, "grad_norm": 0.025688827192495198, "learning_rate": 1.4583333333333333e-05, "loss": 0.0021, "step": 70 }, { "epoch": 0.16740779492545121, "grad_norm": 0.009892162090487124, "learning_rate": 1.6666666666666667e-05, "loss": 0.0013, "step": 80 }, { "epoch": 0.18833376929113263, "grad_norm": 0.013992833360284824, "learning_rate": 1.8750000000000002e-05, "loss": 0.0006, "step": 90 }, { "epoch": 0.20925974365681402, "grad_norm": 0.009570194364057852, "learning_rate": 1.9998927475076107e-05, "loss": 0.0029, "step": 100 }, { "epoch": 0.23018571802249543, "grad_norm": 0.011885486768552911, "learning_rate": 1.998686421164407e-05, "loss": 0.0019, "step": 110 }, { "epoch": 0.2511116923881768, "grad_norm": 0.00535124159298834, "learning_rate": 1.9961413253717214e-05, "loss": 0.0007, "step": 120 }, { "epoch": 0.2720376667538582, "grad_norm": 0.014087761849564343, "learning_rate": 1.9922608719076874e-05, "loss": 0.002, "step": 130 }, { "epoch": 0.29296364111953965, "grad_norm": 0.014243222134916782, "learning_rate": 1.9870502626379127e-05, "loss": 0.0011, "step": 140 }, { "epoch": 0.31388961548522104, "grad_norm": 0.02081790679071402, "learning_rate": 1.980516482542224e-05, "loss": 0.0019, "step": 150 }, { "epoch": 0.33481558985090243, "grad_norm": 0.017403596614001808, "learning_rate": 1.972668290351084e-05, "loss": 0.0022, "step": 160 }, { "epoch": 0.3557415642165838, "grad_norm": 0.014181950638291016, "learning_rate": 1.9635162068042547e-05, "loss": 0.0013, "step": 170 }, { "epoch": 0.37666753858226526, "grad_norm": 0.004266630945852344, "learning_rate": 1.9530725005474195e-05, "loss": 0.0016, "step": 180 }, { "epoch": 0.39759351294794665, "grad_norm": 0.003584919486125673, "learning_rate": 1.9413511716856973e-05, "loss": 0.0017, "step": 190 }, { "epoch": 0.41851948731362804, "grad_norm": 0.010407187744097775, "learning_rate": 1.9283679330160726e-05, "loss": 0.0006, "step": 200 }, { "epoch": 0.4394454616793094, "grad_norm": 0.011994286058888401, "learning_rate": 1.9141401889639167e-05, "loss": 0.004, "step": 210 }, { "epoch": 0.46037143604499087, "grad_norm": 0.002559284420598525, "learning_rate": 1.898687012251826e-05, "loss": 0.0014, "step": 220 }, { "epoch": 0.48129741041067226, "grad_norm": 0.01635207542112704, "learning_rate": 1.8820291183320602e-05, "loss": 0.0035, "step": 230 }, { "epoch": 0.5022233847763536, "grad_norm": 0.019845090979002694, "learning_rate": 1.8641888376168483e-05, "loss": 0.0004, "step": 240 }, { "epoch": 0.5231493591420351, "grad_norm": 0.037240037127226344, "learning_rate": 1.845190085543795e-05, "loss": 0.0032, "step": 250 }, { "epoch": 0.5440753335077164, "grad_norm": 0.006653507161649646, "learning_rate": 1.8250583305165098e-05, "loss": 0.0009, "step": 260 }, { "epoch": 0.5650013078733979, "grad_norm": 0.6020150741013875, "learning_rate": 1.8038205597634392e-05, "loss": 0.0011, "step": 270 }, { "epoch": 0.5859272822390793, "grad_norm": 0.05961677548525192, "learning_rate": 1.7815052431606702e-05, "loss": 0.0042, "step": 280 }, { "epoch": 0.6068532566047606, "grad_norm": 0.03667937697835753, "learning_rate": 1.7581422950671942e-05, "loss": 0.0029, "step": 290 }, { "epoch": 0.6277792309704421, "grad_norm": 0.02203193747208106, "learning_rate": 1.733763034223804e-05, "loss": 0.0017, "step": 300 }, { "epoch": 0.6487052053361234, "grad_norm": 0.03047813653712161, "learning_rate": 1.7084001417693702e-05, "loss": 0.0013, "step": 310 }, { "epoch": 0.6696311797018049, "grad_norm": 0.02403172913701344, "learning_rate": 1.682087617430782e-05, "loss": 0.001, "step": 320 }, { "epoch": 0.6905571540674863, "grad_norm": 0.020885112433327038, "learning_rate": 1.6548607339452853e-05, "loss": 0.0009, "step": 330 }, { "epoch": 0.7114831284331676, "grad_norm": 0.02143325180735118, "learning_rate": 1.626755989776303e-05, "loss": 0.0007, "step": 340 }, { "epoch": 0.7324091027988491, "grad_norm": 0.028502523632724423, "learning_rate": 1.5978110601861408e-05, "loss": 0.0016, "step": 350 }, { "epoch": 0.7533350771645305, "grad_norm": 0.00832352415812105, "learning_rate": 1.568064746731156e-05, "loss": 0.0009, "step": 360 }, { "epoch": 0.7742610515302119, "grad_norm": 0.009527918206519135, "learning_rate": 1.5375569252470897e-05, "loss": 0.0018, "step": 370 }, { "epoch": 0.7951870258958933, "grad_norm": 0.025970668054977514, "learning_rate": 1.506328492394303e-05, "loss": 0.0006, "step": 380 }, { "epoch": 0.8161130002615746, "grad_norm": 0.026371283237352123, "learning_rate": 1.4744213108345605e-05, "loss": 0.0023, "step": 390 }, { "epoch": 0.8370389746272561, "grad_norm": 0.030148449665476192, "learning_rate": 1.4418781531128636e-05, "loss": 0.0021, "step": 400 }, { "epoch": 0.8579649489929375, "grad_norm": 0.009123367946997817, "learning_rate": 1.4087426443195549e-05, "loss": 0.0009, "step": 410 }, { "epoch": 0.8788909233586188, "grad_norm": 0.033901495227343266, "learning_rate": 1.375059203609562e-05, "loss": 0.0023, "step": 420 }, { "epoch": 0.8998168977243003, "grad_norm": 0.007725818014371622, "learning_rate": 1.3408729846571716e-05, "loss": 0.0006, "step": 430 }, { "epoch": 0.9207428720899817, "grad_norm": 0.01076304174801131, "learning_rate": 1.3062298151261592e-05, "loss": 0.0009, "step": 440 }, { "epoch": 0.9416688464556631, "grad_norm": 0.08792335870716929, "learning_rate": 1.2711761352364172e-05, "loss": 0.0016, "step": 450 }, { "epoch": 0.9625948208213445, "grad_norm": 0.019193508970073957, "learning_rate": 1.2357589355094275e-05, "loss": 0.0005, "step": 460 }, { "epoch": 0.9835207951870258, "grad_norm": 0.008676099200769616, "learning_rate": 1.2000256937760446e-05, "loss": 0.0019, "step": 470 }, { "epoch": 1.0044467695527073, "grad_norm": 0.0014044382775048332, "learning_rate": 1.1640243115310219e-05, "loss": 0.0013, "step": 480 }, { "epoch": 1.0253727439183886, "grad_norm": 0.0047799061066043395, "learning_rate": 1.127803049719605e-05, "loss": 0.0008, "step": 490 }, { "epoch": 1.0462987182840702, "grad_norm": 0.006651240484213586, "learning_rate": 1.091410464042268e-05, "loss": 0.0004, "step": 500 }, { "epoch": 1.0672246926497515, "grad_norm": 0.007370492651937019, "learning_rate": 1.0548953398643276e-05, "loss": 0.0002, "step": 510 }, { "epoch": 1.0881506670154328, "grad_norm": 0.004999091099852188, "learning_rate": 1.0183066268176775e-05, "loss": 0.0013, "step": 520 }, { "epoch": 1.1090766413811144, "grad_norm": 0.0067648135423557494, "learning_rate": 9.81693373182323e-06, "loss": 0.0004, "step": 530 }, { "epoch": 1.1300026157467957, "grad_norm": 0.006355010008632378, "learning_rate": 9.451046601356725e-06, "loss": 0.0005, "step": 540 }, { "epoch": 1.150928590112477, "grad_norm": 0.0019993701626922662, "learning_rate": 9.085895359577324e-06, "loss": 0.0012, "step": 550 }, { "epoch": 1.1718545644781586, "grad_norm": 0.07841604107285226, "learning_rate": 8.721969502803954e-06, "loss": 0.0007, "step": 560 }, { "epoch": 1.19278053884384, "grad_norm": 0.008878084655677647, "learning_rate": 8.359756884689785e-06, "loss": 0.0006, "step": 570 }, { "epoch": 1.2137065132095213, "grad_norm": 0.022433859996456522, "learning_rate": 7.999743062239557e-06, "loss": 0.001, "step": 580 }, { "epoch": 1.2346324875752028, "grad_norm": 0.007321094493119974, "learning_rate": 7.642410644905726e-06, "loss": 0.0002, "step": 590 }, { "epoch": 1.2555584619408842, "grad_norm": 0.004130666592411099, "learning_rate": 7.2882386476358304e-06, "loss": 0.0008, "step": 600 }, { "epoch": 1.2764844363065655, "grad_norm": 0.009136384794392362, "learning_rate": 6.937701848738407e-06, "loss": 0.0005, "step": 610 }, { "epoch": 1.297410410672247, "grad_norm": 0.014936959834984399, "learning_rate": 6.591270153428288e-06, "loss": 0.0015, "step": 620 }, { "epoch": 1.3183363850379284, "grad_norm": 0.00995968283183629, "learning_rate": 6.249407963904381e-06, "loss": 0.0003, "step": 630 }, { "epoch": 1.3392623594036097, "grad_norm": 0.007922264005744792, "learning_rate": 5.912573556804453e-06, "loss": 0.002, "step": 640 }, { "epoch": 1.3601883337692913, "grad_norm": 0.009732474218704226, "learning_rate": 5.581218468871365e-06, "loss": 0.0015, "step": 650 }, { "epoch": 1.3811143081349726, "grad_norm": 0.029322482465715245, "learning_rate": 5.2557868916543996e-06, "loss": 0.0004, "step": 660 }, { "epoch": 1.402040282500654, "grad_norm": 0.002219147933917833, "learning_rate": 4.9367150760569746e-06, "loss": 0.0013, "step": 670 }, { "epoch": 1.4229662568663353, "grad_norm": 0.016601760824032873, "learning_rate": 4.6244307475291025e-06, "loss": 0.0007, "step": 680 }, { "epoch": 1.4438922312320168, "grad_norm": 0.03618373662395389, "learning_rate": 4.319352532688444e-06, "loss": 0.0016, "step": 690 }, { "epoch": 1.4648182055976982, "grad_norm": 0.008705175714719602, "learning_rate": 4.0218893981385935e-06, "loss": 0.0009, "step": 700 }, { "epoch": 1.4857441799633795, "grad_norm": 0.0029313903115019873, "learning_rate": 3.732440102236975e-06, "loss": 0.0002, "step": 710 }, { "epoch": 1.5066701543290608, "grad_norm": 0.020710402409599028, "learning_rate": 3.4513926605471504e-06, "loss": 0.0007, "step": 720 }, { "epoch": 1.5275961286947424, "grad_norm": 0.03507089834475249, "learning_rate": 3.1791238256921785e-06, "loss": 0.0021, "step": 730 }, { "epoch": 1.5485221030604237, "grad_norm": 0.07040984119700786, "learning_rate": 2.9159985823062997e-06, "loss": 0.0012, "step": 740 }, { "epoch": 1.569448077426105, "grad_norm": 0.0046045007369338465, "learning_rate": 2.662369657761963e-06, "loss": 0.0005, "step": 750 }, { "epoch": 1.5903740517917866, "grad_norm": 0.044927829447691095, "learning_rate": 2.418577049328058e-06, "loss": 0.001, "step": 760 }, { "epoch": 1.611300026157468, "grad_norm": 0.005367586480852439, "learning_rate": 2.1849475683932996e-06, "loss": 0.0005, "step": 770 }, { "epoch": 1.6322260005231493, "grad_norm": 0.014755038810438978, "learning_rate": 1.961794402365611e-06, "loss": 0.0012, "step": 780 }, { "epoch": 1.6531519748888308, "grad_norm": 0.007586271120637286, "learning_rate": 1.7494166948349057e-06, "loss": 0.0023, "step": 790 }, { "epoch": 1.6740779492545121, "grad_norm": 0.01450560934843513, "learning_rate": 1.5480991445620541e-06, "loss": 0.001, "step": 800 }, { "epoch": 1.6950039236201935, "grad_norm": 0.007197310242295822, "learning_rate": 1.3581116238315194e-06, "loss": 0.0004, "step": 810 }, { "epoch": 1.715929897985875, "grad_norm": 0.011429665649141593, "learning_rate": 1.1797088166794002e-06, "loss": 0.0009, "step": 820 }, { "epoch": 1.7368558723515564, "grad_norm": 0.005479589436376736, "learning_rate": 1.013129877481741e-06, "loss": 0.0015, "step": 830 }, { "epoch": 1.7577818467172377, "grad_norm": 0.010284806366489312, "learning_rate": 8.585981103608343e-07, "loss": 0.0003, "step": 840 }, { "epoch": 1.7787078210829192, "grad_norm": 0.004805391506128972, "learning_rate": 7.163206698392744e-07, "loss": 0.0007, "step": 850 }, { "epoch": 1.7996337954486006, "grad_norm": 0.00832006380541767, "learning_rate": 5.864882831430274e-07, "loss": 0.0013, "step": 860 }, { "epoch": 1.820559769814282, "grad_norm": 0.0016488023765299498, "learning_rate": 4.6927499452580574e-07, "loss": 0.0004, "step": 870 }, { "epoch": 1.8414857441799635, "grad_norm": 0.010828047107950923, "learning_rate": 3.6483793195745686e-07, "loss": 0.002, "step": 880 }, { "epoch": 1.8624117185456448, "grad_norm": 0.00029272379323776047, "learning_rate": 2.733170964891607e-07, "loss": 0.0008, "step": 890 }, { "epoch": 1.8833376929113261, "grad_norm": 0.011245028659871065, "learning_rate": 1.9483517457776436e-07, "loss": 0.0013, "step": 900 }, { "epoch": 1.9042636672770077, "grad_norm": 0.005356139523384608, "learning_rate": 1.2949737362087156e-07, "loss": 0.0009, "step": 910 }, { "epoch": 1.925189641642689, "grad_norm": 0.006490951580098314, "learning_rate": 7.73912809231292e-08, "loss": 0.0005, "step": 920 }, { "epoch": 1.9461156160083704, "grad_norm": 0.003383501070683301, "learning_rate": 3.858674628278825e-08, "loss": 0.003, "step": 930 }, { "epoch": 1.967041590374052, "grad_norm": 0.012526402652665877, "learning_rate": 1.3135788355934652e-08, "loss": 0.0004, "step": 940 }, { "epoch": 1.9879675647397332, "grad_norm": 0.016656680718419487, "learning_rate": 1.0725249238940916e-09, "loss": 0.0006, "step": 950 }, { "epoch": 1.9963379544860058, "step": 954, "total_flos": 841298254757888.0, "train_loss": 0.001730952451793395, "train_runtime": 49706.0079, "train_samples_per_second": 2.461, "train_steps_per_second": 0.019 } ], "logging_steps": 10, "max_steps": 954, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 841298254757888.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }