| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 1.9963379544860058, | |
| "eval_steps": 500, | |
| "global_step": 954, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.020925974365681402, | |
| "grad_norm": 0.5565130669449413, | |
| "learning_rate": 2.0833333333333334e-06, | |
| "loss": 0.0288, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.041851948731362804, | |
| "grad_norm": 0.18606951178093903, | |
| "learning_rate": 4.166666666666667e-06, | |
| "loss": 0.0118, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.0627779230970442, | |
| "grad_norm": 0.03950245765434584, | |
| "learning_rate": 6.25e-06, | |
| "loss": 0.0039, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.08370389746272561, | |
| "grad_norm": 0.03819057800914349, | |
| "learning_rate": 8.333333333333334e-06, | |
| "loss": 0.0029, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.10462987182840701, | |
| "grad_norm": 0.0544544775809417, | |
| "learning_rate": 1.0416666666666668e-05, | |
| "loss": 0.0028, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.1255558461940884, | |
| "grad_norm": 0.017750069891728056, | |
| "learning_rate": 1.25e-05, | |
| "loss": 0.0024, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.14648182055976983, | |
| "grad_norm": 0.025688827192495198, | |
| "learning_rate": 1.4583333333333333e-05, | |
| "loss": 0.0021, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.16740779492545121, | |
| "grad_norm": 0.009892162090487124, | |
| "learning_rate": 1.6666666666666667e-05, | |
| "loss": 0.0013, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.18833376929113263, | |
| "grad_norm": 0.013992833360284824, | |
| "learning_rate": 1.8750000000000002e-05, | |
| "loss": 0.0006, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.20925974365681402, | |
| "grad_norm": 0.009570194364057852, | |
| "learning_rate": 1.9998927475076107e-05, | |
| "loss": 0.0029, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.23018571802249543, | |
| "grad_norm": 0.011885486768552911, | |
| "learning_rate": 1.998686421164407e-05, | |
| "loss": 0.0019, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.2511116923881768, | |
| "grad_norm": 0.00535124159298834, | |
| "learning_rate": 1.9961413253717214e-05, | |
| "loss": 0.0007, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.2720376667538582, | |
| "grad_norm": 0.014087761849564343, | |
| "learning_rate": 1.9922608719076874e-05, | |
| "loss": 0.002, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.29296364111953965, | |
| "grad_norm": 0.014243222134916782, | |
| "learning_rate": 1.9870502626379127e-05, | |
| "loss": 0.0011, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.31388961548522104, | |
| "grad_norm": 0.02081790679071402, | |
| "learning_rate": 1.980516482542224e-05, | |
| "loss": 0.0019, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.33481558985090243, | |
| "grad_norm": 0.017403596614001808, | |
| "learning_rate": 1.972668290351084e-05, | |
| "loss": 0.0022, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.3557415642165838, | |
| "grad_norm": 0.014181950638291016, | |
| "learning_rate": 1.9635162068042547e-05, | |
| "loss": 0.0013, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.37666753858226526, | |
| "grad_norm": 0.004266630945852344, | |
| "learning_rate": 1.9530725005474195e-05, | |
| "loss": 0.0016, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.39759351294794665, | |
| "grad_norm": 0.003584919486125673, | |
| "learning_rate": 1.9413511716856973e-05, | |
| "loss": 0.0017, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.41851948731362804, | |
| "grad_norm": 0.010407187744097775, | |
| "learning_rate": 1.9283679330160726e-05, | |
| "loss": 0.0006, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.4394454616793094, | |
| "grad_norm": 0.011994286058888401, | |
| "learning_rate": 1.9141401889639167e-05, | |
| "loss": 0.004, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.46037143604499087, | |
| "grad_norm": 0.002559284420598525, | |
| "learning_rate": 1.898687012251826e-05, | |
| "loss": 0.0014, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.48129741041067226, | |
| "grad_norm": 0.01635207542112704, | |
| "learning_rate": 1.8820291183320602e-05, | |
| "loss": 0.0035, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.5022233847763536, | |
| "grad_norm": 0.019845090979002694, | |
| "learning_rate": 1.8641888376168483e-05, | |
| "loss": 0.0004, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.5231493591420351, | |
| "grad_norm": 0.037240037127226344, | |
| "learning_rate": 1.845190085543795e-05, | |
| "loss": 0.0032, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.5440753335077164, | |
| "grad_norm": 0.006653507161649646, | |
| "learning_rate": 1.8250583305165098e-05, | |
| "loss": 0.0009, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.5650013078733979, | |
| "grad_norm": 0.6020150741013875, | |
| "learning_rate": 1.8038205597634392e-05, | |
| "loss": 0.0011, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.5859272822390793, | |
| "grad_norm": 0.05961677548525192, | |
| "learning_rate": 1.7815052431606702e-05, | |
| "loss": 0.0042, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.6068532566047606, | |
| "grad_norm": 0.03667937697835753, | |
| "learning_rate": 1.7581422950671942e-05, | |
| "loss": 0.0029, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.6277792309704421, | |
| "grad_norm": 0.02203193747208106, | |
| "learning_rate": 1.733763034223804e-05, | |
| "loss": 0.0017, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.6487052053361234, | |
| "grad_norm": 0.03047813653712161, | |
| "learning_rate": 1.7084001417693702e-05, | |
| "loss": 0.0013, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.6696311797018049, | |
| "grad_norm": 0.02403172913701344, | |
| "learning_rate": 1.682087617430782e-05, | |
| "loss": 0.001, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.6905571540674863, | |
| "grad_norm": 0.020885112433327038, | |
| "learning_rate": 1.6548607339452853e-05, | |
| "loss": 0.0009, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.7114831284331676, | |
| "grad_norm": 0.02143325180735118, | |
| "learning_rate": 1.626755989776303e-05, | |
| "loss": 0.0007, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.7324091027988491, | |
| "grad_norm": 0.028502523632724423, | |
| "learning_rate": 1.5978110601861408e-05, | |
| "loss": 0.0016, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.7533350771645305, | |
| "grad_norm": 0.00832352415812105, | |
| "learning_rate": 1.568064746731156e-05, | |
| "loss": 0.0009, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.7742610515302119, | |
| "grad_norm": 0.009527918206519135, | |
| "learning_rate": 1.5375569252470897e-05, | |
| "loss": 0.0018, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 0.7951870258958933, | |
| "grad_norm": 0.025970668054977514, | |
| "learning_rate": 1.506328492394303e-05, | |
| "loss": 0.0006, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 0.8161130002615746, | |
| "grad_norm": 0.026371283237352123, | |
| "learning_rate": 1.4744213108345605e-05, | |
| "loss": 0.0023, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 0.8370389746272561, | |
| "grad_norm": 0.030148449665476192, | |
| "learning_rate": 1.4418781531128636e-05, | |
| "loss": 0.0021, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.8579649489929375, | |
| "grad_norm": 0.009123367946997817, | |
| "learning_rate": 1.4087426443195549e-05, | |
| "loss": 0.0009, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 0.8788909233586188, | |
| "grad_norm": 0.033901495227343266, | |
| "learning_rate": 1.375059203609562e-05, | |
| "loss": 0.0023, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 0.8998168977243003, | |
| "grad_norm": 0.007725818014371622, | |
| "learning_rate": 1.3408729846571716e-05, | |
| "loss": 0.0006, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 0.9207428720899817, | |
| "grad_norm": 0.01076304174801131, | |
| "learning_rate": 1.3062298151261592e-05, | |
| "loss": 0.0009, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 0.9416688464556631, | |
| "grad_norm": 0.08792335870716929, | |
| "learning_rate": 1.2711761352364172e-05, | |
| "loss": 0.0016, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.9625948208213445, | |
| "grad_norm": 0.019193508970073957, | |
| "learning_rate": 1.2357589355094275e-05, | |
| "loss": 0.0005, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 0.9835207951870258, | |
| "grad_norm": 0.008676099200769616, | |
| "learning_rate": 1.2000256937760446e-05, | |
| "loss": 0.0019, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 1.0044467695527073, | |
| "grad_norm": 0.0014044382775048332, | |
| "learning_rate": 1.1640243115310219e-05, | |
| "loss": 0.0013, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 1.0253727439183886, | |
| "grad_norm": 0.0047799061066043395, | |
| "learning_rate": 1.127803049719605e-05, | |
| "loss": 0.0008, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 1.0462987182840702, | |
| "grad_norm": 0.006651240484213586, | |
| "learning_rate": 1.091410464042268e-05, | |
| "loss": 0.0004, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 1.0672246926497515, | |
| "grad_norm": 0.007370492651937019, | |
| "learning_rate": 1.0548953398643276e-05, | |
| "loss": 0.0002, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 1.0881506670154328, | |
| "grad_norm": 0.004999091099852188, | |
| "learning_rate": 1.0183066268176775e-05, | |
| "loss": 0.0013, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 1.1090766413811144, | |
| "grad_norm": 0.0067648135423557494, | |
| "learning_rate": 9.81693373182323e-06, | |
| "loss": 0.0004, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 1.1300026157467957, | |
| "grad_norm": 0.006355010008632378, | |
| "learning_rate": 9.451046601356725e-06, | |
| "loss": 0.0005, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 1.150928590112477, | |
| "grad_norm": 0.0019993701626922662, | |
| "learning_rate": 9.085895359577324e-06, | |
| "loss": 0.0012, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 1.1718545644781586, | |
| "grad_norm": 0.07841604107285226, | |
| "learning_rate": 8.721969502803954e-06, | |
| "loss": 0.0007, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 1.19278053884384, | |
| "grad_norm": 0.008878084655677647, | |
| "learning_rate": 8.359756884689785e-06, | |
| "loss": 0.0006, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 1.2137065132095213, | |
| "grad_norm": 0.022433859996456522, | |
| "learning_rate": 7.999743062239557e-06, | |
| "loss": 0.001, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 1.2346324875752028, | |
| "grad_norm": 0.007321094493119974, | |
| "learning_rate": 7.642410644905726e-06, | |
| "loss": 0.0002, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 1.2555584619408842, | |
| "grad_norm": 0.004130666592411099, | |
| "learning_rate": 7.2882386476358304e-06, | |
| "loss": 0.0008, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 1.2764844363065655, | |
| "grad_norm": 0.009136384794392362, | |
| "learning_rate": 6.937701848738407e-06, | |
| "loss": 0.0005, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 1.297410410672247, | |
| "grad_norm": 0.014936959834984399, | |
| "learning_rate": 6.591270153428288e-06, | |
| "loss": 0.0015, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 1.3183363850379284, | |
| "grad_norm": 0.00995968283183629, | |
| "learning_rate": 6.249407963904381e-06, | |
| "loss": 0.0003, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 1.3392623594036097, | |
| "grad_norm": 0.007922264005744792, | |
| "learning_rate": 5.912573556804453e-06, | |
| "loss": 0.002, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 1.3601883337692913, | |
| "grad_norm": 0.009732474218704226, | |
| "learning_rate": 5.581218468871365e-06, | |
| "loss": 0.0015, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 1.3811143081349726, | |
| "grad_norm": 0.029322482465715245, | |
| "learning_rate": 5.2557868916543996e-06, | |
| "loss": 0.0004, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 1.402040282500654, | |
| "grad_norm": 0.002219147933917833, | |
| "learning_rate": 4.9367150760569746e-06, | |
| "loss": 0.0013, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 1.4229662568663353, | |
| "grad_norm": 0.016601760824032873, | |
| "learning_rate": 4.6244307475291025e-06, | |
| "loss": 0.0007, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 1.4438922312320168, | |
| "grad_norm": 0.03618373662395389, | |
| "learning_rate": 4.319352532688444e-06, | |
| "loss": 0.0016, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 1.4648182055976982, | |
| "grad_norm": 0.008705175714719602, | |
| "learning_rate": 4.0218893981385935e-06, | |
| "loss": 0.0009, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 1.4857441799633795, | |
| "grad_norm": 0.0029313903115019873, | |
| "learning_rate": 3.732440102236975e-06, | |
| "loss": 0.0002, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 1.5066701543290608, | |
| "grad_norm": 0.020710402409599028, | |
| "learning_rate": 3.4513926605471504e-06, | |
| "loss": 0.0007, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 1.5275961286947424, | |
| "grad_norm": 0.03507089834475249, | |
| "learning_rate": 3.1791238256921785e-06, | |
| "loss": 0.0021, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 1.5485221030604237, | |
| "grad_norm": 0.07040984119700786, | |
| "learning_rate": 2.9159985823062997e-06, | |
| "loss": 0.0012, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 1.569448077426105, | |
| "grad_norm": 0.0046045007369338465, | |
| "learning_rate": 2.662369657761963e-06, | |
| "loss": 0.0005, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 1.5903740517917866, | |
| "grad_norm": 0.044927829447691095, | |
| "learning_rate": 2.418577049328058e-06, | |
| "loss": 0.001, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 1.611300026157468, | |
| "grad_norm": 0.005367586480852439, | |
| "learning_rate": 2.1849475683932996e-06, | |
| "loss": 0.0005, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 1.6322260005231493, | |
| "grad_norm": 0.014755038810438978, | |
| "learning_rate": 1.961794402365611e-06, | |
| "loss": 0.0012, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 1.6531519748888308, | |
| "grad_norm": 0.007586271120637286, | |
| "learning_rate": 1.7494166948349057e-06, | |
| "loss": 0.0023, | |
| "step": 790 | |
| }, | |
| { | |
| "epoch": 1.6740779492545121, | |
| "grad_norm": 0.01450560934843513, | |
| "learning_rate": 1.5480991445620541e-06, | |
| "loss": 0.001, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 1.6950039236201935, | |
| "grad_norm": 0.007197310242295822, | |
| "learning_rate": 1.3581116238315194e-06, | |
| "loss": 0.0004, | |
| "step": 810 | |
| }, | |
| { | |
| "epoch": 1.715929897985875, | |
| "grad_norm": 0.011429665649141593, | |
| "learning_rate": 1.1797088166794002e-06, | |
| "loss": 0.0009, | |
| "step": 820 | |
| }, | |
| { | |
| "epoch": 1.7368558723515564, | |
| "grad_norm": 0.005479589436376736, | |
| "learning_rate": 1.013129877481741e-06, | |
| "loss": 0.0015, | |
| "step": 830 | |
| }, | |
| { | |
| "epoch": 1.7577818467172377, | |
| "grad_norm": 0.010284806366489312, | |
| "learning_rate": 8.585981103608343e-07, | |
| "loss": 0.0003, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 1.7787078210829192, | |
| "grad_norm": 0.004805391506128972, | |
| "learning_rate": 7.163206698392744e-07, | |
| "loss": 0.0007, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 1.7996337954486006, | |
| "grad_norm": 0.00832006380541767, | |
| "learning_rate": 5.864882831430274e-07, | |
| "loss": 0.0013, | |
| "step": 860 | |
| }, | |
| { | |
| "epoch": 1.820559769814282, | |
| "grad_norm": 0.0016488023765299498, | |
| "learning_rate": 4.6927499452580574e-07, | |
| "loss": 0.0004, | |
| "step": 870 | |
| }, | |
| { | |
| "epoch": 1.8414857441799635, | |
| "grad_norm": 0.010828047107950923, | |
| "learning_rate": 3.6483793195745686e-07, | |
| "loss": 0.002, | |
| "step": 880 | |
| }, | |
| { | |
| "epoch": 1.8624117185456448, | |
| "grad_norm": 0.00029272379323776047, | |
| "learning_rate": 2.733170964891607e-07, | |
| "loss": 0.0008, | |
| "step": 890 | |
| }, | |
| { | |
| "epoch": 1.8833376929113261, | |
| "grad_norm": 0.011245028659871065, | |
| "learning_rate": 1.9483517457776436e-07, | |
| "loss": 0.0013, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 1.9042636672770077, | |
| "grad_norm": 0.005356139523384608, | |
| "learning_rate": 1.2949737362087156e-07, | |
| "loss": 0.0009, | |
| "step": 910 | |
| }, | |
| { | |
| "epoch": 1.925189641642689, | |
| "grad_norm": 0.006490951580098314, | |
| "learning_rate": 7.73912809231292e-08, | |
| "loss": 0.0005, | |
| "step": 920 | |
| }, | |
| { | |
| "epoch": 1.9461156160083704, | |
| "grad_norm": 0.003383501070683301, | |
| "learning_rate": 3.858674628278825e-08, | |
| "loss": 0.003, | |
| "step": 930 | |
| }, | |
| { | |
| "epoch": 1.967041590374052, | |
| "grad_norm": 0.012526402652665877, | |
| "learning_rate": 1.3135788355934652e-08, | |
| "loss": 0.0004, | |
| "step": 940 | |
| }, | |
| { | |
| "epoch": 1.9879675647397332, | |
| "grad_norm": 0.016656680718419487, | |
| "learning_rate": 1.0725249238940916e-09, | |
| "loss": 0.0006, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 1.9963379544860058, | |
| "step": 954, | |
| "total_flos": 841298254757888.0, | |
| "train_loss": 0.001730952451793395, | |
| "train_runtime": 49706.0079, | |
| "train_samples_per_second": 2.461, | |
| "train_steps_per_second": 0.019 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 954, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 2, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 841298254757888.0, | |
| "train_batch_size": 8, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |