{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.16, "eval_steps": 1000, "global_step": 200, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0008, "grad_norm": 0.0, "learning_rate": 0.0005, "loss": 9.9524, "step": 1 }, { "epoch": 0.0016, "grad_norm": 1.437297100892465e-06, "learning_rate": 0.0005, "loss": 9.9048, "step": 2 }, { "epoch": 0.0024, "grad_norm": 1.1676747798919678, "learning_rate": 0.0005, "loss": 9.8558, "step": 3 }, { "epoch": 0.0032, "grad_norm": 11.5746431350708, "learning_rate": 0.0005, "loss": 9.5539, "step": 4 }, { "epoch": 0.004, "grad_norm": 3.968721628189087, "learning_rate": 0.0005, "loss": 9.2858, "step": 5 }, { "epoch": 0.0048, "grad_norm": 6.475513935089111, "learning_rate": 0.0005, "loss": 8.3592, "step": 6 }, { "epoch": 0.0056, "grad_norm": 11.020626068115234, "learning_rate": 0.0005, "loss": 7.1505, "step": 7 }, { "epoch": 0.0064, "grad_norm": 16.665830612182617, "learning_rate": 0.0005, "loss": 3.6912, "step": 8 }, { "epoch": 0.0072, "grad_norm": 5.093105316162109, "learning_rate": 0.0005, "loss": 1.7986, "step": 9 }, { "epoch": 0.008, "grad_norm": 5.546494007110596, "learning_rate": 0.0005, "loss": 2.6467, "step": 10 }, { "epoch": 0.0088, "grad_norm": 20.05790138244629, "learning_rate": 0.0005, "loss": 0.9881, "step": 11 }, { "epoch": 0.0096, "grad_norm": 5.210952281951904, "learning_rate": 0.0005, "loss": 1.7098, "step": 12 }, { "epoch": 0.0104, "grad_norm": 7.218165397644043, "learning_rate": 0.0005, "loss": 1.7179, "step": 13 }, { "epoch": 0.0112, "grad_norm": 3.7357559204101562, "learning_rate": 0.0005, "loss": 0.7633, "step": 14 }, { "epoch": 0.012, "grad_norm": 5.467613220214844, "learning_rate": 0.0005, "loss": 1.2452, "step": 15 }, { "epoch": 0.0128, "grad_norm": 13.217657089233398, "learning_rate": 0.0005, "loss": 2.4923, "step": 16 }, { "epoch": 0.0136, "grad_norm": 1.690468430519104, "learning_rate": 0.0005, "loss": 0.9401, "step": 17 }, { "epoch": 0.0144, "grad_norm": 1.6662808656692505, "learning_rate": 0.0005, "loss": 1.7828, "step": 18 }, { "epoch": 0.0152, "grad_norm": 1.3934170007705688, "learning_rate": 0.0005, "loss": 1.5044, "step": 19 }, { "epoch": 0.016, "grad_norm": 1.0231975317001343, "learning_rate": 0.0005, "loss": 1.2216, "step": 20 }, { "epoch": 0.0168, "grad_norm": 1.9770835638046265, "learning_rate": 0.0005, "loss": 2.5877, "step": 21 }, { "epoch": 0.0176, "grad_norm": 1.619942307472229, "learning_rate": 0.0005, "loss": 1.9136, "step": 22 }, { "epoch": 0.0184, "grad_norm": 3.4306373596191406, "learning_rate": 0.0005, "loss": 1.7088, "step": 23 }, { "epoch": 0.0192, "grad_norm": 0.9691258072853088, "learning_rate": 0.0005, "loss": 1.2861, "step": 24 }, { "epoch": 0.02, "grad_norm": 1.9319524765014648, "learning_rate": 0.0005, "loss": 1.9669, "step": 25 }, { "epoch": 0.0208, "grad_norm": 4.606762409210205, "learning_rate": 0.0005, "loss": 2.1804, "step": 26 }, { "epoch": 0.0216, "grad_norm": 0.6729273200035095, "learning_rate": 0.0005, "loss": 0.5727, "step": 27 }, { "epoch": 0.0224, "grad_norm": 0.9792982339859009, "learning_rate": 0.0005, "loss": 0.7054, "step": 28 }, { "epoch": 0.0232, "grad_norm": 3.6252601146698, "learning_rate": 0.0005, "loss": 2.201, "step": 29 }, { "epoch": 0.024, "grad_norm": 1.40730619430542, "learning_rate": 0.0005, "loss": 1.6824, "step": 30 }, { "epoch": 0.0248, "grad_norm": 10.60210132598877, "learning_rate": 0.0005, "loss": 2.8354, "step": 31 }, { "epoch": 0.0256, "grad_norm": 4.060218811035156, "learning_rate": 0.0005, "loss": 3.0198, "step": 32 }, { "epoch": 0.0264, "grad_norm": 2.5272302627563477, "learning_rate": 0.0005, "loss": 1.8971, "step": 33 }, { "epoch": 0.0272, "grad_norm": 3.3175830841064453, "learning_rate": 0.0005, "loss": 2.2358, "step": 34 }, { "epoch": 0.028, "grad_norm": 13.316032409667969, "learning_rate": 0.0005, "loss": 0.9997, "step": 35 }, { "epoch": 0.0288, "grad_norm": 7.873894691467285, "learning_rate": 0.0005, "loss": 4.109, "step": 36 }, { "epoch": 0.0296, "grad_norm": 12.632922172546387, "learning_rate": 0.0005, "loss": 4.6703, "step": 37 }, { "epoch": 0.0304, "grad_norm": 5.1721110343933105, "learning_rate": 0.0005, "loss": 0.2371, "step": 38 }, { "epoch": 0.0312, "grad_norm": 4.0092854499816895, "learning_rate": 0.0005, "loss": 1.6763, "step": 39 }, { "epoch": 0.032, "grad_norm": 13.548662185668945, "learning_rate": 0.0005, "loss": 3.3977, "step": 40 }, { "epoch": 0.0328, "grad_norm": 10.380196571350098, "learning_rate": 0.0005, "loss": 2.8333, "step": 41 }, { "epoch": 0.0336, "grad_norm": 10.124567031860352, "learning_rate": 0.0005, "loss": 1.8736, "step": 42 }, { "epoch": 0.0344, "grad_norm": 33.22747039794922, "learning_rate": 0.0005, "loss": 4.0325, "step": 43 }, { "epoch": 0.0352, "grad_norm": 23.466447830200195, "learning_rate": 0.0005, "loss": 3.209, "step": 44 }, { "epoch": 0.036, "grad_norm": 97.76129150390625, "learning_rate": 0.0005, "loss": 3.4155, "step": 45 }, { "epoch": 0.0368, "grad_norm": 12.2396879196167, "learning_rate": 0.0005, "loss": 1.5068, "step": 46 }, { "epoch": 0.0376, "grad_norm": 18.19890785217285, "learning_rate": 0.0005, "loss": 1.5872, "step": 47 }, { "epoch": 0.0384, "grad_norm": 33.10652160644531, "learning_rate": 0.0005, "loss": 0.4156, "step": 48 }, { "epoch": 0.0392, "grad_norm": 3.613457202911377, "learning_rate": 0.0005, "loss": 0.8332, "step": 49 }, { "epoch": 0.04, "grad_norm": 23.454383850097656, "learning_rate": 0.0005, "loss": 2.4502, "step": 50 }, { "epoch": 0.0408, "grad_norm": 5.318363189697266, "learning_rate": 0.0005, "loss": 1.3799, "step": 51 }, { "epoch": 0.0416, "grad_norm": 6.815697193145752, "learning_rate": 0.0005, "loss": 2.1557, "step": 52 }, { "epoch": 0.0424, "grad_norm": 11.527610778808594, "learning_rate": 0.0005, "loss": 1.298, "step": 53 }, { "epoch": 0.0432, "grad_norm": 4.768402099609375, "learning_rate": 0.0005, "loss": 1.107, "step": 54 }, { "epoch": 0.044, "grad_norm": 1.4602067470550537, "learning_rate": 0.0005, "loss": 0.3143, "step": 55 }, { "epoch": 0.0448, "grad_norm": 9.193857192993164, "learning_rate": 0.0005, "loss": 1.1428, "step": 56 }, { "epoch": 0.0456, "grad_norm": 11.116622924804688, "learning_rate": 0.0005, "loss": 1.788, "step": 57 }, { "epoch": 0.0464, "grad_norm": 4.629782676696777, "learning_rate": 0.0005, "loss": 1.0826, "step": 58 }, { "epoch": 0.0472, "grad_norm": 12.43929672241211, "learning_rate": 0.0005, "loss": 1.8785, "step": 59 }, { "epoch": 0.048, "grad_norm": 36.3989372253418, "learning_rate": 0.0005, "loss": 2.4311, "step": 60 }, { "epoch": 0.0488, "grad_norm": 22.378089904785156, "learning_rate": 0.0005, "loss": 2.036, "step": 61 }, { "epoch": 0.0496, "grad_norm": 37.49986267089844, "learning_rate": 0.0005, "loss": 2.8407, "step": 62 }, { "epoch": 0.0504, "grad_norm": 27.25046730041504, "learning_rate": 0.0005, "loss": 1.4435, "step": 63 }, { "epoch": 0.0512, "grad_norm": 10.643092155456543, "learning_rate": 0.0005, "loss": 0.8146, "step": 64 }, { "epoch": 0.052, "grad_norm": 25.00066566467285, "learning_rate": 0.0005, "loss": 1.4657, "step": 65 }, { "epoch": 0.0528, "grad_norm": 45.32073974609375, "learning_rate": 0.0005, "loss": 1.1125, "step": 66 }, { "epoch": 0.0536, "grad_norm": 19.11292266845703, "learning_rate": 0.0005, "loss": 1.5612, "step": 67 }, { "epoch": 0.0544, "grad_norm": 16.346227645874023, "learning_rate": 0.0005, "loss": 1.7633, "step": 68 }, { "epoch": 0.0552, "grad_norm": 27.169462203979492, "learning_rate": 0.0005, "loss": 1.3909, "step": 69 }, { "epoch": 0.056, "grad_norm": 6.169379711151123, "learning_rate": 0.0005, "loss": 0.6378, "step": 70 }, { "epoch": 0.0568, "grad_norm": 20.114524841308594, "learning_rate": 0.0005, "loss": 1.0569, "step": 71 }, { "epoch": 0.0576, "grad_norm": 29.240468978881836, "learning_rate": 0.0005, "loss": 1.118, "step": 72 }, { "epoch": 0.0584, "grad_norm": 49.98060989379883, "learning_rate": 0.0005, "loss": 2.3733, "step": 73 }, { "epoch": 0.0592, "grad_norm": 7.8704118728637695, "learning_rate": 0.0005, "loss": 0.9115, "step": 74 }, { "epoch": 0.06, "grad_norm": 14.570856094360352, "learning_rate": 0.0005, "loss": 1.5051, "step": 75 }, { "epoch": 0.0608, "grad_norm": 8.354825973510742, "learning_rate": 0.0005, "loss": 0.6045, "step": 76 }, { "epoch": 0.0616, "grad_norm": 6.444540977478027, "learning_rate": 0.0005, "loss": 1.5749, "step": 77 }, { "epoch": 0.0624, "grad_norm": 20.538169860839844, "learning_rate": 0.0005, "loss": 1.6162, "step": 78 }, { "epoch": 0.0632, "grad_norm": 29.843812942504883, "learning_rate": 0.0005, "loss": 2.057, "step": 79 }, { "epoch": 0.064, "grad_norm": 12.291327476501465, "learning_rate": 0.0005, "loss": 1.8096, "step": 80 }, { "epoch": 0.0648, "grad_norm": 4.892437934875488, "learning_rate": 0.0005, "loss": 0.4261, "step": 81 }, { "epoch": 0.0656, "grad_norm": 8.39865779876709, "learning_rate": 0.0005, "loss": 0.9653, "step": 82 }, { "epoch": 0.0664, "grad_norm": 33.09463882446289, "learning_rate": 0.0005, "loss": 2.1087, "step": 83 }, { "epoch": 0.0672, "grad_norm": 12.01011848449707, "learning_rate": 0.0005, "loss": 0.9802, "step": 84 }, { "epoch": 0.068, "grad_norm": 4.892343044281006, "learning_rate": 0.0005, "loss": 1.0565, "step": 85 }, { "epoch": 0.0688, "grad_norm": 29.355051040649414, "learning_rate": 0.0005, "loss": 3.1704, "step": 86 }, { "epoch": 0.0696, "grad_norm": 12.124067306518555, "learning_rate": 0.0005, "loss": 3.0637, "step": 87 }, { "epoch": 0.0704, "grad_norm": 7.9273762702941895, "learning_rate": 0.0005, "loss": 1.93, "step": 88 }, { "epoch": 0.0712, "grad_norm": 14.156713485717773, "learning_rate": 0.0005, "loss": 2.8417, "step": 89 }, { "epoch": 0.072, "grad_norm": 8.770395278930664, "learning_rate": 0.0005, "loss": 2.9201, "step": 90 }, { "epoch": 0.0728, "grad_norm": 8.601066589355469, "learning_rate": 0.0005, "loss": 2.7391, "step": 91 }, { "epoch": 0.0736, "grad_norm": 11.746129989624023, "learning_rate": 0.0005, "loss": 1.79, "step": 92 }, { "epoch": 0.0744, "grad_norm": 83.34435272216797, "learning_rate": 0.0005, "loss": 3.4365, "step": 93 }, { "epoch": 0.0752, "grad_norm": 52.08599090576172, "learning_rate": 0.0005, "loss": 3.0587, "step": 94 }, { "epoch": 0.076, "grad_norm": 9.474879264831543, "learning_rate": 0.0005, "loss": 1.8807, "step": 95 }, { "epoch": 0.0768, "grad_norm": 4.777618408203125, "learning_rate": 0.0005, "loss": 1.1716, "step": 96 }, { "epoch": 0.0776, "grad_norm": 8.437329292297363, "learning_rate": 0.0005, "loss": 1.897, "step": 97 }, { "epoch": 0.0784, "grad_norm": 9.909850120544434, "learning_rate": 0.0005, "loss": 1.026, "step": 98 }, { "epoch": 0.0792, "grad_norm": 18.70866584777832, "learning_rate": 0.0005, "loss": 2.8539, "step": 99 }, { "epoch": 0.08, "grad_norm": 16.791696548461914, "learning_rate": 0.0005, "loss": 1.5087, "step": 100 }, { "epoch": 0.0808, "grad_norm": 10.638335227966309, "learning_rate": 0.0005, "loss": 1.9842, "step": 101 }, { "epoch": 0.0816, "grad_norm": 8.751673698425293, "learning_rate": 0.0005, "loss": 1.8033, "step": 102 }, { "epoch": 0.0824, "grad_norm": 10.246282577514648, "learning_rate": 0.0005, "loss": 2.2316, "step": 103 }, { "epoch": 0.0832, "grad_norm": 15.839383125305176, "learning_rate": 0.0005, "loss": 1.7817, "step": 104 }, { "epoch": 0.084, "grad_norm": 8.457541465759277, "learning_rate": 0.0005, "loss": 1.7869, "step": 105 }, { "epoch": 0.0848, "grad_norm": 6.870094299316406, "learning_rate": 0.0005, "loss": 1.8344, "step": 106 }, { "epoch": 0.0856, "grad_norm": 10.195223808288574, "learning_rate": 0.0005, "loss": 2.8778, "step": 107 }, { "epoch": 0.0864, "grad_norm": 12.052081108093262, "learning_rate": 0.0005, "loss": 2.1871, "step": 108 }, { "epoch": 0.0872, "grad_norm": 9.184932708740234, "learning_rate": 0.0005, "loss": 1.5251, "step": 109 }, { "epoch": 0.088, "grad_norm": 11.652626991271973, "learning_rate": 0.0005, "loss": 2.5694, "step": 110 }, { "epoch": 0.0888, "grad_norm": 17.4293212890625, "learning_rate": 0.0005, "loss": 2.753, "step": 111 }, { "epoch": 0.0896, "grad_norm": 16.894798278808594, "learning_rate": 0.0005, "loss": 2.4887, "step": 112 }, { "epoch": 0.0904, "grad_norm": 8.354668617248535, "learning_rate": 0.0005, "loss": 1.9092, "step": 113 }, { "epoch": 0.0912, "grad_norm": 7.03963565826416, "learning_rate": 0.0005, "loss": 1.364, "step": 114 }, { "epoch": 0.092, "grad_norm": 9.681304931640625, "learning_rate": 0.0005, "loss": 2.539, "step": 115 }, { "epoch": 0.0928, "grad_norm": 7.012979030609131, "learning_rate": 0.0005, "loss": 3.0951, "step": 116 }, { "epoch": 0.0936, "grad_norm": 6.343447208404541, "learning_rate": 0.0005, "loss": 1.1429, "step": 117 }, { "epoch": 0.0944, "grad_norm": 8.117640495300293, "learning_rate": 0.0005, "loss": 1.6064, "step": 118 }, { "epoch": 0.0952, "grad_norm": 7.953670501708984, "learning_rate": 0.0005, "loss": 2.1902, "step": 119 }, { "epoch": 0.096, "grad_norm": 21.180335998535156, "learning_rate": 0.0005, "loss": 3.1061, "step": 120 }, { "epoch": 0.0968, "grad_norm": 5.133480548858643, "learning_rate": 0.0005, "loss": 0.5414, "step": 121 }, { "epoch": 0.0976, "grad_norm": 5.382786750793457, "learning_rate": 0.0005, "loss": 1.3347, "step": 122 }, { "epoch": 0.0984, "grad_norm": 10.311848640441895, "learning_rate": 0.0005, "loss": 1.9284, "step": 123 }, { "epoch": 0.0992, "grad_norm": 7.540424823760986, "learning_rate": 0.0005, "loss": 1.8246, "step": 124 }, { "epoch": 0.1, "grad_norm": 4.958598613739014, "learning_rate": 0.0005, "loss": 0.954, "step": 125 }, { "epoch": 0.1008, "grad_norm": 5.362274646759033, "learning_rate": 0.0005, "loss": 1.6635, "step": 126 }, { "epoch": 0.1016, "grad_norm": 15.276226997375488, "learning_rate": 0.0005, "loss": 4.6421, "step": 127 }, { "epoch": 0.1024, "grad_norm": 10.148447036743164, "learning_rate": 0.0005, "loss": 1.8489, "step": 128 }, { "epoch": 0.1032, "grad_norm": 10.659564018249512, "learning_rate": 0.0005, "loss": 1.9508, "step": 129 }, { "epoch": 0.104, "grad_norm": 5.988990306854248, "learning_rate": 0.0005, "loss": 1.1783, "step": 130 }, { "epoch": 0.1048, "grad_norm": 7.680914402008057, "learning_rate": 0.0005, "loss": 1.6839, "step": 131 }, { "epoch": 0.1056, "grad_norm": 6.716479301452637, "learning_rate": 0.0005, "loss": 1.6122, "step": 132 }, { "epoch": 0.1064, "grad_norm": 14.102774620056152, "learning_rate": 0.0005, "loss": 2.3155, "step": 133 }, { "epoch": 0.1072, "grad_norm": 9.768166542053223, "learning_rate": 0.0005, "loss": 2.0646, "step": 134 }, { "epoch": 0.108, "grad_norm": 6.7697834968566895, "learning_rate": 0.0005, "loss": 1.1913, "step": 135 }, { "epoch": 0.1088, "grad_norm": 6.375694274902344, "learning_rate": 0.0005, "loss": 2.243, "step": 136 }, { "epoch": 0.1096, "grad_norm": 12.372509002685547, "learning_rate": 0.0005, "loss": 2.4898, "step": 137 }, { "epoch": 0.1104, "grad_norm": 14.37572193145752, "learning_rate": 0.0005, "loss": 3.1866, "step": 138 }, { "epoch": 0.1112, "grad_norm": 19.542118072509766, "learning_rate": 0.0005, "loss": 3.4329, "step": 139 }, { "epoch": 0.112, "grad_norm": 25.670761108398438, "learning_rate": 0.0005, "loss": 4.2204, "step": 140 }, { "epoch": 0.1128, "grad_norm": 3.7592780590057373, "learning_rate": 0.0005, "loss": 0.6329, "step": 141 }, { "epoch": 0.1136, "grad_norm": 13.57589340209961, "learning_rate": 0.0005, "loss": 3.4199, "step": 142 }, { "epoch": 0.1144, "grad_norm": 15.773186683654785, "learning_rate": 0.0005, "loss": 1.6622, "step": 143 }, { "epoch": 0.1152, "grad_norm": 21.95498275756836, "learning_rate": 0.0005, "loss": 2.502, "step": 144 }, { "epoch": 0.116, "grad_norm": 14.796022415161133, "learning_rate": 0.0005, "loss": 3.3742, "step": 145 }, { "epoch": 0.1168, "grad_norm": 11.765939712524414, "learning_rate": 0.0005, "loss": 3.1213, "step": 146 }, { "epoch": 0.1176, "grad_norm": 14.750462532043457, "learning_rate": 0.0005, "loss": 4.411, "step": 147 }, { "epoch": 0.1184, "grad_norm": 13.234830856323242, "learning_rate": 0.0005, "loss": 3.3631, "step": 148 }, { "epoch": 0.1192, "grad_norm": 6.553050518035889, "learning_rate": 0.0005, "loss": 1.8951, "step": 149 }, { "epoch": 0.12, "grad_norm": 14.43946647644043, "learning_rate": 0.0005, "loss": 1.8703, "step": 150 }, { "epoch": 0.1208, "grad_norm": 9.733112335205078, "learning_rate": 0.0005, "loss": 2.8055, "step": 151 }, { "epoch": 0.1216, "grad_norm": 10.737160682678223, "learning_rate": 0.0005, "loss": 2.9521, "step": 152 }, { "epoch": 0.1224, "grad_norm": 12.203970909118652, "learning_rate": 0.0005, "loss": 2.9785, "step": 153 }, { "epoch": 0.1232, "grad_norm": 10.63853645324707, "learning_rate": 0.0005, "loss": 2.6857, "step": 154 }, { "epoch": 0.124, "grad_norm": 10.457780838012695, "learning_rate": 0.0005, "loss": 1.743, "step": 155 }, { "epoch": 0.1248, "grad_norm": 12.22805404663086, "learning_rate": 0.0005, "loss": 3.9353, "step": 156 }, { "epoch": 0.1256, "grad_norm": 7.321228981018066, "learning_rate": 0.0005, "loss": 2.2439, "step": 157 }, { "epoch": 0.1264, "grad_norm": 8.463542938232422, "learning_rate": 0.0005, "loss": 3.9458, "step": 158 }, { "epoch": 0.1272, "grad_norm": 8.441742897033691, "learning_rate": 0.0005, "loss": 3.0918, "step": 159 }, { "epoch": 0.128, "grad_norm": 15.034650802612305, "learning_rate": 0.0005, "loss": 4.1888, "step": 160 }, { "epoch": 0.1288, "grad_norm": 21.658571243286133, "learning_rate": 0.0005, "loss": 5.6277, "step": 161 }, { "epoch": 0.1296, "grad_norm": 9.453361511230469, "learning_rate": 0.0005, "loss": 3.1392, "step": 162 }, { "epoch": 0.1304, "grad_norm": 17.79288101196289, "learning_rate": 0.0005, "loss": 3.6066, "step": 163 }, { "epoch": 0.1312, "grad_norm": 14.434952735900879, "learning_rate": 0.0005, "loss": 4.2082, "step": 164 }, { "epoch": 0.132, "grad_norm": 7.758333206176758, "learning_rate": 0.0005, "loss": 1.6918, "step": 165 }, { "epoch": 0.1328, "grad_norm": 11.610477447509766, "learning_rate": 0.0005, "loss": 3.4442, "step": 166 }, { "epoch": 0.1336, "grad_norm": 17.194080352783203, "learning_rate": 0.0005, "loss": 4.6679, "step": 167 }, { "epoch": 0.1344, "grad_norm": 11.869499206542969, "learning_rate": 0.0005, "loss": 4.1471, "step": 168 }, { "epoch": 0.1352, "grad_norm": 8.291399955749512, "learning_rate": 0.0005, "loss": 2.4761, "step": 169 }, { "epoch": 0.136, "grad_norm": 10.868124008178711, "learning_rate": 0.0005, "loss": 3.1574, "step": 170 }, { "epoch": 0.1368, "grad_norm": 28.47736167907715, "learning_rate": 0.0005, "loss": 3.4398, "step": 171 }, { "epoch": 0.1376, "grad_norm": 7.985611438751221, "learning_rate": 0.0005, "loss": 1.7169, "step": 172 }, { "epoch": 0.1384, "grad_norm": 7.796197414398193, "learning_rate": 0.0005, "loss": 1.6151, "step": 173 }, { "epoch": 0.1392, "grad_norm": 31.824750900268555, "learning_rate": 0.0005, "loss": 5.5771, "step": 174 }, { "epoch": 0.14, "grad_norm": 11.322871208190918, "learning_rate": 0.0005, "loss": 1.7649, "step": 175 }, { "epoch": 0.1408, "grad_norm": 12.81942081451416, "learning_rate": 0.0005, "loss": 4.2761, "step": 176 }, { "epoch": 0.1416, "grad_norm": 14.586856842041016, "learning_rate": 0.0005, "loss": 3.7712, "step": 177 }, { "epoch": 0.1424, "grad_norm": 8.232019424438477, "learning_rate": 0.0005, "loss": 3.6163, "step": 178 }, { "epoch": 0.1432, "grad_norm": 12.344880104064941, "learning_rate": 0.0005, "loss": 2.9607, "step": 179 }, { "epoch": 0.144, "grad_norm": 15.326787948608398, "learning_rate": 0.0005, "loss": 3.7187, "step": 180 }, { "epoch": 0.1448, "grad_norm": 8.811168670654297, "learning_rate": 0.0005, "loss": 1.5032, "step": 181 }, { "epoch": 0.1456, "grad_norm": 26.123863220214844, "learning_rate": 0.0005, "loss": 2.9496, "step": 182 }, { "epoch": 0.1464, "grad_norm": 7.073259353637695, "learning_rate": 0.0005, "loss": 2.0154, "step": 183 }, { "epoch": 0.1472, "grad_norm": 10.833226203918457, "learning_rate": 0.0005, "loss": 3.3223, "step": 184 }, { "epoch": 0.148, "grad_norm": 12.570619583129883, "learning_rate": 0.0005, "loss": 3.9599, "step": 185 }, { "epoch": 0.1488, "grad_norm": 10.237825393676758, "learning_rate": 0.0005, "loss": 1.4809, "step": 186 }, { "epoch": 0.1496, "grad_norm": 9.36881160736084, "learning_rate": 0.0005, "loss": 2.7919, "step": 187 }, { "epoch": 0.1504, "grad_norm": 7.162893772125244, "learning_rate": 0.0005, "loss": 1.7637, "step": 188 }, { "epoch": 0.1512, "grad_norm": 13.637219429016113, "learning_rate": 0.0005, "loss": 1.7047, "step": 189 }, { "epoch": 0.152, "grad_norm": 13.284880638122559, "learning_rate": 0.0005, "loss": 3.9825, "step": 190 }, { "epoch": 0.1528, "grad_norm": 17.694873809814453, "learning_rate": 0.0005, "loss": 4.8066, "step": 191 }, { "epoch": 0.1536, "grad_norm": 20.54854965209961, "learning_rate": 0.0005, "loss": 6.1623, "step": 192 }, { "epoch": 0.1544, "grad_norm": 8.24622917175293, "learning_rate": 0.0005, "loss": 2.0798, "step": 193 }, { "epoch": 0.1552, "grad_norm": 7.209560871124268, "learning_rate": 0.0005, "loss": 1.9318, "step": 194 }, { "epoch": 0.156, "grad_norm": 12.79217529296875, "learning_rate": 0.0005, "loss": 2.8844, "step": 195 }, { "epoch": 0.1568, "grad_norm": 5.256202697753906, "learning_rate": 0.0005, "loss": 1.4686, "step": 196 }, { "epoch": 0.1576, "grad_norm": 12.968066215515137, "learning_rate": 0.0005, "loss": 2.5109, "step": 197 }, { "epoch": 0.1584, "grad_norm": 11.350977897644043, "learning_rate": 0.0005, "loss": 4.5141, "step": 198 }, { "epoch": 0.1592, "grad_norm": 9.951147079467773, "learning_rate": 0.0005, "loss": 3.2528, "step": 199 }, { "epoch": 0.16, "grad_norm": 20.016956329345703, "learning_rate": 0.0005, "loss": 5.7353, "step": 200 } ], "logging_steps": 1.0, "max_steps": 200, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 7.02021397118976e+16, "train_batch_size": 8, "trial_name": null, "trial_params": null }