daviddongdong commited on
Commit
049bce0
·
verified ·
1 Parent(s): c356c52

Upload 8 files

Browse files
adapter_config.json ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alpha_pattern": {},
3
+ "auto_mapping": null,
4
+ "base_model_name_or_path": "Qwen2.5-14B-Instruct",
5
+ "bias": "none",
6
+ "eva_config": null,
7
+ "exclude_modules": null,
8
+ "fan_in_fan_out": false,
9
+ "inference_mode": true,
10
+ "init_lora_weights": true,
11
+ "layer_replication": null,
12
+ "layers_pattern": null,
13
+ "layers_to_transform": null,
14
+ "loftq_config": {},
15
+ "lora_alpha": 32,
16
+ "lora_bias": false,
17
+ "lora_dropout": 0.0,
18
+ "megatron_config": null,
19
+ "megatron_core": "megatron.core",
20
+ "modules_to_save": null,
21
+ "peft_type": "LORA",
22
+ "r": 16,
23
+ "rank_pattern": {},
24
+ "revision": null,
25
+ "target_modules": [
26
+ "up_proj",
27
+ "gate_proj",
28
+ "o_proj",
29
+ "k_proj",
30
+ "v_proj",
31
+ "q_proj",
32
+ "down_proj"
33
+ ],
34
+ "task_type": "CAUSAL_LM",
35
+ "use_dora": false,
36
+ "use_rslora": false
37
+ }
adapter_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3edbcc01d656f24ff31bf7c8e46431c79642b4f4665ca85ef0164a2b426a0016
3
+ size 275341720
additional_config.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"lora_dtype": null, "lorap_lr_ratio": null, "lorap_emb_lr": 1e-06}
optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6bd9af23ddd12b10d8c87087f93cabaca9fc51a3c2566a3c823fd1b1c7cb472b
3
+ size 551070514
rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8d138cfe3a4adf21f048848ee35837c9a757a0a3616ff7adbb45b69aac247435
3
+ size 14244
scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a5bfdcce041c9ea47daf7124b9987c48c3cd52a01a7502e37b705c7697288a10
3
+ size 1064
trainer_state.json ADDED
@@ -0,0 +1,577 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": 0.57263505,
3
+ "best_model_checkpoint": "/export/home2/zli/kc/mm_rag/Qwen2.5-14B-Instruct_lora/checkpoint-256",
4
+ "epoch": 0.9978075517661389,
5
+ "eval_steps": 100,
6
+ "global_step": 256,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.00389768574908648,
13
+ "grad_norm": 0.3756321966648102,
14
+ "learning_rate": 7.692307692307694e-06,
15
+ "loss": 0.760678231716156,
16
+ "memory(GiB)": 64.79,
17
+ "step": 1,
18
+ "token_acc": 0.794377180381695,
19
+ "train_speed(iter/s)": 0.008942
20
+ },
21
+ {
22
+ "epoch": 0.0194884287454324,
23
+ "grad_norm": 0.4006075859069824,
24
+ "learning_rate": 3.846153846153846e-05,
25
+ "loss": 0.8511521816253662,
26
+ "memory(GiB)": 69.21,
27
+ "step": 5,
28
+ "token_acc": 0.7628462851792697,
29
+ "train_speed(iter/s)": 0.009262
30
+ },
31
+ {
32
+ "epoch": 0.0389768574908648,
33
+ "grad_norm": 0.40541699528694153,
34
+ "learning_rate": 7.692307692307693e-05,
35
+ "loss": 0.8799286842346191,
36
+ "memory(GiB)": 69.21,
37
+ "step": 10,
38
+ "token_acc": 0.755640420079393,
39
+ "train_speed(iter/s)": 0.009459
40
+ },
41
+ {
42
+ "epoch": 0.058465286236297195,
43
+ "grad_norm": 0.28694042563438416,
44
+ "learning_rate": 9.998328666948438e-05,
45
+ "loss": 0.7268682479858398,
46
+ "memory(GiB)": 69.21,
47
+ "step": 15,
48
+ "token_acc": 0.7857048253940223,
49
+ "train_speed(iter/s)": 0.009584
50
+ },
51
+ {
52
+ "epoch": 0.0779537149817296,
53
+ "grad_norm": 0.29239076375961304,
54
+ "learning_rate": 9.979538999730047e-05,
55
+ "loss": 0.6990458011627197,
56
+ "memory(GiB)": 69.21,
57
+ "step": 20,
58
+ "token_acc": 0.7923689929968607,
59
+ "train_speed(iter/s)": 0.009633
60
+ },
61
+ {
62
+ "epoch": 0.09744214372716199,
63
+ "grad_norm": 0.25992509722709656,
64
+ "learning_rate": 9.939949247384046e-05,
65
+ "loss": 0.7058870792388916,
66
+ "memory(GiB)": 71.47,
67
+ "step": 25,
68
+ "token_acc": 0.7879081525173794,
69
+ "train_speed(iter/s)": 0.009549
70
+ },
71
+ {
72
+ "epoch": 0.11693057247259439,
73
+ "grad_norm": 0.1924816370010376,
74
+ "learning_rate": 9.879724780684519e-05,
75
+ "loss": 0.7370085716247559,
76
+ "memory(GiB)": 71.47,
77
+ "step": 30,
78
+ "token_acc": 0.7818104986383698,
79
+ "train_speed(iter/s)": 0.009632
80
+ },
81
+ {
82
+ "epoch": 0.1364190012180268,
83
+ "grad_norm": 0.2375946342945099,
84
+ "learning_rate": 9.799117163889559e-05,
85
+ "loss": 0.6951110363006592,
86
+ "memory(GiB)": 71.47,
87
+ "step": 35,
88
+ "token_acc": 0.7915360501567398,
89
+ "train_speed(iter/s)": 0.009685
90
+ },
91
+ {
92
+ "epoch": 0.1559074299634592,
93
+ "grad_norm": 0.19938509166240692,
94
+ "learning_rate": 9.698463103929542e-05,
95
+ "loss": 0.634189510345459,
96
+ "memory(GiB)": 71.47,
97
+ "step": 40,
98
+ "token_acc": 0.8057028291378926,
99
+ "train_speed(iter/s)": 0.009727
100
+ },
101
+ {
102
+ "epoch": 0.1753958587088916,
103
+ "grad_norm": 0.2671877145767212,
104
+ "learning_rate": 9.57818304394503e-05,
105
+ "loss": 0.6699491500854492,
106
+ "memory(GiB)": 71.47,
107
+ "step": 45,
108
+ "token_acc": 0.8010264675941635,
109
+ "train_speed(iter/s)": 0.009776
110
+ },
111
+ {
112
+ "epoch": 0.19488428745432398,
113
+ "grad_norm": 0.3463566303253174,
114
+ "learning_rate": 9.438779407049281e-05,
115
+ "loss": 0.6760294437408447,
116
+ "memory(GiB)": 71.47,
117
+ "step": 50,
118
+ "token_acc": 0.793778801843318,
119
+ "train_speed(iter/s)": 0.009808
120
+ },
121
+ {
122
+ "epoch": 0.2143727161997564,
123
+ "grad_norm": 0.22926518321037292,
124
+ "learning_rate": 9.280834497651334e-05,
125
+ "loss": 0.6952155113220215,
126
+ "memory(GiB)": 71.47,
127
+ "step": 55,
128
+ "token_acc": 0.7856186209763081,
129
+ "train_speed(iter/s)": 0.009819
130
+ },
131
+ {
132
+ "epoch": 0.23386114494518878,
133
+ "grad_norm": 0.26200923323631287,
134
+ "learning_rate": 9.105008069106093e-05,
135
+ "loss": 0.6922572135925293,
136
+ "memory(GiB)": 71.47,
137
+ "step": 60,
138
+ "token_acc": 0.784200469035004,
139
+ "train_speed(iter/s)": 0.009839
140
+ },
141
+ {
142
+ "epoch": 0.25334957369062117,
143
+ "grad_norm": 0.2608520984649658,
144
+ "learning_rate": 8.912034567851599e-05,
145
+ "loss": 0.6162422180175782,
146
+ "memory(GiB)": 71.47,
147
+ "step": 65,
148
+ "token_acc": 0.8122029543994862,
149
+ "train_speed(iter/s)": 0.009864
150
+ },
151
+ {
152
+ "epoch": 0.2728380024360536,
153
+ "grad_norm": 0.23958414793014526,
154
+ "learning_rate": 8.702720065545024e-05,
155
+ "loss": 0.6201987266540527,
156
+ "memory(GiB)": 71.47,
157
+ "step": 70,
158
+ "token_acc": 0.807689317368994,
159
+ "train_speed(iter/s)": 0.00982
160
+ },
161
+ {
162
+ "epoch": 0.292326431181486,
163
+ "grad_norm": 0.24707700312137604,
164
+ "learning_rate": 8.47793889201221e-05,
165
+ "loss": 0.6751769542694092,
166
+ "memory(GiB)": 71.47,
167
+ "step": 75,
168
+ "token_acc": 0.7933965994464215,
169
+ "train_speed(iter/s)": 0.009843
170
+ },
171
+ {
172
+ "epoch": 0.3118148599269184,
173
+ "grad_norm": 0.280536413192749,
174
+ "learning_rate": 8.238629983075294e-05,
175
+ "loss": 0.6537846088409424,
176
+ "memory(GiB)": 71.47,
177
+ "step": 80,
178
+ "token_acc": 0.800564772326156,
179
+ "train_speed(iter/s)": 0.00983
180
+ },
181
+ {
182
+ "epoch": 0.3313032886723508,
183
+ "grad_norm": 0.29046493768692017,
184
+ "learning_rate": 7.985792958513931e-05,
185
+ "loss": 0.6765205383300781,
186
+ "memory(GiB)": 71.47,
187
+ "step": 85,
188
+ "token_acc": 0.7913814331445457,
189
+ "train_speed(iter/s)": 0.009828
190
+ },
191
+ {
192
+ "epoch": 0.3507917174177832,
193
+ "grad_norm": 0.2879982888698578,
194
+ "learning_rate": 7.720483946542914e-05,
195
+ "loss": 0.6516193866729736,
196
+ "memory(GiB)": 71.47,
197
+ "step": 90,
198
+ "token_acc": 0.7983723394009439,
199
+ "train_speed(iter/s)": 0.009817
200
+ },
201
+ {
202
+ "epoch": 0.3702801461632156,
203
+ "grad_norm": 0.28098541498184204,
204
+ "learning_rate": 7.443811172247821e-05,
205
+ "loss": 0.6455563545227051,
206
+ "memory(GiB)": 71.47,
207
+ "step": 95,
208
+ "token_acc": 0.8004983012457532,
209
+ "train_speed(iter/s)": 0.009833
210
+ },
211
+ {
212
+ "epoch": 0.38976857490864797,
213
+ "grad_norm": 0.2407151460647583,
214
+ "learning_rate": 7.156930328406268e-05,
215
+ "loss": 0.6356846332550049,
216
+ "memory(GiB)": 71.47,
217
+ "step": 100,
218
+ "token_acc": 0.8042028604018602,
219
+ "train_speed(iter/s)": 0.009804
220
+ },
221
+ {
222
+ "epoch": 0.38976857490864797,
223
+ "eval_loss": 0.598721981048584,
224
+ "eval_runtime": 15.1098,
225
+ "eval_samples_per_second": 0.265,
226
+ "eval_steps_per_second": 0.132,
227
+ "step": 100
228
+ },
229
+ {
230
+ "epoch": 0.4092570036540804,
231
+ "grad_norm": 0.2836802899837494,
232
+ "learning_rate": 6.861039748031351e-05,
233
+ "loss": 0.6598537921905517,
234
+ "memory(GiB)": 77.63,
235
+ "step": 105,
236
+ "token_acc": 0.7957189390414147,
237
+ "train_speed(iter/s)": 0.009789
238
+ },
239
+ {
240
+ "epoch": 0.4287454323995128,
241
+ "grad_norm": 0.26613113284111023,
242
+ "learning_rate": 6.557375398802123e-05,
243
+ "loss": 0.6290350437164307,
244
+ "memory(GiB)": 77.63,
245
+ "step": 110,
246
+ "token_acc": 0.8080845540515483,
247
+ "train_speed(iter/s)": 0.009743
248
+ },
249
+ {
250
+ "epoch": 0.4482338611449452,
251
+ "grad_norm": 0.3952692747116089,
252
+ "learning_rate": 6.247205720289907e-05,
253
+ "loss": 0.6128222942352295,
254
+ "memory(GiB)": 77.63,
255
+ "step": 115,
256
+ "token_acc": 0.8121368342872122,
257
+ "train_speed(iter/s)": 0.009719
258
+ },
259
+ {
260
+ "epoch": 0.46772228989037756,
261
+ "grad_norm": 0.29315847158432007,
262
+ "learning_rate": 5.9318263255459116e-05,
263
+ "loss": 0.6240291595458984,
264
+ "memory(GiB)": 77.63,
265
+ "step": 120,
266
+ "token_acc": 0.8027493895269965,
267
+ "train_speed(iter/s)": 0.009694
268
+ },
269
+ {
270
+ "epoch": 0.48721071863581,
271
+ "grad_norm": 0.2671997547149658,
272
+ "learning_rate": 5.6125545891822274e-05,
273
+ "loss": 0.6649426460266114,
274
+ "memory(GiB)": 77.63,
275
+ "step": 125,
276
+ "token_acc": 0.7922591927838373,
277
+ "train_speed(iter/s)": 0.009676
278
+ },
279
+ {
280
+ "epoch": 0.5066991473812423,
281
+ "grad_norm": 0.26974523067474365,
282
+ "learning_rate": 5.290724144552379e-05,
283
+ "loss": 0.6172929763793945,
284
+ "memory(GiB)": 77.63,
285
+ "step": 130,
286
+ "token_acc": 0.8085726351351351,
287
+ "train_speed(iter/s)": 0.009667
288
+ },
289
+ {
290
+ "epoch": 0.5261875761266748,
291
+ "grad_norm": 0.2798590660095215,
292
+ "learning_rate": 4.967679313017303e-05,
293
+ "loss": 0.6145929336547852,
294
+ "memory(GiB)": 77.63,
295
+ "step": 135,
296
+ "token_acc": 0.8118081180811808,
297
+ "train_speed(iter/s)": 0.009683
298
+ },
299
+ {
300
+ "epoch": 0.5456760048721072,
301
+ "grad_norm": 0.27218812704086304,
302
+ "learning_rate": 4.6447694885663514e-05,
303
+ "loss": 0.618541955947876,
304
+ "memory(GiB)": 77.63,
305
+ "step": 140,
306
+ "token_acc": 0.8050674649659422,
307
+ "train_speed(iter/s)": 0.009706
308
+ },
309
+ {
310
+ "epoch": 0.5651644336175395,
311
+ "grad_norm": 0.29498329758644104,
312
+ "learning_rate": 4.323343501249346e-05,
313
+ "loss": 0.6341890335083008,
314
+ "memory(GiB)": 77.63,
315
+ "step": 145,
316
+ "token_acc": 0.801550316061459,
317
+ "train_speed(iter/s)": 0.009735
318
+ },
319
+ {
320
+ "epoch": 0.584652862362972,
321
+ "grad_norm": 0.2799496352672577,
322
+ "learning_rate": 4.004743982964298e-05,
323
+ "loss": 0.6000133037567139,
324
+ "memory(GiB)": 77.63,
325
+ "step": 150,
326
+ "token_acc": 0.8125188083057479,
327
+ "train_speed(iter/s)": 0.009742
328
+ },
329
+ {
330
+ "epoch": 0.6041412911084044,
331
+ "grad_norm": 0.28971585631370544,
332
+ "learning_rate": 3.6903017591354706e-05,
333
+ "loss": 0.6476753711700439,
334
+ "memory(GiB)": 77.63,
335
+ "step": 155,
336
+ "token_acc": 0.8055159053833605,
337
+ "train_speed(iter/s)": 0.009734
338
+ },
339
+ {
340
+ "epoch": 0.6236297198538368,
341
+ "grad_norm": 0.2888241708278656,
342
+ "learning_rate": 3.381330289708396e-05,
343
+ "loss": 0.6604040622711181,
344
+ "memory(GiB)": 77.63,
345
+ "step": 160,
346
+ "token_acc": 0.7972270363951474,
347
+ "train_speed(iter/s)": 0.009743
348
+ },
349
+ {
350
+ "epoch": 0.6431181485992692,
351
+ "grad_norm": 0.31141144037246704,
352
+ "learning_rate": 3.079120182682412e-05,
353
+ "loss": 0.5943418025970459,
354
+ "memory(GiB)": 77.63,
355
+ "step": 165,
356
+ "token_acc": 0.8125655328607977,
357
+ "train_speed(iter/s)": 0.009733
358
+ },
359
+ {
360
+ "epoch": 0.6626065773447016,
361
+ "grad_norm": 0.28593260049819946,
362
+ "learning_rate": 2.7849338030983257e-05,
363
+ "loss": 0.6077968120574951,
364
+ "memory(GiB)": 77.63,
365
+ "step": 170,
366
+ "token_acc": 0.811592125382263,
367
+ "train_speed(iter/s)": 0.009731
368
+ },
369
+ {
370
+ "epoch": 0.682095006090134,
371
+ "grad_norm": 0.3223704993724823,
372
+ "learning_rate": 2.500000000000001e-05,
373
+ "loss": 0.6201191425323487,
374
+ "memory(GiB)": 77.63,
375
+ "step": 175,
376
+ "token_acc": 0.806087783666259,
377
+ "train_speed(iter/s)": 0.009728
378
+ },
379
+ {
380
+ "epoch": 0.7015834348355664,
381
+ "grad_norm": 0.3051086962223053,
382
+ "learning_rate": 2.225508973396016e-05,
383
+ "loss": 0.6003672122955322,
384
+ "memory(GiB)": 77.63,
385
+ "step": 180,
386
+ "token_acc": 0.8096822043421585,
387
+ "train_speed(iter/s)": 0.009738
388
+ },
389
+ {
390
+ "epoch": 0.7210718635809987,
391
+ "grad_norm": 0.33248814940452576,
392
+ "learning_rate": 1.9626073026625818e-05,
393
+ "loss": 0.6649184226989746,
394
+ "memory(GiB)": 77.63,
395
+ "step": 185,
396
+ "token_acc": 0.7927942418271744,
397
+ "train_speed(iter/s)": 0.009754
398
+ },
399
+ {
400
+ "epoch": 0.7405602923264312,
401
+ "grad_norm": 0.32766395807266235,
402
+ "learning_rate": 1.7123931571546827e-05,
403
+ "loss": 0.620220422744751,
404
+ "memory(GiB)": 77.63,
405
+ "step": 190,
406
+ "token_acc": 0.8047315233083407,
407
+ "train_speed(iter/s)": 0.009761
408
+ },
409
+ {
410
+ "epoch": 0.7600487210718636,
411
+ "grad_norm": 0.3414992392063141,
412
+ "learning_rate": 1.4759117090312197e-05,
413
+ "loss": 0.622031831741333,
414
+ "memory(GiB)": 77.63,
415
+ "step": 195,
416
+ "token_acc": 0.807596541264883,
417
+ "train_speed(iter/s)": 0.009765
418
+ },
419
+ {
420
+ "epoch": 0.7795371498172959,
421
+ "grad_norm": 0.32977092266082764,
422
+ "learning_rate": 1.25415076745532e-05,
423
+ "loss": 0.6251283168792725,
424
+ "memory(GiB)": 77.63,
425
+ "step": 200,
426
+ "token_acc": 0.8072234762979684,
427
+ "train_speed(iter/s)": 0.009771
428
+ },
429
+ {
430
+ "epoch": 0.7795371498172959,
431
+ "eval_loss": 0.5760576725006104,
432
+ "eval_runtime": 14.2053,
433
+ "eval_samples_per_second": 0.282,
434
+ "eval_steps_per_second": 0.141,
435
+ "step": 200
436
+ },
437
+ {
438
+ "epoch": 0.7990255785627284,
439
+ "grad_norm": 0.2861402630805969,
440
+ "learning_rate": 1.0480366524062042e-05,
441
+ "loss": 0.5975393772125244,
442
+ "memory(GiB)": 77.63,
443
+ "step": 205,
444
+ "token_acc": 0.8152735624641129,
445
+ "train_speed(iter/s)": 0.009771
446
+ },
447
+ {
448
+ "epoch": 0.8185140073081608,
449
+ "grad_norm": 0.30515122413635254,
450
+ "learning_rate": 8.584303253381847e-06,
451
+ "loss": 0.6246855735778809,
452
+ "memory(GiB)": 77.63,
453
+ "step": 210,
454
+ "token_acc": 0.8046837508398119,
455
+ "train_speed(iter/s)": 0.009789
456
+ },
457
+ {
458
+ "epoch": 0.8380024360535931,
459
+ "grad_norm": 0.3188154101371765,
460
+ "learning_rate": 6.861237928494579e-06,
461
+ "loss": 0.5521413803100585,
462
+ "memory(GiB)": 77.63,
463
+ "step": 215,
464
+ "token_acc": 0.8276072283910298,
465
+ "train_speed(iter/s)": 0.009786
466
+ },
467
+ {
468
+ "epoch": 0.8574908647990256,
469
+ "grad_norm": 0.3368200361728668,
470
+ "learning_rate": 5.318367983829392e-06,
471
+ "loss": 0.5954047679901123,
472
+ "memory(GiB)": 77.63,
473
+ "step": 220,
474
+ "token_acc": 0.8130686517783292,
475
+ "train_speed(iter/s)": 0.009788
476
+ },
477
+ {
478
+ "epoch": 0.876979293544458,
479
+ "grad_norm": 0.3377918601036072,
480
+ "learning_rate": 3.962138157783085e-06,
481
+ "loss": 0.6312428951263428,
482
+ "memory(GiB)": 77.63,
483
+ "step": 225,
484
+ "token_acc": 0.8009544725061238,
485
+ "train_speed(iter/s)": 0.009783
486
+ },
487
+ {
488
+ "epoch": 0.8964677222898904,
489
+ "grad_norm": 0.34851112961769104,
490
+ "learning_rate": 2.798213572335001e-06,
491
+ "loss": 0.5601920127868653,
492
+ "memory(GiB)": 77.63,
493
+ "step": 230,
494
+ "token_acc": 0.8258236434108527,
495
+ "train_speed(iter/s)": 0.009778
496
+ },
497
+ {
498
+ "epoch": 0.9159561510353228,
499
+ "grad_norm": 0.3354482650756836,
500
+ "learning_rate": 1.8314560692059835e-06,
501
+ "loss": 0.5771200656890869,
502
+ "memory(GiB)": 77.63,
503
+ "step": 235,
504
+ "token_acc": 0.8201089968594125,
505
+ "train_speed(iter/s)": 0.009786
506
+ },
507
+ {
508
+ "epoch": 0.9354445797807551,
509
+ "grad_norm": 0.29914259910583496,
510
+ "learning_rate": 1.0659039014077944e-06,
511
+ "loss": 0.6100361347198486,
512
+ "memory(GiB)": 77.63,
513
+ "step": 240,
514
+ "token_acc": 0.8085165989634403,
515
+ "train_speed(iter/s)": 0.009798
516
+ },
517
+ {
518
+ "epoch": 0.9549330085261876,
519
+ "grad_norm": 0.36409541964530945,
520
+ "learning_rate": 5.047548650136513e-07,
521
+ "loss": 0.6194799423217774,
522
+ "memory(GiB)": 77.63,
523
+ "step": 245,
524
+ "token_acc": 0.807660813392776,
525
+ "train_speed(iter/s)": 0.009801
526
+ },
527
+ {
528
+ "epoch": 0.97442143727162,
529
+ "grad_norm": 0.40651243925094604,
530
+ "learning_rate": 1.503529416103988e-07,
531
+ "loss": 0.5671232223510743,
532
+ "memory(GiB)": 77.63,
533
+ "step": 250,
534
+ "token_acc": 0.8216745683871545,
535
+ "train_speed(iter/s)": 0.009809
536
+ },
537
+ {
538
+ "epoch": 0.9939098660170523,
539
+ "grad_norm": 0.3596270978450775,
540
+ "learning_rate": 4.178507228136397e-09,
541
+ "loss": 0.6255767822265625,
542
+ "memory(GiB)": 77.63,
543
+ "step": 255,
544
+ "token_acc": 0.8013068013068013,
545
+ "train_speed(iter/s)": 0.009788
546
+ },
547
+ {
548
+ "epoch": 0.9978075517661389,
549
+ "eval_loss": 0.5726350545883179,
550
+ "eval_runtime": 14.0103,
551
+ "eval_samples_per_second": 0.286,
552
+ "eval_steps_per_second": 0.143,
553
+ "step": 256
554
+ }
555
+ ],
556
+ "logging_steps": 5,
557
+ "max_steps": 256,
558
+ "num_input_tokens_seen": 0,
559
+ "num_train_epochs": 1,
560
+ "save_steps": 50,
561
+ "stateful_callbacks": {
562
+ "TrainerControl": {
563
+ "args": {
564
+ "should_epoch_stop": false,
565
+ "should_evaluate": false,
566
+ "should_log": false,
567
+ "should_save": true,
568
+ "should_training_stop": true
569
+ },
570
+ "attributes": {}
571
+ }
572
+ },
573
+ "total_flos": 1.3517329026916086e+18,
574
+ "train_batch_size": 1,
575
+ "trial_name": null,
576
+ "trial_params": null
577
+ }
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:45ea9cbee06f408dfc4621ffd928ab710798716a3b78cc757f3da447b7a31ed0
3
+ size 5816