llama3_8b_sft_alpaca / trainer_state.json
MeharBhatia's picture
Upload folder using huggingface_hub
cb5de1a verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.9979494190020506,
"eval_steps": 100,
"global_step": 1095,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.02734107997265892,
"grad_norm": 706.2386474609375,
"learning_rate": 2.0000000000000003e-06,
"loss": 3.6456,
"mean_token_accuracy": 0.43679042160511017,
"num_tokens": 171126.0,
"step": 10
},
{
"epoch": 0.05468215994531784,
"grad_norm": 40.858299255371094,
"learning_rate": 4.000000000000001e-06,
"loss": 1.9492,
"mean_token_accuracy": 0.6433063127100468,
"num_tokens": 340272.0,
"step": 20
},
{
"epoch": 0.08202323991797676,
"grad_norm": 10.141822814941406,
"learning_rate": 6e-06,
"loss": 1.3747,
"mean_token_accuracy": 0.7164170637726783,
"num_tokens": 512138.0,
"step": 30
},
{
"epoch": 0.10936431989063568,
"grad_norm": 5.630201816558838,
"learning_rate": 8.000000000000001e-06,
"loss": 1.2492,
"mean_token_accuracy": 0.7324269533157348,
"num_tokens": 682029.0,
"step": 40
},
{
"epoch": 0.1367053998632946,
"grad_norm": 4.846619129180908,
"learning_rate": 1e-05,
"loss": 1.1189,
"mean_token_accuracy": 0.7480411291122436,
"num_tokens": 850280.0,
"step": 50
},
{
"epoch": 0.16404647983595352,
"grad_norm": 2.89782452583313,
"learning_rate": 1.2e-05,
"loss": 0.9996,
"mean_token_accuracy": 0.7543315425515175,
"num_tokens": 1018134.0,
"step": 60
},
{
"epoch": 0.19138755980861244,
"grad_norm": 2.9257192611694336,
"learning_rate": 1.4e-05,
"loss": 0.9432,
"mean_token_accuracy": 0.7699216932058335,
"num_tokens": 1187053.0,
"step": 70
},
{
"epoch": 0.21872863978127136,
"grad_norm": 2.4271352291107178,
"learning_rate": 1.6000000000000003e-05,
"loss": 0.8963,
"mean_token_accuracy": 0.7777562007308007,
"num_tokens": 1355716.0,
"step": 80
},
{
"epoch": 0.24606971975393027,
"grad_norm": 3.1461400985717773,
"learning_rate": 1.8e-05,
"loss": 0.8491,
"mean_token_accuracy": 0.780939394235611,
"num_tokens": 1523424.0,
"step": 90
},
{
"epoch": 0.2734107997265892,
"grad_norm": 1.6774334907531738,
"learning_rate": 2e-05,
"loss": 0.8691,
"mean_token_accuracy": 0.7762249544262886,
"num_tokens": 1692335.0,
"step": 100
},
{
"epoch": 0.2734107997265892,
"eval_loss": 0.8699743747711182,
"eval_mean_token_accuracy": 0.7742338619349193,
"eval_num_tokens": 1692335.0,
"eval_runtime": 14.2578,
"eval_samples_per_second": 364.783,
"eval_steps_per_second": 11.432,
"step": 100
},
{
"epoch": 0.3007518796992481,
"grad_norm": 1.4909802675247192,
"learning_rate": 1.999501589126174e-05,
"loss": 0.8914,
"mean_token_accuracy": 0.7690163850784302,
"num_tokens": 1866084.0,
"step": 110
},
{
"epoch": 0.32809295967190705,
"grad_norm": 6.061567783355713,
"learning_rate": 1.9980068533314937e-05,
"loss": 0.9556,
"mean_token_accuracy": 0.7586096897721291,
"num_tokens": 2037177.0,
"step": 120
},
{
"epoch": 0.35543403964456594,
"grad_norm": 1.5067142248153687,
"learning_rate": 1.995517282601106e-05,
"loss": 0.9423,
"mean_token_accuracy": 0.7592512294650078,
"num_tokens": 2206650.0,
"step": 130
},
{
"epoch": 0.3827751196172249,
"grad_norm": 1.3846155405044556,
"learning_rate": 1.992035358593258e-05,
"loss": 0.8981,
"mean_token_accuracy": 0.7691041335463524,
"num_tokens": 2377519.0,
"step": 140
},
{
"epoch": 0.4101161995898838,
"grad_norm": 1.385062575340271,
"learning_rate": 1.987564552165524e-05,
"loss": 0.8706,
"mean_token_accuracy": 0.7754243835806847,
"num_tokens": 2543887.0,
"step": 150
},
{
"epoch": 0.4374572795625427,
"grad_norm": 1.2568836212158203,
"learning_rate": 1.9821093199149806e-05,
"loss": 0.9014,
"mean_token_accuracy": 0.767622135579586,
"num_tokens": 2717105.0,
"step": 160
},
{
"epoch": 0.46479835953520166,
"grad_norm": 1.3307958841323853,
"learning_rate": 1.9756750997357738e-05,
"loss": 0.8707,
"mean_token_accuracy": 0.7744425281882286,
"num_tokens": 2885656.0,
"step": 170
},
{
"epoch": 0.49213943950786054,
"grad_norm": 1.3584340810775757,
"learning_rate": 1.9682683053985073e-05,
"loss": 0.8685,
"mean_token_accuracy": 0.7746870696544648,
"num_tokens": 3055414.0,
"step": 180
},
{
"epoch": 0.5194805194805194,
"grad_norm": 1.4741311073303223,
"learning_rate": 1.959896320156857e-05,
"loss": 0.8758,
"mean_token_accuracy": 0.7729167580604553,
"num_tokens": 3224208.0,
"step": 190
},
{
"epoch": 0.5468215994531784,
"grad_norm": 1.252274990081787,
"learning_rate": 1.950567489387783e-05,
"loss": 0.8767,
"mean_token_accuracy": 0.7731393739581108,
"num_tokens": 3394113.0,
"step": 200
},
{
"epoch": 0.5468215994531784,
"eval_loss": 0.8808718323707581,
"eval_mean_token_accuracy": 0.7713644226635892,
"eval_num_tokens": 3394113.0,
"eval_runtime": 14.1481,
"eval_samples_per_second": 367.611,
"eval_steps_per_second": 11.521,
"step": 200
},
{
"epoch": 0.5741626794258373,
"grad_norm": 1.2876965999603271,
"learning_rate": 1.9402911122726756e-05,
"loss": 0.8797,
"mean_token_accuracy": 0.7735408559441567,
"num_tokens": 3561161.0,
"step": 210
},
{
"epoch": 0.6015037593984962,
"grad_norm": 2.8176612854003906,
"learning_rate": 1.9290774325277305e-05,
"loss": 0.8902,
"mean_token_accuracy": 0.7713056340813637,
"num_tokens": 3732561.0,
"step": 220
},
{
"epoch": 0.6288448393711552,
"grad_norm": 1.2533715963363647,
"learning_rate": 1.916937628192789e-05,
"loss": 0.8588,
"mean_token_accuracy": 0.7773521527647972,
"num_tokens": 3900320.0,
"step": 230
},
{
"epoch": 0.6561859193438141,
"grad_norm": 1.3737971782684326,
"learning_rate": 1.903883800488824e-05,
"loss": 0.8796,
"mean_token_accuracy": 0.7744974941015244,
"num_tokens": 4068430.0,
"step": 240
},
{
"epoch": 0.683526999316473,
"grad_norm": 1.3005715608596802,
"learning_rate": 1.8899289617551803e-05,
"loss": 0.8767,
"mean_token_accuracy": 0.7730578362941742,
"num_tokens": 4238745.0,
"step": 250
},
{
"epoch": 0.7108680792891319,
"grad_norm": 1.2251440286636353,
"learning_rate": 1.875087022478594e-05,
"loss": 0.872,
"mean_token_accuracy": 0.7743289664387702,
"num_tokens": 4408233.0,
"step": 260
},
{
"epoch": 0.7382091592617909,
"grad_norm": 1.1912628412246704,
"learning_rate": 1.8593727774269122e-05,
"loss": 0.8857,
"mean_token_accuracy": 0.7705358847975731,
"num_tokens": 4581540.0,
"step": 270
},
{
"epoch": 0.7655502392344498,
"grad_norm": 1.1127012968063354,
"learning_rate": 1.842801890901351e-05,
"loss": 0.8716,
"mean_token_accuracy": 0.7735901325941086,
"num_tokens": 4752315.0,
"step": 280
},
{
"epoch": 0.7928913192071086,
"grad_norm": 1.1501351594924927,
"learning_rate": 1.8253908811219764e-05,
"loss": 0.8572,
"mean_token_accuracy": 0.7767810940742492,
"num_tokens": 4922291.0,
"step": 290
},
{
"epoch": 0.8202323991797676,
"grad_norm": 1.1722129583358765,
"learning_rate": 1.8071571037619856e-05,
"loss": 0.8814,
"mean_token_accuracy": 0.7732006222009659,
"num_tokens": 5091028.0,
"step": 300
},
{
"epoch": 0.8202323991797676,
"eval_loss": 0.8661420345306396,
"eval_mean_token_accuracy": 0.774554943745853,
"eval_num_tokens": 5091028.0,
"eval_runtime": 14.1269,
"eval_samples_per_second": 368.164,
"eval_steps_per_second": 11.538,
"step": 300
},
{
"epoch": 0.8475734791524265,
"grad_norm": 1.1900874376296997,
"learning_rate": 1.7881187346471924e-05,
"loss": 0.878,
"mean_token_accuracy": 0.7729042619466782,
"num_tokens": 5263685.0,
"step": 310
},
{
"epoch": 0.8749145591250854,
"grad_norm": 1.1753484010696411,
"learning_rate": 1.7682947516379706e-05,
"loss": 0.8649,
"mean_token_accuracy": 0.775020281970501,
"num_tokens": 5433324.0,
"step": 320
},
{
"epoch": 0.9022556390977443,
"grad_norm": 1.0580289363861084,
"learning_rate": 1.7477049157117093e-05,
"loss": 0.873,
"mean_token_accuracy": 0.7735655456781387,
"num_tokens": 5602221.0,
"step": 330
},
{
"epoch": 0.9295967190704033,
"grad_norm": 1.3067166805267334,
"learning_rate": 1.7263697512646397e-05,
"loss": 0.8579,
"mean_token_accuracy": 0.7792476788163185,
"num_tokens": 5770719.0,
"step": 340
},
{
"epoch": 0.9569377990430622,
"grad_norm": 1.161084532737732,
"learning_rate": 1.7043105256526723e-05,
"loss": 0.8507,
"mean_token_accuracy": 0.7788742691278457,
"num_tokens": 5939298.0,
"step": 350
},
{
"epoch": 0.9842788790157211,
"grad_norm": 1.1978732347488403,
"learning_rate": 1.681549227991634e-05,
"loss": 0.8597,
"mean_token_accuracy": 0.7768805012106895,
"num_tokens": 6109792.0,
"step": 360
},
{
"epoch": 1.0136705399863295,
"grad_norm": 1.2675771713256836,
"learning_rate": 1.658108547238038e-05,
"loss": 0.803,
"mean_token_accuracy": 0.8084554207034227,
"num_tokens": 6283339.0,
"step": 370
},
{
"epoch": 1.0410116199589883,
"grad_norm": 1.265122890472412,
"learning_rate": 1.634011849572239e-05,
"loss": 0.5533,
"mean_token_accuracy": 0.8468987360596657,
"num_tokens": 6451470.0,
"step": 380
},
{
"epoch": 1.0683526999316473,
"grad_norm": 1.2544903755187988,
"learning_rate": 1.609283155106517e-05,
"loss": 0.534,
"mean_token_accuracy": 0.8496995747089386,
"num_tokens": 6620528.0,
"step": 390
},
{
"epoch": 1.0956937799043063,
"grad_norm": 1.2964367866516113,
"learning_rate": 1.5839471139413065e-05,
"loss": 0.5464,
"mean_token_accuracy": 0.8472872495651245,
"num_tokens": 6789027.0,
"step": 400
},
{
"epoch": 1.0956937799043063,
"eval_loss": 0.923022449016571,
"eval_mean_token_accuracy": 0.7719546443114251,
"eval_num_tokens": 6789027.0,
"eval_runtime": 14.2222,
"eval_samples_per_second": 365.696,
"eval_steps_per_second": 11.461,
"step": 400
},
{
"epoch": 1.123034859876965,
"grad_norm": 1.214882254600525,
"learning_rate": 1.55802898159344e-05,
"loss": 0.5407,
"mean_token_accuracy": 0.848370036482811,
"num_tokens": 6958294.0,
"step": 410
},
{
"epoch": 1.150375939849624,
"grad_norm": 1.1957063674926758,
"learning_rate": 1.5315545938209016e-05,
"loss": 0.5303,
"mean_token_accuracy": 0.8504750013351441,
"num_tokens": 7126382.0,
"step": 420
},
{
"epoch": 1.177717019822283,
"grad_norm": 1.2198673486709595,
"learning_rate": 1.5045503408691776e-05,
"loss": 0.5361,
"mean_token_accuracy": 0.8481929019093514,
"num_tokens": 7296187.0,
"step": 430
},
{
"epoch": 1.2050580997949418,
"grad_norm": 1.2924712896347046,
"learning_rate": 1.4770431411648898e-05,
"loss": 0.5327,
"mean_token_accuracy": 0.8494196251034737,
"num_tokens": 7467368.0,
"step": 440
},
{
"epoch": 1.2323991797676008,
"grad_norm": 1.169443130493164,
"learning_rate": 1.4490604144829204e-05,
"loss": 0.5543,
"mean_token_accuracy": 0.8449285939335823,
"num_tokens": 7638452.0,
"step": 450
},
{
"epoch": 1.2597402597402598,
"grad_norm": 1.2594044208526611,
"learning_rate": 1.4206300546137844e-05,
"loss": 0.5341,
"mean_token_accuracy": 0.8485873684287071,
"num_tokens": 7810489.0,
"step": 460
},
{
"epoch": 1.2870813397129186,
"grad_norm": 1.2319672107696533,
"learning_rate": 1.3917804015584932e-05,
"loss": 0.5496,
"mean_token_accuracy": 0.8453727856278419,
"num_tokens": 7980694.0,
"step": 470
},
{
"epoch": 1.3144224196855776,
"grad_norm": 1.2721346616744995,
"learning_rate": 1.3625402132786247e-05,
"loss": 0.5241,
"mean_token_accuracy": 0.8517711386084557,
"num_tokens": 8150413.0,
"step": 480
},
{
"epoch": 1.3417634996582364,
"grad_norm": 1.2490406036376953,
"learning_rate": 1.3329386370297615e-05,
"loss": 0.5359,
"mean_token_accuracy": 0.848786735534668,
"num_tokens": 8320550.0,
"step": 490
},
{
"epoch": 1.3691045796308954,
"grad_norm": 1.2758678197860718,
"learning_rate": 1.3030051803068729e-05,
"loss": 0.5357,
"mean_token_accuracy": 0.8479269713163375,
"num_tokens": 8490464.0,
"step": 500
},
{
"epoch": 1.3691045796308954,
"eval_loss": 0.8972102999687195,
"eval_mean_token_accuracy": 0.7732659333322677,
"eval_num_tokens": 8490464.0,
"eval_runtime": 14.2312,
"eval_samples_per_second": 365.464,
"eval_steps_per_second": 11.454,
"step": 500
},
{
"epoch": 1.3964456596035544,
"grad_norm": 1.197234034538269,
"learning_rate": 1.2727696814306034e-05,
"loss": 0.5424,
"mean_token_accuracy": 0.8464474871754646,
"num_tokens": 8658228.0,
"step": 510
},
{
"epoch": 1.4237867395762134,
"grad_norm": 1.2052682638168335,
"learning_rate": 1.2422622798037833e-05,
"loss": 0.5344,
"mean_token_accuracy": 0.849498587846756,
"num_tokens": 8827640.0,
"step": 520
},
{
"epoch": 1.4511278195488722,
"grad_norm": 1.2217556238174438,
"learning_rate": 1.2115133858678192e-05,
"loss": 0.5329,
"mean_token_accuracy": 0.8495344504714012,
"num_tokens": 8998630.0,
"step": 530
},
{
"epoch": 1.4784688995215312,
"grad_norm": 1.2305293083190918,
"learning_rate": 1.1805536507889021e-05,
"loss": 0.5324,
"mean_token_accuracy": 0.8484964698553086,
"num_tokens": 9165678.0,
"step": 540
},
{
"epoch": 1.50580997949419,
"grad_norm": 1.2119832038879395,
"learning_rate": 1.1494139359042612e-05,
"loss": 0.5365,
"mean_token_accuracy": 0.8479975119233132,
"num_tokens": 9336809.0,
"step": 550
},
{
"epoch": 1.533151059466849,
"grad_norm": 1.2407721281051636,
"learning_rate": 1.1181252819589081e-05,
"loss": 0.5316,
"mean_token_accuracy": 0.8489836812019348,
"num_tokens": 9507494.0,
"step": 560
},
{
"epoch": 1.560492139439508,
"grad_norm": 1.2563666105270386,
"learning_rate": 1.086718878163551e-05,
"loss": 0.5372,
"mean_token_accuracy": 0.8482470452785492,
"num_tokens": 9678069.0,
"step": 570
},
{
"epoch": 1.587833219412167,
"grad_norm": 1.1326807737350464,
"learning_rate": 1.0552260311045082e-05,
"loss": 0.5207,
"mean_token_accuracy": 0.8524438634514808,
"num_tokens": 9844214.0,
"step": 580
},
{
"epoch": 1.6151742993848257,
"grad_norm": 1.2195802927017212,
"learning_rate": 1.0236781335366239e-05,
"loss": 0.5437,
"mean_token_accuracy": 0.8467420265078545,
"num_tokens": 10013802.0,
"step": 590
},
{
"epoch": 1.6425153793574845,
"grad_norm": 1.2044719457626343,
"learning_rate": 9.92106633090287e-06,
"loss": 0.5234,
"mean_token_accuracy": 0.8509897753596306,
"num_tokens": 10183182.0,
"step": 600
},
{
"epoch": 1.6425153793574845,
"eval_loss": 0.8916485905647278,
"eval_mean_token_accuracy": 0.7742225652092074,
"eval_num_tokens": 10183182.0,
"eval_runtime": 14.1444,
"eval_samples_per_second": 367.707,
"eval_steps_per_second": 11.524,
"step": 600
},
{
"epoch": 1.6698564593301435,
"grad_norm": 1.2906745672225952,
"learning_rate": 9.605430009237474e-06,
"loss": 0.549,
"mean_token_accuracy": 0.8446809872984886,
"num_tokens": 10355702.0,
"step": 610
},
{
"epoch": 1.6971975393028025,
"grad_norm": 1.1947541236877441,
"learning_rate": 9.290187003519841e-06,
"loss": 0.5447,
"mean_token_accuracy": 0.8470652237534523,
"num_tokens": 10527901.0,
"step": 620
},
{
"epoch": 1.7245386192754615,
"grad_norm": 1.2121537923812866,
"learning_rate": 8.975651554833869e-06,
"loss": 0.5352,
"mean_token_accuracy": 0.8479515925049782,
"num_tokens": 10698378.0,
"step": 630
},
{
"epoch": 1.7518796992481203,
"grad_norm": 1.274902105331421,
"learning_rate": 8.662137198955211e-06,
"loss": 0.5317,
"mean_token_accuracy": 0.8490415424108505,
"num_tokens": 10863945.0,
"step": 640
},
{
"epoch": 1.7792207792207793,
"grad_norm": 1.270011305809021,
"learning_rate": 8.349956453812009e-06,
"loss": 0.5193,
"mean_token_accuracy": 0.8525262147188186,
"num_tokens": 11032272.0,
"step": 650
},
{
"epoch": 1.806561859193438,
"grad_norm": 1.1416285037994385,
"learning_rate": 8.03942050796022e-06,
"loss": 0.538,
"mean_token_accuracy": 0.8476535528898239,
"num_tokens": 11200924.0,
"step": 660
},
{
"epoch": 1.833902939166097,
"grad_norm": 1.182759165763855,
"learning_rate": 7.730838910384098e-06,
"loss": 0.5182,
"mean_token_accuracy": 0.8533438310027123,
"num_tokens": 11369904.0,
"step": 670
},
{
"epoch": 1.861244019138756,
"grad_norm": 1.2786790132522583,
"learning_rate": 7.424519261931036e-06,
"loss": 0.5427,
"mean_token_accuracy": 0.846244253218174,
"num_tokens": 11541083.0,
"step": 680
},
{
"epoch": 1.888585099111415,
"grad_norm": 1.2706736326217651,
"learning_rate": 7.1207669086883366e-06,
"loss": 0.5438,
"mean_token_accuracy": 0.8467564389109612,
"num_tokens": 11715458.0,
"step": 690
},
{
"epoch": 1.9159261790840738,
"grad_norm": 1.23281991481781,
"learning_rate": 6.819884637607619e-06,
"loss": 0.522,
"mean_token_accuracy": 0.85111885368824,
"num_tokens": 11885424.0,
"step": 700
},
{
"epoch": 1.9159261790840738,
"eval_loss": 0.8901246190071106,
"eval_mean_token_accuracy": 0.7761346054223418,
"eval_num_tokens": 11885424.0,
"eval_runtime": 14.2093,
"eval_samples_per_second": 366.027,
"eval_steps_per_second": 11.471,
"step": 700
},
{
"epoch": 1.9432672590567326,
"grad_norm": 1.181417465209961,
"learning_rate": 6.522172374680177e-06,
"loss": 0.5152,
"mean_token_accuracy": 0.8532154947519303,
"num_tokens": 12053676.0,
"step": 710
},
{
"epoch": 1.9706083390293916,
"grad_norm": 1.2707254886627197,
"learning_rate": 6.2279268859642396e-06,
"loss": 0.5259,
"mean_token_accuracy": 0.8510755389928818,
"num_tokens": 12221973.0,
"step": 720
},
{
"epoch": 1.9979494190020506,
"grad_norm": 1.197986125946045,
"learning_rate": 5.937441481762112e-06,
"loss": 0.5305,
"mean_token_accuracy": 0.8508959412574768,
"num_tokens": 12391740.0,
"step": 730
},
{
"epoch": 2.027341079972659,
"grad_norm": 1.7105273008346558,
"learning_rate": 5.651005724242072e-06,
"loss": 0.331,
"mean_token_accuracy": 0.9215623707306094,
"num_tokens": 12566261.0,
"step": 740
},
{
"epoch": 2.054682159945318,
"grad_norm": 1.1565046310424805,
"learning_rate": 5.368905138796523e-06,
"loss": 0.2543,
"mean_token_accuracy": 0.9277019128203392,
"num_tokens": 12734646.0,
"step": 750
},
{
"epoch": 2.0820232399179766,
"grad_norm": 1.153232216835022,
"learning_rate": 5.091420929424065e-06,
"loss": 0.2451,
"mean_token_accuracy": 0.9302394583821296,
"num_tokens": 12905690.0,
"step": 760
},
{
"epoch": 2.1093643198906356,
"grad_norm": 1.2222003936767578,
"learning_rate": 4.818829698419225e-06,
"loss": 0.2436,
"mean_token_accuracy": 0.9303286671638489,
"num_tokens": 13079445.0,
"step": 770
},
{
"epoch": 2.1367053998632946,
"grad_norm": 1.2984336614608765,
"learning_rate": 4.551403170649299e-06,
"loss": 0.2528,
"mean_token_accuracy": 0.9280653029680253,
"num_tokens": 13249001.0,
"step": 780
},
{
"epoch": 2.1640464798359536,
"grad_norm": 1.2998665571212769,
"learning_rate": 4.289407922693053e-06,
"loss": 0.2474,
"mean_token_accuracy": 0.9294205904006958,
"num_tokens": 13418283.0,
"step": 790
},
{
"epoch": 2.1913875598086126,
"grad_norm": 1.1589492559432983,
"learning_rate": 4.033105117111441e-06,
"loss": 0.244,
"mean_token_accuracy": 0.9301661103963852,
"num_tokens": 13591481.0,
"step": 800
},
{
"epoch": 2.1913875598086126,
"eval_loss": 1.0690910816192627,
"eval_mean_token_accuracy": 0.7694480528860735,
"eval_num_tokens": 13591481.0,
"eval_runtime": 14.1576,
"eval_samples_per_second": 367.366,
"eval_steps_per_second": 11.513,
"step": 800
},
{
"epoch": 2.2187286397812715,
"grad_norm": 1.2524975538253784,
"learning_rate": 3.7827502421150497e-06,
"loss": 0.2465,
"mean_token_accuracy": 0.9289533108472824,
"num_tokens": 13759603.0,
"step": 810
},
{
"epoch": 2.24606971975393,
"grad_norm": 1.2752741575241089,
"learning_rate": 3.5385928568879012e-06,
"loss": 0.2489,
"mean_token_accuracy": 0.9280223950743676,
"num_tokens": 13928891.0,
"step": 820
},
{
"epoch": 2.273410799726589,
"grad_norm": 1.1992969512939453,
"learning_rate": 3.300876342821451e-06,
"loss": 0.2555,
"mean_token_accuracy": 0.9266896471381187,
"num_tokens": 14099181.0,
"step": 830
},
{
"epoch": 2.300751879699248,
"grad_norm": 1.2335748672485352,
"learning_rate": 3.0698376609066828e-06,
"loss": 0.2531,
"mean_token_accuracy": 0.9276198267936706,
"num_tokens": 14269195.0,
"step": 840
},
{
"epoch": 2.328092959671907,
"grad_norm": 1.2644842863082886,
"learning_rate": 2.8457071155262885e-06,
"loss": 0.2509,
"mean_token_accuracy": 0.9287231177091598,
"num_tokens": 14438098.0,
"step": 850
},
{
"epoch": 2.355434039644566,
"grad_norm": 1.2391464710235596,
"learning_rate": 2.628708124882212e-06,
"loss": 0.2467,
"mean_token_accuracy": 0.9289599403738975,
"num_tokens": 14609269.0,
"step": 860
},
{
"epoch": 2.382775119617225,
"grad_norm": 1.1992994546890259,
"learning_rate": 2.419056998287547e-06,
"loss": 0.2471,
"mean_token_accuracy": 0.9298149168491363,
"num_tokens": 14775550.0,
"step": 870
},
{
"epoch": 2.4101161995898837,
"grad_norm": 1.2357077598571777,
"learning_rate": 2.216962720544703e-06,
"loss": 0.2499,
"mean_token_accuracy": 0.9283073455095291,
"num_tokens": 14946021.0,
"step": 880
},
{
"epoch": 2.4374572795625427,
"grad_norm": 1.1988837718963623,
"learning_rate": 2.022626743624807e-06,
"loss": 0.2461,
"mean_token_accuracy": 0.9295058876276017,
"num_tokens": 15116026.0,
"step": 890
},
{
"epoch": 2.4647983595352017,
"grad_norm": 1.1734275817871094,
"learning_rate": 1.8362427858560094e-06,
"loss": 0.2454,
"mean_token_accuracy": 0.9299230113625526,
"num_tokens": 15287643.0,
"step": 900
},
{
"epoch": 2.4647983595352017,
"eval_loss": 1.0691540241241455,
"eval_mean_token_accuracy": 0.7695598562070929,
"eval_num_tokens": 15287643.0,
"eval_runtime": 14.2047,
"eval_samples_per_second": 366.146,
"eval_steps_per_second": 11.475,
"step": 900
},
{
"epoch": 2.4921394395078607,
"grad_norm": 1.1861553192138672,
"learning_rate": 1.6579966388208257e-06,
"loss": 0.2415,
"mean_token_accuracy": 0.9315058842301369,
"num_tokens": 15455425.0,
"step": 910
},
{
"epoch": 2.5194805194805197,
"grad_norm": 1.1142207384109497,
"learning_rate": 1.4880659821550547e-06,
"loss": 0.2387,
"mean_token_accuracy": 0.931833279132843,
"num_tokens": 15625317.0,
"step": 920
},
{
"epoch": 2.546821599453178,
"grad_norm": 1.2019150257110596,
"learning_rate": 1.3266202064328548e-06,
"loss": 0.2424,
"mean_token_accuracy": 0.9311563596129417,
"num_tokens": 15795191.0,
"step": 930
},
{
"epoch": 2.574162679425837,
"grad_norm": 1.2547236680984497,
"learning_rate": 1.1738202443145307e-06,
"loss": 0.2449,
"mean_token_accuracy": 0.9302171498537064,
"num_tokens": 15966331.0,
"step": 940
},
{
"epoch": 2.601503759398496,
"grad_norm": 1.1526901721954346,
"learning_rate": 1.029818410125365e-06,
"loss": 0.2457,
"mean_token_accuracy": 0.9300970509648323,
"num_tokens": 16132046.0,
"step": 950
},
{
"epoch": 2.628844839371155,
"grad_norm": 1.288517951965332,
"learning_rate": 8.94758248025378e-07,
"loss": 0.2487,
"mean_token_accuracy": 0.9283649578690529,
"num_tokens": 16300373.0,
"step": 960
},
{
"epoch": 2.656185919343814,
"grad_norm": 1.21402108669281,
"learning_rate": 7.687743889213939e-07,
"loss": 0.2415,
"mean_token_accuracy": 0.930217319726944,
"num_tokens": 16469055.0,
"step": 970
},
{
"epoch": 2.6835269993164728,
"grad_norm": 1.2375903129577637,
"learning_rate": 6.519924162640168e-07,
"loss": 0.2408,
"mean_token_accuracy": 0.9317809835076332,
"num_tokens": 16638392.0,
"step": 980
},
{
"epoch": 2.7108680792891318,
"grad_norm": 1.1859582662582397,
"learning_rate": 5.445287408633304e-07,
"loss": 0.2416,
"mean_token_accuracy": 0.9303795635700226,
"num_tokens": 16807749.0,
"step": 990
},
{
"epoch": 2.7382091592617908,
"grad_norm": 1.150184154510498,
"learning_rate": 4.464904848480522e-07,
"loss": 0.2423,
"mean_token_accuracy": 0.9296372309327126,
"num_tokens": 16982293.0,
"step": 1000
},
{
"epoch": 2.7382091592617908,
"eval_loss": 1.0727962255477905,
"eval_mean_token_accuracy": 0.7691918337271989,
"eval_num_tokens": 16982293.0,
"eval_runtime": 14.2647,
"eval_samples_per_second": 364.607,
"eval_steps_per_second": 11.427,
"step": 1000
},
{
"epoch": 2.7655502392344498,
"grad_norm": 1.2391895055770874,
"learning_rate": 3.5797537488388326e-07,
"loss": 0.2407,
"mean_token_accuracy": 0.9306937634944916,
"num_tokens": 17151239.0,
"step": 1010
},
{
"epoch": 2.7928913192071088,
"grad_norm": 1.3776847124099731,
"learning_rate": 2.790716447574304e-07,
"loss": 0.2454,
"mean_token_accuracy": 0.9301917076110839,
"num_tokens": 17320214.0,
"step": 1020
},
{
"epoch": 2.8202323991797678,
"grad_norm": 1.2018030881881714,
"learning_rate": 2.098579474228546e-07,
"loss": 0.2423,
"mean_token_accuracy": 0.9306885093450546,
"num_tokens": 17490016.0,
"step": 1030
},
{
"epoch": 2.8475734791524268,
"grad_norm": 1.2227405309677124,
"learning_rate": 1.504032765988961e-07,
"loss": 0.2388,
"mean_token_accuracy": 0.9320418834686279,
"num_tokens": 17661813.0,
"step": 1040
},
{
"epoch": 2.8749145591250853,
"grad_norm": 1.286917805671692,
"learning_rate": 1.0076689799442874e-07,
"loss": 0.2444,
"mean_token_accuracy": 0.9299480274319649,
"num_tokens": 17829411.0,
"step": 1050
},
{
"epoch": 2.9022556390977443,
"grad_norm": 1.239208459854126,
"learning_rate": 6.099829023112236e-08,
"loss": 0.249,
"mean_token_accuracy": 0.9299103036522866,
"num_tokens": 17998820.0,
"step": 1060
},
{
"epoch": 2.9295967190704033,
"grad_norm": 1.1208025217056274,
"learning_rate": 3.1137095522068006e-08,
"loss": 0.2402,
"mean_token_accuracy": 0.9313350632786751,
"num_tokens": 18167974.0,
"step": 1070
},
{
"epoch": 2.9569377990430623,
"grad_norm": 1.7012122869491577,
"learning_rate": 1.1213080155564327e-08,
"loss": 0.2473,
"mean_token_accuracy": 0.9298618510365486,
"num_tokens": 18335256.0,
"step": 1080
},
{
"epoch": 2.9842788790157213,
"grad_norm": 1.2129054069519043,
"learning_rate": 1.246104823426908e-09,
"loss": 0.2468,
"mean_token_accuracy": 0.9303196057677269,
"num_tokens": 18506697.0,
"step": 1090
}
],
"logging_steps": 10,
"max_steps": 1095,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 100,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1.4034604245899018e+18,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}