zgrgr's picture
Upload model files with Nebius access
21b0b84 verified
{
"best_metric": 1.1622273921966553,
"best_model_checkpoint": "./outputs/instruct-lora-8b-aplly_chat_template-land/checkpoint-740",
"epoch": 1.0652463382157125,
"eval_steps": 20,
"global_step": 800,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0013315579227696406,
"eval_loss": 1.4733461141586304,
"eval_runtime": 59.4361,
"eval_samples_per_second": 22.461,
"eval_steps_per_second": 5.619,
"step": 1
},
{
"epoch": 0.02663115845539281,
"grad_norm": 0.7614122629165649,
"learning_rate": 2.666666666666667e-06,
"loss": 1.4194,
"step": 20
},
{
"epoch": 0.02663115845539281,
"eval_loss": 1.471280813217163,
"eval_runtime": 57.1574,
"eval_samples_per_second": 23.357,
"eval_steps_per_second": 5.844,
"step": 20
},
{
"epoch": 0.05326231691078562,
"grad_norm": 0.7800308465957642,
"learning_rate": 5.333333333333334e-06,
"loss": 1.376,
"step": 40
},
{
"epoch": 0.05326231691078562,
"eval_loss": 1.4474345445632935,
"eval_runtime": 57.2352,
"eval_samples_per_second": 23.325,
"eval_steps_per_second": 5.836,
"step": 40
},
{
"epoch": 0.07989347536617843,
"grad_norm": 0.8508164286613464,
"learning_rate": 8e-06,
"loss": 1.3563,
"step": 60
},
{
"epoch": 0.07989347536617843,
"eval_loss": 1.3645858764648438,
"eval_runtime": 57.1364,
"eval_samples_per_second": 23.365,
"eval_steps_per_second": 5.846,
"step": 60
},
{
"epoch": 0.10652463382157124,
"grad_norm": 0.8896499276161194,
"learning_rate": 1.0666666666666667e-05,
"loss": 1.2653,
"step": 80
},
{
"epoch": 0.10652463382157124,
"eval_loss": 1.303858757019043,
"eval_runtime": 57.1088,
"eval_samples_per_second": 23.376,
"eval_steps_per_second": 5.848,
"step": 80
},
{
"epoch": 0.13315579227696406,
"grad_norm": 0.9267684817314148,
"learning_rate": 1.3333333333333333e-05,
"loss": 1.2094,
"step": 100
},
{
"epoch": 0.13315579227696406,
"eval_loss": 1.279226541519165,
"eval_runtime": 59.6928,
"eval_samples_per_second": 22.365,
"eval_steps_per_second": 5.595,
"step": 100
},
{
"epoch": 0.15978695073235685,
"grad_norm": 1.0457453727722168,
"learning_rate": 1.6e-05,
"loss": 1.1917,
"step": 120
},
{
"epoch": 0.15978695073235685,
"eval_loss": 1.2594722509384155,
"eval_runtime": 57.1101,
"eval_samples_per_second": 23.376,
"eval_steps_per_second": 5.848,
"step": 120
},
{
"epoch": 0.18641810918774968,
"grad_norm": 1.1883381605148315,
"learning_rate": 1.866666666666667e-05,
"loss": 1.2034,
"step": 140
},
{
"epoch": 0.18641810918774968,
"eval_loss": 1.2453105449676514,
"eval_runtime": 57.085,
"eval_samples_per_second": 23.386,
"eval_steps_per_second": 5.851,
"step": 140
},
{
"epoch": 0.21304926764314247,
"grad_norm": 1.2522987127304077,
"learning_rate": 2.1333333333333335e-05,
"loss": 1.1147,
"step": 160
},
{
"epoch": 0.21304926764314247,
"eval_loss": 1.2352497577667236,
"eval_runtime": 59.6977,
"eval_samples_per_second": 22.363,
"eval_steps_per_second": 5.595,
"step": 160
},
{
"epoch": 0.2396804260985353,
"grad_norm": 1.3950749635696411,
"learning_rate": 2.4e-05,
"loss": 1.1172,
"step": 180
},
{
"epoch": 0.2396804260985353,
"eval_loss": 1.2247178554534912,
"eval_runtime": 57.1298,
"eval_samples_per_second": 23.368,
"eval_steps_per_second": 5.846,
"step": 180
},
{
"epoch": 0.2663115845539281,
"grad_norm": 1.3889997005462646,
"learning_rate": 2.6666666666666667e-05,
"loss": 1.1148,
"step": 200
},
{
"epoch": 0.2663115845539281,
"eval_loss": 1.2236417531967163,
"eval_runtime": 57.1101,
"eval_samples_per_second": 23.376,
"eval_steps_per_second": 5.848,
"step": 200
},
{
"epoch": 0.2929427430093209,
"grad_norm": 1.4289050102233887,
"learning_rate": 2.9333333333333333e-05,
"loss": 1.0828,
"step": 220
},
{
"epoch": 0.2929427430093209,
"eval_loss": 1.217771291732788,
"eval_runtime": 57.09,
"eval_samples_per_second": 23.384,
"eval_steps_per_second": 5.85,
"step": 220
},
{
"epoch": 0.3195739014647137,
"grad_norm": 1.48817777633667,
"learning_rate": 2.9995950624188135e-05,
"loss": 1.0756,
"step": 240
},
{
"epoch": 0.3195739014647137,
"eval_loss": 1.2135677337646484,
"eval_runtime": 57.0597,
"eval_samples_per_second": 23.397,
"eval_steps_per_second": 5.854,
"step": 240
},
{
"epoch": 0.34620505992010653,
"grad_norm": 1.4912829399108887,
"learning_rate": 2.9977957806883764e-05,
"loss": 1.0463,
"step": 260
},
{
"epoch": 0.34620505992010653,
"eval_loss": 1.207130789756775,
"eval_runtime": 57.0489,
"eval_samples_per_second": 23.401,
"eval_steps_per_second": 5.855,
"step": 260
},
{
"epoch": 0.37283621837549935,
"grad_norm": 1.4056388139724731,
"learning_rate": 2.99455888692835e-05,
"loss": 1.0452,
"step": 280
},
{
"epoch": 0.37283621837549935,
"eval_loss": 1.2046023607254028,
"eval_runtime": 57.0853,
"eval_samples_per_second": 23.386,
"eval_steps_per_second": 5.851,
"step": 280
},
{
"epoch": 0.3994673768308921,
"grad_norm": 1.4942606687545776,
"learning_rate": 2.989887487969095e-05,
"loss": 1.0261,
"step": 300
},
{
"epoch": 0.3994673768308921,
"eval_loss": 1.1982561349868774,
"eval_runtime": 57.1051,
"eval_samples_per_second": 23.378,
"eval_steps_per_second": 5.849,
"step": 300
},
{
"epoch": 0.42609853528628494,
"grad_norm": 1.6378928422927856,
"learning_rate": 2.983786067505537e-05,
"loss": 1.0198,
"step": 320
},
{
"epoch": 0.42609853528628494,
"eval_loss": 1.197502851486206,
"eval_runtime": 59.5901,
"eval_samples_per_second": 22.403,
"eval_steps_per_second": 5.605,
"step": 320
},
{
"epoch": 0.45272969374167776,
"grad_norm": 1.569143533706665,
"learning_rate": 2.9762604817936267e-05,
"loss": 1.0101,
"step": 340
},
{
"epoch": 0.45272969374167776,
"eval_loss": 1.197273850440979,
"eval_runtime": 57.1144,
"eval_samples_per_second": 23.374,
"eval_steps_per_second": 5.848,
"step": 340
},
{
"epoch": 0.4793608521970706,
"grad_norm": 1.6125699281692505,
"learning_rate": 2.9673179540294035e-05,
"loss": 1.0121,
"step": 360
},
{
"epoch": 0.4793608521970706,
"eval_loss": 1.1948621273040771,
"eval_runtime": 57.1203,
"eval_samples_per_second": 23.372,
"eval_steps_per_second": 5.847,
"step": 360
},
{
"epoch": 0.5059920106524634,
"grad_norm": 1.5121594667434692,
"learning_rate": 2.9569670674160343e-05,
"loss": 1.0169,
"step": 380
},
{
"epoch": 0.5059920106524634,
"eval_loss": 1.1911152601242065,
"eval_runtime": 60.0674,
"eval_samples_per_second": 22.225,
"eval_steps_per_second": 5.56,
"step": 380
},
{
"epoch": 0.5326231691078562,
"grad_norm": 1.5439465045928955,
"learning_rate": 2.945217756925498e-05,
"loss": 0.9799,
"step": 400
},
{
"epoch": 0.5326231691078562,
"eval_loss": 1.1894199848175049,
"eval_runtime": 57.1247,
"eval_samples_per_second": 23.37,
"eval_steps_per_second": 5.847,
"step": 400
},
{
"epoch": 0.559254327563249,
"grad_norm": 1.857911229133606,
"learning_rate": 2.9320812997628184e-05,
"loss": 0.9872,
"step": 420
},
{
"epoch": 0.559254327563249,
"eval_loss": 1.1862047910690308,
"eval_runtime": 57.1282,
"eval_samples_per_second": 23.368,
"eval_steps_per_second": 5.847,
"step": 420
},
{
"epoch": 0.5858854860186418,
"grad_norm": 1.6074450016021729,
"learning_rate": 2.9175703045419906e-05,
"loss": 0.988,
"step": 440
},
{
"epoch": 0.5858854860186418,
"eval_loss": 1.184722661972046,
"eval_runtime": 57.1666,
"eval_samples_per_second": 23.353,
"eval_steps_per_second": 5.843,
"step": 440
},
{
"epoch": 0.6125166444740346,
"grad_norm": 1.587011456489563,
"learning_rate": 2.9016986991840035e-05,
"loss": 0.9861,
"step": 460
},
{
"epoch": 0.6125166444740346,
"eval_loss": 1.1814427375793457,
"eval_runtime": 57.1111,
"eval_samples_per_second": 23.375,
"eval_steps_per_second": 5.848,
"step": 460
},
{
"epoch": 0.6391478029294274,
"grad_norm": 1.6503058671951294,
"learning_rate": 2.8844817175485628e-05,
"loss": 0.9997,
"step": 480
},
{
"epoch": 0.6391478029294274,
"eval_loss": 1.1827510595321655,
"eval_runtime": 59.6344,
"eval_samples_per_second": 22.386,
"eval_steps_per_second": 5.601,
"step": 480
},
{
"epoch": 0.6657789613848203,
"grad_norm": 1.4606473445892334,
"learning_rate": 2.865935884812353e-05,
"loss": 0.9756,
"step": 500
},
{
"epoch": 0.6657789613848203,
"eval_loss": 1.177931785583496,
"eval_runtime": 57.1613,
"eval_samples_per_second": 23.355,
"eval_steps_per_second": 5.843,
"step": 500
},
{
"epoch": 0.6924101198402131,
"grad_norm": 1.6386032104492188,
"learning_rate": 2.8460790016078664e-05,
"loss": 0.9704,
"step": 520
},
{
"epoch": 0.6924101198402131,
"eval_loss": 1.1767512559890747,
"eval_runtime": 57.128,
"eval_samples_per_second": 23.369,
"eval_steps_per_second": 5.847,
"step": 520
},
{
"epoch": 0.7190412782956058,
"grad_norm": 1.5629956722259521,
"learning_rate": 2.824930126938027e-05,
"loss": 0.9575,
"step": 540
},
{
"epoch": 0.7190412782956058,
"eval_loss": 1.1756982803344727,
"eval_runtime": 59.3596,
"eval_samples_per_second": 22.49,
"eval_steps_per_second": 5.627,
"step": 540
},
{
"epoch": 0.7456724367509987,
"grad_norm": 1.9192149639129639,
"learning_rate": 2.8025095598830108e-05,
"loss": 0.9845,
"step": 560
},
{
"epoch": 0.7456724367509987,
"eval_loss": 1.1744287014007568,
"eval_runtime": 57.1096,
"eval_samples_per_second": 23.376,
"eval_steps_per_second": 5.848,
"step": 560
},
{
"epoch": 0.7723035952063915,
"grad_norm": 1.5297322273254395,
"learning_rate": 2.7788388201168096e-05,
"loss": 0.9635,
"step": 580
},
{
"epoch": 0.7723035952063915,
"eval_loss": 1.1726077795028687,
"eval_runtime": 57.1106,
"eval_samples_per_second": 23.376,
"eval_steps_per_second": 5.848,
"step": 580
},
{
"epoch": 0.7989347536617842,
"grad_norm": 1.5995993614196777,
"learning_rate": 2.7539406272522557e-05,
"loss": 1.0019,
"step": 600
},
{
"epoch": 0.7989347536617842,
"eval_loss": 1.1684755086898804,
"eval_runtime": 59.1165,
"eval_samples_per_second": 22.583,
"eval_steps_per_second": 5.65,
"step": 600
},
{
"epoch": 0.8255659121171771,
"grad_norm": 1.989475131034851,
"learning_rate": 2.7278388790343133e-05,
"loss": 0.965,
"step": 620
},
{
"epoch": 0.8255659121171771,
"eval_loss": 1.16959547996521,
"eval_runtime": 57.5389,
"eval_samples_per_second": 23.202,
"eval_steps_per_second": 5.805,
"step": 620
},
{
"epoch": 0.8521970705725699,
"grad_norm": 1.581007719039917,
"learning_rate": 2.7005586284025857e-05,
"loss": 0.9521,
"step": 640
},
{
"epoch": 0.8521970705725699,
"eval_loss": 1.1685765981674194,
"eval_runtime": 57.0994,
"eval_samples_per_second": 23.38,
"eval_steps_per_second": 5.849,
"step": 640
},
{
"epoch": 0.8788282290279628,
"grad_norm": 1.8926242589950562,
"learning_rate": 2.6721260594450408e-05,
"loss": 0.9714,
"step": 660
},
{
"epoch": 0.8788282290279628,
"eval_loss": 1.1654787063598633,
"eval_runtime": 57.0989,
"eval_samples_per_second": 23.38,
"eval_steps_per_second": 5.849,
"step": 660
},
{
"epoch": 0.9054593874833555,
"grad_norm": 1.7182027101516724,
"learning_rate": 2.6425684622660387e-05,
"loss": 0.9893,
"step": 680
},
{
"epoch": 0.9054593874833555,
"eval_loss": 1.1642155647277832,
"eval_runtime": 57.0492,
"eval_samples_per_second": 23.401,
"eval_steps_per_second": 5.855,
"step": 680
},
{
"epoch": 0.9320905459387483,
"grad_norm": 1.7494959831237793,
"learning_rate": 2.6119142067927872e-05,
"loss": 0.9581,
"step": 700
},
{
"epoch": 0.9320905459387483,
"eval_loss": 1.164635419845581,
"eval_runtime": 59.4597,
"eval_samples_per_second": 22.452,
"eval_steps_per_second": 5.617,
"step": 700
},
{
"epoch": 0.9587217043941412,
"grad_norm": 1.9605196714401245,
"learning_rate": 2.5801927155453614e-05,
"loss": 0.9165,
"step": 720
},
{
"epoch": 0.9587217043941412,
"eval_loss": 1.164476752281189,
"eval_runtime": 59.4987,
"eval_samples_per_second": 22.437,
"eval_steps_per_second": 5.614,
"step": 720
},
{
"epoch": 0.9853528628495339,
"grad_norm": 1.636960744857788,
"learning_rate": 2.5474344353964275e-05,
"loss": 0.9849,
"step": 740
},
{
"epoch": 0.9853528628495339,
"eval_loss": 1.1622273921966553,
"eval_runtime": 57.4882,
"eval_samples_per_second": 23.222,
"eval_steps_per_second": 5.81,
"step": 740
},
{
"epoch": 1.0119840213049267,
"grad_norm": 1.6740643978118896,
"learning_rate": 2.513670808347771e-05,
"loss": 0.905,
"step": 760
},
{
"epoch": 1.0119840213049267,
"eval_loss": 1.1645617485046387,
"eval_runtime": 57.4263,
"eval_samples_per_second": 23.247,
"eval_steps_per_second": 5.816,
"step": 760
},
{
"epoch": 1.0386151797603196,
"grad_norm": 1.7723573446273804,
"learning_rate": 2.4789342413516838e-05,
"loss": 0.8868,
"step": 780
},
{
"epoch": 1.0386151797603196,
"eval_loss": 1.1635513305664062,
"eval_runtime": 57.091,
"eval_samples_per_second": 23.384,
"eval_steps_per_second": 5.85,
"step": 780
},
{
"epoch": 1.0652463382157125,
"grad_norm": 1.7861186265945435,
"learning_rate": 2.4432580752061735e-05,
"loss": 0.8853,
"step": 800
},
{
"epoch": 1.0652463382157125,
"eval_loss": 1.1627150774002075,
"eval_runtime": 57.0672,
"eval_samples_per_second": 23.393,
"eval_steps_per_second": 5.853,
"step": 800
}
],
"logging_steps": 20,
"max_steps": 2253,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 200,
"stateful_callbacks": {
"EarlyStoppingCallback": {
"args": {
"early_stopping_patience": 3,
"early_stopping_threshold": 0.0
},
"attributes": {
"early_stopping_patience_counter": 3
}
},
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 2.1795547152069427e+17,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}